nir: Rewrite and merge 16bit tex folding pass with 16bit image folding pass.
Allow folding constants/undef sources by sharing more code with the image_store 16bit folding pass. Allow more than one set of sources because RADV wants two, one for G16 (ddx/ddy) and one for A16 (all other sources). Allow folding cube sampling destination conversions on radeonsi/radv because I think the limitation only applies to sources. Signed-off-by: Georg Lehmann <dadschoorse@gmail.com> Reviewed-by: Daniel Schürmann <daniel@schuermann.dev> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/16978>
This commit is contained in:
parent
06b33770b6
commit
87e3277b82
|
@ -4888,15 +4888,14 @@ radv_create_shaders(struct radv_pipeline *pipeline, struct radv_pipeline_layout
|
|||
}
|
||||
if (((stages[i].nir->info.bit_sizes_int | stages[i].nir->info.bit_sizes_float) & 16) &&
|
||||
device->physical_device->rad_info.gfx_level >= GFX9) {
|
||||
uint32_t sampler_dims = UINT32_MAX;
|
||||
/* Skip because AMD doesn't support 16-bit types with these. */
|
||||
sampler_dims &= ~BITFIELD_BIT(GLSL_SAMPLER_DIM_CUBE);
|
||||
// TODO: also optimize the tex srcs. see radeonSI for reference */
|
||||
/* Skip if there are potentially conflicting rounding modes */
|
||||
if (!nir_has_any_rounding_mode_enabled(stages[i].nir->info.float_controls_execution_mode))
|
||||
NIR_PASS(_, stages[i].nir, nir_fold_16bit_sampler_conversions, 0, sampler_dims);
|
||||
struct nir_fold_16bit_tex_image_options fold_16bit_options = {
|
||||
.rounding_mode = nir_rounding_mode_rtne,
|
||||
.fold_tex_dest = true,
|
||||
.fold_image_load_store_data = true,
|
||||
};
|
||||
NIR_PASS(_, stages[i].nir, nir_fold_16bit_tex_image, &fold_16bit_options);
|
||||
|
||||
NIR_PASS(_, stages[i].nir, nir_fold_16bit_image_load_store_conversions);
|
||||
NIR_PASS(_, stages[i].nir, nir_opt_vectorize, opt_vectorize_callback, device);
|
||||
}
|
||||
|
||||
|
|
|
@ -5339,9 +5339,22 @@ bool nir_lower_mediump_io(nir_shader *nir, nir_variable_mode modes,
|
|||
bool nir_force_mediump_io(nir_shader *nir, nir_variable_mode modes,
|
||||
nir_alu_type types);
|
||||
bool nir_unpack_16bit_varying_slots(nir_shader *nir, nir_variable_mode modes);
|
||||
bool nir_fold_16bit_sampler_conversions(nir_shader *nir,
|
||||
unsigned tex_src_types, uint32_t sampler_dims);
|
||||
bool nir_fold_16bit_image_load_store_conversions(nir_shader *nir);
|
||||
|
||||
struct nir_fold_tex_srcs_options {
|
||||
unsigned sampler_dims;
|
||||
unsigned src_types;
|
||||
};
|
||||
|
||||
struct nir_fold_16bit_tex_image_options {
|
||||
nir_rounding_mode rounding_mode;
|
||||
bool fold_tex_dest;
|
||||
bool fold_image_load_store_data;
|
||||
unsigned fold_srcs_options_count;
|
||||
struct nir_fold_tex_srcs_options *fold_srcs_options;
|
||||
};
|
||||
|
||||
bool nir_fold_16bit_tex_image(nir_shader *nir,
|
||||
struct nir_fold_16bit_tex_image_options *options);
|
||||
|
||||
typedef struct {
|
||||
bool legalize_type; /* whether this src should be legalized */
|
||||
|
|
|
@ -376,7 +376,6 @@ static bool
|
|||
is_f32_to_f16_conversion(nir_instr *instr)
|
||||
{
|
||||
return is_n_to_m_conversion(instr, 32, nir_op_f2f16) ||
|
||||
is_n_to_m_conversion(instr, 32, nir_op_f2f16_rtne) ||
|
||||
is_n_to_m_conversion(instr, 32, nir_op_f2fmp);
|
||||
}
|
||||
|
||||
|
@ -396,156 +395,8 @@ static bool
|
|||
is_i32_to_i16_conversion(nir_instr *instr)
|
||||
{
|
||||
return is_n_to_m_conversion(instr, 32, nir_op_i2i16) ||
|
||||
is_n_to_m_conversion(instr, 32, nir_op_u2u16);
|
||||
}
|
||||
|
||||
static void
|
||||
replace_with_mov(nir_builder *b, nir_instr *instr, nir_src *src,
|
||||
nir_alu_instr *alu)
|
||||
{
|
||||
nir_ssa_def *mov = nir_mov_alu(b, alu->src[0],
|
||||
nir_dest_num_components(alu->dest.dest));
|
||||
assert(!alu->dest.saturate);
|
||||
nir_instr_rewrite_src_ssa(instr, src, mov);
|
||||
}
|
||||
|
||||
/**
|
||||
* If texture source operands use f16->f32 conversions or return values are
|
||||
* followed by f16->f32 or f32->f16, remove those conversions. This benefits
|
||||
* drivers that have texture opcodes that can accept and return 16-bit types.
|
||||
*
|
||||
* "tex_src_types" is a mask of nir_tex_src_* operands that should be handled.
|
||||
* It's always done for the destination.
|
||||
*
|
||||
* This should be run after late algebraic optimizations.
|
||||
* Copy propagation and DCE should be run after this.
|
||||
*/
|
||||
bool
|
||||
nir_fold_16bit_sampler_conversions(nir_shader *nir,
|
||||
unsigned tex_src_types,
|
||||
uint32_t sampler_dims)
|
||||
{
|
||||
bool changed = false;
|
||||
nir_function_impl *impl = nir_shader_get_entrypoint(nir);
|
||||
assert(impl);
|
||||
|
||||
nir_builder b;
|
||||
nir_builder_init(&b, impl);
|
||||
|
||||
nir_foreach_block_safe (block, impl) {
|
||||
nir_foreach_instr_safe (instr, block) {
|
||||
if (instr->type != nir_instr_type_tex)
|
||||
continue;
|
||||
|
||||
nir_tex_instr *tex = nir_instr_as_tex(instr);
|
||||
nir_instr *src;
|
||||
nir_alu_instr *src_alu;
|
||||
|
||||
/* Skip sparse residency */
|
||||
if (tex->is_sparse)
|
||||
continue;
|
||||
|
||||
if ((tex->op == nir_texop_txs ||
|
||||
tex->op == nir_texop_query_levels) ||
|
||||
!(sampler_dims & BITFIELD_BIT(tex->sampler_dim)))
|
||||
continue;
|
||||
|
||||
/* Optimize source operands. */
|
||||
for (unsigned i = 0; i < tex->num_srcs; i++) {
|
||||
/* Filter out sources that should be ignored. */
|
||||
if (!(BITFIELD_BIT(tex->src[i].src_type) & tex_src_types))
|
||||
continue;
|
||||
|
||||
src = tex->src[i].src.ssa->parent_instr;
|
||||
if (src->type != nir_instr_type_alu)
|
||||
continue;
|
||||
|
||||
src_alu = nir_instr_as_alu(src);
|
||||
b.cursor = nir_before_instr(src);
|
||||
|
||||
nir_alu_type src_type = nir_tex_instr_src_type(tex, i);
|
||||
|
||||
/* Handle vector sources that are made of scalar instructions. */
|
||||
if (nir_op_is_vec(src_alu->op)) {
|
||||
/* See if the vector is made of f16->f32 opcodes. */
|
||||
unsigned num = nir_dest_num_components(src_alu->dest.dest);
|
||||
bool is_f16_to_f32 = src_type == nir_type_float;
|
||||
bool is_u16_to_u32 = src_type & (nir_type_int | nir_type_uint);
|
||||
|
||||
for (unsigned comp = 0; comp < num; comp++) {
|
||||
nir_instr *instr = src_alu->src[comp].src.ssa->parent_instr;
|
||||
is_f16_to_f32 &= is_f16_to_f32_conversion(instr);
|
||||
/* Zero-extension (u16) and sign-extension (i16) have
|
||||
* the same behavior here - txf returns 0 if bit 15 is set
|
||||
* because it's out of bounds and the higher bits don't
|
||||
* matter.
|
||||
*/
|
||||
is_u16_to_u32 &= is_u16_to_u32_conversion(instr) ||
|
||||
is_i16_to_i32_conversion(instr);
|
||||
}
|
||||
|
||||
if (!is_f16_to_f32 && !is_u16_to_u32)
|
||||
continue;
|
||||
|
||||
nir_alu_instr *new_vec = nir_alu_instr_clone(nir, src_alu);
|
||||
nir_instr_insert_after(&src_alu->instr, &new_vec->instr);
|
||||
|
||||
/* Replace conversions with mov. */
|
||||
for (unsigned comp = 0; comp < num; comp++) {
|
||||
nir_instr *instr = new_vec->src[comp].src.ssa->parent_instr;
|
||||
replace_with_mov(&b, &new_vec->instr,
|
||||
&new_vec->src[comp].src,
|
||||
nir_instr_as_alu(instr));
|
||||
}
|
||||
|
||||
new_vec->dest.dest.ssa.bit_size =
|
||||
new_vec->src[0].src.ssa->bit_size;
|
||||
nir_instr_rewrite_src_ssa(&tex->instr, &tex->src[i].src,
|
||||
&new_vec->dest.dest.ssa);
|
||||
changed = true;
|
||||
} else if ((is_f16_to_f32_conversion(&src_alu->instr) &&
|
||||
src_type == nir_type_float) ||
|
||||
((is_u16_to_u32_conversion(&src_alu->instr) ||
|
||||
is_i16_to_i32_conversion(&src_alu->instr)) &&
|
||||
src_type & (nir_type_int | nir_type_uint))) {
|
||||
/* Handle scalar sources. */
|
||||
replace_with_mov(&b, &tex->instr, &tex->src[i].src, src_alu);
|
||||
changed = true;
|
||||
}
|
||||
}
|
||||
|
||||
/* Optimize the destination. */
|
||||
bool is_f32_to_f16 = tex->dest_type & nir_type_float;
|
||||
/* same behavior for int and uint */
|
||||
bool is_i32_to_i16 = tex->dest_type & (nir_type_int | nir_type_uint);
|
||||
|
||||
nir_foreach_use(use, &tex->dest.ssa) {
|
||||
is_f32_to_f16 &= is_f32_to_f16_conversion(use->parent_instr);
|
||||
is_i32_to_i16 &= is_i32_to_i16_conversion(use->parent_instr);
|
||||
}
|
||||
|
||||
if (is_f32_to_f16 || is_i32_to_i16) {
|
||||
/* All uses are the same conversions. Replace them with mov. */
|
||||
nir_foreach_use(use, &tex->dest.ssa) {
|
||||
nir_alu_instr *conv = nir_instr_as_alu(use->parent_instr);
|
||||
conv->op = nir_op_mov;
|
||||
tex->dest.ssa.bit_size = conv->dest.dest.ssa.bit_size;
|
||||
tex->dest_type = (tex->dest_type & (~16 & ~32 & ~64)) |
|
||||
conv->dest.dest.ssa.bit_size;
|
||||
}
|
||||
changed = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (changed) {
|
||||
nir_metadata_preserve(impl, nir_metadata_dominance |
|
||||
nir_metadata_block_index);
|
||||
} else {
|
||||
nir_metadata_preserve(impl, nir_metadata_all);
|
||||
}
|
||||
|
||||
return changed;
|
||||
is_n_to_m_conversion(instr, 32, nir_op_u2u16) ||
|
||||
is_n_to_m_conversion(instr, 32, nir_op_i2imp);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -662,6 +513,73 @@ const_is_i16(nir_ssa_scalar scalar)
|
|||
return value == (int16_t) value;
|
||||
}
|
||||
|
||||
static bool
|
||||
can_fold_16bit_src(nir_ssa_def *ssa, nir_alu_type src_type, bool sext_matters)
|
||||
{
|
||||
bool fold_f16 = src_type == nir_type_float32;
|
||||
bool fold_u16 = src_type == nir_type_uint32 && sext_matters;
|
||||
bool fold_i16 = src_type == nir_type_int32 && sext_matters;
|
||||
bool fold_i16_u16 = (src_type == nir_type_uint32 || src_type == nir_type_int32) && !sext_matters;
|
||||
|
||||
bool can_fold = fold_f16 || fold_u16 || fold_i16 || fold_i16_u16;
|
||||
for (unsigned i = 0; can_fold && i < ssa->num_components; i++) {
|
||||
nir_ssa_scalar comp = nir_ssa_scalar_resolved(ssa, i);
|
||||
if (comp.def->parent_instr->type == nir_instr_type_ssa_undef)
|
||||
continue;
|
||||
else if (nir_ssa_scalar_is_const(comp)) {
|
||||
if (fold_f16)
|
||||
can_fold &= const_is_f16(comp);
|
||||
else if (fold_u16)
|
||||
can_fold &= const_is_u16(comp);
|
||||
else if (fold_i16)
|
||||
can_fold &= const_is_i16(comp);
|
||||
else if (fold_i16_u16)
|
||||
can_fold &= (const_is_u16(comp) || const_is_i16(comp));
|
||||
} else {
|
||||
if (fold_f16)
|
||||
can_fold &= is_f16_to_f32_conversion(comp.def->parent_instr);
|
||||
else if (fold_u16)
|
||||
can_fold &= is_u16_to_u32_conversion(comp.def->parent_instr);
|
||||
else if (fold_i16)
|
||||
can_fold &= is_i16_to_i32_conversion(comp.def->parent_instr);
|
||||
else if (fold_i16_u16)
|
||||
can_fold &= (is_i16_to_i32_conversion(comp.def->parent_instr) ||
|
||||
is_u16_to_u32_conversion(comp.def->parent_instr));
|
||||
}
|
||||
}
|
||||
|
||||
return can_fold;
|
||||
}
|
||||
|
||||
static void
|
||||
fold_16bit_src(nir_builder *b, nir_instr *instr, nir_src *src, nir_alu_type src_type)
|
||||
{
|
||||
b->cursor = nir_before_instr(instr);
|
||||
|
||||
nir_ssa_scalar new_comps[NIR_MAX_VEC_COMPONENTS];
|
||||
for (unsigned i = 0; i < src->ssa->num_components; i++) {
|
||||
nir_ssa_scalar comp = nir_ssa_scalar_resolved(src->ssa, i);
|
||||
|
||||
if (comp.def->parent_instr->type == nir_instr_type_ssa_undef)
|
||||
new_comps[i] = nir_get_ssa_scalar(nir_ssa_undef(b, 1, 16), 0);
|
||||
else if (nir_ssa_scalar_is_const(comp)) {
|
||||
nir_ssa_def *constant;
|
||||
if (src_type == nir_type_float32)
|
||||
constant = nir_imm_float16(b, nir_ssa_scalar_as_float(comp));
|
||||
else
|
||||
constant = nir_imm_intN_t(b, nir_ssa_scalar_as_uint(comp), 16);
|
||||
new_comps[i] = nir_get_ssa_scalar(constant, 0);
|
||||
} else {
|
||||
/* conversion instruction */
|
||||
new_comps[i] = nir_ssa_scalar_chase_alu_src(comp, 0);
|
||||
}
|
||||
}
|
||||
|
||||
nir_ssa_def *new_vec = nir_vec_scalars(b, new_comps, src->ssa->num_components);
|
||||
|
||||
nir_instr_rewrite_src_ssa(instr, src, new_vec);
|
||||
}
|
||||
|
||||
static bool
|
||||
fold_16bit_store_data(nir_builder *b, nir_intrinsic_instr *instr)
|
||||
{
|
||||
|
@ -670,49 +588,10 @@ fold_16bit_store_data(nir_builder *b, nir_intrinsic_instr *instr)
|
|||
|
||||
b->cursor = nir_before_instr(&instr->instr);
|
||||
|
||||
bool fold_f16 = src_type == nir_type_float32;
|
||||
bool fold_u16 = src_type == nir_type_uint32;
|
||||
bool fold_i16 = src_type == nir_type_int32;
|
||||
|
||||
nir_ssa_scalar comps[NIR_MAX_VEC_COMPONENTS];
|
||||
for (unsigned i = 0; i < instr->num_components; i++) {
|
||||
comps[i] = nir_ssa_scalar_resolved(data_src->ssa, i);
|
||||
if (comps[i].def->parent_instr->type == nir_instr_type_ssa_undef)
|
||||
continue;
|
||||
else if (nir_ssa_scalar_is_const(comps[i])) {
|
||||
fold_f16 &= const_is_f16(comps[i]);
|
||||
fold_u16 &= const_is_u16(comps[i]);
|
||||
fold_i16 &= const_is_i16(comps[i]);
|
||||
} else {
|
||||
fold_f16 &= is_f16_to_f32_conversion(comps[i].def->parent_instr);
|
||||
fold_u16 &= is_u16_to_u32_conversion(comps[i].def->parent_instr);
|
||||
fold_i16 &= is_i16_to_i32_conversion(comps[i].def->parent_instr);
|
||||
}
|
||||
}
|
||||
|
||||
if (!fold_f16 && !fold_u16 && !fold_i16)
|
||||
if (!can_fold_16bit_src(data_src->ssa, src_type, true))
|
||||
return false;
|
||||
|
||||
nir_ssa_scalar new_comps[NIR_MAX_VEC_COMPONENTS];
|
||||
for (unsigned i = 0; i < instr->num_components; i++) {
|
||||
if (comps[i].def->parent_instr->type == nir_instr_type_ssa_undef)
|
||||
new_comps[i] = nir_get_ssa_scalar(nir_ssa_undef(b, 1, 16), 0);
|
||||
else if (nir_ssa_scalar_is_const(comps[i])) {
|
||||
nir_ssa_def *constant;
|
||||
if (src_type == nir_type_float32)
|
||||
constant = nir_imm_float16(b, nir_ssa_scalar_as_float(comps[i]));
|
||||
else
|
||||
constant = nir_imm_intN_t(b, nir_ssa_scalar_as_uint(comps[i]), 16);
|
||||
new_comps[i] = nir_get_ssa_scalar(constant, 0);
|
||||
} else {
|
||||
/* conversion instruction */
|
||||
new_comps[i] = nir_ssa_scalar_chase_alu_src(comps[i], 0);
|
||||
}
|
||||
}
|
||||
|
||||
nir_ssa_def *new_vec = nir_vec_scalars(b, new_comps, instr->num_components);
|
||||
|
||||
nir_instr_rewrite_src_ssa(&instr->instr, data_src, new_vec);
|
||||
fold_16bit_src(b, &instr->instr, data_src, src_type);
|
||||
|
||||
nir_intrinsic_set_src_type(instr, (src_type & ~32) | 16);
|
||||
|
||||
|
@ -720,70 +599,169 @@ fold_16bit_store_data(nir_builder *b, nir_intrinsic_instr *instr)
|
|||
}
|
||||
|
||||
static bool
|
||||
fold_16bit_load_data(nir_builder *b, nir_intrinsic_instr *instr)
|
||||
fold_16bit_destination(nir_ssa_def *ssa, nir_alu_type dest_type,
|
||||
unsigned exec_mode, nir_rounding_mode rdm)
|
||||
{
|
||||
nir_alu_type dest_type = nir_intrinsic_dest_type(instr);
|
||||
|
||||
if (dest_type == nir_type_float32 &&
|
||||
nir_has_any_rounding_mode_enabled(b->shader->info.float_controls_execution_mode))
|
||||
return false;
|
||||
|
||||
bool is_f32_to_f16 = dest_type == nir_type_float32;
|
||||
bool is_i32_to_i16 = dest_type == nir_type_int32 || dest_type == nir_type_uint32;
|
||||
|
||||
nir_foreach_use(use, &instr->dest.ssa) {
|
||||
is_f32_to_f16 &= is_f32_to_f16_conversion(use->parent_instr);
|
||||
is_i32_to_i16 &= is_i32_to_i16_conversion(use->parent_instr);
|
||||
nir_rounding_mode src_rdm =
|
||||
nir_get_rounding_mode_from_float_controls(exec_mode, nir_type_float16);
|
||||
bool allow_standard = (src_rdm == rdm || src_rdm == nir_rounding_mode_undef);
|
||||
bool allow_rtz = rdm == nir_rounding_mode_rtz;
|
||||
bool allow_rtne = rdm == nir_rounding_mode_rtne;
|
||||
|
||||
nir_foreach_use(use, ssa) {
|
||||
nir_instr *instr = use->parent_instr;
|
||||
is_f32_to_f16 &= (allow_standard && is_f32_to_f16_conversion(instr)) ||
|
||||
(allow_rtz && is_n_to_m_conversion(instr, 32, nir_op_f2f16_rtz)) ||
|
||||
(allow_rtne && is_n_to_m_conversion(instr, 32, nir_op_f2f16_rtne));
|
||||
is_i32_to_i16 &= is_i32_to_i16_conversion(instr);
|
||||
}
|
||||
|
||||
if (!is_f32_to_f16 && !is_i32_to_i16)
|
||||
return false;
|
||||
|
||||
/* All uses are the same conversions. Replace them with mov. */
|
||||
nir_foreach_use(use, &instr->dest.ssa) {
|
||||
nir_foreach_use(use, ssa) {
|
||||
nir_alu_instr *conv = nir_instr_as_alu(use->parent_instr);
|
||||
conv->op = nir_op_mov;
|
||||
}
|
||||
|
||||
instr->dest.ssa.bit_size = 16;
|
||||
ssa->bit_size = 16;
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool
|
||||
fold_16bit_load_data(nir_builder *b, nir_intrinsic_instr *instr,
|
||||
unsigned exec_mode, nir_rounding_mode rdm)
|
||||
{
|
||||
nir_alu_type dest_type = nir_intrinsic_dest_type(instr);
|
||||
|
||||
if (!fold_16bit_destination(&instr->dest.ssa, dest_type, exec_mode, rdm))
|
||||
return false;
|
||||
|
||||
nir_intrinsic_set_dest_type(instr, (dest_type & ~32) | 16);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool
|
||||
fold_16bit_image_load_store(nir_builder *b, nir_instr *instr, UNUSED void *unused)
|
||||
fold_16bit_tex_dest(nir_tex_instr *tex, unsigned exec_mode,
|
||||
nir_rounding_mode rdm)
|
||||
{
|
||||
if (instr->type != nir_instr_type_intrinsic)
|
||||
/* Skip sparse residency */
|
||||
if (tex->is_sparse)
|
||||
return false;
|
||||
|
||||
nir_intrinsic_instr *intrinsic = nir_instr_as_intrinsic(instr);
|
||||
if (tex->op != nir_texop_tex &&
|
||||
tex->op != nir_texop_txb &&
|
||||
tex->op != nir_texop_txd &&
|
||||
tex->op != nir_texop_txl &&
|
||||
tex->op != nir_texop_txf &&
|
||||
tex->op != nir_texop_txf_ms &&
|
||||
tex->op != nir_texop_tg4 &&
|
||||
tex->op != nir_texop_tex_prefetch &&
|
||||
tex->op != nir_texop_fragment_fetch_amd)
|
||||
return false;
|
||||
|
||||
if (!fold_16bit_destination(&tex->dest.ssa, tex->dest_type, exec_mode, rdm))
|
||||
return false;
|
||||
|
||||
tex->dest_type = (tex->dest_type & ~32) | 16;
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
static bool
|
||||
fold_16bit_tex_srcs(nir_builder *b, nir_tex_instr *tex,
|
||||
struct nir_fold_tex_srcs_options *options)
|
||||
{
|
||||
if (tex->op != nir_texop_tex &&
|
||||
tex->op != nir_texop_txb &&
|
||||
tex->op != nir_texop_txd &&
|
||||
tex->op != nir_texop_txl &&
|
||||
tex->op != nir_texop_txf &&
|
||||
tex->op != nir_texop_txf_ms &&
|
||||
tex->op != nir_texop_tg4 &&
|
||||
tex->op != nir_texop_tex_prefetch &&
|
||||
tex->op != nir_texop_fragment_fetch_amd &&
|
||||
tex->op != nir_texop_fragment_mask_fetch_amd)
|
||||
return false;
|
||||
|
||||
if (!(options->sampler_dims & BITFIELD_BIT(tex->sampler_dim)))
|
||||
return false;
|
||||
|
||||
bool changed = false;
|
||||
for (unsigned i = 0; i < tex->num_srcs; i++) {
|
||||
/* Filter out sources that should be ignored. */
|
||||
if (!(BITFIELD_BIT(tex->src[i].src_type) & options->src_types))
|
||||
continue;
|
||||
|
||||
nir_src *src = &tex->src[i].src;
|
||||
|
||||
nir_alu_type src_type = nir_tex_instr_src_type(tex, i) | src->ssa->bit_size;
|
||||
|
||||
/* Zero-extension (u16) and sign-extension (i16) have
|
||||
* the same behavior here - txf returns 0 if bit 15 is set
|
||||
* because it's out of bounds and the higher bits don't
|
||||
* matter.
|
||||
*/
|
||||
if (!can_fold_16bit_src(src->ssa, src_type, false))
|
||||
continue;
|
||||
|
||||
fold_16bit_src(b, &tex->instr, src, src_type);
|
||||
changed = true;
|
||||
}
|
||||
|
||||
return changed;
|
||||
}
|
||||
|
||||
static bool
|
||||
fold_16bit_tex_image(nir_builder *b, nir_instr *instr, void *params)
|
||||
{
|
||||
struct nir_fold_16bit_tex_image_options *options = params;
|
||||
unsigned exec_mode = b->shader->info.float_controls_execution_mode;
|
||||
bool progress = false;
|
||||
|
||||
switch (intrinsic->intrinsic) {
|
||||
case nir_intrinsic_bindless_image_store:
|
||||
case nir_intrinsic_image_deref_store:
|
||||
case nir_intrinsic_image_store:
|
||||
progress |= fold_16bit_store_data(b, intrinsic);
|
||||
break;
|
||||
case nir_intrinsic_bindless_image_load:
|
||||
case nir_intrinsic_image_deref_load:
|
||||
case nir_intrinsic_image_load:
|
||||
progress |= fold_16bit_load_data(b, intrinsic);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
if (instr->type == nir_instr_type_intrinsic) {
|
||||
nir_intrinsic_instr *intrinsic = nir_instr_as_intrinsic(instr);
|
||||
|
||||
switch (intrinsic->intrinsic) {
|
||||
case nir_intrinsic_bindless_image_store:
|
||||
case nir_intrinsic_image_deref_store:
|
||||
case nir_intrinsic_image_store:
|
||||
if (options->fold_image_load_store_data)
|
||||
progress |= fold_16bit_store_data(b, intrinsic);
|
||||
break;
|
||||
case nir_intrinsic_bindless_image_load:
|
||||
case nir_intrinsic_image_deref_load:
|
||||
case nir_intrinsic_image_load:
|
||||
if (options->fold_image_load_store_data)
|
||||
progress |= fold_16bit_load_data(b, intrinsic, exec_mode, options->rounding_mode);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
} else if (instr->type == nir_instr_type_tex) {
|
||||
nir_tex_instr *tex = nir_instr_as_tex(instr);
|
||||
|
||||
if (options->fold_tex_dest)
|
||||
progress |= fold_16bit_tex_dest(tex, exec_mode, options->rounding_mode);
|
||||
|
||||
for (unsigned i = 0; i < options->fold_srcs_options_count; i++) {
|
||||
progress |= fold_16bit_tex_srcs(b, tex, &options->fold_srcs_options[i]);
|
||||
}
|
||||
}
|
||||
|
||||
return progress;
|
||||
}
|
||||
|
||||
bool
|
||||
nir_fold_16bit_image_load_store_conversions(nir_shader *nir)
|
||||
bool nir_fold_16bit_tex_image(nir_shader *nir,
|
||||
struct nir_fold_16bit_tex_image_options *options)
|
||||
{
|
||||
return nir_shader_instructions_pass(nir,
|
||||
fold_16bit_image_load_store,
|
||||
fold_16bit_tex_image,
|
||||
nir_metadata_block_index | nir_metadata_dominance,
|
||||
NULL);
|
||||
options);
|
||||
}
|
||||
|
|
|
@ -771,20 +771,27 @@ ir3_nir_lower_variant(struct ir3_shader_variant *so, nir_shader *s)
|
|||
* coordinates that had been upconverted to 32-bits just for the
|
||||
* sampler to just be 16-bit texture sources.
|
||||
*/
|
||||
OPT(s, nir_fold_16bit_sampler_conversions,
|
||||
(1 << nir_tex_src_coord) |
|
||||
(1 << nir_tex_src_lod) |
|
||||
(1 << nir_tex_src_bias) |
|
||||
(1 << nir_tex_src_comparator) |
|
||||
(1 << nir_tex_src_min_lod) |
|
||||
(1 << nir_tex_src_ms_index) |
|
||||
(1 << nir_tex_src_ddx) |
|
||||
(1 << nir_tex_src_ddy),
|
||||
~0);
|
||||
struct nir_fold_tex_srcs_options fold_srcs_options = {
|
||||
.sampler_dims = ~0,
|
||||
.src_types = (1 << nir_tex_src_coord) |
|
||||
(1 << nir_tex_src_lod) |
|
||||
(1 << nir_tex_src_bias) |
|
||||
(1 << nir_tex_src_comparator) |
|
||||
(1 << nir_tex_src_min_lod) |
|
||||
(1 << nir_tex_src_ms_index) |
|
||||
(1 << nir_tex_src_ddx) |
|
||||
(1 << nir_tex_src_ddy),
|
||||
};
|
||||
struct nir_fold_16bit_tex_image_options fold_16bit_options = {
|
||||
.rounding_mode = nir_rounding_mode_rtz,
|
||||
.fold_tex_dest = true,
|
||||
/* blob dumps have no half regs on pixel 2's ldib or stib, so only enable for a6xx+. */
|
||||
.fold_image_load_store_data = so->compiler->gen >= 6,
|
||||
.fold_srcs_options_count = 1,
|
||||
.fold_srcs_options = &fold_srcs_options,
|
||||
};
|
||||
OPT(s, nir_fold_16bit_tex_image, &fold_16bit_options);
|
||||
|
||||
/* blob dumps have no half regs on pixel 2's ldib or stib, so only enable for a6xx+. */
|
||||
if (so->compiler->gen >= 6)
|
||||
OPT(s, nir_fold_16bit_image_load_store_conversions);
|
||||
|
||||
/* Now that we stripped off the 16-bit conversions, legalize so that we
|
||||
* don't have a mix of 16- and 32-bit args that will need to be
|
||||
|
|
|
@ -186,15 +186,21 @@ static void si_late_optimize_16bit_samplers(struct si_screen *sscreen, nir_shade
|
|||
};
|
||||
bool changed = false;
|
||||
|
||||
uint32_t sampler_dims = UINT32_MAX;
|
||||
/* Skip because AMD doesn't support 16-bit types with these. */
|
||||
sampler_dims &= ~BITFIELD_BIT(GLSL_SAMPLER_DIM_CUBE);
|
||||
NIR_PASS(changed, nir, nir_fold_16bit_sampler_conversions,
|
||||
(1 << nir_tex_src_coord) |
|
||||
(has_g16 ? 1 << nir_tex_src_ddx : 0),
|
||||
sampler_dims);
|
||||
struct nir_fold_tex_srcs_options fold_srcs_options = {
|
||||
.sampler_dims = ~BITFIELD_BIT(GLSL_SAMPLER_DIM_CUBE),
|
||||
.src_types = (1 << nir_tex_src_coord) |
|
||||
(has_g16 ? 1 << nir_tex_src_ddx : 0),
|
||||
};
|
||||
struct nir_fold_16bit_tex_image_options fold_16bit_options = {
|
||||
.rounding_mode = nir_rounding_mode_rtne,
|
||||
.fold_tex_dest = true,
|
||||
.fold_image_load_store_data = true,
|
||||
.fold_srcs_options_count = 1,
|
||||
.fold_srcs_options = &fold_srcs_options,
|
||||
};
|
||||
NIR_PASS(changed, nir, nir_fold_16bit_tex_image, &fold_16bit_options);
|
||||
|
||||
NIR_PASS(changed, nir, nir_legalize_16bit_sampler_srcs, tex_constraints);
|
||||
NIR_PASS(changed, nir, nir_fold_16bit_image_load_store_conversions);
|
||||
|
||||
if (changed) {
|
||||
si_nir_opts(sscreen, nir, false);
|
||||
|
|
|
@ -1078,8 +1078,11 @@ lvp_shader_compile_to_ir(struct lvp_pipeline *pipeline,
|
|||
|
||||
// TODO: also optimize the tex srcs. see radeonSI for reference */
|
||||
/* Skip if there are potentially conflicting rounding modes */
|
||||
if (!nir_has_any_rounding_mode_enabled(nir->info.float_controls_execution_mode))
|
||||
NIR_PASS_V(nir, nir_fold_16bit_sampler_conversions, 0, UINT32_MAX);
|
||||
struct nir_fold_16bit_tex_image_options fold_16bit_options = {
|
||||
.rounding_mode = nir_rounding_mode_undef,
|
||||
.fold_tex_dest = true,
|
||||
};
|
||||
NIR_PASS_V(nir, nir_fold_16bit_tex_image, &fold_16bit_options);
|
||||
|
||||
lvp_shader_optimize(nir);
|
||||
|
||||
|
|
Loading…
Reference in New Issue