nir: Rewrite and merge 16bit tex folding pass with 16bit image folding pass.

Allow folding constants/undef sources by sharing more code with the image_store 16bit folding pass. Allow more than one set of sources because RADV wants two, one for G16 (ddx/ddy) and one for A16 (all other sources). Allow folding cube sampling destination conversions on radeonsi/radv because I think the limitation only applies to sources. Signed-off-by: Georg Lehmann <dadschoorse@gmail.com> Reviewed-by: Daniel Schürmann <daniel@schuermann.dev> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/16978>
2022-06-04 15:56:31 +02:00 · 2022-06-04 15:56:31 +02:00 · 87e3277b82
parent 06b33770b6
commit 87e3277b82
6 changed files with 263 additions and 257 deletions
--- a/src/amd/vulkan/radv_pipeline.c
+++ b/src/amd/vulkan/radv_pipeline.c
@ -4888,15 +4888,14 @@ radv_create_shaders(struct radv_pipeline *pipeline, struct radv_pipeline_layout
         }
         if (((stages[i].nir->info.bit_sizes_int | stages[i].nir->info.bit_sizes_float) & 16) &&
             device->physical_device->rad_info.gfx_level >= GFX9) {
-            uint32_t sampler_dims = UINT32_MAX;
-            /* Skip because AMD doesn't support 16-bit types with these. */
-            sampler_dims &= ~BITFIELD_BIT(GLSL_SAMPLER_DIM_CUBE);
            // TODO: also optimize the tex srcs. see radeonSI for reference */
-            /* Skip if there are potentially conflicting rounding modes */
-            if (!nir_has_any_rounding_mode_enabled(stages[i].nir->info.float_controls_execution_mode))
-               NIR_PASS(_, stages[i].nir, nir_fold_16bit_sampler_conversions, 0, sampler_dims);
+            struct nir_fold_16bit_tex_image_options fold_16bit_options = {
+               .rounding_mode = nir_rounding_mode_rtne,
+               .fold_tex_dest = true,
+               .fold_image_load_store_data = true,
+            };
+            NIR_PASS(_, stages[i].nir, nir_fold_16bit_tex_image, &fold_16bit_options);

-            NIR_PASS(_, stages[i].nir, nir_fold_16bit_image_load_store_conversions);
            NIR_PASS(_, stages[i].nir, nir_opt_vectorize, opt_vectorize_callback, device);
         }

--- a/src/compiler/nir/nir.h
+++ b/src/compiler/nir/nir.h
@ -5339,9 +5339,22 @@ bool nir_lower_mediump_io(nir_shader *nir, nir_variable_mode modes,
 bool nir_force_mediump_io(nir_shader *nir, nir_variable_mode modes,
                          nir_alu_type types);
 bool nir_unpack_16bit_varying_slots(nir_shader *nir, nir_variable_mode modes);
-bool nir_fold_16bit_sampler_conversions(nir_shader *nir,
-                                        unsigned tex_src_types, uint32_t sampler_dims);
-bool nir_fold_16bit_image_load_store_conversions(nir_shader *nir);
+
+struct nir_fold_tex_srcs_options {
+   unsigned sampler_dims;
+   unsigned src_types;
+};
+
+struct nir_fold_16bit_tex_image_options {
+   nir_rounding_mode rounding_mode;
+   bool fold_tex_dest;
+   bool fold_image_load_store_data;
+   unsigned fold_srcs_options_count;
+   struct nir_fold_tex_srcs_options *fold_srcs_options;
+};
+
+bool nir_fold_16bit_tex_image(nir_shader *nir,
+                              struct nir_fold_16bit_tex_image_options *options);

 typedef struct {
   bool legalize_type;         /* whether this src should be legalized */
--- a/src/compiler/nir/nir_lower_mediump.c
+++ b/src/compiler/nir/nir_lower_mediump.c
@ -376,7 +376,6 @@ static bool
 is_f32_to_f16_conversion(nir_instr *instr)
 {
   return is_n_to_m_conversion(instr, 32, nir_op_f2f16) ||
-          is_n_to_m_conversion(instr, 32, nir_op_f2f16_rtne) ||
          is_n_to_m_conversion(instr, 32, nir_op_f2fmp);
 }

@ -396,156 +395,8 @@ static bool
 is_i32_to_i16_conversion(nir_instr *instr)
 {
   return is_n_to_m_conversion(instr, 32, nir_op_i2i16) ||
-      is_n_to_m_conversion(instr, 32, nir_op_u2u16);
-}
-
-static void
-replace_with_mov(nir_builder *b, nir_instr *instr, nir_src *src,
-                 nir_alu_instr *alu)
-{
-   nir_ssa_def *mov = nir_mov_alu(b, alu->src[0],
-                                  nir_dest_num_components(alu->dest.dest));
-   assert(!alu->dest.saturate);
-   nir_instr_rewrite_src_ssa(instr, src, mov);
-}
-
-/**
- * If texture source operands use f16->f32 conversions or return values are
- * followed by f16->f32 or f32->f16, remove those conversions. This benefits
- * drivers that have texture opcodes that can accept and return 16-bit types.
- *
- * "tex_src_types" is a mask of nir_tex_src_* operands that should be handled.
- * It's always done for the destination.
- *
- * This should be run after late algebraic optimizations.
- * Copy propagation and DCE should be run after this.
- */
-bool
-nir_fold_16bit_sampler_conversions(nir_shader *nir,
-                                   unsigned tex_src_types,
-                                   uint32_t sampler_dims)
-{
-   bool changed = false;
-   nir_function_impl *impl = nir_shader_get_entrypoint(nir);
-   assert(impl);
-
-   nir_builder b;
-   nir_builder_init(&b, impl);
-
-   nir_foreach_block_safe (block, impl) {
-      nir_foreach_instr_safe (instr, block) {
-         if (instr->type != nir_instr_type_tex)
-            continue;
-
-         nir_tex_instr *tex = nir_instr_as_tex(instr);
-         nir_instr *src;
-         nir_alu_instr *src_alu;
-
-         /* Skip sparse residency */
-         if (tex->is_sparse)
-            continue;
-
-         if ((tex->op == nir_texop_txs ||
-              tex->op == nir_texop_query_levels) ||
-             !(sampler_dims & BITFIELD_BIT(tex->sampler_dim)))
-            continue;
-
-         /* Optimize source operands. */
-         for (unsigned i = 0; i < tex->num_srcs; i++) {
-            /* Filter out sources that should be ignored. */
-            if (!(BITFIELD_BIT(tex->src[i].src_type) & tex_src_types))
-               continue;
-
-            src = tex->src[i].src.ssa->parent_instr;
-            if (src->type != nir_instr_type_alu)
-               continue;
-
-            src_alu = nir_instr_as_alu(src);
-            b.cursor = nir_before_instr(src);
-
-            nir_alu_type src_type = nir_tex_instr_src_type(tex, i);
-
-            /* Handle vector sources that are made of scalar instructions. */
-            if (nir_op_is_vec(src_alu->op)) {
-               /* See if the vector is made of f16->f32 opcodes. */
-               unsigned num = nir_dest_num_components(src_alu->dest.dest);
-               bool is_f16_to_f32 = src_type == nir_type_float;
-               bool is_u16_to_u32 = src_type & (nir_type_int | nir_type_uint);
-
-               for (unsigned comp = 0; comp < num; comp++) {
-                  nir_instr *instr = src_alu->src[comp].src.ssa->parent_instr;
-                  is_f16_to_f32 &= is_f16_to_f32_conversion(instr);
-                  /* Zero-extension (u16) and sign-extension (i16) have
-                   * the same behavior here - txf returns 0 if bit 15 is set
-                   * because it's out of bounds and the higher bits don't
-                   * matter.
-                   */
-                  is_u16_to_u32 &= is_u16_to_u32_conversion(instr) ||
-                                   is_i16_to_i32_conversion(instr);
-               }
-
-               if (!is_f16_to_f32 && !is_u16_to_u32)
-                  continue;
-
-               nir_alu_instr *new_vec = nir_alu_instr_clone(nir, src_alu);
-               nir_instr_insert_after(&src_alu->instr, &new_vec->instr);
-
-               /* Replace conversions with mov. */
-               for (unsigned comp = 0; comp < num; comp++) {
-                  nir_instr *instr = new_vec->src[comp].src.ssa->parent_instr;
-                  replace_with_mov(&b, &new_vec->instr,
-                                   &new_vec->src[comp].src,
-                                   nir_instr_as_alu(instr));
-               }
-
-               new_vec->dest.dest.ssa.bit_size =
-                  new_vec->src[0].src.ssa->bit_size;
-               nir_instr_rewrite_src_ssa(&tex->instr, &tex->src[i].src,
-                                         &new_vec->dest.dest.ssa);
-               changed = true;
-            } else if ((is_f16_to_f32_conversion(&src_alu->instr) &&
-                        src_type == nir_type_float) ||
-                       ((is_u16_to_u32_conversion(&src_alu->instr) ||
-                         is_i16_to_i32_conversion(&src_alu->instr)) &&
-                        src_type & (nir_type_int | nir_type_uint))) {
-               /* Handle scalar sources. */
-               replace_with_mov(&b, &tex->instr, &tex->src[i].src, src_alu);
-               changed = true;
-            }
-         }
-
-         /* Optimize the destination. */
-         bool is_f32_to_f16 = tex->dest_type & nir_type_float;
-         /* same behavior for int and uint */
-         bool is_i32_to_i16 = tex->dest_type & (nir_type_int | nir_type_uint);
-
-         nir_foreach_use(use, &tex->dest.ssa) {
-            is_f32_to_f16 &= is_f32_to_f16_conversion(use->parent_instr);
-            is_i32_to_i16 &= is_i32_to_i16_conversion(use->parent_instr);
-         }
-
-         if (is_f32_to_f16 || is_i32_to_i16) {
-            /* All uses are the same conversions. Replace them with mov. */
-            nir_foreach_use(use, &tex->dest.ssa) {
-               nir_alu_instr *conv = nir_instr_as_alu(use->parent_instr);
-               conv->op = nir_op_mov;
-               tex->dest.ssa.bit_size = conv->dest.dest.ssa.bit_size;
-               tex->dest_type = (tex->dest_type & (~16 & ~32 & ~64)) |
-                                conv->dest.dest.ssa.bit_size;
-            }
-            changed = true;
-         }
-      }
-   }
-
-   if (changed) {
-      nir_metadata_preserve(impl, nir_metadata_dominance |
-                                  nir_metadata_block_index);
-   } else {
-      nir_metadata_preserve(impl, nir_metadata_all);
-   }
-
-   return changed;
+          is_n_to_m_conversion(instr, 32, nir_op_u2u16) ||
+          is_n_to_m_conversion(instr, 32, nir_op_i2imp);
 }

 /**
@ -662,6 +513,73 @@ const_is_i16(nir_ssa_scalar scalar)
   return value == (int16_t) value;
 }

+static bool
+can_fold_16bit_src(nir_ssa_def *ssa, nir_alu_type src_type, bool sext_matters)
+{
+   bool fold_f16 = src_type == nir_type_float32;
+   bool fold_u16 = src_type == nir_type_uint32 && sext_matters;
+   bool fold_i16 = src_type == nir_type_int32 && sext_matters;
+   bool fold_i16_u16 = (src_type == nir_type_uint32 || src_type == nir_type_int32) && !sext_matters;
+
+   bool can_fold = fold_f16 || fold_u16 || fold_i16 || fold_i16_u16;
+   for (unsigned i = 0; can_fold && i < ssa->num_components; i++) {
+      nir_ssa_scalar comp = nir_ssa_scalar_resolved(ssa, i);
+      if (comp.def->parent_instr->type == nir_instr_type_ssa_undef)
+         continue;
+      else if (nir_ssa_scalar_is_const(comp)) {
+         if (fold_f16)
+            can_fold &= const_is_f16(comp);
+         else if (fold_u16)
+            can_fold &= const_is_u16(comp);
+         else if (fold_i16)
+            can_fold &= const_is_i16(comp);
+         else if (fold_i16_u16)
+            can_fold &= (const_is_u16(comp) || const_is_i16(comp));
+      } else {
+         if (fold_f16)
+            can_fold &= is_f16_to_f32_conversion(comp.def->parent_instr);
+         else if (fold_u16)
+            can_fold &= is_u16_to_u32_conversion(comp.def->parent_instr);
+         else if (fold_i16)
+            can_fold &= is_i16_to_i32_conversion(comp.def->parent_instr);
+         else if (fold_i16_u16)
+            can_fold &= (is_i16_to_i32_conversion(comp.def->parent_instr) ||
+                         is_u16_to_u32_conversion(comp.def->parent_instr));
+      }
+   }
+
+   return can_fold;
+}
+
+static void
+fold_16bit_src(nir_builder *b, nir_instr *instr, nir_src *src, nir_alu_type src_type)
+{
+   b->cursor = nir_before_instr(instr);
+
+   nir_ssa_scalar new_comps[NIR_MAX_VEC_COMPONENTS];
+   for (unsigned i = 0; i < src->ssa->num_components; i++) {
+      nir_ssa_scalar comp = nir_ssa_scalar_resolved(src->ssa, i);
+
+      if (comp.def->parent_instr->type == nir_instr_type_ssa_undef)
+         new_comps[i] = nir_get_ssa_scalar(nir_ssa_undef(b, 1, 16), 0);
+      else if (nir_ssa_scalar_is_const(comp)) {
+         nir_ssa_def *constant;
+         if (src_type == nir_type_float32)
+            constant = nir_imm_float16(b, nir_ssa_scalar_as_float(comp));
+         else
+            constant = nir_imm_intN_t(b, nir_ssa_scalar_as_uint(comp), 16);
+         new_comps[i] = nir_get_ssa_scalar(constant, 0);
+      } else {
+         /* conversion instruction */
+         new_comps[i] = nir_ssa_scalar_chase_alu_src(comp, 0);
+      }
+   }
+
+   nir_ssa_def *new_vec = nir_vec_scalars(b, new_comps, src->ssa->num_components);
+
+   nir_instr_rewrite_src_ssa(instr, src, new_vec);
+}
+
 static bool
 fold_16bit_store_data(nir_builder *b, nir_intrinsic_instr *instr)
 {
@ -670,49 +588,10 @@ fold_16bit_store_data(nir_builder *b, nir_intrinsic_instr *instr)

   b->cursor = nir_before_instr(&instr->instr);

-   bool fold_f16 = src_type == nir_type_float32;
-   bool fold_u16 = src_type == nir_type_uint32;
-   bool fold_i16 = src_type == nir_type_int32;
-
-   nir_ssa_scalar comps[NIR_MAX_VEC_COMPONENTS];
-   for (unsigned i = 0; i < instr->num_components; i++) {
-      comps[i] = nir_ssa_scalar_resolved(data_src->ssa, i);
-      if (comps[i].def->parent_instr->type == nir_instr_type_ssa_undef)
-         continue;
-      else if (nir_ssa_scalar_is_const(comps[i])) {
-         fold_f16 &= const_is_f16(comps[i]);
-         fold_u16 &= const_is_u16(comps[i]);
-         fold_i16 &= const_is_i16(comps[i]);
-      } else {
-         fold_f16 &= is_f16_to_f32_conversion(comps[i].def->parent_instr);
-         fold_u16 &= is_u16_to_u32_conversion(comps[i].def->parent_instr);
-         fold_i16 &= is_i16_to_i32_conversion(comps[i].def->parent_instr);
-      }
-   }
-
-   if (!fold_f16 && !fold_u16 && !fold_i16)
+   if (!can_fold_16bit_src(data_src->ssa, src_type, true))
      return false;

-   nir_ssa_scalar new_comps[NIR_MAX_VEC_COMPONENTS];
-   for (unsigned i = 0; i < instr->num_components; i++) {
-      if (comps[i].def->parent_instr->type == nir_instr_type_ssa_undef)
-         new_comps[i] = nir_get_ssa_scalar(nir_ssa_undef(b, 1, 16), 0);
-      else if (nir_ssa_scalar_is_const(comps[i])) {
-         nir_ssa_def *constant;
-         if (src_type == nir_type_float32)
-            constant = nir_imm_float16(b, nir_ssa_scalar_as_float(comps[i]));
-         else
-            constant = nir_imm_intN_t(b, nir_ssa_scalar_as_uint(comps[i]), 16);
-         new_comps[i] = nir_get_ssa_scalar(constant, 0);
-      } else {
-         /* conversion instruction */
-         new_comps[i] = nir_ssa_scalar_chase_alu_src(comps[i], 0);
-      }
-   }
-
-   nir_ssa_def *new_vec = nir_vec_scalars(b, new_comps, instr->num_components);
-
-   nir_instr_rewrite_src_ssa(&instr->instr, data_src, new_vec);
+   fold_16bit_src(b, &instr->instr, data_src, src_type);

   nir_intrinsic_set_src_type(instr, (src_type & ~32) | 16);

@ -720,70 +599,169 @@ fold_16bit_store_data(nir_builder *b, nir_intrinsic_instr *instr)
 }

 static bool
-fold_16bit_load_data(nir_builder *b, nir_intrinsic_instr *instr)
+fold_16bit_destination(nir_ssa_def *ssa, nir_alu_type dest_type,
+                       unsigned exec_mode, nir_rounding_mode rdm)
 {
-   nir_alu_type dest_type = nir_intrinsic_dest_type(instr);
-
-   if (dest_type == nir_type_float32 &&
-       nir_has_any_rounding_mode_enabled(b->shader->info.float_controls_execution_mode))
-      return false;
-
   bool is_f32_to_f16 = dest_type == nir_type_float32;
   bool is_i32_to_i16 = dest_type == nir_type_int32 || dest_type == nir_type_uint32;

-   nir_foreach_use(use, &instr->dest.ssa) {
-      is_f32_to_f16 &= is_f32_to_f16_conversion(use->parent_instr);
-      is_i32_to_i16 &= is_i32_to_i16_conversion(use->parent_instr);
+   nir_rounding_mode src_rdm =
+      nir_get_rounding_mode_from_float_controls(exec_mode, nir_type_float16);
+   bool allow_standard = (src_rdm == rdm || src_rdm == nir_rounding_mode_undef);
+   bool allow_rtz = rdm == nir_rounding_mode_rtz;
+   bool allow_rtne = rdm == nir_rounding_mode_rtne;
+
+   nir_foreach_use(use, ssa) {
+      nir_instr *instr = use->parent_instr;
+      is_f32_to_f16 &= (allow_standard && is_f32_to_f16_conversion(instr)) ||
+                       (allow_rtz && is_n_to_m_conversion(instr, 32, nir_op_f2f16_rtz)) ||
+                       (allow_rtne && is_n_to_m_conversion(instr, 32, nir_op_f2f16_rtne));
+      is_i32_to_i16 &= is_i32_to_i16_conversion(instr);
   }

   if (!is_f32_to_f16 && !is_i32_to_i16)
      return false;

   /* All uses are the same conversions. Replace them with mov. */
-   nir_foreach_use(use, &instr->dest.ssa) {
+   nir_foreach_use(use, ssa) {
      nir_alu_instr *conv = nir_instr_as_alu(use->parent_instr);
      conv->op = nir_op_mov;
   }

-   instr->dest.ssa.bit_size = 16;
+   ssa->bit_size = 16;
+   return true;
+}
+
+static bool
+fold_16bit_load_data(nir_builder *b, nir_intrinsic_instr *instr,
+                     unsigned exec_mode, nir_rounding_mode rdm)
+{
+   nir_alu_type dest_type = nir_intrinsic_dest_type(instr);
+
+   if (!fold_16bit_destination(&instr->dest.ssa, dest_type, exec_mode, rdm))
+      return false;
+
   nir_intrinsic_set_dest_type(instr, (dest_type & ~32) | 16);

   return true;
 }

 static bool
-fold_16bit_image_load_store(nir_builder *b, nir_instr *instr, UNUSED void *unused)
+fold_16bit_tex_dest(nir_tex_instr *tex, unsigned exec_mode,
+                    nir_rounding_mode rdm)
 {
-   if (instr->type != nir_instr_type_intrinsic)
+   /* Skip sparse residency */
+   if (tex->is_sparse)
      return false;

-   nir_intrinsic_instr *intrinsic = nir_instr_as_intrinsic(instr);
+   if (tex->op != nir_texop_tex &&
+       tex->op != nir_texop_txb &&
+       tex->op != nir_texop_txd &&
+       tex->op != nir_texop_txl &&
+       tex->op != nir_texop_txf &&
+       tex->op != nir_texop_txf_ms &&
+       tex->op != nir_texop_tg4 &&
+       tex->op != nir_texop_tex_prefetch &&
+       tex->op != nir_texop_fragment_fetch_amd)
+      return false;

+   if (!fold_16bit_destination(&tex->dest.ssa, tex->dest_type, exec_mode, rdm))
+      return false;
+
+   tex->dest_type = (tex->dest_type & ~32) | 16;
+   return true;
+}
+
+
+static bool
+fold_16bit_tex_srcs(nir_builder *b, nir_tex_instr *tex,
+                    struct nir_fold_tex_srcs_options *options)
+{
+   if (tex->op != nir_texop_tex &&
+       tex->op != nir_texop_txb &&
+       tex->op != nir_texop_txd &&
+       tex->op != nir_texop_txl &&
+       tex->op != nir_texop_txf &&
+       tex->op != nir_texop_txf_ms &&
+       tex->op != nir_texop_tg4 &&
+       tex->op != nir_texop_tex_prefetch &&
+       tex->op != nir_texop_fragment_fetch_amd &&
+       tex->op != nir_texop_fragment_mask_fetch_amd)
+      return false;
+
+   if (!(options->sampler_dims & BITFIELD_BIT(tex->sampler_dim)))
+      return false;
+
+   bool changed = false;
+   for (unsigned i = 0; i < tex->num_srcs; i++) {
+      /* Filter out sources that should be ignored. */
+      if (!(BITFIELD_BIT(tex->src[i].src_type) & options->src_types))
+         continue;
+
+      nir_src *src = &tex->src[i].src;
+
+      nir_alu_type src_type = nir_tex_instr_src_type(tex, i) | src->ssa->bit_size;
+
+      /* Zero-extension (u16) and sign-extension (i16) have
+       * the same behavior here - txf returns 0 if bit 15 is set
+       * because it's out of bounds and the higher bits don't
+       * matter.
+       */
+      if (!can_fold_16bit_src(src->ssa, src_type, false))
+         continue;
+
+      fold_16bit_src(b, &tex->instr, src, src_type);
+      changed = true;
+   }
+
+   return changed;
+}
+
+static bool
+fold_16bit_tex_image(nir_builder *b, nir_instr *instr, void *params)
+{
+   struct nir_fold_16bit_tex_image_options *options = params;
+   unsigned exec_mode = b->shader->info.float_controls_execution_mode;
   bool progress = false;

-   switch (intrinsic->intrinsic) {
-   case nir_intrinsic_bindless_image_store:
-   case nir_intrinsic_image_deref_store:
-   case nir_intrinsic_image_store:
-      progress |= fold_16bit_store_data(b, intrinsic);
-      break;
-   case nir_intrinsic_bindless_image_load:
-   case nir_intrinsic_image_deref_load:
-   case nir_intrinsic_image_load:
-      progress |= fold_16bit_load_data(b, intrinsic);
-      break;
-   default:
-      break;
+   if (instr->type == nir_instr_type_intrinsic) {
+      nir_intrinsic_instr *intrinsic = nir_instr_as_intrinsic(instr);
+
+      switch (intrinsic->intrinsic) {
+      case nir_intrinsic_bindless_image_store:
+      case nir_intrinsic_image_deref_store:
+      case nir_intrinsic_image_store:
+         if (options->fold_image_load_store_data)
+            progress |= fold_16bit_store_data(b, intrinsic);
+         break;
+      case nir_intrinsic_bindless_image_load:
+      case nir_intrinsic_image_deref_load:
+      case nir_intrinsic_image_load:
+         if (options->fold_image_load_store_data)
+            progress |= fold_16bit_load_data(b, intrinsic, exec_mode, options->rounding_mode);
+         break;
+      default:
+         break;
+      }
+   } else if (instr->type == nir_instr_type_tex) {
+      nir_tex_instr *tex = nir_instr_as_tex(instr);
+
+      if (options->fold_tex_dest)
+         progress |= fold_16bit_tex_dest(tex, exec_mode, options->rounding_mode);
+
+      for (unsigned i = 0; i < options->fold_srcs_options_count; i++) {
+         progress |= fold_16bit_tex_srcs(b, tex, &options->fold_srcs_options[i]);
+      }
   }

   return progress;
 }

-bool
-nir_fold_16bit_image_load_store_conversions(nir_shader *nir)
+bool nir_fold_16bit_tex_image(nir_shader *nir,
+                              struct nir_fold_16bit_tex_image_options *options)
 {
   return nir_shader_instructions_pass(nir,
-                                       fold_16bit_image_load_store,
+                                       fold_16bit_tex_image,
                                       nir_metadata_block_index | nir_metadata_dominance,
-                                       NULL);
+                                       options);
 }
--- a/src/freedreno/ir3/ir3_nir.c
+++ b/src/freedreno/ir3/ir3_nir.c
@ -771,20 +771,27 @@ ir3_nir_lower_variant(struct ir3_shader_variant *so, nir_shader *s)
          * coordinates that had been upconverted to 32-bits just for the
          * sampler to just be 16-bit texture sources.
          */
-         OPT(s, nir_fold_16bit_sampler_conversions,
-            (1 << nir_tex_src_coord) |
-            (1 << nir_tex_src_lod) |
-            (1 << nir_tex_src_bias) |
-            (1 << nir_tex_src_comparator) |
-            (1 << nir_tex_src_min_lod) |
-            (1 << nir_tex_src_ms_index) |
-            (1 << nir_tex_src_ddx) |
-            (1 << nir_tex_src_ddy),
-            ~0);
+         struct nir_fold_tex_srcs_options fold_srcs_options = {
+            .sampler_dims = ~0,
+            .src_types = (1 << nir_tex_src_coord) |
+                         (1 << nir_tex_src_lod) |
+                         (1 << nir_tex_src_bias) |
+                         (1 << nir_tex_src_comparator) |
+                         (1 << nir_tex_src_min_lod) |
+                         (1 << nir_tex_src_ms_index) |
+                         (1 << nir_tex_src_ddx) |
+                         (1 << nir_tex_src_ddy),
+         };
+         struct nir_fold_16bit_tex_image_options fold_16bit_options = {
+            .rounding_mode = nir_rounding_mode_rtz,
+            .fold_tex_dest = true,
+            /* blob dumps have no half regs on pixel 2's ldib or stib, so only enable for a6xx+. */
+            .fold_image_load_store_data = so->compiler->gen >= 6,
+            .fold_srcs_options_count = 1,
+            .fold_srcs_options = &fold_srcs_options,
+         };
+         OPT(s, nir_fold_16bit_tex_image, &fold_16bit_options);

-         /* blob dumps have no half regs on pixel 2's ldib or stib, so only enable for a6xx+. */
-         if (so->compiler->gen >= 6)
-            OPT(s, nir_fold_16bit_image_load_store_conversions);

         /* Now that we stripped off the 16-bit conversions, legalize so that we
          * don't have a mix of 16- and 32-bit args that will need to be
--- a/src/gallium/drivers/radeonsi/si_shader_nir.c
+++ b/src/gallium/drivers/radeonsi/si_shader_nir.c
@ -186,15 +186,21 @@ static void si_late_optimize_16bit_samplers(struct si_screen *sscreen, nir_shade
   };
   bool changed = false;

-   uint32_t sampler_dims = UINT32_MAX;
-   /* Skip because AMD doesn't support 16-bit types with these. */
-   sampler_dims &= ~BITFIELD_BIT(GLSL_SAMPLER_DIM_CUBE);
-   NIR_PASS(changed, nir, nir_fold_16bit_sampler_conversions,
-            (1 << nir_tex_src_coord) |
-            (has_g16 ? 1 << nir_tex_src_ddx : 0),
-            sampler_dims);
+   struct nir_fold_tex_srcs_options fold_srcs_options = {
+      .sampler_dims = ~BITFIELD_BIT(GLSL_SAMPLER_DIM_CUBE),
+      .src_types = (1 << nir_tex_src_coord) |
+                   (has_g16 ? 1 << nir_tex_src_ddx : 0),
+   };
+   struct nir_fold_16bit_tex_image_options fold_16bit_options = {
+      .rounding_mode = nir_rounding_mode_rtne,
+      .fold_tex_dest = true,
+      .fold_image_load_store_data = true,
+      .fold_srcs_options_count = 1,
+      .fold_srcs_options = &fold_srcs_options,
+   };
+   NIR_PASS(changed, nir, nir_fold_16bit_tex_image, &fold_16bit_options);
+
   NIR_PASS(changed, nir, nir_legalize_16bit_sampler_srcs, tex_constraints);
-   NIR_PASS(changed, nir, nir_fold_16bit_image_load_store_conversions);

   if (changed) {
      si_nir_opts(sscreen, nir, false);
--- a/src/gallium/frontends/lavapipe/lvp_pipeline.c
+++ b/src/gallium/frontends/lavapipe/lvp_pipeline.c
@ -1078,8 +1078,11 @@ lvp_shader_compile_to_ir(struct lvp_pipeline *pipeline,

   // TODO: also optimize the tex srcs. see radeonSI for reference */
   /* Skip if there are potentially conflicting rounding modes */
-   if (!nir_has_any_rounding_mode_enabled(nir->info.float_controls_execution_mode))
-      NIR_PASS_V(nir, nir_fold_16bit_sampler_conversions, 0, UINT32_MAX);
+   struct nir_fold_16bit_tex_image_options fold_16bit_options = {
+      .rounding_mode = nir_rounding_mode_undef,
+      .fold_tex_dest = true,
+   };
+   NIR_PASS_V(nir, nir_fold_16bit_tex_image, &fold_16bit_options);

   lvp_shader_optimize(nir);