radeonsi: minor simplifications of clear/copy_buffer shaders

- always use L2_LRU (never use ACCESS_NON_TEMPORAL) - for better perf - never use ACCESS_COHERENT because the address might not be aligned to a cache line - assume the wave size is always 64 Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29053>
2024-04-26 19:44:11 -04:00 · 2024-04-26 19:44:11 -04:00 · 92497d1c8f
parent 81c90cded0
commit 92497d1c8f
4 changed files with 21 additions and 45 deletions
--- a/src/gallium/drivers/radeonsi/si_compute_blit.c
+++ b/src/gallium/drivers/radeonsi/si_compute_blit.c
@ -192,7 +192,7 @@ void si_launch_grid_internal_ssbos(struct si_context *sctx, struct pipe_grid_inf
                                   unsigned writeable_bitmask)
 {
   if (!(flags & SI_OP_SKIP_CACHE_INV_BEFORE)) {
-      sctx->flags |= si_get_flush_flags(sctx, coher, SI_COMPUTE_DST_CACHE_POLICY);
+      sctx->flags |= si_get_flush_flags(sctx, coher, L2_LRU);
      si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);
   }

@ -351,16 +351,14 @@ static void si_compute_do_clear_or_copy(struct si_context *sctx, struct pipe_res
   sb[0].buffer_offset = dst_offset;
   sb[0].buffer_size = size;

-   bool shader_dst_stream_policy = SI_COMPUTE_DST_CACHE_POLICY != L2_LRU;
-
   if (src) {
      sb[1].buffer = src;
      sb[1].buffer_offset = src_offset;
      sb[1].buffer_size = size;

      if (!sctx->cs_copy_buffer) {
-         sctx->cs_copy_buffer = si_create_dma_compute_shader(
-            sctx, SI_COMPUTE_COPY_DW_PER_THREAD, shader_dst_stream_policy, true);
+         sctx->cs_copy_buffer = si_create_dma_compute_shader(sctx, SI_COMPUTE_COPY_DW_PER_THREAD,
+                                                             true);
      }

      si_launch_grid_internal_ssbos(sctx, &info, sctx->cs_copy_buffer, flags, coher,
@ -373,8 +371,8 @@ static void si_compute_do_clear_or_copy(struct si_context *sctx, struct pipe_res
         sctx->cs_user_data[i] = clear_value[i % (clear_value_size / 4)];

      if (!sctx->cs_clear_buffer) {
-         sctx->cs_clear_buffer = si_create_dma_compute_shader(
-            sctx, SI_COMPUTE_CLEAR_DW_PER_THREAD, shader_dst_stream_policy, false);
+         sctx->cs_clear_buffer = si_create_dma_compute_shader(sctx, SI_COMPUTE_CLEAR_DW_PER_THREAD,
+                                                              false);
      }

      si_launch_grid_internal_ssbos(sctx, &info, sctx->cs_clear_buffer, flags, coher,
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@ -49,8 +49,6 @@ struct ac_llvm_compiler;
 /* Tunables for compute-based clear_buffer and copy_buffer: */
 #define SI_COMPUTE_CLEAR_DW_PER_THREAD 4
 #define SI_COMPUTE_COPY_DW_PER_THREAD  4
-/* L2 LRU is recommended because the compute shader can finish sooner due to fewer L2 evictions. */
-#define SI_COMPUTE_DST_CACHE_POLICY    L2_LRU

 /* Pipeline & streamout query controls. */
 #define SI_CONTEXT_START_PIPELINE_STATS  (1 << 0)
@ -1729,7 +1727,7 @@ void *si_create_blit_cs(struct si_context *sctx, const union si_compute_blit_sha
 void *si_get_blitter_vs(struct si_context *sctx, enum blitter_attrib_type type,
                        unsigned num_layers);
 void *si_create_dma_compute_shader(struct si_context *sctx, unsigned num_dwords_per_thread,
-                                   bool dst_stream_cache_policy, bool is_copy);
+                                   bool is_copy);
 void *si_create_ubyte_to_ushort_compute_shader(struct si_context *sctx);
 void *si_create_clear_buffer_rmw_cs(struct si_context *sctx);
 void *si_clear_render_target_shader(struct si_context *sctx, enum pipe_texture_target type);
--- a/src/gallium/drivers/radeonsi/si_shaderlib_nir.c
+++ b/src/gallium/drivers/radeonsi/si_shaderlib_nir.c
@ -231,9 +231,7 @@ void *si_create_clear_buffer_rmw_cs(struct si_context *sctx)
   /* data |= clear_value_masked; */
   data = nir_ior(&b, data, nir_channel(&b, user_sgprs, 0));

-   nir_store_ssbo(&b, data, zero, address,
-      .access = SI_COMPUTE_DST_CACHE_POLICY != L2_LRU ? ACCESS_NON_TEMPORAL : 0,
-      .align_mul = 4);
+   nir_store_ssbo(&b, data, zero, address, .align_mul = 4);

   return create_shader_state(sctx, b.shader);
 }
@ -645,25 +643,16 @@ void *si_clear_12bytes_buffer_shader(struct si_context *sctx)
   nir_def *offset = nir_imul_imm(&b, get_global_ids(&b, 1), 12);
   nir_def *value = nir_trim_vector(&b, nir_load_user_data_amd(&b), 3);

-   nir_store_ssbo(&b, value, nir_imm_int(&b, 0), offset,
-      .access = SI_COMPUTE_DST_CACHE_POLICY != L2_LRU ? ACCESS_NON_TEMPORAL : 0);
+   nir_store_ssbo(&b, value, nir_imm_int(&b, 0), offset);

   return create_shader_state(sctx, b.shader);
 }

 void *si_create_ubyte_to_ushort_compute_shader(struct si_context *sctx)
 {
-   unsigned store_qualifier = ACCESS_COHERENT | ACCESS_RESTRICT;
-
-   /* Don't cache loads, because there is no reuse. */
-   unsigned load_qualifier = store_qualifier | ACCESS_NON_TEMPORAL;
-
   nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, sctx->screen->nir_options,
                                                  "ubyte_to_ushort");
-
-   unsigned default_wave_size = si_determine_wave_size(sctx->screen, NULL);
-
-   b.shader->info.workgroup_size[0] = default_wave_size;
+   b.shader->info.workgroup_size[0] = 64;
   b.shader->info.workgroup_size[1] = 1;
   b.shader->info.workgroup_size[2] = 1;
   b.shader->info.num_ssbos = 2;
@ -672,32 +661,24 @@ void *si_create_ubyte_to_ushort_compute_shader(struct si_context *sctx)
   nir_def *store_address = nir_imul_imm(&b, load_address, 2);

   nir_def *ubyte_value = nir_load_ssbo(&b, 1, 8, nir_imm_int(&b, 1),
-                                        load_address, .access = load_qualifier);
-   nir_store_ssbo(&b, nir_u2uN(&b, ubyte_value, 16), nir_imm_int(&b, 0),
-                  store_address, .access = store_qualifier);
+                                        load_address, .access = ACCESS_RESTRICT);
+   nir_store_ssbo(&b, nir_u2u16(&b, ubyte_value), nir_imm_int(&b, 0),
+                  store_address, .access = ACCESS_RESTRICT);

   return create_shader_state(sctx, b.shader);
 }

 /* Create a compute shader implementing clear_buffer or copy_buffer. */
 void *si_create_dma_compute_shader(struct si_context *sctx, unsigned num_dwords_per_thread,
-                                   bool dst_stream_cache_policy, bool is_copy)
+                                   bool is_copy)
 {
   assert(util_is_power_of_two_nonzero(num_dwords_per_thread));

-   unsigned store_qualifier = ACCESS_COHERENT | ACCESS_RESTRICT;
-   if (dst_stream_cache_policy)
-      store_qualifier |= ACCESS_NON_TEMPORAL;
-
-   /* Don't cache loads, because there is no reuse. */
-   unsigned load_qualifier = store_qualifier | ACCESS_NON_TEMPORAL;
-
   nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, sctx->screen->nir_options,
                                                  "create_dma_compute");
+   unsigned wg_size = 64;

-   unsigned default_wave_size = si_determine_wave_size(sctx->screen, NULL);
-
-   b.shader->info.workgroup_size[0] = default_wave_size;
+   b.shader->info.workgroup_size[0] = wg_size;
   b.shader->info.workgroup_size[1] = 1;
   b.shader->info.workgroup_size[2] = 1;
   b.shader->info.num_ssbos = 1;
@ -717,7 +698,7 @@ void *si_create_dma_compute_shader(struct si_context *sctx, unsigned num_dwords_
    */
   nir_def *store_address =
      nir_iadd(&b, nir_imul_imm(&b, nir_channel(&b, nir_load_workgroup_id(&b), 0),
-                                default_wave_size * num_mem_ops),
+                                wg_size * num_mem_ops),
               nir_channel(&b, nir_load_local_invocation_id(&b), 0));

   /* Convert from a "store size unit" into bytes. */
@ -741,19 +722,19 @@ void *si_create_dma_compute_shader(struct si_context *sctx, unsigned num_dwords_
      if (is_copy && i < num_mem_ops) {
         if (i) {
            load_address = nir_iadd(&b, load_address,
-                                    nir_imm_int(&b, 4 * inst_dwords[i] * default_wave_size));
+                                    nir_imm_int(&b, 4 * inst_dwords[i] * wg_size));
         }
         values[i] = nir_load_ssbo(&b, inst_dwords[i], 32, nir_imm_int(&b, 1), load_address,
-                                   .access = load_qualifier);
+                                   .access = ACCESS_RESTRICT);
      }

      if (d >= 0) {
         if (d) {
            store_address = nir_iadd(&b, store_address,
-                                     nir_imm_int(&b, 4 * inst_dwords[d] * default_wave_size));
+                                     nir_imm_int(&b, 4 * inst_dwords[d] * wg_size));
         }
         nir_store_ssbo(&b, is_copy ? values[d] : value, nir_imm_int(&b, 0), store_address,
-                        .access = store_qualifier);
+                        .access = ACCESS_RESTRICT);
      }
   }

--- a/src/gallium/drivers/radeonsi/si_test_dma_perf.c
+++ b/src/gallium/drivers/radeonsi/si_test_dma_perf.c
@ -120,8 +120,7 @@ void si_test_dma_perf(struct si_screen *sscreen)

         void *compute_shader = NULL;
         if (test_cs) {
-            compute_shader = si_create_dma_compute_shader(sctx, cs_dwords_per_thread,
-                                              cache_policy == L2_STREAM, is_copy);
+            compute_shader = si_create_dma_compute_shader(sctx, cs_dwords_per_thread, is_copy);
         }

         double score = 0;