radeonsi: merge the copy_image shader generators

Use a unified si_create_copy_image_cs() to generate both the 1D and 2D copy shaders; the shaders themselves remain separate. Also: - add a helper deref_ssa() for nir deref - add a helper set_work_size() for setting workgroup and grid sizes - pass si_context (instead of pipe_context) to the copy_image generator Reviewed-by: Marek Olšák <marek.olsak@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/15268>
2022-03-21 07:37:08 +02:00 · 2022-03-21 07:37:08 +02:00 · 08f74e7185
parent c0ef40bbce
commit 08f74e7185
4 changed files with 85 additions and 117 deletions
--- a/src/gallium/drivers/radeonsi/si_compute_blit.c
+++ b/src/gallium/drivers/radeonsi/si_compute_blit.c
@ -475,6 +475,21 @@ void si_copy_buffer(struct si_context *sctx, struct pipe_resource *dst, struct p
   }
 }

+static void
+set_work_size(struct pipe_grid_info *info, unsigned block_x, unsigned block_y, unsigned block_z,
+              unsigned work_x, unsigned work_y, unsigned work_z)
+{
+   info->block[0] = block_x;
+   info->block[1] = block_y;
+   info->block[2] = block_z;
+
+   unsigned work[3] = {work_x, work_y, work_z};
+   for (int i = 0; i < 3; ++i) {
+      info->last_block[i] = work[i] % info->block[i];
+      info->grid[i] = DIV_ROUND_UP(work[i], info->block[i]);
+   }
+}
+
 void si_compute_copy_image(struct si_context *sctx, struct pipe_resource *dst, unsigned dst_level,
                           struct pipe_resource *src, unsigned src_level, unsigned dstx,
                           unsigned dsty, unsigned dstz, const struct pipe_box *src_box,
@ -489,6 +504,7 @@ void si_compute_copy_image(struct si_context *sctx, struct pipe_resource *dst, u
   enum pipe_format src_format = util_format_linear(src->format);
   enum pipe_format dst_format = util_format_linear(dst->format);
   bool is_linear = ssrc->surface.is_linear || sdst->surface.is_linear;
+   bool is_1D = dst->target == PIPE_TEXTURE_1D_ARRAY && src->target == PIPE_TEXTURE_1D_ARRAY;

   assert(util_format_is_subsampled_422(src_format) == util_format_is_subsampled_422(dst_format));

@ -579,12 +595,6 @@ void si_compute_copy_image(struct si_context *sctx, struct pipe_resource *dst, u

   ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 2, 0, image);

-   if (!is_dcc_decompress) {
-      sctx->cs_user_data[0] = src_box->x | (dstx << 16);
-      sctx->cs_user_data[1] = src_box->y | (dsty << 16);
-      sctx->cs_user_data[2] = src_box->z | (dstz << 16);
-   }
-
   struct pipe_grid_info info = {0};

   if (is_dcc_decompress) {
@ -593,64 +603,56 @@ void si_compute_copy_image(struct si_context *sctx, struct pipe_resource *dst, u
       * the DCC block size or a multiple thereof. The shader uses a barrier
       * between loads and stores to safely overwrite each DCC block of pixels.
       */
-      unsigned dim[3] = {src_box->width, src_box->height, src_box->depth};
-
      assert(src == dst);
      assert(dst->target != PIPE_TEXTURE_1D && dst->target != PIPE_TEXTURE_1D_ARRAY);

      if (!sctx->cs_dcc_decompress)
         sctx->cs_dcc_decompress = si_create_dcc_decompress_cs(ctx);

-      info.block[0] = ssrc->surface.u.gfx9.color.dcc_block_width;
-      info.block[1] = ssrc->surface.u.gfx9.color.dcc_block_height;
-      info.block[2] = ssrc->surface.u.gfx9.color.dcc_block_depth;
+      unsigned block_x = ssrc->surface.u.gfx9.color.dcc_block_width;
+      unsigned block_y = ssrc->surface.u.gfx9.color.dcc_block_height;
+      unsigned block_z = ssrc->surface.u.gfx9.color.dcc_block_depth;

      unsigned default_wave_size = si_determine_wave_size(sctx->screen, NULL);;

      /* Make sure the block size is at least the same as wave size. */
-      while (info.block[0] * info.block[1] * info.block[2] < default_wave_size) {
-         info.block[0] *= 2;
+      while (block_x * block_y * block_z < default_wave_size) {
+         block_x *= 2;
      }

-      for (unsigned i = 0; i < 3; i++) {
-         info.last_block[i] = dim[i] % info.block[i];
-         info.grid[i] = DIV_ROUND_UP(dim[i], info.block[i]);
-      }
+      set_work_size(&info, block_x, block_y, block_z, src_box->width, src_box->height, src_box->depth);

      si_launch_grid_internal(sctx, &info, sctx->cs_dcc_decompress, flags | SI_OP_CS_IMAGE);
-   } else if (dst->target == PIPE_TEXTURE_1D_ARRAY && src->target == PIPE_TEXTURE_1D_ARRAY) {
-      if (!sctx->cs_copy_image_1d_array)
-         sctx->cs_copy_image_1d_array = si_create_copy_image_1d_array_cs(ctx);
-
-      info.block[0] = 64;
-      info.last_block[0] = width % 64;
-      info.block[1] = 1;
-      info.block[2] = 1;
-      info.grid[0] = DIV_ROUND_UP(width, 64);
-      info.grid[1] = depth;
-      info.grid[2] = 1;
-
-      si_launch_grid_internal(sctx, &info, sctx->cs_copy_image_1d_array, flags | SI_OP_CS_IMAGE);
   } else {
-      if (!sctx->cs_copy_image)
-         sctx->cs_copy_image = si_create_copy_image_cs(ctx);
+      sctx->cs_user_data[0] = src_box->x | (dstx << 16);

-      /* This is better for access over PCIe. */
-      if (is_linear) {
-         info.block[0] = 64;
-         info.block[1] = 1;
+      int block_x = is_1D || is_linear ? 64 : 8;
+      int block_y = is_1D || is_linear ? 1 : 8;
+      int block_z = 1;
+
+      if (is_1D) {
+         assert(height == 1); /* height is not used for 1D images */
+         assert(src_box->y == 0 && dsty == 0);
+
+         sctx->cs_user_data[1] = src_box->z | (dstz << 16);
+
+         /* We pass array index in 'y' for 1D images. */
+         height = depth;
+         depth = 1;
      } else {
-         info.block[0] = 8;
-         info.block[1] = 8;
+         sctx->cs_user_data[1] = src_box->y | (dsty << 16);
+         sctx->cs_user_data[2] = src_box->z | (dstz << 16);
      }
-      info.last_block[0] = width % info.block[0];
-      info.last_block[1] = height % info.block[1];
-      info.block[2] = 1;
-      info.grid[0] = DIV_ROUND_UP(width, info.block[0]);
-      info.grid[1] = DIV_ROUND_UP(height, info.block[1]);
-      info.grid[2] = depth;

-      si_launch_grid_internal(sctx, &info, sctx->cs_copy_image, flags | SI_OP_CS_IMAGE);
+      set_work_size(&info, block_x, block_y, block_z, width, height, depth);
+
+      void **copy_image_cs_ptr = is_1D ? &sctx->cs_copy_image_1D : &sctx->cs_copy_image_2D;
+      if (!*copy_image_cs_ptr)
+         *copy_image_cs_ptr = si_create_copy_image_cs(sctx, is_1D);
+
+      assert(*copy_image_cs_ptr);
+
+      si_launch_grid_internal(sctx, &info, *copy_image_cs_ptr, flags | SI_OP_CS_IMAGE);
   }

   ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 2, 0, saved_image);
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@ -258,10 +258,10 @@ static void si_destroy_context(struct pipe_context *context)
      sctx->b.delete_compute_state(&sctx->b, sctx->cs_clear_buffer_rmw);
   if (sctx->cs_copy_buffer)
      sctx->b.delete_compute_state(&sctx->b, sctx->cs_copy_buffer);
-   if (sctx->cs_copy_image)
-      sctx->b.delete_compute_state(&sctx->b, sctx->cs_copy_image);
-   if (sctx->cs_copy_image_1d_array)
-      sctx->b.delete_compute_state(&sctx->b, sctx->cs_copy_image_1d_array);
+   if (sctx->cs_copy_image_1D)
+      sctx->b.delete_compute_state(&sctx->b, sctx->cs_copy_image_1D);
+   if (sctx->cs_copy_image_2D)
+      sctx->b.delete_compute_state(&sctx->b, sctx->cs_copy_image_2D);
   if (sctx->cs_clear_render_target)
      sctx->b.delete_compute_state(&sctx->b, sctx->cs_clear_render_target);
   if (sctx->cs_clear_render_target_1d_array)
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@ -970,8 +970,8 @@ struct si_context {
   void *cs_clear_buffer;
   void *cs_clear_buffer_rmw;
   void *cs_copy_buffer;
-   void *cs_copy_image;
-   void *cs_copy_image_1d_array;
+   void *cs_copy_image_1D;
+   void *cs_copy_image_2D;
   void *cs_clear_render_target;
   void *cs_clear_render_target_1d_array;
   void *cs_clear_12bytes_buffer;
@ -1517,8 +1517,7 @@ void si_suspend_queries(struct si_context *sctx);
 void si_resume_queries(struct si_context *sctx);

 /* si_shaderlib_nir.c */
-void *si_create_copy_image_cs(struct pipe_context *ctx);
-void *si_create_copy_image_1d_array_cs(struct pipe_context *ctx);
+void *si_create_copy_image_cs(struct si_context *sctx, bool is_1D);
 void *si_create_dcc_retile_cs(struct si_context *sctx, struct radeon_surf *surf);
 void *gfx9_create_clear_dcc_msaa_cs(struct si_context *sctx, struct si_texture *tex);

--- a/src/gallium/drivers/radeonsi/si_shaderlib_nir.c
+++ b/src/gallium/drivers/radeonsi/si_shaderlib_nir.c
@ -53,32 +53,46 @@ static void unpack_2x16(nir_builder *b, nir_ssa_def *src, nir_ssa_def **x, nir_s
   *y = nir_ushr(b, src, nir_imm_int(b, 16));
 }

-/* Create a NIR compute shader implementing copy_image for 1D_ARRAY images.
- */
-void *si_create_copy_image_1d_array_cs(struct pipe_context *ctx)
+static nir_ssa_def *
+deref_ssa(nir_builder *b, nir_variable *var)
 {
-   struct si_context *sctx = (struct si_context *) ctx;
+   return &nir_build_deref_var(b, var)->dest.ssa;
+}

+/* Create a NIR compute shader implementing copy_image.
+ *
+ * This shader can handle 1D and 2D, linear and non-linear images.
+ * It expects the source and destination (x,y,z) coords as user_data_amd,
+ * packed into 3 SGPRs as 2x16bits per component.
+ */
+void *si_create_copy_image_cs(struct si_context *sctx, bool is_1D)
+{
   const nir_shader_compiler_options *options =
      sctx->b.screen->get_compiler_options(sctx->b.screen, PIPE_SHADER_IR_NIR, PIPE_SHADER_COMPUTE);

-   nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, options, "copy_image_1d_array_cs");
-
-   b.shader->info.workgroup_size[0] = 64;
-   b.shader->info.workgroup_size[1] = 1;
-   b.shader->info.workgroup_size[2] = 1;
-
-   b.shader->info.cs.user_data_components_amd = 3;
+   nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, options, "copy_image_cs");
   b.shader->info.num_images = 2;

+   /* The workgroup size is either 8x8 for normal (non-linear) 2D images,
+    * or 64x1 for 1D and linear-2D images.
+    */
+   b.shader->info.workgroup_size_variable = true;
+
+   /* 1D uses 'x' as image coord, and 'y' as array index.
+    * 2D uses 'x'&'y' as image coords, and 'z' as array index.
+    */
+   int n_components = is_1D ? 2 : 3;
+   b.shader->info.cs.user_data_components_amd = n_components;
+   nir_ssa_def *ids = get_global_ids(&b, n_components);
+
   nir_ssa_def *coord_src = NULL, *coord_dst = NULL;
   unpack_2x16(&b, nir_load_user_data_amd(&b), &coord_src, &coord_dst);

-   nir_ssa_def *ids = get_global_ids(&b, 3);
-   coord_src = nir_channels(&b, nir_iadd(&b, coord_src, ids), /*xz*/ 0x5);
-   coord_dst = nir_channels(&b, nir_iadd(&b, coord_dst, ids), /*xz*/ 0x5);
+   coord_src = nir_iadd(&b, coord_src, ids);
+   coord_dst = nir_iadd(&b, coord_dst, ids);

-   const struct glsl_type *img_type = glsl_image_type(GLSL_SAMPLER_DIM_1D, /*is_array*/ true, GLSL_TYPE_FLOAT);
+   const struct glsl_type *img_type = glsl_image_type(is_1D ? GLSL_SAMPLER_DIM_1D : GLSL_SAMPLER_DIM_2D,
+                                                      /*is_array*/ true, GLSL_TYPE_FLOAT);

   nir_variable *img_src = nir_variable_create(b.shader, nir_var_image, img_type, "img_src");
   img_src->data.binding = 0;
@ -90,56 +104,9 @@ void *si_create_copy_image_1d_array_cs(struct pipe_context *ctx)
   nir_ssa_def *zero = nir_imm_int(&b, 0);

   nir_ssa_def *data = nir_image_deref_load(&b, /*num_components*/ 4, /*bit_size*/ 32,
-      &nir_build_deref_var(&b, img_src)->dest.ssa, coord_src, undef32, zero);
+      deref_ssa(&b, img_src), coord_src, undef32, zero);

-   nir_image_deref_store(&b,
-      &nir_build_deref_var(&b, img_dst)->dest.ssa, coord_dst, undef32, data, zero);
-
-   return create_nir_cs(sctx, &b);
-}
-
-/* Create a NIR compute shader implementing copy_image.
- *
- * This is the NIR version of the removed si_create_copy_image_compute_shader() [TGSI].
- * It inherits the following note from the TGSI version:
- * "Luckily, this works with all texture targets except 1D_ARRAY."
- */
-void *si_create_copy_image_cs(struct pipe_context *ctx)
-{
-   struct si_context *sctx = (struct si_context *) ctx;
-
-   const nir_shader_compiler_options *options =
-      sctx->b.screen->get_compiler_options(sctx->b.screen, PIPE_SHADER_IR_NIR, PIPE_SHADER_COMPUTE);
-
-   nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, options, "copy_image_cs");
-
-   b.shader->info.workgroup_size_variable = true;
-   b.shader->info.cs.user_data_components_amd = 3;
-   b.shader->info.num_images = 2;
-
-   nir_ssa_def *coord_src = NULL, *coord_dst = NULL;
-   unpack_2x16(&b, nir_load_user_data_amd(&b), &coord_src, &coord_dst);
-
-   nir_ssa_def *ids = get_global_ids(&b, 3);
-   coord_src = nir_iadd(&b, coord_src, ids);
-   coord_dst = nir_iadd(&b, coord_dst, ids);
-
-   const struct glsl_type *img_type =
-      glsl_image_type(GLSL_SAMPLER_DIM_2D, /*is_array*/ true, GLSL_TYPE_FLOAT);
-
-   nir_variable *img_src = nir_variable_create(b.shader, nir_var_image, img_type, "img_src");
-   img_src->data.binding = 0;
-
-   nir_variable *img_dst = nir_variable_create(b.shader, nir_var_image, img_type, "img_dst");
-   img_dst->data.binding = 1;
-
-   nir_ssa_def *data = nir_image_deref_load(&b, /*num_components*/ 4, /*bit_size*/ 32,
-      &nir_build_deref_var(&b, img_src)->dest.ssa, coord_src, nir_ssa_undef(&b, 1, 32),
-      nir_imm_int(&b, 0), .image_dim = GLSL_SAMPLER_DIM_2D);
-
-   nir_image_deref_store(&b,
-      &nir_build_deref_var(&b, img_dst)->dest.ssa, coord_dst, nir_ssa_undef(&b, 1, 32), data,
-      nir_imm_int(&b, 0), .image_dim = GLSL_SAMPLER_DIM_2D);
+   nir_image_deref_store(&b, deref_ssa(&b, img_dst), coord_dst, undef32, data, zero);

   return create_nir_cs(sctx, &b);
 }