radeonsi: implement DCC MSAA 4x/8x fast clear using DCC equations on gfx9

MSAA 4x and 8x should only clear the first 2 samples because other samples are uncompressed. The compute shader only clears that subset of DCC. Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/10003>
2021-04-04 16:58:29 -04:00 · 2021-04-04 16:58:29 -04:00 · 3120113ee7
parent 8b95f51ef1
commit 3120113ee7
6 changed files with 158 additions and 4 deletions
--- a/src/amd/common/ac_surface_dcc_address_test.c
+++ b/src/amd/common/ac_surface_dcc_address_test.c
@ -255,6 +255,13 @@ static bool one_dcc_address_test(const char *name, const char *test, ADDR_HANDLE
                  addr = gfx9_dcc_addr_from_coord(info, &dout, dout.metaBlkWidth, dout.metaBlkHeight,
                                                  dout.metaBlkDepth, dout.pitch, dout.height,
                                                  in.x, in.y, in.slice, in.sample, in.pipeXor);
+                  if (in.sample == 1) {
+                     /* Sample 0 should be one byte before sample 1. The DCC MSAA clear relies on it. */
+                     assert(addr - 1 ==
+                            gfx9_dcc_addr_from_coord(info, &dout, dout.metaBlkWidth, dout.metaBlkHeight,
+                                                     dout.metaBlkDepth, dout.pitch, dout.height,
+                                                     in.x, in.y, in.slice, 0, in.pipeXor));
+                  }
               } else {
                  addr = gfx10_dcc_addr_from_coord(info, dout.equation.gfx10_bits,
                                                   in.bpp, dout.metaBlkWidth, dout.metaBlkHeight,
--- a/src/gallium/drivers/radeonsi/si_clear.c
+++ b/src/gallium/drivers/radeonsi/si_clear.c
@ -43,6 +43,7 @@ void si_init_buffer_clear(struct si_clear_info *info,
   info->size = size;
   info->clear_value = clear_value;
   info->writemask = 0xffffffff;
+   info->is_dcc_msaa = false;
 }

 static void si_init_buffer_clear_rmw(struct si_clear_info *info,
@ -75,6 +76,12 @@ void si_execute_clears(struct si_context *sctx, struct si_clear_info *info,

   /* Execute clears. */
   for (unsigned i = 0; i < num_clears; i++) {
+      if (info[i].is_dcc_msaa) {
+         gfx9_clear_dcc_msaa(sctx, info[i].resource, info[i].clear_value,
+                             SI_OP_SKIP_CACHE_INV_BEFORE, SI_COHERENCY_CP);
+         continue;
+      }
+
      assert(info[i].size > 0);

      if (info[i].writemask != 0xffffffff) {
@ -328,10 +335,13 @@ bool vi_dcc_get_clear_info(struct si_context *sctx, struct si_texture *tex, unsi
      if (tex->buffer.b.b.last_level > 0)
         return false;

-      /* 4x and 8x MSAA needs a sophisticated compute shader for
-       * the clear. See AMDVLK. */
-      if (tex->buffer.b.b.nr_storage_samples >= 4)
-         return false;
+      /* 4x and 8x MSAA need to clear only sample 0 and 1 in a compute shader and leave other
+       * samples untouched. (only the first 2 samples are compressed) */
+      if (tex->buffer.b.b.nr_storage_samples >= 4) {
+         si_init_buffer_clear(out, dcc_buffer, 0, 0, clear_value);
+         out->is_dcc_msaa = true;
+         return true;
+      }

      clear_size = tex->surface.meta_size;
   } else {
--- a/src/gallium/drivers/radeonsi/si_compute_blit.c
+++ b/src/gallium/drivers/radeonsi/si_compute_blit.c
@ -725,6 +725,78 @@ void si_retile_dcc(struct si_context *sctx, struct si_texture *tex)
   pipe_resource_reference(&saved_sb.buffer, NULL);
 }

+void gfx9_clear_dcc_msaa(struct si_context *sctx, struct pipe_resource *res, uint32_t clear_value,
+                         unsigned flags, enum si_coherency coher)
+{
+   struct pipe_context *ctx = &sctx->b;
+   struct si_texture *tex = (struct si_texture*)res;
+
+   if (!(flags & SI_OP_SKIP_CACHE_INV_BEFORE))
+      sctx->flags |= si_get_flush_flags(sctx, coher, SI_COMPUTE_DST_CACHE_POLICY);
+
+   /* Save states. */
+   void *saved_cs = sctx->cs_shader_state.program;
+   struct pipe_shader_buffer saved_sb = {};
+   si_get_shader_buffers(sctx, PIPE_SHADER_COMPUTE, 0, 1, &saved_sb);
+
+   unsigned saved_writable_mask = 0;
+   if (sctx->const_and_shader_buffers[PIPE_SHADER_COMPUTE].writable_mask &
+       (1u << si_get_shaderbuf_slot(0)))
+      saved_writable_mask |= 1 << 0;
+
+   /* Set the DCC buffer. */
+   assert(tex->surface.meta_offset && tex->surface.meta_offset <= UINT_MAX);
+   assert(tex->buffer.bo_size <= UINT_MAX);
+
+   struct pipe_shader_buffer sb = {};
+   sb.buffer = &tex->buffer.b.b;
+   sb.buffer_offset = tex->surface.meta_offset;
+   sb.buffer_size = tex->buffer.bo_size - sb.buffer_offset;
+   ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, 1, &sb, 0x1);
+
+   sctx->cs_user_data[0] = (tex->surface.u.gfx9.color.dcc_pitch_max + 1) |
+                           (tex->surface.u.gfx9.color.dcc_height << 16);
+   sctx->cs_user_data[1] = (clear_value & 0xffff) |
+                           ((uint32_t)tex->surface.tile_swizzle << 16);
+
+   /* These variables identify the shader variant. */
+   unsigned swizzle_mode = tex->surface.u.gfx9.swizzle_mode;
+   unsigned bpe_log2 = util_logbase2(tex->surface.bpe);
+   bool samples8 = tex->buffer.b.b.nr_storage_samples == 8;
+   bool is_array = tex->buffer.b.b.array_size > 1;
+   void **shader = &sctx->cs_clear_dcc_msaa[swizzle_mode][bpe_log2][samples8][is_array];
+
+   if (!*shader)
+      *shader = gfx9_create_clear_dcc_msaa_cs(sctx, tex);
+   ctx->bind_compute_state(ctx, *shader);
+
+   /* Dispatch compute. */
+   unsigned width = DIV_ROUND_UP(tex->buffer.b.b.width0, tex->surface.u.gfx9.color.dcc_block_width);
+   unsigned height = DIV_ROUND_UP(tex->buffer.b.b.height0, tex->surface.u.gfx9.color.dcc_block_height);
+   unsigned depth = DIV_ROUND_UP(tex->buffer.b.b.array_size, tex->surface.u.gfx9.color.dcc_block_depth);
+
+   struct pipe_grid_info info = {};
+   info.block[0] = 8;
+   info.block[1] = 8;
+   info.block[2] = 1;
+   info.last_block[0] = width % info.block[0];
+   info.last_block[1] = height % info.block[1];
+   info.grid[0] = DIV_ROUND_UP(width, info.block[0]);
+   info.grid[1] = DIV_ROUND_UP(height, info.block[1]);
+   info.grid[2] = depth;
+
+   si_launch_grid_internal(sctx, &info, saved_cs, flags);
+
+   enum si_cache_policy cache_policy = get_cache_policy(sctx, coher, tex->surface.meta_size);
+
+   if (flags & SI_OP_SYNC_AFTER)
+      sctx->flags |= cache_policy == L2_BYPASS ? SI_CONTEXT_WB_L2 : 0;
+
+   /* Restore states. */
+   ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, 1, &saved_sb, saved_writable_mask);
+   pipe_resource_reference(&saved_sb.buffer, NULL);
+}
+
 /* Expand FMASK to make it identity, so that image stores can ignore it. */
 void si_compute_expand_fmask(struct pipe_context *ctx, struct pipe_resource *tex)
 {
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@ -268,6 +268,17 @@ static void si_destroy_context(struct pipe_context *context)
      }
   }

+   for (unsigned i = 0; i < ARRAY_SIZE(sctx->cs_clear_dcc_msaa); i++) {
+      for (unsigned j = 0; j < ARRAY_SIZE(sctx->cs_clear_dcc_msaa[i]); j++) {
+         for (unsigned k = 0; k < ARRAY_SIZE(sctx->cs_clear_dcc_msaa[i][j]); k++) {
+            for (unsigned l = 0; l < ARRAY_SIZE(sctx->cs_clear_dcc_msaa[i][j][k]); l++) {
+               if (sctx->cs_clear_dcc_msaa[i][j][k][l])
+                  sctx->b.delete_compute_state(&sctx->b, sctx->cs_clear_dcc_msaa[i][j][k][l]);
+            }
+         }
+      }
+   }
+
   if (sctx->blitter)
      util_blitter_destroy(sctx->blitter);

--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@ -1316,6 +1316,10 @@ struct si_context {
   bool thread_trace_enabled;

   unsigned context_flags;
+
+   /* Shaders. */
+   /* TODO: move other shaders here too */
+   void *cs_clear_dcc_msaa[32][5][2][2]; /* [swizzle_mode][log2(bpe)][samples == 8][is_array] */
 };

 /* si_blit.c */
@ -1368,6 +1372,7 @@ struct si_clear_info {
   uint32_t size;
   uint32_t clear_value;
   uint32_t writemask;
+   bool is_dcc_msaa; /* Clear it as a DCC MSAA image. */
 };

 enum pipe_format si_simplify_cb_format(enum pipe_format format);
@ -1423,6 +1428,8 @@ void si_compute_clear_render_target(struct pipe_context *ctx, struct pipe_surfac
                                    unsigned dsty, unsigned width, unsigned height,
                                    bool render_condition_enabled);
 void si_retile_dcc(struct si_context *sctx, struct si_texture *tex);
+void gfx9_clear_dcc_msaa(struct si_context *sctx, struct pipe_resource *res, uint32_t clear_value,
+                         unsigned flags, enum si_coherency coher);
 void si_compute_expand_fmask(struct pipe_context *ctx, struct pipe_resource *tex);
 void si_init_compute_blit_functions(struct si_context *sctx);

@ -1539,6 +1546,7 @@ void si_resume_queries(struct si_context *sctx);

 /* si_shaderlib_nir.c */
 void *si_create_dcc_retile_cs(struct si_context *sctx, struct radeon_surf *surf);
+void *gfx9_create_clear_dcc_msaa_cs(struct si_context *sctx, struct si_texture *tex);

 /* si_shaderlib_tgsi.c */
 void *si_get_blitter_vs(struct si_context *sctx, enum blitter_attrib_type type,
--- a/src/gallium/drivers/radeonsi/si_shaderlib_nir.c
+++ b/src/gallium/drivers/radeonsi/si_shaderlib_nir.c
@ -100,3 +100,49 @@ void *si_create_dcc_retile_cs(struct si_context *sctx, struct radeon_surf *surf)

   return create_nir_cs(sctx, &b);
 }
+
+void *gfx9_create_clear_dcc_msaa_cs(struct si_context *sctx, struct si_texture *tex)
+{
+   const nir_shader_compiler_options *options =
+      sctx->b.screen->get_compiler_options(sctx->b.screen, PIPE_SHADER_IR_NIR, PIPE_SHADER_COMPUTE);
+
+   nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, options, "clear_dcc_msaa");
+   b.shader->info.cs.local_size[0] = 8;
+   b.shader->info.cs.local_size[1] = 8;
+   b.shader->info.cs.local_size[2] = 1;
+   b.shader->info.cs.user_data_components_amd = 2;
+   b.shader->info.num_ssbos = 1;
+
+   /* Get user data SGPRs. */
+   nir_ssa_def *user_sgprs = nir_load_user_data_amd(&b);
+   nir_ssa_def *dcc_pitch, *dcc_height, *clear_value, *pipe_xor;
+   unpack_2x16(&b, nir_channel(&b, user_sgprs, 0), &dcc_pitch, &dcc_height);
+   unpack_2x16(&b, nir_channel(&b, user_sgprs, 1), &clear_value, &pipe_xor);
+   clear_value = nir_u2u16(&b, clear_value);
+
+   /* Get the 2D coordinates. */
+   nir_ssa_def *coord = get_global_ids(&b, 3);
+   nir_ssa_def *zero = nir_imm_int(&b, 0);
+
+   /* Multiply the coordinates by the DCC block size (they are DCC block coordinates). */
+   coord = nir_imul(&b, coord,
+                    nir_channels(&b, nir_imm_ivec4(&b, tex->surface.u.gfx9.color.dcc_block_width,
+                                                   tex->surface.u.gfx9.color.dcc_block_height,
+                                                   tex->surface.u.gfx9.color.dcc_block_depth, 0), 0x7));
+
+   nir_ssa_def *offset =
+      ac_nir_dcc_addr_from_coord(&b, &sctx->screen->info, tex->surface.bpe,
+                                 &tex->surface.u.gfx9.color.dcc_equation,
+                                 dcc_pitch, dcc_height, zero, /* DCC slice size */
+                                 nir_channel(&b, coord, 0), nir_channel(&b, coord, 1), /* x, y */
+                                 tex->buffer.b.b.array_size > 1 ? nir_channel(&b, coord, 2) : zero, /* z */
+                                 zero, pipe_xor); /* sample, pipe_xor */
+
+   /* The trick here is that DCC elements for an even and the next odd sample are next to each other
+    * in memory, so we only need to compute the address for sample 0 and the next DCC byte is always
+    * sample 1. That's why the clear value has 2 bytes - we're clearing 2 samples at the same time.
+    */
+   nir_store_ssbo(&b, clear_value, zero, offset, .write_mask=0x1, .align_mul=2);
+
+   return create_nir_cs(sctx, &b);
+}