radeonsi: move smoothing to the main shader part to remove 1 live VGPR

The samplemask VGPR that we had to pass to the epilog increased VGPR usage by 1 for all shaders. Do it in the main function by using the mono key structure, which causes on-demand compilation and stall, but we'll save the VGPR. 57794 shaders in 35145 tests Totals: SGPRS: 2715856 -> 2716272 (0.02 %) VGPRS: 1776168 -> 1718432 (-3.25 %) Spilled SGPRs: 3704 -> 3630 (-2.00 %) Spilled VGPRs: 1727 -> 1733 (0.35 %) Private memory VGPRs: 256 -> 256 (0.00 %) Scratch size: 2008 -> 2016 (0.40 %) dwords per thread Code Size: 61429584 -> 61393288 (-0.06 %) bytes Max Waves: 838645 -> 840484 (0.22 %) Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/14266>
2021-12-13 14:36:37 -05:00 · 2021-12-13 14:36:37 -05:00 · 198ad7e4dc
parent 12b942bd16
commit 198ad7e4dc
6 changed files with 26 additions and 48 deletions
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@ -701,8 +701,6 @@ void si_init_shader_args(struct si_shader_context *ctx, bool ngg_cull_shader)
                    shader->selector->info.writes_z + shader->selector->info.writes_stencil +
                    shader->selector->info.writes_samplemask + 1 /* SampleMaskIn */;

-      num_returns = MAX2(num_returns, num_return_sgprs + PS_EPILOG_SAMPLEMASK_MIN_LOC + 1);
-
      for (i = 0; i < num_return_sgprs; i++)
         ac_add_return(&ctx->args, AC_ARG_SGPR);
      for (; i < num_returns; i++)
@ -1249,9 +1247,8 @@ static void si_dump_shader_key(const struct si_shader *shader, FILE *f)
      fprintf(f, "  epilog.last_cbuf = %u\n", key->ps.part.epilog.last_cbuf);
      fprintf(f, "  epilog.alpha_func = %u\n", key->ps.part.epilog.alpha_func);
      fprintf(f, "  epilog.alpha_to_one = %u\n", key->ps.part.epilog.alpha_to_one);
-      fprintf(f, "  epilog.poly_line_smoothing = %u\n",
-              key->ps.part.epilog.poly_line_smoothing);
      fprintf(f, "  epilog.clamp_color = %u\n", key->ps.part.epilog.clamp_color);
+      fprintf(f, "  mono.poly_line_smoothing = %u\n", key->ps.mono.poly_line_smoothing);
      fprintf(f, "  mono.interpolate_at_sample_force_center = %u\n",
              key->ps.mono.interpolate_at_sample_force_center);
      fprintf(f, "  mono.fbfetch_msaa = %u\n", key->ps.mono.fbfetch_msaa);
@ -1986,12 +1983,6 @@ static bool si_shader_select_ps_parts(struct si_screen *sscreen, struct ac_llvm_
      assert(G_0286CC_ANCILLARY_ENA(shader->config.spi_ps_input_addr));
   }

-   /* The sample mask input is always enabled, because the API shader always
-    * passes it through to the epilog. Disable it here if it's unused.
-    */
-   if (!shader->key.ps.part.epilog.poly_line_smoothing && !shader->selector->info.reads_samplemask)
-      shader->config.spi_ps_input_ena &= C_0286CC_SAMPLE_COVERAGE_ENA;
-
   return true;
 }

--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@ -578,7 +578,6 @@ struct si_ps_epilog_bits {
   unsigned last_cbuf : 3;
   unsigned alpha_func : 3;
   unsigned alpha_to_one : 1;
-   unsigned poly_line_smoothing : 1;
   unsigned clamp_color : 1;
 };

@ -708,6 +707,7 @@ struct si_shader_key_ps {

   /* Flags for monolithic compilation only. */
   struct {
+      unsigned poly_line_smoothing : 1;
      unsigned interpolate_at_sample_force_center : 1;
      unsigned fbfetch_msaa : 1;
      unsigned fbfetch_is_1D : 1;
--- a/src/gallium/drivers/radeonsi/si_shader_internal.h
+++ b/src/gallium/drivers/radeonsi/si_shader_internal.h
@ -30,11 +30,6 @@

 struct pipe_debug_callback;

-/* Ideally pass the sample mask input to the PS epilog as v14, which
- * is its usual location, so that the shader doesn't have to add v_mov.
- */
-#define PS_EPILOG_SAMPLEMASK_MIN_LOC 14
-
 struct si_shader_output_values {
   LLVMValueRef values[4];
   ubyte vertex_stream[4];
--- a/src/gallium/drivers/radeonsi/si_shader_llvm_ps.c
+++ b/src/gallium/drivers/radeonsi/si_shader_llvm_ps.c
@ -221,23 +221,17 @@ static void si_alpha_test(struct si_shader_context *ctx, LLVMValueRef alpha)
   }
 }

-static LLVMValueRef si_scale_alpha_by_sample_mask(struct si_shader_context *ctx, LLVMValueRef alpha,
-                                                  unsigned samplemask_param)
+static LLVMValueRef si_get_coverage_from_sample_mask(struct si_shader_context *ctx)
 {
   LLVMValueRef coverage;

   /* alpha = alpha * popcount(coverage) / SI_NUM_SMOOTH_AA_SAMPLES */
-   coverage = LLVMGetParam(ctx->main_fn, samplemask_param);
+   coverage = LLVMGetParam(ctx->main_fn, SI_PARAM_SAMPLE_COVERAGE);
   coverage = ac_build_bit_count(&ctx->ac, ac_to_integer(&ctx->ac, coverage));
   coverage = LLVMBuildUIToFP(ctx->ac.builder, coverage, ctx->ac.f32, "");

-   coverage = LLVMBuildFMul(ctx->ac.builder, coverage,
-                            LLVMConstReal(ctx->ac.f32, 1.0 / SI_NUM_SMOOTH_AA_SAMPLES), "");
-
-   if (LLVMTypeOf(alpha) == ctx->ac.f16)
-      coverage = LLVMBuildFPTrunc(ctx->ac.builder, coverage, ctx->ac.f16, "");
-
-   return LLVMBuildFMul(ctx->ac.builder, alpha, coverage, "");
+   return LLVMBuildFMul(ctx->ac.builder, coverage,
+                        LLVMConstReal(ctx->ac.f32, 1.0 / SI_NUM_SMOOTH_AA_SAMPLES), "");
 }

 struct si_ps_exports {
@ -407,8 +401,8 @@ static bool si_llvm_init_ps_export_args(struct si_shader_context *ctx, LLVMValue
 }

 static void si_export_mrt_color(struct si_shader_context *ctx, LLVMValueRef *color, unsigned index,
-                                unsigned first_color_export, unsigned samplemask_param,
-                                unsigned color_type, struct si_ps_exports *exp)
+                                unsigned first_color_export, unsigned color_type,
+                                struct si_ps_exports *exp)
 {
   int i;

@ -425,10 +419,6 @@ static void si_export_mrt_color(struct si_shader_context *ctx, LLVMValueRef *col
   if (index == 0 && ctx->shader->key.ps.part.epilog.alpha_func != PIPE_FUNC_ALWAYS)
      si_alpha_test(ctx, color[3]);

-   /* Line & polygon smoothing */
-   if (ctx->shader->key.ps.part.epilog.poly_line_smoothing)
-      color[3] = si_scale_alpha_by_sample_mask(ctx, color[3], samplemask_param);
-
   /* If last_cbuf > 0, FS_COLOR0_WRITES_ALL_CBUFS is true. */
   if (ctx->shader->key.ps.part.epilog.last_cbuf > 0) {
      assert(exp->num == first_color_export);
@ -470,7 +460,7 @@ static void si_llvm_return_fs_outputs(struct ac_shader_abi *abi)
   struct si_shader *shader = ctx->shader;
   struct si_shader_info *info = &shader->selector->info;
   LLVMBuilderRef builder = ctx->ac.builder;
-   unsigned i, j, first_vgpr, vgpr;
+   unsigned i, j, vgpr;
   LLVMValueRef *addrs = abi->outputs;

   LLVMValueRef color[8][4] = {};
@ -507,6 +497,10 @@ static void si_llvm_return_fs_outputs(struct ac_shader_abi *abi)
      }
   }

+   LLVMValueRef smoothing_coverage = NULL;
+   if (ctx->shader->key.ps.mono.poly_line_smoothing)
+      smoothing_coverage = si_get_coverage_from_sample_mask(ctx);
+
   /* Fill the return structure. */
   ret = ctx->return_value;

@ -516,12 +510,17 @@ static void si_llvm_return_fs_outputs(struct ac_shader_abi *abi)
      SI_SGPR_ALPHA_REF, "");

   /* Set VGPRs */
-   first_vgpr = vgpr = SI_SGPR_ALPHA_REF + 1;
+   vgpr = SI_SGPR_ALPHA_REF + 1;
   for (i = 0; i < ARRAY_SIZE(color); i++) {
      if (!color[i][0])
         continue;

      if (LLVMTypeOf(color[i][0]) == ctx->ac.f16) {
+         if (smoothing_coverage) {
+            color[i][3] = LLVMBuildFMul(builder, color[i][3],
+                  LLVMBuildFPTrunc(builder, smoothing_coverage, ctx->ac.f16, ""), "");
+         }
+
         for (j = 0; j < 2; j++) {
            LLVMValueRef tmp = ac_build_gather_values(&ctx->ac, &color[i][j * 2], 2);
            tmp = LLVMBuildBitCast(builder, tmp, ctx->ac.f32, "");
@ -529,6 +528,9 @@ static void si_llvm_return_fs_outputs(struct ac_shader_abi *abi)
         }
         vgpr += 2;
      } else {
+         if (smoothing_coverage)
+            color[i][3] = LLVMBuildFMul(builder, color[i][3], smoothing_coverage, "");
+
         for (j = 0; j < 4; j++)
            ret = LLVMBuildInsertValue(builder, ret, color[i][j], vgpr++, "");
      }
@ -540,12 +542,6 @@ static void si_llvm_return_fs_outputs(struct ac_shader_abi *abi)
   if (samplemask)
      ret = LLVMBuildInsertValue(builder, ret, samplemask, vgpr++, "");

-   /* Add the input sample mask for smoothing at the end. */
-   if (vgpr < first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC)
-      vgpr = first_vgpr + PS_EPILOG_SAMPLEMASK_MIN_LOC;
-   ret = LLVMBuildInsertValue(builder, ret, LLVMGetParam(ctx->main_fn, SI_PARAM_SAMPLE_COVERAGE),
-                              vgpr++, "");
-
   ctx->return_value = ret;
 }

@ -860,9 +856,6 @@ void si_llvm_build_ps_epilog(struct si_shader_context *ctx, union si_shader_part
      ctx->args.num_sgprs_used + util_bitcount(key->ps_epilog.colors_written) * 4 +
      key->ps_epilog.writes_z + key->ps_epilog.writes_stencil + key->ps_epilog.writes_samplemask;

-   required_num_params =
-      MAX2(required_num_params, ctx->args.num_sgprs_used + PS_EPILOG_SAMPLEMASK_MIN_LOC + 1);
-
   while (ctx->args.arg_count < required_num_params)
      ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, NULL);

@ -911,8 +904,7 @@ void si_llvm_build_ps_epilog(struct si_shader_context *ctx, union si_shader_part
            color[i] = LLVMGetParam(ctx->main_fn, vgpr++);
      }

-      si_export_mrt_color(ctx, color, output_index, first_color_export,
-                          ctx->args.arg_count - 1, color_type, &exp);
+      si_export_mrt_color(ctx, color, output_index, first_color_export, color_type, &exp);
   }

   if (exp.num) {
--- a/src/gallium/drivers/radeonsi/si_state_draw.cpp
+++ b/src/gallium/drivers/radeonsi/si_state_draw.cpp
@ -275,8 +275,8 @@ static bool si_update_shaders(struct si_context *sctx)
      si_mark_atom_dirty(sctx, &sctx->atoms.s.cb_render_state);

   if (sctx->smoothing_enabled !=
-       sctx->shader.ps.current->key.ps.part.epilog.poly_line_smoothing) {
-      sctx->smoothing_enabled = sctx->shader.ps.current->key.ps.part.epilog.poly_line_smoothing;
+       sctx->shader.ps.current->key.ps.mono.poly_line_smoothing) {
+      sctx->smoothing_enabled = sctx->shader.ps.current->key.ps.mono.poly_line_smoothing;
      si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config);

      /* NGG cull state uses smoothing_enabled. */
--- a/src/gallium/drivers/radeonsi/si_state_shaders.cpp
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.cpp
@ -2196,7 +2196,7 @@ static void si_ps_key_update_primtype_shader_rasterizer_framebuffer(struct si_co
   bool is_line = util_prim_is_lines(sctx->current_rast_prim);

   key->ps.part.prolog.poly_stipple = rs->poly_stipple_enable && is_poly;
-   key->ps.part.epilog.poly_line_smoothing =
+   key->ps.mono.poly_line_smoothing =
      ((is_poly && rs->poly_smooth) || (is_line && rs->line_smooth)) &&
      sctx->framebuffer.nr_samples <= 1;
 }