radeonsi: remove redundant GS variables in si_shader_selector

Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/6624>
2020-09-06 11:23:13 -04:00 · 2020-09-06 11:23:13 -04:00 · 2b4fa68808
parent 7960668dc9
commit 2b4fa68808
4 changed files with 38 additions and 45 deletions
--- a/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c
+++ b/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c
@ -1439,7 +1439,7 @@ static LLVMValueRef ngg_gs_get_vertex_storage(struct si_shader_context *ctx)
 * is in emit order; that is:
 * - during the epilogue, N is the threadidx (relative to the entire threadgroup)
 * - during vertex emit, i.e. while the API GS shader invocation is running,
- *   N = threadidx * gs_max_out_vertices + emitidx
+ *   N = threadidx * gs.vertices_out + emitidx
 *
 * Goals of the LDS memory layout:
 * 1. Eliminate bank conflicts on write for geometry shaders that have all emits
@ -1458,7 +1458,7 @@ static LLVMValueRef ngg_gs_get_vertex_storage(struct si_shader_context *ctx)
 *
 * Swizzling is required to satisfy points 1 and 2 simultaneously.
 *
- * Vertices are stored in export order (gsthread * gs_max_out_vertices + emitidx).
+ * Vertices are stored in export order (gsthread * gs.vertices_out + emitidx).
 * Indices are swizzled in groups of 32, which ensures point 1 without
 * disturbing point 2.
 *
@ -1470,8 +1470,8 @@ static LLVMValueRef ngg_gs_vertex_ptr(struct si_shader_context *ctx, LLVMValueRe
   LLVMBuilderRef builder = ctx->ac.builder;
   LLVMValueRef storage = ngg_gs_get_vertex_storage(ctx);

-   /* gs_max_out_vertices = 2^(write_stride_2exp) * some odd number */
-   unsigned write_stride_2exp = ffs(sel->gs_max_out_vertices) - 1;
+   /* gs.vertices_out = 2^(write_stride_2exp) * some odd number */
+   unsigned write_stride_2exp = ffs(sel->info.base.gs.vertices_out) - 1;
   if (write_stride_2exp) {
      LLVMValueRef row = LLVMBuildLShr(builder, vertexidx, LLVMConstInt(ctx->ac.i32, 5, false), "");
      LLVMValueRef swizzle = LLVMBuildAnd(
@ -1489,7 +1489,7 @@ static LLVMValueRef ngg_gs_emit_vertex_ptr(struct si_shader_context *ctx, LLVMVa
   LLVMBuilderRef builder = ctx->ac.builder;
   LLVMValueRef tmp;

-   tmp = LLVMConstInt(ctx->ac.i32, sel->gs_max_out_vertices, false);
+   tmp = LLVMConstInt(ctx->ac.i32, sel->info.base.gs.vertices_out, false);
   tmp = LLVMBuildMul(builder, tmp, gsthread, "");
   const LLVMValueRef vertexidx = LLVMBuildAdd(builder, tmp, emitidx, "");
   return ngg_gs_vertex_ptr(ctx, vertexidx);
@ -1531,7 +1531,7 @@ void gfx10_ngg_gs_emit_vertex(struct si_shader_context *ctx, unsigned stream, LL
    */
   const LLVMValueRef can_emit =
      LLVMBuildICmp(builder, LLVMIntULT, vertexidx,
-                    LLVMConstInt(ctx->ac.i32, sel->gs_max_out_vertices, false), "");
+                    LLVMConstInt(ctx->ac.i32, sel->info.base.gs.vertices_out, false), "");

   tmp = LLVMBuildAdd(builder, vertexidx, ctx->ac.i32_1, "");
   tmp = LLVMBuildSelect(builder, can_emit, tmp, vertexidx, "");
@ -1557,7 +1557,7 @@ void gfx10_ngg_gs_emit_vertex(struct si_shader_context *ctx, unsigned stream, LL
   /* Determine and store whether this vertex completed a primitive. */
   const LLVMValueRef curverts = LLVMBuildLoad(builder, ctx->gs_curprim_verts[stream], "");

-   tmp = LLVMConstInt(ctx->ac.i32, u_vertices_per_prim(sel->gs_output_prim) - 1, false);
+   tmp = LLVMConstInt(ctx->ac.i32, u_vertices_per_prim(sel->info.base.gs.output_primitive) - 1, false);
   const LLVMValueRef iscompleteprim = LLVMBuildICmp(builder, LLVMIntUGE, curverts, tmp, "");

   /* Since the geometry shader emits triangle strips, we need to
@ -1565,7 +1565,7 @@ void gfx10_ngg_gs_emit_vertex(struct si_shader_context *ctx, unsigned stream, LL
    * the correct vertex order.
    */
   LLVMValueRef is_odd = ctx->ac.i1false;
-   if (stream == 0 && u_vertices_per_prim(sel->gs_output_prim) == 3) {
+   if (stream == 0 && u_vertices_per_prim(sel->info.base.gs.output_primitive) == 3) {
      tmp = LLVMBuildAnd(builder, curverts, ctx->ac.i32_1, "");
      is_odd = LLVMBuildICmp(builder, LLVMIntEQ, tmp, ctx->ac.i32_1, "");
   }
@ -1615,7 +1615,7 @@ void gfx10_ngg_gs_emit_epilogue(struct si_shader_context *ctx)
 {
   const struct si_shader_selector *sel = ctx->shader->selector;
   const struct si_shader_info *info = &sel->info;
-   const unsigned verts_per_prim = u_vertices_per_prim(sel->gs_output_prim);
+   const unsigned verts_per_prim = u_vertices_per_prim(sel->info.base.gs.output_primitive);
   LLVMBuilderRef builder = ctx->ac.builder;
   LLVMValueRef i8_0 = LLVMConstInt(ctx->ac.i8, 0, false);
   LLVMValueRef tmp, tmp2;
@ -1637,7 +1637,7 @@ void gfx10_ngg_gs_emit_epilogue(struct si_shader_context *ctx)

      const LLVMValueRef vertexidx = LLVMBuildLoad(builder, ctx->gs_next_vertex[stream], "");
      tmp = LLVMBuildICmp(builder, LLVMIntUGE, vertexidx,
-                          LLVMConstInt(ctx->ac.i32, sel->gs_max_out_vertices, false), "");
+                          LLVMConstInt(ctx->ac.i32, sel->info.base.gs.vertices_out, false), "");
      ac_build_ifcc(&ctx->ac, tmp, 5101);
      ac_build_break(&ctx->ac);
      ac_build_endif(&ctx->ac, 5101);
@ -1905,7 +1905,7 @@ bool gfx10_ngg_calculate_subgroup_info(struct si_shader *shader)
   const struct si_shader_selector *es_sel =
      shader->previous_stage_sel ? shader->previous_stage_sel : gs_sel;
   const gl_shader_stage gs_stage = gs_sel->info.stage;
-   const unsigned gs_num_invocations = MAX2(gs_sel->gs_num_invocations, 1);
+   const unsigned gs_num_invocations = MAX2(gs_sel->info.base.gs.invocations, 1);
   const unsigned input_prim = si_get_input_prim(gs_sel);
   const bool use_adjacency =
      input_prim >= PIPE_PRIM_LINES_ADJACENCY && input_prim <= PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY;
@ -1946,7 +1946,7 @@ bool gfx10_ngg_calculate_subgroup_info(struct si_shader *shader)

   if (gs_stage == MESA_SHADER_GEOMETRY) {
      bool force_multi_cycling = false;
-      unsigned max_out_verts_per_gsprim = gs_sel->gs_max_out_vertices * gs_num_invocations;
+      unsigned max_out_verts_per_gsprim = gs_sel->info.base.gs.vertices_out * gs_num_invocations;

 retry_select_mode:
      if (max_out_verts_per_gsprim <= 256 && !force_multi_cycling) {
@ -1959,7 +1959,7 @@ retry_select_mode:
          * tessellation. */
         max_vert_out_per_gs_instance = true;
         max_gsprims_base = 1;
-         max_out_verts_per_gsprim = gs_sel->gs_max_out_vertices;
+         max_out_verts_per_gsprim = gs_sel->info.base.gs.vertices_out;
      }

      esvert_lds_size = es_sel->esgs_itemsize / 4;
@ -2050,9 +2050,9 @@ retry_select_mode:

   unsigned max_out_vertices =
      max_vert_out_per_gs_instance
-         ? gs_sel->gs_max_out_vertices
+         ? gs_sel->info.base.gs.vertices_out
         : gs_stage == MESA_SHADER_GEOMETRY
-              ? max_gsprims * gs_num_invocations * gs_sel->gs_max_out_vertices
+              ? max_gsprims * gs_num_invocations * gs_sel->info.base.gs.vertices_out
              : max_esverts;
   assert(max_out_vertices <= 256);

@ -2060,7 +2060,7 @@ retry_select_mode:
   if (gs_stage == MESA_SHADER_GEOMETRY) {
      /* Number of output primitives per GS input primitive after
       * GS instancing. */
-      prim_amp_factor = gs_sel->gs_max_out_vertices;
+      prim_amp_factor = gs_sel->info.base.gs.vertices_out;
   }

   /* The GE only checks against the maximum number of ES verts after
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@ -433,11 +433,8 @@ struct si_shader_selector {
   uint16_t lshs_vertex_stride;

   /* GS parameters. */
-   uint16_t gs_max_out_vertices;
   uint16_t gsvs_vertex_size;
   ubyte gs_input_verts_per_prim;
-   ubyte gs_output_prim;
-   ubyte gs_num_invocations;
   ubyte max_gs_stream; /* count - 1 */
   unsigned max_gsvs_emit_size;
   uint16_t enabled_streamout_buffer_mask;
--- a/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c
+++ b/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c
@ -295,7 +295,7 @@ static void si_llvm_emit_vertex(struct ac_shader_abi *abi, unsigned stream, LLVM
    */
   can_emit =
      LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, gs_next_vertex,
-                    LLVMConstInt(ctx->ac.i32, shader->selector->gs_max_out_vertices, 0), "");
+                    LLVMConstInt(ctx->ac.i32, shader->selector->info.base.gs.vertices_out, 0), "");

   bool use_kill = !info->base.writes_memory;
   if (use_kill) {
@ -313,7 +313,7 @@ static void si_llvm_emit_vertex(struct ac_shader_abi *abi, unsigned stream, LLVM

         LLVMValueRef out_val = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], "");
         LLVMValueRef voffset =
-            LLVMConstInt(ctx->ac.i32, offset * shader->selector->gs_max_out_vertices, 0);
+            LLVMConstInt(ctx->ac.i32, offset * shader->selector->info.base.gs.vertices_out, 0);
         offset++;

         voffset = LLVMBuildAdd(ctx->ac.builder, voffset, gs_next_vertex, "");
@ -402,7 +402,7 @@ void si_preload_gs_rings(struct si_shader_context *ctx)
      if (!num_components)
         continue;

-      stride = 4 * num_components * sel->gs_max_out_vertices;
+      stride = 4 * num_components * sel->info.base.gs.vertices_out;

      /* Limit on the stride field for <= GFX7. */
      assert(stride < (1 << 14));
@ -535,7 +535,7 @@ struct si_shader *si_generate_gs_copy_shader(struct si_screen *sscreen,
            }

            LLVMValueRef soffset =
-               LLVMConstInt(ctx.ac.i32, offset * gs_selector->gs_max_out_vertices * 16 * 4, 0);
+               LLVMConstInt(ctx.ac.i32, offset * gs_selector->info.base.gs.vertices_out * 16 * 4, 0);
            offset++;

            outputs[i].values[chan] =
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@ -617,7 +617,7 @@ static void si_shader_es(struct si_screen *sscreen, struct si_shader *shader)
 void gfx9_get_gs_info(struct si_shader_selector *es, struct si_shader_selector *gs,
                      struct gfx9_gs_info *out)
 {
-   unsigned gs_num_invocations = MAX2(gs->gs_num_invocations, 1);
+   unsigned gs_num_invocations = MAX2(gs->info.base.gs.invocations, 1);
   unsigned input_prim = gs->info.base.gs.input_primitive;
   bool uses_adjacency =
      input_prim >= PIPE_PRIM_LINES_ADJACENCY && input_prim <= PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY;
@ -644,9 +644,9 @@ void gfx9_get_gs_info(struct si_shader_selector *es, struct si_shader_selector *
   /* MAX_PRIMS_PER_SUBGROUP = gs_prims * max_vert_out * gs_invocations.
    * Make sure we don't go over the maximum value.
    */
-   if (gs->gs_max_out_vertices > 0) {
+   if (gs->info.base.gs.vertices_out > 0) {
      max_gs_prims =
-         MIN2(max_gs_prims, max_out_prims / (gs->gs_max_out_vertices * gs_num_invocations));
+         MIN2(max_gs_prims, max_out_prims / (gs->info.base.gs.vertices_out * gs_num_invocations));
   }
   assert(max_gs_prims > 0);

@ -701,7 +701,7 @@ void gfx9_get_gs_info(struct si_shader_selector *es, struct si_shader_selector *
   out->es_verts_per_subgroup = es_verts;
   out->gs_prims_per_subgroup = gs_prims;
   out->gs_inst_prims_in_subgroup = gs_prims * gs_num_invocations;
-   out->max_prims_per_subgroup = out->gs_inst_prims_in_subgroup * gs->gs_max_out_vertices;
+   out->max_prims_per_subgroup = out->gs_inst_prims_in_subgroup * gs->info.base.gs.vertices_out;
   out->esgs_ring_size = esgs_lds_size;

   assert(out->max_prims_per_subgroup <= max_out_prims);
@ -772,7 +772,7 @@ static void si_shader_gs(struct si_screen *sscreen, struct si_shader *shader)
 {
   struct si_shader_selector *sel = shader->selector;
   const ubyte *num_components = sel->info.num_stream_output_components;
-   unsigned gs_num_invocations = sel->gs_num_invocations;
+   unsigned gs_num_invocations = sel->info.base.gs.invocations;
   struct si_pm4_state *pm4;
   uint64_t va;
   unsigned max_stream = sel->max_gs_stream;
@ -784,25 +784,25 @@ static void si_shader_gs(struct si_screen *sscreen, struct si_shader *shader)

   pm4->atom.emit = si_emit_shader_gs;

-   offset = num_components[0] * sel->gs_max_out_vertices;
+   offset = num_components[0] * sel->info.base.gs.vertices_out;
   shader->ctx_reg.gs.vgt_gsvs_ring_offset_1 = offset;

   if (max_stream >= 1)
-      offset += num_components[1] * sel->gs_max_out_vertices;
+      offset += num_components[1] * sel->info.base.gs.vertices_out;
   shader->ctx_reg.gs.vgt_gsvs_ring_offset_2 = offset;

   if (max_stream >= 2)
-      offset += num_components[2] * sel->gs_max_out_vertices;
+      offset += num_components[2] * sel->info.base.gs.vertices_out;
   shader->ctx_reg.gs.vgt_gsvs_ring_offset_3 = offset;

   if (max_stream >= 3)
-      offset += num_components[3] * sel->gs_max_out_vertices;
+      offset += num_components[3] * sel->info.base.gs.vertices_out;
   shader->ctx_reg.gs.vgt_gsvs_ring_itemsize = offset;

   /* The GSVS_RING_ITEMSIZE register takes 15 bits */
   assert(offset < (1 << 15));

-   shader->ctx_reg.gs.vgt_gs_max_vert_out = sel->gs_max_out_vertices;
+   shader->ctx_reg.gs.vgt_gs_max_vert_out = sel->info.base.gs.vertices_out;

   shader->ctx_reg.gs.vgt_gs_vert_itemsize = num_components[0];
   shader->ctx_reg.gs.vgt_gs_vert_itemsize_1 = (max_stream >= 1) ? num_components[1] : 0;
@ -1067,7 +1067,7 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader
   bool window_space = gs_info->stage == MESA_SHADER_VERTEX ?
                          gs_info->base.vs.window_space_position : 0;
   bool es_enable_prim_id = shader->key.mono.u.vs_export_prim_id || es_info->uses_primid;
-   unsigned gs_num_invocations = MAX2(gs_sel->gs_num_invocations, 1);
+   unsigned gs_num_invocations = MAX2(gs_sel->info.base.gs.invocations, 1);
   unsigned input_prim = si_get_input_prim(gs_sel);
   bool break_wave_at_eoi = false;
   struct si_pm4_state *pm4 = si_get_shader_pm4_state(shader);
@ -1187,7 +1187,7 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader

   if (gs_stage == MESA_SHADER_GEOMETRY) {
      shader->ctx_reg.ngg.vgt_esgs_ring_itemsize = es_sel->esgs_itemsize / 4;
-      shader->ctx_reg.ngg.vgt_gs_max_vert_out = gs_sel->gs_max_out_vertices;
+      shader->ctx_reg.ngg.vgt_gs_max_vert_out = gs_sel->info.base.gs.vertices_out;
   } else {
      shader->ctx_reg.ngg.vgt_esgs_ring_itemsize = 1;
   }
@ -1375,7 +1375,7 @@ static void si_shader_vs(struct si_screen *sscreen, struct si_shader *shader,
      shader->ctx_reg.vs.vgt_primitiveid_en = enable_prim_id;
   } else {
      shader->ctx_reg.vs.vgt_gs_mode =
-         ac_vgt_gs_mode(gs->gs_max_out_vertices, sscreen->info.chip_class);
+         ac_vgt_gs_mode(gs->info.base.gs.vertices_out, sscreen->info.chip_class);
      shader->ctx_reg.vs.vgt_primitiveid_en = 0;
   }

@ -2629,17 +2629,13 @@ static void *si_create_shader_selector(struct pipe_context *ctx,

   switch (sel->info.stage) {
   case MESA_SHADER_GEOMETRY:
-      sel->gs_output_prim = sel->info.base.gs.output_primitive;
-
      /* Only possibilities: POINTS, LINE_STRIP, TRIANGLES */
-      sel->rast_prim = sel->gs_output_prim;
+      sel->rast_prim = sel->info.base.gs.output_primitive;
      if (util_rast_prim_is_triangles(sel->rast_prim))
         sel->rast_prim = PIPE_PRIM_TRIANGLES;

-      sel->gs_max_out_vertices = sel->info.base.gs.vertices_out;
-      sel->gs_num_invocations = sel->info.base.gs.invocations;
      sel->gsvs_vertex_size = sel->info.num_outputs * 16;
-      sel->max_gsvs_emit_size = sel->gsvs_vertex_size * sel->gs_max_out_vertices;
+      sel->max_gsvs_emit_size = sel->gsvs_vertex_size * sel->info.base.gs.vertices_out;

      sel->max_gs_stream = 0;
      for (i = 0; i < sel->so.num_outputs; i++)
@ -2650,12 +2646,12 @@ static void *si_create_shader_selector(struct pipe_context *ctx,

      /* EN_MAX_VERT_OUT_PER_GS_INSTANCE does not work with tesselation so
       * we can't split workgroups. Disable ngg if any of the following conditions is true:
-       * - num_invocations * gs_max_out_vertices > 256
+       * - num_invocations * gs.vertices_out > 256
       * - LDS usage is too high
       */
      sel->tess_turns_off_ngg = sscreen->info.chip_class >= GFX10 &&
-                                (sel->gs_num_invocations * sel->gs_max_out_vertices > 256 ||
-                                 sel->gs_num_invocations * sel->gs_max_out_vertices *
+                                (sel->info.base.gs.invocations * sel->info.base.gs.vertices_out > 256 ||
+                                 sel->info.base.gs.invocations * sel->info.base.gs.vertices_out *
                                 (sel->info.num_outputs * 4 + 1) > 6500 /* max dw per GS primitive */);
      break;