ac/llvm: add gl_shader_stage parameter into ac_build_s_barrier

this will be used later Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/16304>
2022-05-02 21:38:07 -04:00 · 2022-05-02 21:38:07 -04:00 · e4882d6b7e
parent fb51a3c4b0
commit e4882d6b7e
7 changed files with 33 additions and 29 deletions
--- a/src/amd/llvm/ac_llvm_build.c
+++ b/src/amd/llvm/ac_llvm_build.c
@ -388,7 +388,7 @@ LLVMValueRef ac_build_phi(struct ac_llvm_context *ctx, LLVMTypeRef type, unsigne
   return phi;
 }

-void ac_build_s_barrier(struct ac_llvm_context *ctx)
+void ac_build_s_barrier(struct ac_llvm_context *ctx, gl_shader_stage stage)
 {
   ac_build_intrinsic(ctx, "llvm.amdgcn.s.barrier", ctx->voidt, NULL, 0, AC_FUNC_ATTR_CONVERGENT);
 }
@ -4019,7 +4019,7 @@ void ac_build_wg_wavescan_bottom(struct ac_llvm_context *ctx, struct ac_wg_scan
 void ac_build_wg_wavescan(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
 {
   ac_build_wg_wavescan_top(ctx, ws);
-   ac_build_s_barrier(ctx);
+   ac_build_s_barrier(ctx, ws->stage);
   ac_build_wg_wavescan_bottom(ctx, ws);
 }

@ -4081,7 +4081,7 @@ void ac_build_wg_scan_bottom(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
 void ac_build_wg_scan(struct ac_llvm_context *ctx, struct ac_wg_scan *ws)
 {
   ac_build_wg_scan_top(ctx, ws);
-   ac_build_s_barrier(ctx);
+   ac_build_s_barrier(ctx, ws->stage);
   ac_build_wg_scan_bottom(ctx, ws);
 }

--- a/src/amd/llvm/ac_llvm_build.h
+++ b/src/amd/llvm/ac_llvm_build.h
@ -175,7 +175,7 @@ void ac_build_type_name_for_intr(LLVMTypeRef type, char *buf, unsigned bufsize);
 LLVMValueRef ac_build_phi(struct ac_llvm_context *ctx, LLVMTypeRef type, unsigned count_incoming,
                          LLVMValueRef *values, LLVMBasicBlockRef *blocks);

-void ac_build_s_barrier(struct ac_llvm_context *ctx);
+void ac_build_s_barrier(struct ac_llvm_context *ctx, gl_shader_stage stage);
 void ac_build_optimization_barrier(struct ac_llvm_context *ctx, LLVMValueRef *pgpr, bool sgpr);

 LLVMValueRef ac_build_shader_clock(struct ac_llvm_context *ctx, nir_scope scope);
@ -502,6 +502,7 @@ LLVMValueRef ac_build_reduce(struct ac_llvm_context *ctx, LLVMValueRef src, nir_
 * values across an entire workgroup, while respecting the order of waves.
 */
 struct ac_wg_scan {
+   gl_shader_stage stage;
   bool enable_reduce;
   bool enable_exclusive;
   bool enable_inclusive;
--- a/src/amd/llvm/ac_nir_to_llvm.c
+++ b/src/amd/llvm/ac_nir_to_llvm.c
@ -2959,7 +2959,7 @@ void ac_emit_barrier(struct ac_llvm_context *ac, gl_shader_stage stage)
   if (ac->chip_class == GFX6 && stage == MESA_SHADER_TESS_CTRL)
      return;

-   ac_build_s_barrier(ac);
+   ac_build_s_barrier(ac, stage);
 }

 static void emit_discard(struct ac_nir_context *ctx, const nir_intrinsic_instr *instr)
--- a/src/amd/vulkan/radv_nir_to_llvm.c
+++ b/src/amd/vulkan/radv_nir_to_llvm.c
@ -1338,7 +1338,7 @@ handle_ngg_outputs_post_2(struct radv_shader_context *ctx)

         if (ctx->stage == MESA_SHADER_VERTEX) {
            /* Wait for GS stores to finish. */
-            ac_build_s_barrier(&ctx->ac);
+            ac_build_s_barrier(&ctx->ac, ctx->stage);

            tmp = ac_build_gep0(&ctx->ac, ctx->esgs_ring, get_thread_id_in_tg(ctx));
            values[0] = LLVMBuildLoad(builder, tmp, "");
@ -1384,7 +1384,7 @@ gfx10_ngg_gs_emit_prologue(struct radv_shader_context *ctx)
   LLVMBuildBr(ctx->ac.builder, merge_block);
   LLVMPositionBuilderAtEnd(ctx->ac.builder, merge_block);

-   ac_build_s_barrier(&ctx->ac);
+   ac_build_s_barrier(&ctx->ac, ctx->stage);
 }

 static void
@ -1459,7 +1459,7 @@ gfx10_ngg_gs_emit_epilogue_2(struct radv_shader_context *ctx)
   LLVMBuilderRef builder = ctx->ac.builder;
   LLVMValueRef tmp, tmp2;

-   ac_build_s_barrier(&ctx->ac);
+   ac_build_s_barrier(&ctx->ac, ctx->stage);

   const LLVMValueRef tid = get_thread_id_in_tg(ctx);
   LLVMValueRef num_emit_threads = ngg_get_prim_cnt(ctx);
@ -1522,6 +1522,7 @@ gfx10_ngg_gs_emit_epilogue_2(struct radv_shader_context *ctx)
   /* Inclusive scan addition across the current wave. */
   LLVMValueRef vertlive = LLVMBuildLoad(builder, vertliveptr, "");
   struct ac_wg_scan vertlive_scan = {0};
+   vertlive_scan.stage = ctx->stage;
   vertlive_scan.op = nir_op_iadd;
   vertlive_scan.enable_reduce = true;
   vertlive_scan.enable_exclusive = true;
@ -1564,7 +1565,7 @@ gfx10_ngg_gs_emit_epilogue_2(struct radv_shader_context *ctx)
   }
   ac_build_endif(&ctx->ac, 5130);

-   ac_build_s_barrier(&ctx->ac);
+   ac_build_s_barrier(&ctx->ac, ctx->stage);

   /* Export primitive data */
   tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, num_emit_threads, "");
@ -2076,7 +2077,7 @@ ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm,

      /* GFX10 hang workaround - there needs to be an s_barrier before gs_alloc_req always */
      if (ctx.ac.chip_class == GFX10 && shader_count == 1)
-         ac_build_s_barrier(&ctx.ac);
+         ac_build_s_barrier(&ctx.ac, shaders[0]->info.stage);
   }

   for (int shader_idx = 0; shader_idx < shader_count; ++shader_idx) {
@ -2149,7 +2150,7 @@ ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm,
          * and contains a barrier, it will wait there and then
          * reach s_endpgm.
          */
-         ac_emit_barrier(&ctx.ac, ctx.stage);
+         ac_build_s_barrier(&ctx.ac, shaders[shader_idx]->info.stage);
      }

      nir_foreach_shader_out_variable(variable, shaders[shader_idx]) scan_shader_output_decl(
--- a/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c
+++ b/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c
@ -481,6 +481,7 @@ static void build_streamout(struct si_shader_context *ctx, struct ngg_streamout
         if (!info->num_stream_output_components[stream])
            continue;

+         primemit_scan[stream].stage = ctx->stage;
         primemit_scan[stream].enable_exclusive = true;
         primemit_scan[stream].op = nir_op_iadd;
         primemit_scan[stream].src = nggso->prim_enable[stream];
@ -499,7 +500,7 @@ static void build_streamout(struct si_shader_context *ctx, struct ngg_streamout
      }
   }

-   ac_build_s_barrier(&ctx->ac);
+   ac_build_s_barrier(&ctx->ac, ctx->stage);

   /* Fetch the per-buffer offsets and per-stream emit counts in all waves. */
   LLVMValueRef wgoffset_dw[4] = {};
@ -1022,7 +1023,7 @@ void gfx10_emit_ngg_culling_epilogue(struct ac_shader_abi *abi)
      builder, packed_data,
      ac_build_gep0(&ctx->ac, es_vtxptr, LLVMConstInt(ctx->ac.i32, lds_packed_data, 0)));
   ac_build_endif(&ctx->ac, ctx->merged_wrap_if_label);
-   ac_build_s_barrier(&ctx->ac);
+   ac_build_s_barrier(&ctx->ac, ctx->stage);

   LLVMValueRef tid = ac_get_thread_id(&ctx->ac);

@ -1141,7 +1142,7 @@ void gfx10_emit_ngg_culling_epilogue(struct ac_shader_abi *abi)
      cull_primitive(ctx, pos, clipdist_accepted, gs_accepted, gs_vtxptr);
   }
   ac_build_endif(&ctx->ac, 16002);
-   ac_build_s_barrier(&ctx->ac);
+   ac_build_s_barrier(&ctx->ac, ctx->stage);

   gs_accepted = LLVMBuildLoad(builder, gs_accepted, "");

@ -1171,7 +1172,7 @@ void gfx10_emit_ngg_culling_epilogue(struct ac_shader_abi *abi)
   }
   ac_build_endif(&ctx->ac, 16008);

-   ac_build_s_barrier(&ctx->ac);
+   ac_build_s_barrier(&ctx->ac, ctx->stage);

   /* Load the vertex masks and compute the new ES thread count. */
   LLVMValueRef new_num_es_threads, prefix_sum, kill_wave;
@ -1262,7 +1263,7 @@ void gfx10_emit_ngg_culling_epilogue(struct ac_shader_abi *abi)
      ac_build_s_endpgm(&ctx->ac);
   }
   ac_build_endif(&ctx->ac, 19202);
-   ac_build_s_barrier(&ctx->ac);
+   ac_build_s_barrier(&ctx->ac, ctx->stage);

   /* Send the final vertex and primitive counts. */
   ac_build_sendmsg_gs_alloc_req(&ctx->ac, get_wave_id_in_tg(ctx), new_num_es_threads,
@ -1408,7 +1409,7 @@ void gfx10_emit_ngg_culling_epilogue(struct ac_shader_abi *abi)
   /* These two also use LDS. */
   if (gfx10_ngg_writes_user_edgeflags(shader) ||
       (ctx->stage == MESA_SHADER_VERTEX && shader->key.ge.mono.u.vs_export_prim_id))
-      ac_build_s_barrier(&ctx->ac);
+      ac_build_s_barrier(&ctx->ac, ctx->stage);

   ctx->return_value = ret;
 }
@ -1512,7 +1513,7 @@ void gfx10_emit_ngg_epilogue(struct ac_shader_abi *abi)

      /* Streamout already inserted the barrier, so don't insert it again. */
      if (!ctx->so.num_outputs)
-         ac_build_s_barrier(&ctx->ac);
+         ac_build_s_barrier(&ctx->ac, ctx->stage);

      ac_build_ifcc(&ctx->ac, is_gs_thread, 5400);
      /* Load edge flags from ES threads and store them into VGPRs in GS threads. */
@ -1536,7 +1537,7 @@ void gfx10_emit_ngg_epilogue(struct ac_shader_abi *abi)

      /* Streamout and edge flags use LDS. Make it idle, so that we can reuse it. */
      if (ctx->so.num_outputs || gfx10_ngg_writes_user_edgeflags(ctx->shader))
-         ac_build_s_barrier(&ctx->ac);
+         ac_build_s_barrier(&ctx->ac, ctx->stage);

      ac_build_ifcc(&ctx->ac, is_gs_thread, 5400);
      /* Extract the PROVOKING_VTX_INDEX field. */
@ -1630,7 +1631,7 @@ void gfx10_emit_ngg_epilogue(struct ac_shader_abi *abi)

         if (ctx->stage == MESA_SHADER_VERTEX) {
            /* Wait for GS stores to finish. */
-            ac_build_s_barrier(&ctx->ac);
+            ac_build_s_barrier(&ctx->ac, ctx->stage);

            tmp = ngg_nogs_vertex_ptr(ctx, gfx10_get_thread_id_in_tg(ctx));
            tmp = ac_build_gep0(&ctx->ac, tmp, ctx->ac.i32_0);
@ -1861,7 +1862,7 @@ void gfx10_ngg_gs_emit_prologue(struct si_shader_context *ctx)
      }
   ac_build_endif(&ctx->ac, 15090);

-   ac_build_s_barrier(&ctx->ac);
+   ac_build_s_barrier(&ctx->ac, ctx->stage);
 }

 void gfx10_ngg_gs_emit_epilogue(struct si_shader_context *ctx)
@ -1925,7 +1926,7 @@ void gfx10_ngg_gs_emit_epilogue(struct si_shader_context *ctx)

   ac_build_endif(&ctx->ac, ctx->merged_wrap_if_label);

-   ac_build_s_barrier(&ctx->ac);
+   ac_build_s_barrier(&ctx->ac, ctx->stage);

   const LLVMValueRef tid = gfx10_get_thread_id_in_tg(ctx);
   LLVMValueRef num_emit_threads = ngg_get_prim_cnt(ctx);
@ -2003,7 +2004,7 @@ void gfx10_ngg_gs_emit_epilogue(struct si_shader_context *ctx)

      /* Wait for streamout to finish before we kill primitives. */
      if (ctx->so.num_outputs)
-         ac_build_s_barrier(&ctx->ac);
+         ac_build_s_barrier(&ctx->ac, ctx->stage);

      ac_build_ifcc(&ctx->ac, prim_enable, 0);
      {
@ -2061,7 +2062,7 @@ void gfx10_ngg_gs_emit_epilogue(struct si_shader_context *ctx)
         ac_build_endif(&ctx->ac, 0);
      }
      ac_build_endif(&ctx->ac, 0);
-      ac_build_s_barrier(&ctx->ac);
+      ac_build_s_barrier(&ctx->ac, ctx->stage);
   }

   /* Determine vertex liveness. */
@ -2096,6 +2097,7 @@ void gfx10_ngg_gs_emit_epilogue(struct si_shader_context *ctx)
   /* Inclusive scan addition across the current wave. */
   LLVMValueRef vertlive = LLVMBuildLoad(builder, vertliveptr, "");
   struct ac_wg_scan vertlive_scan = {};
+   vertlive_scan.stage = ctx->stage;
   vertlive_scan.op = nir_op_iadd;
   vertlive_scan.enable_reduce = true;
   vertlive_scan.enable_exclusive = true;
@ -2129,7 +2131,7 @@ void gfx10_ngg_gs_emit_epilogue(struct si_shader_context *ctx)
   }
   ac_build_endif(&ctx->ac, 5130);

-   ac_build_s_barrier(&ctx->ac);
+   ac_build_s_barrier(&ctx->ac, ctx->stage);

   /* Export primitive data */
   tmp = LLVMBuildICmp(builder, LLVMIntULT, tid, num_emit_threads, "");
--- a/src/gallium/drivers/radeonsi/si_shader_llvm.c
+++ b/src/gallium/drivers/radeonsi/si_shader_llvm.c
@ -957,7 +957,7 @@ bool si_llvm_translate_nir(struct si_shader_context *ctx, struct si_shader *shad
          shader->key.ge.as_ngg && !shader->key.ge.as_es && !shader->key.ge.opt.ngg_culling) {
         /* GFX10 requires a barrier before gs_alloc_req due to a hw bug. */
         if (ctx->screen->info.chip_class == GFX10)
-            ac_build_s_barrier(&ctx->ac);
+            ac_build_s_barrier(&ctx->ac, ctx->stage);

         gfx10_ngg_build_sendmsg_gs_alloc_req(ctx);

@ -1016,10 +1016,10 @@ bool si_llvm_translate_nir(struct si_shader_context *ctx, struct si_shader *shad
         if (!shader->key.ge.opt.same_patch_vertices ||
             shader->selector->info.base.inputs_read &
             ~shader->selector->info.tcs_vgpr_only_inputs)
-            ac_build_s_barrier(&ctx->ac);
+            ac_build_s_barrier(&ctx->ac, ctx->stage);
      } else if (ctx->stage == MESA_SHADER_GEOMETRY && !shader->key.ge.as_ngg) {
         /* gfx10_ngg_gs_emit_prologue inserts the barrier for NGG. */
-         ac_build_s_barrier(&ctx->ac);
+         ac_build_s_barrier(&ctx->ac, ctx->stage);
      }
   }

--- a/src/gallium/drivers/radeonsi/si_shader_llvm_tess.c
+++ b/src/gallium/drivers/radeonsi/si_shader_llvm_tess.c
@ -690,7 +690,7 @@ static void si_write_tess_factors(struct si_shader_context *ctx, LLVMValueRef re
       * a single wave due to a bug workaround disallowing multi-wave HS workgroups.
       */
      if (ctx->screen->info.chip_class != GFX6)
-         ac_build_s_barrier(&ctx->ac);
+         ac_build_s_barrier(&ctx->ac, ctx->stage);
   }

   /* Do this only for invocation 0, because the tess levels are per-patch,