aco: implement GS copy shaders

v5: rebase on float_controls changes v7: rebase after shader args MR and load/store vectorizer MR Signed-off-by: Rhys Perry <pendingchaos02@gmail.com> Reviewed-by: Daniel Schürmann <daniel@schuermann.dev> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/merge_requests/2421>
2019-11-15 11:31:03 +00:00 · 2019-11-15 11:31:03 +00:00 · f8f7712666
parent de4ce66f5c
commit f8f7712666
4 changed files with 327 additions and 148 deletions
--- a/src/amd/compiler/aco_instruction_selection.cpp
+++ b/src/amd/compiler/aco_instruction_selection.cpp
@ -25,6 +25,7 @@

 #include <algorithm>
 #include <array>
+#include <stack>
 #include <map>

 #include "ac_shader_util.h"
@ -8534,7 +8535,7 @@ static void create_vs_exports(isel_context *ctx)
   if (ctx->num_clip_distances + ctx->num_cull_distances > 4)
      export_vs_varying(ctx, VARYING_SLOT_CLIP_DIST1, true, &next_pos);

-   if (ctx->options->key.vs_common_out.export_clip_dists) {
+   if (ctx->export_clip_dists) {
      if (ctx->num_clip_distances + ctx->num_cull_distances > 0)
         export_vs_varying(ctx, VARYING_SLOT_CLIP_DIST0, false, &next_pos);
      if (ctx->num_clip_distances + ctx->num_cull_distances > 4)
@ -8568,7 +8569,7 @@ static void emit_stream_output(isel_context *ctx,

   Temp out[4];
   bool all_undef = true;
-   assert(ctx->stage == vertex_vs);
+   assert(ctx->stage == vertex_vs || ctx->stage == gs_copy_vs);
   for (unsigned i = 0; i < num_comps; i++) {
      out[i] = ctx->vsgs_output.outputs[loc][start + i];
      all_undef = all_undef && !out[i].id();
@ -8804,13 +8805,24 @@ void setup_fp_mode(isel_context *ctx, nir_shader *shader)
   ctx->block->fp_mode = program->next_fp_mode;
 }

+void cleanup_cfg(Program *program)
+{
+   /* create linear_succs/logical_succs */
+   for (Block& BB : program->blocks) {
+      for (unsigned idx : BB.linear_preds)
+         program->blocks[idx].linear_succs.emplace_back(BB.index);
+      for (unsigned idx : BB.logical_preds)
+         program->blocks[idx].logical_succs.emplace_back(BB.index);
+   }
+}
+
 void select_program(Program *program,
                    unsigned shader_count,
                    struct nir_shader *const *shaders,
                    ac_shader_config* config,
                    struct radv_shader_args *args)
 {
-   isel_context ctx = setup_isel_context(program, shader_count, shaders, config, args);
+   isel_context ctx = setup_isel_context(program, shader_count, shaders, config, args, false);

   for (unsigned i = 0; i < shader_count; i++) {
      nir_shader *nir = shaders[i];
@ -8879,12 +8891,162 @@ void select_program(Program *program,
      bld.smem(aco_opcode::s_dcache_wb, false);
   bld.sopp(aco_opcode::s_endpgm);

-   /* cleanup CFG */
-   for (Block& BB : program->blocks) {
-      for (unsigned idx : BB.linear_preds)
-         program->blocks[idx].linear_succs.emplace_back(BB.index);
-      for (unsigned idx : BB.logical_preds)
-         program->blocks[idx].logical_succs.emplace_back(BB.index);
+   cleanup_cfg(program);
+}
+
+void select_gs_copy_shader(Program *program, struct nir_shader *gs_shader,
+                           ac_shader_config* config,
+                           struct radv_shader_args *args)
+{
+   isel_context ctx = setup_isel_context(program, 1, &gs_shader, config, args, true);
+
+   program->next_fp_mode.preserve_signed_zero_inf_nan32 = false;
+   program->next_fp_mode.preserve_signed_zero_inf_nan16_64 = false;
+   program->next_fp_mode.must_flush_denorms32 = false;
+   program->next_fp_mode.must_flush_denorms16_64 = false;
+   program->next_fp_mode.care_about_round32 = false;
+   program->next_fp_mode.care_about_round16_64 = false;
+   program->next_fp_mode.denorm16_64 = fp_denorm_keep;
+   program->next_fp_mode.denorm32 = 0;
+   program->next_fp_mode.round32 = fp_round_ne;
+   program->next_fp_mode.round16_64 = fp_round_ne;
+   ctx.block->fp_mode = program->next_fp_mode;
+
+   add_startpgm(&ctx);
+   append_logical_start(ctx.block);
+
+   Builder bld(ctx.program, ctx.block);
+
+   Temp gsvs_ring = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), program->private_segment_buffer, Operand(RING_GSVS_VS * 16u));
+
+   Operand stream_id(0u);
+   if (args->shader_info->so.num_outputs)
+      stream_id = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
+                           get_arg(&ctx, ctx.args->streamout_config), Operand(0x20018u));
+
+   Temp vtx_offset = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), get_arg(&ctx, ctx.args->ac.vertex_id));
+
+   std::stack<Block> endif_blocks;
+
+   for (unsigned stream = 0; stream < 4; stream++) {
+      if (stream_id.isConstant() && stream != stream_id.constantValue())
+         continue;
+
+      unsigned num_components = args->shader_info->gs.num_stream_output_components[stream];
+      if (stream > 0 && (!num_components || !args->shader_info->so.num_outputs))
+         continue;
+
+      memset(ctx.vsgs_output.mask, 0, sizeof(ctx.vsgs_output.mask));
+
+      unsigned BB_if_idx = ctx.block->index;
+      Block BB_endif = Block();
+      if (!stream_id.isConstant()) {
+         /* begin IF */
+         Temp cond = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), stream_id, Operand(stream));
+         append_logical_end(ctx.block);
+         ctx.block->kind |= block_kind_uniform;
+         bld.branch(aco_opcode::p_cbranch_z, cond);
+
+         BB_endif.kind |= ctx.block->kind & block_kind_top_level;
+
+         ctx.block = ctx.program->create_and_insert_block();
+         add_edge(BB_if_idx, ctx.block);
+         bld.reset(ctx.block);
+         append_logical_start(ctx.block);
+      }
+
+      unsigned offset = 0;
+      for (unsigned i = 0; i <= VARYING_SLOT_VAR31; ++i) {
+         if (args->shader_info->gs.output_streams[i] != stream)
+            continue;
+
+         unsigned output_usage_mask = args->shader_info->gs.output_usage_mask[i];
+         unsigned length = util_last_bit(output_usage_mask);
+         for (unsigned j = 0; j < length; ++j) {
+            if (!(output_usage_mask & (1 << j)))
+               continue;
+
+            unsigned const_offset = offset * args->shader_info->gs.vertices_out * 16 * 4;
+            Temp voffset = vtx_offset;
+            if (const_offset >= 4096u) {
+               voffset = bld.vadd32(bld.def(v1), Operand(const_offset / 4096u * 4096u), voffset);
+               const_offset %= 4096u;
+            }
+
+            aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(aco_opcode::buffer_load_dword, Format::MUBUF, 3, 1)};
+            mubuf->definitions[0] = bld.def(v1);
+            mubuf->operands[0] = Operand(voffset);
+            mubuf->operands[1] = Operand(gsvs_ring);
+            mubuf->operands[2] = Operand(0u);
+            mubuf->offen = true;
+            mubuf->offset = const_offset;
+            mubuf->glc = true;
+            mubuf->slc = true;
+            mubuf->dlc = args->options->chip_class >= GFX10;
+            mubuf->barrier = barrier_none;
+            mubuf->can_reorder = true;
+
+            ctx.vsgs_output.mask[i] |= 1 << j;
+            ctx.vsgs_output.outputs[i][j] = mubuf->definitions[0].getTemp();
+
+            bld.insert(std::move(mubuf));
+
+            offset++;
+         }
+      }
+
+      if (args->shader_info->so.num_outputs) {
+         emit_streamout(&ctx, stream);
+         bld.reset(ctx.block);
+      }
+
+      if (stream == 0) {
+         create_vs_exports(&ctx);
+         ctx.block->kind |= block_kind_export_end;
+      }
+
+      if (!stream_id.isConstant()) {
+         append_logical_end(ctx.block);
+
+         /* branch from then block to endif block */
+         bld.branch(aco_opcode::p_branch);
+         add_edge(ctx.block->index, &BB_endif);
+         ctx.block->kind |= block_kind_uniform;
+
+         /* emit else block */
+         ctx.block = ctx.program->create_and_insert_block();
+         add_edge(BB_if_idx, ctx.block);
+         bld.reset(ctx.block);
+         append_logical_start(ctx.block);
+
+         endif_blocks.push(std::move(BB_endif));
+      }
   }
+
+   while (!endif_blocks.empty()) {
+      Block BB_endif = std::move(endif_blocks.top());
+      endif_blocks.pop();
+
+      Block *BB_else = ctx.block;
+
+      append_logical_end(BB_else);
+      /* branch from else block to endif block */
+      bld.branch(aco_opcode::p_branch);
+      add_edge(BB_else->index, &BB_endif);
+      BB_else->kind |= block_kind_uniform;
+
+      /** emit endif merge block */
+      ctx.block = program->insert_block(std::move(BB_endif));
+      bld.reset(ctx.block);
+      append_logical_start(ctx.block);
+   }
+
+   program->config->float_mode = program->blocks[0].fp_mode.val;
+
+   append_logical_end(ctx.block);
+   ctx.block->kind |= block_kind_uniform;
+   bld.sopp(aco_opcode::s_endpgm);
+
+   cleanup_cfg(program);
 }
 }
--- a/src/amd/compiler/aco_instruction_selection_setup.cpp
+++ b/src/amd/compiler/aco_instruction_selection_setup.cpp
@ -85,6 +85,7 @@ struct isel_context {
   uint64_t output_masks[MESA_SHADER_COMPUTE];

   /* VS output information */
+   bool export_clip_dists;
   unsigned num_clip_distances;
   unsigned num_cull_distances;

@ -661,6 +662,54 @@ mem_vectorize_callback(unsigned align, unsigned bit_size,
   return false;
 }

+void
+setup_vs_output_info(isel_context *ctx, nir_shader *nir,
+                     bool export_prim_id, bool export_clip_dists,
+                     radv_vs_output_info *outinfo)
+{
+   memset(outinfo->vs_output_param_offset, AC_EXP_PARAM_UNDEFINED,
+          sizeof(outinfo->vs_output_param_offset));
+
+   outinfo->param_exports = 0;
+   int pos_written = 0x1;
+   if (outinfo->writes_pointsize || outinfo->writes_viewport_index || outinfo->writes_layer)
+      pos_written |= 1 << 1;
+
+   uint64_t mask = ctx->output_masks[nir->info.stage];
+   while (mask) {
+      int idx = u_bit_scan64(&mask);
+      if (idx >= VARYING_SLOT_VAR0 || idx == VARYING_SLOT_LAYER || idx == VARYING_SLOT_PRIMITIVE_ID ||
+          ((idx == VARYING_SLOT_CLIP_DIST0 || idx == VARYING_SLOT_CLIP_DIST1) && export_clip_dists)) {
+         if (outinfo->vs_output_param_offset[idx] == AC_EXP_PARAM_UNDEFINED)
+            outinfo->vs_output_param_offset[idx] = outinfo->param_exports++;
+      }
+   }
+   if (outinfo->writes_layer &&
+       outinfo->vs_output_param_offset[VARYING_SLOT_LAYER] == AC_EXP_PARAM_UNDEFINED) {
+      /* when ctx->options->key.has_multiview_view_index = true, the layer
+       * variable isn't declared in NIR and it's isel's job to get the layer */
+      outinfo->vs_output_param_offset[VARYING_SLOT_LAYER] = outinfo->param_exports++;
+   }
+
+   if (export_prim_id) {
+      assert(outinfo->vs_output_param_offset[VARYING_SLOT_PRIMITIVE_ID] == AC_EXP_PARAM_UNDEFINED);
+      outinfo->vs_output_param_offset[VARYING_SLOT_PRIMITIVE_ID] = outinfo->param_exports++;
+   }
+
+   ctx->export_clip_dists = export_clip_dists;
+   ctx->num_clip_distances = util_bitcount(outinfo->clip_dist_mask);
+   ctx->num_cull_distances = util_bitcount(outinfo->cull_dist_mask);
+
+   assert(ctx->num_clip_distances + ctx->num_cull_distances <= 8);
+
+   if (ctx->num_clip_distances + ctx->num_cull_distances > 0)
+      pos_written |= 1 << 2;
+   if (ctx->num_clip_distances + ctx->num_cull_distances > 4)
+      pos_written |= 1 << 3;
+
+   outinfo->pos_exports = util_bitcount(pos_written);
+}
+
 void
 setup_vs_variables(isel_context *ctx, nir_shader *nir)
 {
@ -681,49 +730,8 @@ setup_vs_variables(isel_context *ctx, nir_shader *nir)

   if (ctx->stage == vertex_vs) {
      radv_vs_output_info *outinfo = &ctx->program->info->vs.outinfo;
-
-      memset(outinfo->vs_output_param_offset, AC_EXP_PARAM_UNDEFINED,
-             sizeof(outinfo->vs_output_param_offset));
-
-      bool export_clip_dists = ctx->options->key.vs_common_out.export_clip_dists;
-
-      outinfo->param_exports = 0;
-      int pos_written = 0x1;
-      if (outinfo->writes_pointsize || outinfo->writes_viewport_index || outinfo->writes_layer)
-         pos_written |= 1 << 1;
-
-      uint64_t mask = ctx->output_masks[nir->info.stage];
-      while (mask) {
-         int idx = u_bit_scan64(&mask);
-         if (idx >= VARYING_SLOT_VAR0 || idx == VARYING_SLOT_LAYER || idx == VARYING_SLOT_PRIMITIVE_ID ||
-             ((idx == VARYING_SLOT_CLIP_DIST0 || idx == VARYING_SLOT_CLIP_DIST1) && export_clip_dists)) {
-            if (outinfo->vs_output_param_offset[idx] == AC_EXP_PARAM_UNDEFINED)
-               outinfo->vs_output_param_offset[idx] = outinfo->param_exports++;
-         }
-      }
-      if (outinfo->writes_layer &&
-          outinfo->vs_output_param_offset[VARYING_SLOT_LAYER] == AC_EXP_PARAM_UNDEFINED) {
-         /* when ctx->options->key.has_multiview_view_index = true, the layer
-          * variable isn't declared in NIR and it's isel's job to get the layer */
-         outinfo->vs_output_param_offset[VARYING_SLOT_LAYER] = outinfo->param_exports++;
-      }
-
-      if (outinfo->export_prim_id) {
-         assert(outinfo->vs_output_param_offset[VARYING_SLOT_PRIMITIVE_ID] == AC_EXP_PARAM_UNDEFINED);
-         outinfo->vs_output_param_offset[VARYING_SLOT_PRIMITIVE_ID] = outinfo->param_exports++;
-      }
-
-      ctx->num_clip_distances = util_bitcount(outinfo->clip_dist_mask);
-      ctx->num_cull_distances = util_bitcount(outinfo->cull_dist_mask);
-
-      assert(ctx->num_clip_distances + ctx->num_cull_distances <= 8);
-
-      if (ctx->num_clip_distances + ctx->num_cull_distances > 0)
-         pos_written |= 1 << 2;
-      if (ctx->num_clip_distances + ctx->num_cull_distances > 4)
-         pos_written |= 1 << 3;
-
-      outinfo->pos_exports = util_bitcount(pos_written);
+      setup_vs_output_info(ctx, nir, outinfo->export_prim_id,
+                           ctx->options->key.vs_common_out.export_clip_dists, outinfo);
   } else if (ctx->stage == vertex_geometry_gs || ctx->stage == vertex_es) {
      /* TODO: radv_nir_shader_info_pass() already sets this but it's larger
       * than it needs to be in order to set it better, we have to improve
@ -824,12 +832,80 @@ get_io_masks(isel_context *ctx, unsigned shader_count, struct nir_shader *const
   }
 }

+void
+setup_nir(isel_context *ctx, nir_shader *nir)
+{
+   Program *program = ctx->program;
+
+   /* align and copy constant data */
+   while (program->constant_data.size() % 4u)
+      program->constant_data.push_back(0);
+   ctx->constant_data_offset = program->constant_data.size();
+   program->constant_data.insert(program->constant_data.end(),
+                                 (uint8_t*)nir->constant_data,
+                                 (uint8_t*)nir->constant_data + nir->constant_data_size);
+
+   /* the variable setup has to be done before lower_io / CSE */
+   setup_variables(ctx, nir);
+
+   /* optimize and lower memory operations */
+   bool lower_to_scalar = false;
+   bool lower_pack = false;
+   if (nir_opt_load_store_vectorize(nir,
+                                    (nir_variable_mode)(nir_var_mem_ssbo | nir_var_mem_ubo |
+                                                        nir_var_mem_push_const | nir_var_mem_shared),
+                                    mem_vectorize_callback)) {
+      lower_to_scalar = true;
+      lower_pack = true;
+   }
+   if (nir->info.stage != MESA_SHADER_COMPUTE)
+      nir_lower_io(nir, (nir_variable_mode)(nir_var_shader_in | nir_var_shader_out), type_size, (nir_lower_io_options)0);
+   nir_lower_explicit_io(nir, nir_var_mem_global, nir_address_format_64bit_global);
+
+   if (lower_to_scalar)
+      nir_lower_alu_to_scalar(nir, NULL, NULL);
+   if (lower_pack)
+      nir_lower_pack(nir);
+
+   /* lower ALU operations */
+   // TODO: implement logic64 in aco, it's more effective for sgprs
+   nir_lower_int64(nir, nir->options->lower_int64_options);
+
+   nir_opt_idiv_const(nir, 32);
+   nir_lower_idiv(nir, nir_lower_idiv_precise);
+
+   /* optimize the lowered ALU operations */
+   bool more_algebraic = true;
+   while (more_algebraic) {
+      more_algebraic = false;
+      NIR_PASS_V(nir, nir_copy_prop);
+      NIR_PASS_V(nir, nir_opt_dce);
+      NIR_PASS_V(nir, nir_opt_constant_folding);
+      NIR_PASS(more_algebraic, nir, nir_opt_algebraic);
+   }
+
+   /* cleanup passes */
+   nir_lower_load_const_to_scalar(nir);
+   nir_opt_shrink_load(nir);
+   nir_move_options move_opts = (nir_move_options)(
+      nir_move_const_undef | nir_move_load_ubo | nir_move_load_input | nir_move_comparisons);
+   nir_opt_sink(nir, move_opts);
+   nir_opt_move(nir, move_opts);
+   nir_convert_to_lcssa(nir, true, false);
+   nir_lower_phis_to_scalar(nir);
+
+   nir_function_impl *func = nir_shader_get_entrypoint(nir);
+   nir_index_ssa_defs(func);
+   nir_metadata_require(func, nir_metadata_block_index);
+}
+
 isel_context
 setup_isel_context(Program* program,
                   unsigned shader_count,
                   struct nir_shader *const *shaders,
                   ac_shader_config* config,
-                   struct radv_shader_args *args)
+                   struct radv_shader_args *args,
+                   bool is_gs_copy_shader)
 {
   program->stage = 0;
   for (unsigned i = 0; i < shader_count; i++) {
@ -844,7 +920,7 @@ setup_isel_context(Program* program,
         program->stage |= sw_tes;
         break;
      case MESA_SHADER_GEOMETRY:
-         program->stage |= sw_gs;
+         program->stage |= is_gs_copy_shader ? sw_gs_copy : sw_gs;
         break;
      case MESA_SHADER_FRAGMENT:
         program->stage |= sw_fs;
@ -868,6 +944,8 @@ setup_isel_context(Program* program,
      program->stage |= hw_fs;
   else if (program->stage == sw_cs)
      program->stage |= hw_cs;
+   else if (program->stage == sw_gs_copy)
+      program->stage |= hw_vs;
   else if (program->stage == (sw_vs | sw_gs) && gfx9_plus && !ngg)
      program->stage |= hw_gs;
   else
@ -918,94 +996,25 @@ setup_isel_context(Program* program,

   get_io_masks(&ctx, shader_count, shaders);

-   for (unsigned i = 0; i < shader_count; i++) {
-      nir_shader *nir = shaders[i];
+   unsigned scratch_size = 0;
+   if (program->stage == gs_copy_vs) {
+      assert(shader_count == 1);
+      setup_vs_output_info(&ctx, shaders[0], false, true, &args->shader_info->vs.outinfo);
+   } else {
+      for (unsigned i = 0; i < shader_count; i++) {
+         nir_shader *nir = shaders[i];
+         setup_nir(&ctx, nir);

-      /* align and copy constant data */
-      while (program->constant_data.size() % 4u)
-         program->constant_data.push_back(0);
-      ctx.constant_data_offset = program->constant_data.size();
-      program->constant_data.insert(program->constant_data.end(),
-                                    (uint8_t*)nir->constant_data,
-                                    (uint8_t*)nir->constant_data + nir->constant_data_size);
-
-      /* the variable setup has to be done before lower_io / CSE */
-      setup_variables(&ctx, nir);
-
-      /* optimize and lower memory operations */
-      bool lower_to_scalar = false;
-      bool lower_pack = false;
-      if (nir_opt_load_store_vectorize(nir,
-                                       (nir_variable_mode)(nir_var_mem_ssbo | nir_var_mem_ubo |
-                                                           nir_var_mem_push_const | nir_var_mem_shared),
-                                       mem_vectorize_callback)) {
-         lower_to_scalar = true;
-         lower_pack = true;
-      }
-      if (nir->info.stage != MESA_SHADER_COMPUTE)
-         nir_lower_io(nir, (nir_variable_mode)(nir_var_shader_in | nir_var_shader_out), type_size, (nir_lower_io_options)0);
-      nir_lower_explicit_io(nir, nir_var_mem_global, nir_address_format_64bit_global);
-
-      if (lower_to_scalar)
-         nir_lower_alu_to_scalar(nir, NULL, NULL);
-      if (lower_pack)
-         nir_lower_pack(nir);
-
-      /* lower ALU operations */
-      // TODO: implement logic64 in aco, it's more effective for sgprs
-      nir_lower_int64(nir, nir->options->lower_int64_options);
-
-      nir_opt_idiv_const(nir, 32);
-      nir_lower_idiv(nir, nir_lower_idiv_precise);
-
-      /* optimize the lowered ALU operations */
-      bool more_algebraic = true;
-      while (more_algebraic) {
-         more_algebraic = false;
-         NIR_PASS_V(nir, nir_copy_prop);
-         NIR_PASS_V(nir, nir_opt_dce);
-         NIR_PASS_V(nir, nir_opt_constant_folding);
-         NIR_PASS(more_algebraic, nir, nir_opt_algebraic);
+         if (args->options->dump_preoptir) {
+            fprintf(stderr, "NIR shader before instruction selection:\n");
+            nir_print_shader(nir, stderr);
+         }
      }

-      /* Do late algebraic optimization to turn add(a, neg(b)) back into
-      * subs, then the mandatory cleanup after algebraic.  Note that it may
-      * produce fnegs, and if so then we need to keep running to squash
-      * fneg(fneg(a)).
-      */
-      bool more_late_algebraic = true;
-      while (more_late_algebraic) {
-         more_late_algebraic = false;
-         NIR_PASS(more_late_algebraic, nir, nir_opt_algebraic_late);
-         NIR_PASS_V(nir, nir_opt_constant_folding);
-         NIR_PASS_V(nir, nir_copy_prop);
-         NIR_PASS_V(nir, nir_opt_dce);
-         NIR_PASS_V(nir, nir_opt_cse);
-      }
-
-      /* cleanup passes */
-      nir_lower_load_const_to_scalar(nir);
-      nir_opt_shrink_load(nir);
-      nir_move_options move_opts = (nir_move_options)(
-         nir_move_const_undef | nir_move_load_ubo | nir_move_load_input | nir_move_comparisons);
-      nir_opt_sink(nir, move_opts);
-      nir_opt_move(nir, move_opts);
-      nir_convert_to_lcssa(nir, true, false);
-      nir_lower_phis_to_scalar(nir);
-
-      nir_function_impl *func = nir_shader_get_entrypoint(nir);
-      nir_index_ssa_defs(func);
-      nir_metadata_require(func, nir_metadata_block_index);
-
-      if (args->options->dump_preoptir) {
-         fprintf(stderr, "NIR shader before instruction selection:\n");
-         nir_print_shader(nir, stderr);
-      }
+      for (unsigned i = 0; i < shader_count; i++)
+         scratch_size = std::max(scratch_size, shaders[i]->scratch_size);
   }

-   unsigned scratch_size = 0;
-   for (unsigned i = 0; i < shader_count; i++)
-      scratch_size = std::max(scratch_size, shaders[i]->scratch_size);
   ctx.program->config->scratch_bytes_per_wave = align(scratch_size * ctx.program->wave_size, 1024);

   ctx.block = ctx.program->create_and_insert_block();
--- a/src/amd/compiler/aco_interface.cpp
+++ b/src/amd/compiler/aco_interface.cpp
@ -65,7 +65,10 @@ void aco_compile_shader(unsigned shader_count,
   std::unique_ptr<aco::Program> program{new aco::Program};

   /* Instruction Selection */
-   aco::select_program(program.get(), shader_count, shaders, &config, args);
+   if (args->is_gs_copy_shader)
+      aco::select_gs_copy_shader(program.get(), shaders[0], &config, args);
+   else
+      aco::select_program(program.get(), shader_count, shaders, &config, args);
   if (args->options->dump_preoptir) {
      std::cerr << "After Instruction Selection:\n";
      aco_print_program(program.get(), stderr);
@ -162,7 +165,7 @@ void aco_compile_shader(unsigned shader_count,

   legacy_binary->base.type = RADV_BINARY_TYPE_LEGACY;
   legacy_binary->base.stage = shaders[shader_count-1]->info.stage;
-   legacy_binary->base.is_gs_copy_shader = false;
+   legacy_binary->base.is_gs_copy_shader = args->is_gs_copy_shader;
   legacy_binary->base.total_size = size;

   memcpy(legacy_binary->data, code.data(), code.size() * sizeof(uint32_t));
--- a/src/amd/compiler/aco_ir.h
+++ b/src/amd/compiler/aco_ir.h
@ -1106,23 +1106,25 @@ static constexpr Stage sw_tcs = 1 << 2;
 static constexpr Stage sw_tes = 1 << 3;
 static constexpr Stage sw_fs = 1 << 4;
 static constexpr Stage sw_cs = 1 << 5;
-static constexpr Stage sw_mask = 0x3f;
+static constexpr Stage sw_gs_copy = 1 << 6;
+static constexpr Stage sw_mask = 0x7f;

 /* hardware stages (can't be OR'd, just a mask for convenience when testing multiple) */
-static constexpr Stage hw_vs = 1 << 6;
-static constexpr Stage hw_es = 1 << 7; /* not on GFX9. combined into GS on GFX9 (and GFX10/legacy). */
-static constexpr Stage hw_gs = 1 << 8;
-static constexpr Stage hw_ls = 1 << 9; /* not on GFX9. combined into HS on GFX9 (and GFX10/legacy). */
-static constexpr Stage hw_hs = 1 << 10;
-static constexpr Stage hw_fs = 1 << 11;
-static constexpr Stage hw_cs = 1 << 12;
-static constexpr Stage hw_mask = 0x7f << 6;
+static constexpr Stage hw_vs = 1 << 7;
+static constexpr Stage hw_es = 1 << 8; /* not on GFX9. combined into GS on GFX9 (and GFX10/legacy). */
+static constexpr Stage hw_gs = 1 << 9;
+static constexpr Stage hw_ls = 1 << 10; /* not on GFX9. combined into HS on GFX9 (and GFX10/legacy). */
+static constexpr Stage hw_hs = 1 << 11;
+static constexpr Stage hw_fs = 1 << 12;
+static constexpr Stage hw_cs = 1 << 13;
+static constexpr Stage hw_mask = 0x7f << 7;

 /* possible settings of Program::stage */
 static constexpr Stage vertex_vs = sw_vs | hw_vs;
 static constexpr Stage fragment_fs = sw_fs | hw_fs;
 static constexpr Stage compute_cs = sw_cs | hw_cs;
 static constexpr Stage tess_eval_vs = sw_tes | hw_vs;
+static constexpr Stage gs_copy_vs = sw_gs_copy | hw_vs;
 /* GFX10/NGG */
 static constexpr Stage ngg_vertex_gs = sw_vs | hw_gs;
 static constexpr Stage ngg_vertex_geometry_gs = sw_vs | sw_gs | hw_gs;
@ -1219,6 +1221,9 @@ void select_program(Program *program,
                    struct nir_shader *const *shaders,
                    ac_shader_config* config,
                    struct radv_shader_args *args);
+void select_gs_copy_shader(Program *program, struct nir_shader *gs_shader,
+                           ac_shader_config* config,
+                           struct radv_shader_args *args);

 void lower_wqm(Program* program, live& live_vars,
               const struct radv_nir_compiler_options *options);