aco: Use strong typing to model SW<->HW stage mappings

Reviewed-by: Timur Kristóf <timur.kristof@gmail.com> Acked-by: Daniel Schürmann <daniel@schuermann.dev> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7094>
2020-10-05 17:50:37 +02:00 · 2020-10-05 17:50:37 +02:00 · 86c227c10c
parent fdbc45d1d4
commit 86c227c10c
8 changed files with 174 additions and 115 deletions
--- a/src/amd/compiler/aco_assembler.cpp
+++ b/src/amd/compiler/aco_assembler.cpp
@ -732,7 +732,7 @@ void fix_exports(asm_context& ctx, std::vector<uint32_t>& out, Program* program)
      {
         if ((*it)->format == Format::EXP) {
            Export_instruction* exp = static_cast<Export_instruction*>((*it).get());
-            if (program->stage & (hw_vs | hw_ngg_gs)) {
+            if (program->stage.hw == HWStage::VS || program->stage.hw == HWStage::NGG_GS) {
               if (exp->dest >= V_008DFC_SQ_EXP_POS && exp->dest <= (V_008DFC_SQ_EXP_POS + 3)) {
                  exp->done = true;
                  exported = true;
@ -752,7 +752,8 @@ void fix_exports(asm_context& ctx, std::vector<uint32_t>& out, Program* program)

   if (!exported) {
      /* Abort in order to avoid a GPU hang. */
-      aco_err(program, "Missing export in %s shader:", (program->stage & (hw_vs | hw_ngg_gs)) ? "vertex or NGG" : "fragment");
+      bool is_vertex_or_ngg = (program->stage.hw == HWStage::VS || program->stage.hw == HWStage::NGG_GS);
+      aco_err(program, "Missing export in %s shader:", is_vertex_or_ngg ? "vertex or NGG" : "fragment");
      aco_print_program(program, stderr);
      abort();
   }
@ -917,7 +918,9 @@ unsigned emit_program(Program* program,
 {
   asm_context ctx(program);

-   if (program->stage & (hw_vs | hw_fs | hw_ngg_gs))
+   if (program->stage.hw == HWStage::VS ||
+       program->stage.hw == HWStage::FS ||
+       program->stage.hw == HWStage::NGG_GS)
      fix_exports(ctx, code, program);

   for (Block& block : program->blocks) {
--- a/src/amd/compiler/aco_insert_exec_mask.cpp
+++ b/src/amd/compiler/aco_insert_exec_mask.cpp
@ -379,7 +379,7 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block,
      bld.insert(std::move(startpgm));

      /* exec seems to need to be manually initialized with combined shaders */
-      if (util_bitcount(ctx.program->stage & sw_mask) > 1 || (ctx.program->stage & hw_ngg_gs)) {
+      if (ctx.program->stage.num_sw_stages() > 1 || ctx.program->stage.hw == HWStage::NGG_GS) {
         bld.sop1(Builder::s_mov, bld.exec(Definition(exec_mask)), bld.lm == s2 ? Operand(UINT64_MAX) : Operand(UINT32_MAX));
         instructions[0]->definitions.pop_back();
      }
--- a/src/amd/compiler/aco_instruction_selection.cpp
+++ b/src/amd/compiler/aco_instruction_selection.cpp
@ -4252,7 +4252,7 @@ void visit_store_ls_or_es_output(isel_context *ctx, nir_intrinsic_instr *instr)
      if (ctx->stage == vertex_geometry_gs || ctx->stage == tess_eval_geometry_gs ||
          ctx->stage == ngg_vertex_geometry_gs || ctx->stage == ngg_tess_eval_geometry_gs) {
         /* GFX9+: ES stage is merged into GS, data is passed between them using LDS. */
-         unsigned itemsize = (ctx->stage & sw_vs)
+         unsigned itemsize = ctx->stage.has(SWStage::VS)
                             ? ctx->program->info->vs.es_info.esgs_itemsize
                             : ctx->program->info->tes.es_info.esgs_itemsize;
         Temp vertex_idx = thread_id_in_threadgroup(ctx);
@ -4363,9 +4363,9 @@ void visit_store_output(isel_context *ctx, nir_intrinsic_instr *instr)
         isel_err(instr->src[1].ssa->parent_instr, "Unimplemented output offset instruction");
         abort();
      }
-   } else if ((ctx->stage & (hw_ls | hw_es)) ||
+   } else if ((ctx->stage.hw == HWStage::LS || ctx->stage.hw == HWStage::ES) ||
              (ctx->stage == vertex_tess_control_hs && ctx->shader->info.stage == MESA_SHADER_VERTEX) ||
-              ((ctx->stage & sw_gs) && ctx->shader->info.stage != MESA_SHADER_GEOMETRY)) {
+              (ctx->stage.has(SWStage::GS) && ctx->shader->info.stage != MESA_SHADER_GEOMETRY)) {
      visit_store_ls_or_es_output(ctx, instr);
   } else if (ctx->shader->info.stage == MESA_SHADER_TESS_CTRL) {
      visit_store_tcs_output(ctx, instr, false);
@ -7588,7 +7588,10 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
      break;
   }
   case nir_intrinsic_load_view_index: {
-      if (ctx->stage & (sw_vs | sw_gs | sw_tcs | sw_tes)) {
+      if (ctx->stage.has(SWStage::VS) ||
+          ctx->stage.has(SWStage::GS) ||
+          ctx->stage.has(SWStage::TCS) ||
+          ctx->stage.has(SWStage::TES)) {
         Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
         bld.copy(Definition(dst), Operand(get_arg(ctx, ctx->args->ac.view_index)));
         break;
@ -8348,21 +8351,21 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
      break;
   }
   case nir_intrinsic_emit_vertex_with_counter: {
-      if (ctx->stage & hw_ngg_gs)
+      if (ctx->stage.hw == HWStage::NGG_GS)
         ngg_visit_emit_vertex_with_counter(ctx, instr);
      else
         visit_emit_vertex_with_counter(ctx, instr);
      break;
   }
   case nir_intrinsic_end_primitive_with_counter: {
-      if ((ctx->stage & hw_ngg_gs) == 0) {
+      if (ctx->stage.hw != HWStage::NGG_GS) {
         unsigned stream = nir_intrinsic_stream_id(instr);
         bld.sopp(aco_opcode::s_sendmsg, bld.m0(ctx->gs_wave_id), -1, sendmsg_gs(true, false, stream));
      }
      break;
   }
   case nir_intrinsic_set_vertex_and_primitive_count: {
-      if (ctx->stage & hw_ngg_gs)
+      if (ctx->stage.hw == HWStage::NGG_GS)
         ngg_visit_set_vertex_and_primitive_count(ctx, instr);
      /* unused in the legacy pipeline, the HW keeps track of this for us */
      break;
@ -10079,9 +10082,9 @@ static bool visit_cf_list(isel_context *ctx,

 static void export_vs_varying(isel_context *ctx, int slot, bool is_pos, int *next_pos)
 {
-   assert(ctx->stage & (hw_vs | hw_ngg_gs));
+   assert(ctx->stage.hw == HWStage::VS || ctx->stage.hw == HWStage::NGG_GS);

-   int offset = ((ctx->stage & sw_tes) && !(ctx->stage & sw_gs))
+   int offset = (ctx->stage.has(SWStage::TES) && !ctx->stage.has(SWStage::GS))
                ? ctx->program->info->tes.outinfo.vs_output_param_offset[slot]
                : ctx->program->info->vs.outinfo.vs_output_param_offset[slot];
   uint64_t mask = ctx->outputs.mask[slot];
@ -10176,15 +10179,15 @@ static void create_export_phis(isel_context *ctx)

 static void create_vs_exports(isel_context *ctx)
 {
-   assert(ctx->stage & (hw_vs | hw_ngg_gs));
+   assert(ctx->stage.hw == HWStage::VS || ctx->stage.hw == HWStage::NGG_GS);

-   radv_vs_output_info *outinfo = ((ctx->stage & sw_tes) && !(ctx->stage & sw_gs))
+   radv_vs_output_info *outinfo = (ctx->stage.has(SWStage::TES) && !ctx->stage.has(SWStage::GS))
                                  ? &ctx->program->info->tes.outinfo
                                  : &ctx->program->info->vs.outinfo;

-   if (outinfo->export_prim_id && !(ctx->stage & hw_ngg_gs)) {
+   if (outinfo->export_prim_id && ctx->stage.hw != HWStage::NGG_GS) {
      ctx->outputs.mask[VARYING_SLOT_PRIMITIVE_ID] |= 0x1;
-      if (ctx->stage & sw_tes)
+      if (ctx->stage.has(SWStage::TES))
         ctx->outputs.temps[VARYING_SLOT_PRIMITIVE_ID * 4u] = get_arg(ctx, ctx->args->ac.tes_patch_id);
      else
         ctx->outputs.temps[VARYING_SLOT_PRIMITIVE_ID * 4u] = get_arg(ctx, ctx->args->vs_prim_id);
@ -10646,7 +10649,7 @@ static void emit_stream_output(isel_context *ctx,

   Temp out[4];
   bool all_undef = true;
-   assert(ctx->stage & hw_vs);
+   assert(ctx->stage.hw == HWStage::VS);
   for (unsigned i = 0; i < num_comps; i++) {
      out[i] = ctx->outputs.temps[loc * 4 + start + i];
      all_undef = all_undef && !out[i].id();
@ -11055,7 +11058,7 @@ void ngg_emit_sendmsg_gs_alloc_req(isel_context *ctx, Temp vtx_cnt = Temp(), Tem
   /* VS/TES: we infer the vertex and primitive count from arguments
    * GS: the caller needs to supply them
    */
-   assert((ctx->stage & sw_gs)
+   assert(ctx->stage.has(SWStage::GS)
          ? (vtx_cnt.id() && prm_cnt.id())
          : (!vtx_cnt.id() && !prm_cnt.id()));

@ -11117,7 +11120,7 @@ void ngg_emit_prim_export(isel_context *ctx, unsigned num_vertices_per_primitive
   Builder bld(ctx->program, ctx->block);
   Temp prim_exp_arg;

-   if (!(ctx->stage & sw_gs) && ctx->args->options->key.vs_common_out.as_ngg_passthrough)
+   if (!ctx->stage.has(SWStage::GS) && ctx->args->options->key.vs_common_out.as_ngg_passthrough)
      prim_exp_arg = get_arg(ctx, ctx->args->gs_vtx_offset[0]);
   else
      prim_exp_arg = ngg_pack_prim_exp_arg(ctx, num_vertices_per_primitive, vtxindex, is_null);
@ -11144,7 +11147,7 @@ void ngg_nogs_export_primitives(isel_context *ctx)
   constexpr unsigned max_vertices_per_primitive = 3;
   unsigned num_vertices_per_primitive = max_vertices_per_primitive;

-   assert(!(ctx->stage & sw_gs));
+   assert(!ctx->stage.has(SWStage::GS));

   if (ctx->stage == ngg_vertex_gs) {
      /* TODO: optimize for points & lines */
@ -11711,10 +11714,10 @@ void select_program(Program *program,

      visit_cf_list(&ctx, &func->body);

-      if (ctx.program->info->so.num_outputs && (ctx.stage & hw_vs))
+      if (ctx.program->info->so.num_outputs && ctx.stage.hw == HWStage::VS)
         emit_streamout(&ctx, 0);

-      if (ctx.stage & hw_vs) {
+      if (ctx.stage.hw == HWStage::VS) {
         create_vs_exports(&ctx);
         ctx.block->kind |= block_kind_export_end;
      } else if (ngg_no_gs && ctx.ngg_nogs_early_prim_export) {
--- a/src/amd/compiler/aco_instruction_selection.h
+++ b/src/amd/compiler/aco_instruction_selection.h
@ -61,7 +61,7 @@ struct isel_context {
   Block *block;
   std::unique_ptr<Temp[]> allocated;
   std::unordered_map<unsigned, std::array<Temp,NIR_MAX_VEC_COMPONENTS>> allocated_vec;
-   Stage stage; /* Stage */
+   Stage stage;
   bool has_gfx10_wave64_bpermute = false;
   struct {
      bool has_branch;
--- a/src/amd/compiler/aco_instruction_selection_setup.cpp
+++ b/src/amd/compiler/aco_instruction_selection_setup.cpp
@ -435,7 +435,7 @@ setup_vs_variables(isel_context *ctx, nir_shader *nir)
                           ctx->options->key.vs_common_out.export_clip_dists, outinfo);

      /* TODO: NGG streamout */
-      if (ctx->stage & hw_ngg_gs)
+      if (ctx->stage.hw == HWStage::NGG_GS)
         assert(!ctx->args->shader_info->so.num_outputs);

      /* TODO: check if the shader writes edge flags (not in Vulkan) */
@ -481,9 +481,9 @@ void setup_gs_variables(isel_context *ctx, nir_shader *nir)
      ctx->ngg_gs_early_alloc = ctx->ngg_gs_const_vtxcnt[0] == nir->info.gs.vertices_out && ctx->ngg_gs_const_prmcnt[0] != -1;
   }

-   if (ctx->stage & sw_vs)
+   if (ctx->stage.has(SWStage::VS))
      ctx->program->info->gs.es_type = MESA_SHADER_VERTEX;
-   else if (ctx->stage & sw_tes)
+   else if (ctx->stage.has(SWStage::TES))
      ctx->program->info->gs.es_type = MESA_SHADER_TESS_EVAL;
 }

@ -550,7 +550,7 @@ setup_tes_variables(isel_context *ctx, nir_shader *nir)
                           ctx->options->key.vs_common_out.export_clip_dists, outinfo);

      /* TODO: NGG streamout */
-      if (ctx->stage & hw_ngg_gs)
+      if (ctx->stage.hw == HWStage::NGG_GS)
         assert(!ctx->args->shader_info->so.num_outputs);

      /* Tess eval shaders can't write edge flags, so this can be always true. */
@ -644,7 +644,7 @@ void init_context(isel_context *ctx, nir_shader *shader)
   /* we'll need this for isel */
   nir_metadata_require(impl, nir_metadata_block_index);

-   if (!(ctx->stage & sw_gs_copy) && ctx->options->dump_preoptir) {
+   if (!ctx->stage.has(SWStage::GSCopy) && ctx->options->dump_preoptir) {
      fprintf(stderr, "NIR shader before instruction selection:\n");
      nir_print_shader(shader, stderr);
   }
@ -1022,26 +1022,26 @@ setup_isel_context(Program* program,
                   struct radv_shader_args *args,
                   bool is_gs_copy_shader)
 {
-   Stage stage = 0;
+   SWStage sw_stage = SWStage::None;
   for (unsigned i = 0; i < shader_count; i++) {
      switch (shaders[i]->info.stage) {
      case MESA_SHADER_VERTEX:
-         stage |= sw_vs;
+         sw_stage = sw_stage | SWStage::VS;
         break;
      case MESA_SHADER_TESS_CTRL:
-         stage |= sw_tcs;
+         sw_stage = sw_stage | SWStage::TCS;
         break;
      case MESA_SHADER_TESS_EVAL:
-         stage |= sw_tes;
+         sw_stage = sw_stage | SWStage::TES;
         break;
      case MESA_SHADER_GEOMETRY:
-         stage |= is_gs_copy_shader ? sw_gs_copy : sw_gs;
+         sw_stage = sw_stage | (is_gs_copy_shader ? SWStage::GSCopy : SWStage::GS);
         break;
      case MESA_SHADER_FRAGMENT:
-         stage |= sw_fs;
+         sw_stage = sw_stage | SWStage::FS;
         break;
      case MESA_SHADER_COMPUTE:
-         stage |= sw_cs;
+         sw_stage = sw_stage | SWStage::CS;
         break;
      default:
         unreachable("Shader stage not implemented");
@ -1049,44 +1049,45 @@ setup_isel_context(Program* program,
   }
   bool gfx9_plus = args->options->chip_class >= GFX9;
   bool ngg = args->shader_info->is_ngg && args->options->chip_class >= GFX10;
-   if (stage == sw_vs && args->shader_info->vs.as_es && !ngg)
-      stage |= hw_es;
-   else if (stage == sw_vs && !args->shader_info->vs.as_ls && !ngg)
-      stage |= hw_vs;
-   else if (stage == sw_vs && ngg)
-      stage |= hw_ngg_gs; /* GFX10/NGG: VS without GS uses the HW GS stage */
-   else if (stage == sw_gs)
-      stage |= hw_gs;
-   else if (stage == sw_fs)
-      stage |= hw_fs;
-   else if (stage == sw_cs)
-      stage |= hw_cs;
-   else if (stage == sw_gs_copy)
-      stage |= hw_vs;
-   else if (stage == (sw_vs | sw_gs) && gfx9_plus && !ngg)
-      stage |= hw_gs; /* GFX6-9: VS+GS merged into a GS (and GFX10/legacy) */
-   else if (stage == (sw_vs | sw_gs) && ngg)
-      stage |= hw_ngg_gs; /* GFX10+: VS+GS merged into an NGG GS */
-   else if (stage == sw_vs && args->shader_info->vs.as_ls)
-      stage |= hw_ls; /* GFX6-8: VS is a Local Shader, when tessellation is used */
-   else if (stage == sw_tcs)
-      stage |= hw_hs; /* GFX6-8: TCS is a Hull Shader */
-   else if (stage == (sw_vs | sw_tcs))
-      stage |= hw_hs; /* GFX9-10: VS+TCS merged into a Hull Shader */
-   else if (stage == sw_tes && !args->shader_info->tes.as_es && !ngg)
-      stage |= hw_vs; /* GFX6-9: TES without GS uses the HW VS stage (and GFX10/legacy) */
-   else if (stage == sw_tes && !args->shader_info->tes.as_es && ngg)
-      stage |= hw_ngg_gs; /* GFX10/NGG: TES without GS uses the HW GS stage */
-   else if (stage == sw_tes && args->shader_info->tes.as_es && !ngg)
-      stage |= hw_es; /* GFX6-8: TES is an Export Shader */
-   else if (stage == (sw_tes | sw_gs) && gfx9_plus && !ngg)
-      stage |= hw_gs; /* GFX9: TES+GS merged into a GS (and GFX10/legacy) */
-   else if (stage == (sw_tes | sw_gs) && ngg)
-      stage |= hw_ngg_gs; /* GFX10+: TES+GS merged into an NGG GS */
+   HWStage hw_stage { };
+   if (sw_stage == SWStage::VS && args->shader_info->vs.as_es && !ngg)
+      hw_stage = HWStage::ES;
+   else if (sw_stage == SWStage::VS && !args->shader_info->vs.as_ls && !ngg)
+      hw_stage = HWStage::VS;
+   else if (sw_stage == SWStage::VS && ngg)
+      hw_stage = HWStage::NGG_GS; /* GFX10/NGG: VS without GS uses the HW GS stage */
+   else if (sw_stage == SWStage::GS)
+      hw_stage = HWStage::GS;
+   else if (sw_stage == SWStage::FS)
+      hw_stage = HWStage::FS;
+   else if (sw_stage == SWStage::CS)
+      hw_stage = HWStage::CS;
+   else if (sw_stage == SWStage::GSCopy)
+      hw_stage = HWStage::VS;
+   else if (sw_stage == SWStage::VS_GS && gfx9_plus && !ngg)
+      hw_stage = HWStage::GS; /* GFX6-9: VS+GS merged into a GS (and GFX10/legacy) */
+   else if (sw_stage == SWStage::VS_GS && ngg)
+      hw_stage = HWStage::NGG_GS; /* GFX10+: VS+GS merged into an NGG GS */
+   else if (sw_stage == SWStage::VS && args->shader_info->vs.as_ls)
+      hw_stage = HWStage::LS; /* GFX6-8: VS is a Local Shader, when tessellation is used */
+   else if (sw_stage == SWStage::TCS)
+      hw_stage = HWStage::HS; /* GFX6-8: TCS is a Hull Shader */
+   else if (sw_stage == SWStage::VS_TCS)
+      hw_stage = HWStage::HS; /* GFX9-10: VS+TCS merged into a Hull Shader */
+   else if (sw_stage == SWStage::TES && !args->shader_info->tes.as_es && !ngg)
+      hw_stage = HWStage::VS; /* GFX6-9: TES without GS uses the HW VS stage (and GFX10/legacy) */
+   else if (sw_stage == SWStage::TES && !args->shader_info->tes.as_es && ngg)
+      hw_stage = HWStage::NGG_GS; /* GFX10/NGG: TES without GS uses the HW GS stage */
+   else if (sw_stage == SWStage::TES && args->shader_info->tes.as_es && !ngg)
+      hw_stage = HWStage::ES; /* GFX6-8: TES is an Export Shader */
+   else if (sw_stage == SWStage::TES_GS && gfx9_plus && !ngg)
+      hw_stage = HWStage::GS; /* GFX9: TES+GS merged into a GS (and GFX10/legacy) */
+   else if (sw_stage == SWStage::TES_GS && ngg)
+      hw_stage = HWStage::NGG_GS; /* GFX10+: TES+GS merged into an NGG GS */
   else
      unreachable("Shader stage not implemented");

-   init_program(program, stage, args->shader_info,
+   init_program(program, Stage { hw_stage, sw_stage }, args->shader_info,
                args->options->chip_class, args->options->family, config);

   isel_context ctx = {};
@ -1096,7 +1097,7 @@ setup_isel_context(Program* program,
   ctx.stage = program->stage;

   /* TODO: Check if we need to adjust min_waves for unknown workgroup sizes. */
-   if (program->stage & (hw_vs | hw_fs)) {
+   if (program->stage.hw == HWStage::VS || program->stage.hw == HWStage::FS) {
      /* PS and legacy VS have separate waves, no workgroups */
      program->workgroup_size = program->wave_size;
   } else if (program->stage == compute_cs) {
@ -1104,10 +1105,10 @@ setup_isel_context(Program* program,
      program->workgroup_size = shaders[0]->info.cs.local_size[0] *
                                shaders[0]->info.cs.local_size[1] *
                                shaders[0]->info.cs.local_size[2];
-   } else if ((program->stage & hw_es) || program->stage == geometry_gs) {
+   } else if (program->stage.hw == HWStage::ES || program->stage == geometry_gs) {
      /* Unmerged ESGS operate in workgroups if on-chip GS (LDS rings) are enabled on GFX7-8 (not implemented in Mesa)  */
      program->workgroup_size = program->wave_size;
-   } else if (program->stage & hw_gs) {
+   } else if (program->stage.hw == HWStage::GS) {
      /* If on-chip GS (LDS rings) are enabled on GFX9 or later, merged GS operates in workgroups */
      assert(program->chip_class >= GFX9);
      uint32_t es_verts_per_subgrp = G_028A44_ES_VERTS_PER_SUBGRP(program->info->gs_ring_info.vgt_gs_onchip_cntl);
@ -1125,7 +1126,7 @@ setup_isel_context(Program* program,
      /* Merged LSHS operates in workgroups, but can still have a different number of LS and HS invocations */
      setup_tcs_info(&ctx, shaders[1], shaders[0]);
      program->workgroup_size = ctx.tcs_num_patches * MAX2(shaders[1]->info.tess.tcs_vertices_out, ctx.args->options->key.tcs.input_vertices);
-   } else if (program->stage & hw_ngg_gs) {
+   } else if (program->stage.hw == HWStage::NGG_GS) {
      gfx10_ngg_info &ngg_info = args->shader_info->ngg_info;

      /* Max ES (SW VS) threads */
--- a/src/amd/compiler/aco_ir.h
+++ b/src/amd/compiler/aco_ir.h
@ -1500,50 +1500,101 @@ struct Block {
   Block() : index(0) {}
 };

-using Stage = uint16_t;
+/*
+ * Shader stages as provided in Vulkan by the application. Contrast this to HWStage.
+ */
+enum class SWStage : uint8_t {
+    None = 0,
+    VS = 1 << 0,     /* Vertex Shader */
+    GS = 1 << 1,     /* Geometry Shader */
+    TCS = 1 << 2,    /* Tessellation Control aka Hull Shader */
+    TES = 1 << 3,    /* Tessellation Evaluation aka Domain Shader */
+    FS = 1 << 4,     /* Fragment aka Pixel Shader */
+    CS = 1 << 5,     /* Compute Shader */
+    GSCopy = 1 << 6, /* GS Copy Shader (internal) */

-/* software stages */
-static constexpr Stage sw_vs = 1 << 0;
-static constexpr Stage sw_gs = 1 << 1;
-static constexpr Stage sw_tcs = 1 << 2;
-static constexpr Stage sw_tes = 1 << 3;
-static constexpr Stage sw_fs = 1 << 4;
-static constexpr Stage sw_cs = 1 << 5;
-static constexpr Stage sw_gs_copy = 1 << 6;
-static constexpr Stage sw_mask = 0x7f;
+    /* Stage combinations merged to run on a single HWStage */
+    VS_GS = VS | GS,
+    VS_TCS = VS | TCS,
+    TES_GS = TES | GS,
+};

-/* hardware stages (can't be OR'd, just a mask for convenience when testing multiple) */
-static constexpr Stage hw_vs = 1 << 7;
-static constexpr Stage hw_es = 1 << 8; /* Export shader: pre-GS (VS or TES) on GFX6-8. Combined into GS on GFX9 (and GFX10/legacy). */
-static constexpr Stage hw_gs = 1 << 9; /* Geometry shader on GFX10/legacy and GFX6-9. */
-static constexpr Stage hw_ngg_gs = 1 << 10; /* Geometry shader on GFX10/NGG. */
-static constexpr Stage hw_ls = 1 << 11; /* Local shader: pre-TCS (VS) on GFX6-8. Combined into HS on GFX9 (and GFX10/legacy). */
-static constexpr Stage hw_hs = 1 << 12; /* Hull shader: TCS on GFX6-8. Merged VS and TCS on GFX9-10. */
-static constexpr Stage hw_fs = 1 << 13;
-static constexpr Stage hw_cs = 1 << 14;
-static constexpr Stage hw_mask = 0xff << 7;
+constexpr SWStage operator|(SWStage a, SWStage b) {
+    return static_cast<SWStage>(static_cast<uint8_t>(a) | static_cast<uint8_t>(b));
+}
+
+/*
+ * Shader stages as running on the AMD GPU.
+ *
+ * The relation between HWStages and SWStages is not a one-to-one mapping:
+ * Some SWStages are merged by ACO to run on a single HWStage.
+ * See README.md for details.
+ */
+enum class HWStage : uint8_t {
+    VS,
+    ES,     /* Export shader: pre-GS (VS or TES) on GFX6-8. Combined into GS on GFX9 (and GFX10/legacy). */
+    GS,     /* Geometry shader on GFX10/legacy and GFX6-9. */
+    NGG_GS, /* Geometry shader on GFX10/NGG. */
+    LS,     /* Local shader: pre-TCS (VS) on GFX6-8. Combined into HS on GFX9 (and GFX10/legacy). */
+    HS,     /* Hull shader: TCS on GFX6-8. Merged VS and TCS on GFX9-10. */
+    FS,
+    CS,
+};
+
+/*
+ * Set of SWStages to be merged into a single shader paired with the
+ * HWStage it will run on.
+ */
+struct Stage {
+    constexpr Stage() = default;
+
+    explicit constexpr Stage(HWStage hw_, SWStage sw_) : sw(sw_), hw(hw_) { }
+
+    /* Check if the given SWStage is included */
+    constexpr bool has(SWStage stage) const {
+        return (static_cast<uint8_t>(sw) & static_cast<uint8_t>(stage));
+    }
+
+    unsigned num_sw_stages() const {
+        return util_bitcount(static_cast<uint8_t>(sw));
+    }
+
+    constexpr bool operator==(const Stage& other) const {
+        return sw == other.sw && hw == other.hw;
+    }
+
+    constexpr bool operator!=(const Stage& other) const {
+        return sw != other.sw || hw != other.hw;
+    }
+
+    /* Mask of merged software stages */
+    SWStage sw = SWStage::None;
+
+    /* Active hardware stage */
+    HWStage hw {};
+};

 /* possible settings of Program::stage */
-static constexpr Stage vertex_vs = sw_vs | hw_vs;
-static constexpr Stage fragment_fs = sw_fs | hw_fs;
-static constexpr Stage compute_cs = sw_cs | hw_cs;
-static constexpr Stage tess_eval_vs = sw_tes | hw_vs;
-static constexpr Stage gs_copy_vs = sw_gs_copy | hw_vs;
+static constexpr Stage vertex_vs(HWStage::VS, SWStage::VS);
+static constexpr Stage fragment_fs(HWStage::FS, SWStage::FS);
+static constexpr Stage compute_cs(HWStage::CS, SWStage::CS);
+static constexpr Stage tess_eval_vs(HWStage::VS, SWStage::TES);
+static constexpr Stage gs_copy_vs(HWStage::VS, SWStage::GSCopy);
 /* GFX10/NGG */
-static constexpr Stage ngg_vertex_gs = sw_vs | hw_ngg_gs;
-static constexpr Stage ngg_vertex_geometry_gs = sw_vs | sw_gs | hw_ngg_gs;
-static constexpr Stage ngg_tess_eval_gs = sw_tes | hw_ngg_gs;
-static constexpr Stage ngg_tess_eval_geometry_gs = sw_tes | sw_gs | hw_ngg_gs;
+static constexpr Stage ngg_vertex_gs(HWStage::NGG_GS, SWStage::VS);
+static constexpr Stage ngg_vertex_geometry_gs(HWStage::NGG_GS, SWStage::VS_GS);
+static constexpr Stage ngg_tess_eval_gs(HWStage::NGG_GS, SWStage::TES);
+static constexpr Stage ngg_tess_eval_geometry_gs(HWStage::NGG_GS, SWStage::TES_GS);
 /* GFX9 (and GFX10 if NGG isn't used) */
-static constexpr Stage vertex_geometry_gs = sw_vs | sw_gs | hw_gs;
-static constexpr Stage vertex_tess_control_hs = sw_vs | sw_tcs | hw_hs;
-static constexpr Stage tess_eval_geometry_gs = sw_tes | sw_gs | hw_gs;
+static constexpr Stage vertex_geometry_gs(HWStage::GS, SWStage::VS_GS);
+static constexpr Stage vertex_tess_control_hs(HWStage::HS, SWStage::VS_TCS);
+static constexpr Stage tess_eval_geometry_gs(HWStage::GS, SWStage::TES_GS);
 /* pre-GFX9 */
-static constexpr Stage vertex_ls = sw_vs | hw_ls; /* vertex before tesselation control */
-static constexpr Stage vertex_es = sw_vs | hw_es; /* vertex before geometry */
-static constexpr Stage tess_control_hs = sw_tcs | hw_hs;
-static constexpr Stage tess_eval_es = sw_tes | hw_es; /* tesselation evaluation before geometry */
-static constexpr Stage geometry_gs = sw_gs | hw_gs;
+static constexpr Stage vertex_ls(HWStage::LS, SWStage::VS); /* vertex before tesselation control */
+static constexpr Stage vertex_es(HWStage::ES, SWStage::VS); /* vertex before geometry */
+static constexpr Stage tess_control_hs(HWStage::HS, SWStage::TCS);
+static constexpr Stage tess_eval_es(HWStage::ES, SWStage::TES); /* tesselation evaluation before geometry */
+static constexpr Stage geometry_gs(HWStage::GS, SWStage::GS);

 enum statistic {
   statistic_hash,
@ -1574,7 +1625,7 @@ public:
   enum radeon_family family;
   unsigned wave_size;
   RegClass lane_mask;
-   Stage stage; /* Stage */
+   Stage stage;
   bool needs_exact = false; /* there exists an instruction with disable_wqm = true */
   bool needs_wqm = false; /* there exists a p_wqm instruction */
   bool wb_smem_l1_on_end = false;
--- a/src/amd/compiler/aco_lower_to_hw_instr.cpp
+++ b/src/amd/compiler/aco_lower_to_hw_instr.cpp
@ -1726,7 +1726,7 @@ void lower_to_hw_instr(Program* program)
               /* don't bother with an early exit near the end of the program */
               if ((block->instructions.size() - 1 - j) <= 4 &&
                    block->instructions.back()->opcode == aco_opcode::s_endpgm) {
-                  unsigned null_exp_dest = (ctx.program->stage & hw_fs) ? 9 /* NULL */ : V_008DFC_SQ_EXP_POS;
+                  unsigned null_exp_dest = (ctx.program->stage.hw == HWStage::FS) ? 9 /* NULL */ : V_008DFC_SQ_EXP_POS;
                  bool ignore_early_exit = true;

                  for (unsigned k = j + 1; k < block->instructions.size(); ++k) {
--- a/src/amd/compiler/aco_scheduler.cpp
+++ b/src/amd/compiler/aco_scheduler.cpp
@ -862,7 +862,8 @@ void schedule_block(sched_ctx& ctx, Program *program, Block* block, live& live_v
      }
   }

-   if ((program->stage & (hw_vs | hw_ngg_gs)) && (block->kind & block_kind_export_end)) {
+   if ((program->stage.hw == HWStage::VS || program->stage.hw == HWStage::NGG_GS) &&
+       (block->kind & block_kind_export_end)) {
      /* Try to move position exports as far up as possible, to reduce register
       * usage and because ISA reference guides say so. */
      for (unsigned idx = 0; idx < block->instructions.size(); idx++) {