aco/nops: add State

Signed-off-by: Rhys Perry <pendingchaos02@gmail.com> Reviewed-by: Daniel Schürmann <daniel@schuermann.dev> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/12720>
2021-09-03 15:29:08 +01:00 · 2021-09-03 15:29:08 +01:00 · a8cc911aaf
parent bdf7eed045
commit a8cc911aaf
1 changed files with 36 additions and 30 deletions
--- a/src/amd/compiler/aco_insert_NOPs.cpp
+++ b/src/amd/compiler/aco_insert_NOPs.cpp
@ -33,6 +33,12 @@
 namespace aco {
 namespace {

+struct State {
+   Program* program;
+   Block* block;
+   std::vector<aco_ptr<Instruction>> old_instructions;
+};
+
 struct NOP_ctx_gfx6 {
   void join(const NOP_ctx_gfx6& other)
   {
@ -228,8 +234,7 @@ handle_raw_hazard_instr(aco_ptr<Instruction>& pred, PhysReg reg, int* nops_neede

 template <bool Valu, bool Vintrp, bool Salu>
 int
-handle_raw_hazard_internal(Program* program, Block* block, int nops_needed, PhysReg reg,
-                           uint32_t mask)
+handle_raw_hazard_internal(State& state, Block* block, int nops_needed, PhysReg reg, uint32_t mask)
 {
   for (int pred_idx = block->instructions.size() - 1; pred_idx >= 0; pred_idx--) {
      if (handle_raw_hazard_instr<Valu, Vintrp, Salu>(block->instructions[pred_idx], reg,
@ -244,19 +249,19 @@ handle_raw_hazard_internal(Program* program, Block* block, int nops_needed, Phys
    * huge value. */
   for (unsigned lin_pred : block->linear_preds) {
      res = std::max(res, handle_raw_hazard_internal<Valu, Vintrp, Salu>(
-                             program, &program->blocks[lin_pred], nops_needed, reg, mask));
+                             state, &state.program->blocks[lin_pred], nops_needed, reg, mask));
   }
   return res;
 }

 template <bool Valu, bool Vintrp, bool Salu>
 void
-handle_raw_hazard(Program* program, Block* cur_block, int* NOPs, int min_states, Operand op)
+handle_raw_hazard(State& state, int* NOPs, int min_states, Operand op)
 {
   if (*NOPs >= min_states)
      return;
   int res = handle_raw_hazard_internal<Valu, Vintrp, Salu>(
-      program, cur_block, min_states, op.physReg(), u_bit_consecutive(0, op.size()));
+      state, state.block, min_states, op.physReg(), u_bit_consecutive(0, op.size()));
   *NOPs = MAX2(*NOPs, res);
 }

@ -332,15 +337,14 @@ handle_smem_clause_hazards(Program* program, NOP_ctx_gfx6& ctx, aco_ptr<Instruct

 /* TODO: we don't handle accessing VCC using the actual SGPR instead of using the alias */
 void
-handle_instruction_gfx6(Program* program, Block* cur_block, NOP_ctx_gfx6& ctx,
-                        aco_ptr<Instruction>& instr,
+handle_instruction_gfx6(State& state, NOP_ctx_gfx6& ctx, aco_ptr<Instruction>& instr,
                        std::vector<aco_ptr<Instruction>>& new_instructions)
 {
   /* check hazards */
   int NOPs = 0;

   if (instr->isSMEM()) {
-      if (program->chip_class == GFX6) {
+      if (state.program->chip_class == GFX6) {
         /* A read of an SGPR by SMRD instruction requires 4 wait states
          * when the SGPR was written by a VALU instruction. According to LLVM,
          * there is also an undocumented hardware behavior when the buffer
@ -352,13 +356,13 @@ handle_instruction_gfx6(Program* program, Block* cur_block, NOP_ctx_gfx6& ctx,

            bool is_buffer_desc = i == 0 && op.size() > 2;
            if (is_buffer_desc)
-               handle_valu_salu_then_read_hazard(program, cur_block, &NOPs, 4, op);
+               handle_valu_salu_then_read_hazard(state, &NOPs, 4, op);
            else
-               handle_valu_then_read_hazard(program, cur_block, &NOPs, 4, op);
+               handle_valu_then_read_hazard(state, &NOPs, 4, op);
         }
      }

-      handle_smem_clause_hazards(program, ctx, instr, &NOPs);
+      handle_smem_clause_hazards(state.program, ctx, instr, &NOPs);
   } else if (instr->isSALU()) {
      if (instr->opcode == aco_opcode::s_setreg_b32 ||
          instr->opcode == aco_opcode::s_setreg_imm32_b32 ||
@ -366,7 +370,7 @@ handle_instruction_gfx6(Program* program, Block* cur_block, NOP_ctx_gfx6& ctx,
         NOPs = MAX2(NOPs, ctx.setreg_then_getsetreg);
      }

-      if (program->chip_class == GFX9) {
+      if (state.program->chip_class == GFX9) {
         if (instr->opcode == aco_opcode::s_movrels_b32 ||
             instr->opcode == aco_opcode::s_movrels_b64 ||
             instr->opcode == aco_opcode::s_movreld_b32 ||
@ -389,7 +393,7 @@ handle_instruction_gfx6(Program* program, Block* cur_block, NOP_ctx_gfx6& ctx,

      if (instr->isDPP()) {
         NOPs = MAX2(NOPs, ctx.valu_wr_exec_then_dpp);
-         handle_valu_then_read_hazard(program, cur_block, &NOPs, 2, instr->operands[0]);
+         handle_valu_then_read_hazard(state, &NOPs, 2, instr->operands[0]);
      }

      for (Definition def : instr->definitions) {
@ -404,7 +408,7 @@ handle_instruction_gfx6(Program* program, Block* cur_block, NOP_ctx_gfx6& ctx,
           instr->opcode == aco_opcode::v_writelane_b32 ||
           instr->opcode == aco_opcode::v_writelane_b32_e64) &&
          !instr->operands[1].isConstant()) {
-         handle_valu_then_read_hazard(program, cur_block, &NOPs, 4, instr->operands[1]);
+         handle_valu_then_read_hazard(state, &NOPs, 4, instr->operands[1]);
      }

      /* It's required to insert 1 wait state if the dst VGPR of any v_interp_*
@ -412,10 +416,10 @@ handle_instruction_gfx6(Program* program, Block* cur_block, NOP_ctx_gfx6& ctx,
       * hangs on GFX6. Note that v_writelane_* is apparently not affected.
       * This hazard isn't documented anywhere but AMD confirmed that hazard.
       */
-      if (program->chip_class == GFX6 &&
+      if (state.program->chip_class == GFX6 &&
          (instr->opcode == aco_opcode::v_readlane_b32 || /* GFX6 doesn't have v_readlane_b32_e64 */
           instr->opcode == aco_opcode::v_readfirstlane_b32)) {
-         handle_vintrp_then_read_hazard(program, cur_block, &NOPs, 1, instr->operands[0]);
+         handle_vintrp_then_read_hazard(state, &NOPs, 1, instr->operands[0]);
      }

      if (instr->opcode == aco_opcode::v_div_fmas_f32 ||
@ -425,14 +429,14 @@ handle_instruction_gfx6(Program* program, Block* cur_block, NOP_ctx_gfx6& ctx,
      /* If the VALU writes the SGPR that is used by a VMEM, the user must add five wait states. */
      for (Operand op : instr->operands) {
         if (!op.isConstant() && !op.isUndefined() && op.regClass().type() == RegType::sgpr)
-            handle_valu_then_read_hazard(program, cur_block, &NOPs, 5, op);
+            handle_valu_then_read_hazard(state, &NOPs, 5, op);
      }
   }

   if (!instr->isSALU() && instr->format != Format::SMEM)
      NOPs = MAX2(NOPs, ctx.set_vskip_mode_then_vector);

-   if (program->chip_class == GFX9) {
+   if (state.program->chip_class == GFX9) {
      bool lds_scratch_global = (instr->isScratch() || instr->isGlobal()) && instr->flatlike().lds;
      if (instr->isVINTRP() || lds_scratch_global ||
          instr->opcode == aco_opcode::ds_read_addtid_b32 ||
@ -459,7 +463,7 @@ handle_instruction_gfx6(Program* program, Block* cur_block, NOP_ctx_gfx6& ctx,
      ctx.smem_clause = false;
      ctx.smem_write = false;

-      if (program->dev.xnack_enabled) {
+      if (state.program->dev.xnack_enabled) {
         BITSET_ZERO(ctx.smem_clause_read_write);
         BITSET_ZERO(ctx.smem_clause_write);
      }
@ -471,7 +475,7 @@ handle_instruction_gfx6(Program* program, Block* cur_block, NOP_ctx_gfx6& ctx,
      } else {
         ctx.smem_clause = true;

-         if (program->dev.xnack_enabled) {
+         if (state.program->dev.xnack_enabled) {
            for (Operand op : instr->operands) {
               if (!op.isConstant()) {
                  set_bitset_range(ctx.smem_clause_read_write, op.physReg(), op.size());
@ -616,8 +620,7 @@ instr_is_branch(const aco_ptr<Instruction>& instr)
 }

 void
-handle_instruction_gfx10(Program* program, Block* cur_block, NOP_ctx_gfx10& ctx,
-                         aco_ptr<Instruction>& instr,
+handle_instruction_gfx10(State& state, NOP_ctx_gfx10& ctx, aco_ptr<Instruction>& instr,
                         std::vector<aco_ptr<Instruction>>& new_instructions)
 {
   // TODO: s_dcache_inv needs to be in it's own group on GFX10
@ -630,7 +633,7 @@ handle_instruction_gfx10(Program* program, Block* cur_block, NOP_ctx_gfx10& ctx,
      /* Remember all SGPRs that are read by the VMEM instruction */
      mark_read_regs(instr, ctx.sgprs_read_by_VMEM);
      ctx.sgprs_read_by_VMEM.set(exec);
-      if (program->wave_size == 64)
+      if (state.program->wave_size == 64)
         ctx.sgprs_read_by_VMEM.set(exec_hi);
   } else if (instr->isSALU() || instr->isSMEM()) {
      if (instr->opcode == aco_opcode::s_waitcnt) {
@ -786,7 +789,7 @@ handle_instruction_gfx10(Program* program, Block* cur_block, NOP_ctx_gfx10& ctx,
      if (instr->isMUBUF() || instr->isMTBUF()) {
         uint32_t offset = instr->isMUBUF() ? instr->mubuf().offset : instr->mtbuf().offset;
         if (offset & 6)
-            Builder(program, &new_instructions).sopp(aco_opcode::s_nop, -1, 0);
+            Builder(state.program, &new_instructions).sopp(aco_opcode::s_nop, -1, 0);
      }
   }

@ -798,12 +801,12 @@ handle_instruction_gfx10(Program* program, Block* cur_block, NOP_ctx_gfx10& ctx,
   } else if (ctx.has_writelane) {
      ctx.has_writelane = false;
      if (instr->isMIMG() && get_mimg_nsa_dwords(instr.get()) > 0)
-         Builder(program, &new_instructions).sopp(aco_opcode::s_nop, -1, 0);
+         Builder(state.program, &new_instructions).sopp(aco_opcode::s_nop, -1, 0);
   }
 }

 template <typename Ctx>
-using HandleInstr = void (*)(Program*, Block* block, Ctx&, aco_ptr<Instruction>&,
+using HandleInstr = void (*)(State& state, Ctx&, aco_ptr<Instruction>&,
                             std::vector<aco_ptr<Instruction>>&);

 template <typename Ctx, HandleInstr<Ctx> Handle>
@ -813,13 +816,16 @@ handle_block(Program* program, Ctx& ctx, Block& block)
   if (block.instructions.empty())
      return;

-   std::vector<aco_ptr<Instruction>> old_instructions = std::move(block.instructions);
+   State state;
+   state.program = program;
+   state.block = &block;
+   state.old_instructions = std::move(block.instructions);

   block.instructions.clear(); // Silence clang-analyzer-cplusplus.Move warning
-   block.instructions.reserve(old_instructions.size());
+   block.instructions.reserve(state.old_instructions.size());

-   for (aco_ptr<Instruction>& instr : old_instructions) {
-      Handle(program, &block, ctx, instr, block.instructions);
+   for (aco_ptr<Instruction>& instr : state.old_instructions) {
+      Handle(state, ctx, instr, block.instructions);
      block.instructions.emplace_back(std::move(instr));
   }
 }