diff --git a/src/amd/compiler/aco_insert_NOPs.cpp b/src/amd/compiler/aco_insert_NOPs.cpp
index dac9452421a..b8e4e6688e1 100644
--- a/src/amd/compiler/aco_insert_NOPs.cpp
+++ b/src/amd/compiler/aco_insert_NOPs.cpp
@@ -33,6 +33,12 @@
 namespace aco {
 namespace {
 
+struct State {
+   Program* program;
+   Block* block;
+   std::vector<aco_ptr<Instruction>> old_instructions;
+};
+
 struct NOP_ctx_gfx6 {
    void join(const NOP_ctx_gfx6& other)
    {
@@ -228,8 +234,7 @@ handle_raw_hazard_instr(aco_ptr<Instruction>& pred, PhysReg reg, int* nops_neede
 
 template <bool Valu, bool Vintrp, bool Salu>
 int
-handle_raw_hazard_internal(Program* program, Block* block, int nops_needed, PhysReg reg,
-                           uint32_t mask)
+handle_raw_hazard_internal(State& state, Block* block, int nops_needed, PhysReg reg, uint32_t mask)
 {
    for (int pred_idx = block->instructions.size() - 1; pred_idx >= 0; pred_idx--) {
       if (handle_raw_hazard_instr<Valu, Vintrp, Salu>(block->instructions[pred_idx], reg,
@@ -244,19 +249,19 @@ handle_raw_hazard_internal(Program* program, Block* block, int nops_needed, Phys
     * huge value. */
    for (unsigned lin_pred : block->linear_preds) {
       res = std::max(res, handle_raw_hazard_internal<Valu, Vintrp, Salu>(
-                             program, &program->blocks[lin_pred], nops_needed, reg, mask));
+                             state, &state.program->blocks[lin_pred], nops_needed, reg, mask));
    }
    return res;
 }
 
 template <bool Valu, bool Vintrp, bool Salu>
 void
-handle_raw_hazard(Program* program, Block* cur_block, int* NOPs, int min_states, Operand op)
+handle_raw_hazard(State& state, int* NOPs, int min_states, Operand op)
 {
    if (*NOPs >= min_states)
       return;
    int res = handle_raw_hazard_internal<Valu, Vintrp, Salu>(
-      program, cur_block, min_states, op.physReg(), u_bit_consecutive(0, op.size()));
+      state, state.block, min_states, op.physReg(), u_bit_consecutive(0, op.size()));
    *NOPs = MAX2(*NOPs, res);
 }
 
@@ -332,15 +337,14 @@ handle_smem_clause_hazards(Program* program, NOP_ctx_gfx6& ctx, aco_ptr<Instruct
 
 /* TODO: we don't handle accessing VCC using the actual SGPR instead of using the alias */
 void
-handle_instruction_gfx6(Program* program, Block* cur_block, NOP_ctx_gfx6& ctx,
-                        aco_ptr<Instruction>& instr,
+handle_instruction_gfx6(State& state, NOP_ctx_gfx6& ctx, aco_ptr<Instruction>& instr,
                         std::vector<aco_ptr<Instruction>>& new_instructions)
 {
    /* check hazards */
    int NOPs = 0;
 
    if (instr->isSMEM()) {
-      if (program->chip_class == GFX6) {
+      if (state.program->chip_class == GFX6) {
          /* A read of an SGPR by SMRD instruction requires 4 wait states
           * when the SGPR was written by a VALU instruction. According to LLVM,
           * there is also an undocumented hardware behavior when the buffer
@@ -352,13 +356,13 @@ handle_instruction_gfx6(Program* program, Block* cur_block, NOP_ctx_gfx6& ctx,
 
             bool is_buffer_desc = i == 0 && op.size() > 2;
             if (is_buffer_desc)
-               handle_valu_salu_then_read_hazard(program, cur_block, &NOPs, 4, op);
+               handle_valu_salu_then_read_hazard(state, &NOPs, 4, op);
             else
-               handle_valu_then_read_hazard(program, cur_block, &NOPs, 4, op);
+               handle_valu_then_read_hazard(state, &NOPs, 4, op);
          }
       }
 
-      handle_smem_clause_hazards(program, ctx, instr, &NOPs);
+      handle_smem_clause_hazards(state.program, ctx, instr, &NOPs);
    } else if (instr->isSALU()) {
       if (instr->opcode == aco_opcode::s_setreg_b32 ||
           instr->opcode == aco_opcode::s_setreg_imm32_b32 ||
@@ -366,7 +370,7 @@ handle_instruction_gfx6(Program* program, Block* cur_block, NOP_ctx_gfx6& ctx,
          NOPs = MAX2(NOPs, ctx.setreg_then_getsetreg);
       }
 
-      if (program->chip_class == GFX9) {
+      if (state.program->chip_class == GFX9) {
          if (instr->opcode == aco_opcode::s_movrels_b32 ||
              instr->opcode == aco_opcode::s_movrels_b64 ||
              instr->opcode == aco_opcode::s_movreld_b32 ||
@@ -389,7 +393,7 @@ handle_instruction_gfx6(Program* program, Block* cur_block, NOP_ctx_gfx6& ctx,
 
       if (instr->isDPP()) {
          NOPs = MAX2(NOPs, ctx.valu_wr_exec_then_dpp);
-         handle_valu_then_read_hazard(program, cur_block, &NOPs, 2, instr->operands[0]);
+         handle_valu_then_read_hazard(state, &NOPs, 2, instr->operands[0]);
       }
 
       for (Definition def : instr->definitions) {
@@ -404,7 +408,7 @@ handle_instruction_gfx6(Program* program, Block* cur_block, NOP_ctx_gfx6& ctx,
            instr->opcode == aco_opcode::v_writelane_b32 ||
            instr->opcode == aco_opcode::v_writelane_b32_e64) &&
           !instr->operands[1].isConstant()) {
-         handle_valu_then_read_hazard(program, cur_block, &NOPs, 4, instr->operands[1]);
+         handle_valu_then_read_hazard(state, &NOPs, 4, instr->operands[1]);
       }
 
       /* It's required to insert 1 wait state if the dst VGPR of any v_interp_*
@@ -412,10 +416,10 @@ handle_instruction_gfx6(Program* program, Block* cur_block, NOP_ctx_gfx6& ctx,
        * hangs on GFX6. Note that v_writelane_* is apparently not affected.
        * This hazard isn't documented anywhere but AMD confirmed that hazard.
        */
-      if (program->chip_class == GFX6 &&
+      if (state.program->chip_class == GFX6 &&
           (instr->opcode == aco_opcode::v_readlane_b32 || /* GFX6 doesn't have v_readlane_b32_e64 */
            instr->opcode == aco_opcode::v_readfirstlane_b32)) {
-         handle_vintrp_then_read_hazard(program, cur_block, &NOPs, 1, instr->operands[0]);
+         handle_vintrp_then_read_hazard(state, &NOPs, 1, instr->operands[0]);
       }
 
       if (instr->opcode == aco_opcode::v_div_fmas_f32 ||
@@ -425,14 +429,14 @@ handle_instruction_gfx6(Program* program, Block* cur_block, NOP_ctx_gfx6& ctx,
       /* If the VALU writes the SGPR that is used by a VMEM, the user must add five wait states. */
       for (Operand op : instr->operands) {
          if (!op.isConstant() && !op.isUndefined() && op.regClass().type() == RegType::sgpr)
-            handle_valu_then_read_hazard(program, cur_block, &NOPs, 5, op);
+            handle_valu_then_read_hazard(state, &NOPs, 5, op);
       }
    }
 
    if (!instr->isSALU() && instr->format != Format::SMEM)
       NOPs = MAX2(NOPs, ctx.set_vskip_mode_then_vector);
 
-   if (program->chip_class == GFX9) {
+   if (state.program->chip_class == GFX9) {
       bool lds_scratch_global = (instr->isScratch() || instr->isGlobal()) && instr->flatlike().lds;
       if (instr->isVINTRP() || lds_scratch_global ||
           instr->opcode == aco_opcode::ds_read_addtid_b32 ||
@@ -459,7 +463,7 @@ handle_instruction_gfx6(Program* program, Block* cur_block, NOP_ctx_gfx6& ctx,
       ctx.smem_clause = false;
       ctx.smem_write = false;
 
-      if (program->dev.xnack_enabled) {
+      if (state.program->dev.xnack_enabled) {
          BITSET_ZERO(ctx.smem_clause_read_write);
          BITSET_ZERO(ctx.smem_clause_write);
       }
@@ -471,7 +475,7 @@ handle_instruction_gfx6(Program* program, Block* cur_block, NOP_ctx_gfx6& ctx,
       } else {
          ctx.smem_clause = true;
 
-         if (program->dev.xnack_enabled) {
+         if (state.program->dev.xnack_enabled) {
             for (Operand op : instr->operands) {
                if (!op.isConstant()) {
                   set_bitset_range(ctx.smem_clause_read_write, op.physReg(), op.size());
@@ -616,8 +620,7 @@ instr_is_branch(const aco_ptr<Instruction>& instr)
 }
 
 void
-handle_instruction_gfx10(Program* program, Block* cur_block, NOP_ctx_gfx10& ctx,
-                         aco_ptr<Instruction>& instr,
+handle_instruction_gfx10(State& state, NOP_ctx_gfx10& ctx, aco_ptr<Instruction>& instr,
                          std::vector<aco_ptr<Instruction>>& new_instructions)
 {
    // TODO: s_dcache_inv needs to be in it's own group on GFX10
@@ -630,7 +633,7 @@ handle_instruction_gfx10(Program* program, Block* cur_block, NOP_ctx_gfx10& ctx,
       /* Remember all SGPRs that are read by the VMEM instruction */
       mark_read_regs(instr, ctx.sgprs_read_by_VMEM);
       ctx.sgprs_read_by_VMEM.set(exec);
-      if (program->wave_size == 64)
+      if (state.program->wave_size == 64)
          ctx.sgprs_read_by_VMEM.set(exec_hi);
    } else if (instr->isSALU() || instr->isSMEM()) {
       if (instr->opcode == aco_opcode::s_waitcnt) {
@@ -786,7 +789,7 @@ handle_instruction_gfx10(Program* program, Block* cur_block, NOP_ctx_gfx10& ctx,
       if (instr->isMUBUF() || instr->isMTBUF()) {
          uint32_t offset = instr->isMUBUF() ? instr->mubuf().offset : instr->mtbuf().offset;
          if (offset & 6)
-            Builder(program, &new_instructions).sopp(aco_opcode::s_nop, -1, 0);
+            Builder(state.program, &new_instructions).sopp(aco_opcode::s_nop, -1, 0);
       }
    }
 
@@ -798,12 +801,12 @@ handle_instruction_gfx10(Program* program, Block* cur_block, NOP_ctx_gfx10& ctx,
    } else if (ctx.has_writelane) {
       ctx.has_writelane = false;
       if (instr->isMIMG() && get_mimg_nsa_dwords(instr.get()) > 0)
-         Builder(program, &new_instructions).sopp(aco_opcode::s_nop, -1, 0);
+         Builder(state.program, &new_instructions).sopp(aco_opcode::s_nop, -1, 0);
    }
 }
 
 template <typename Ctx>
-using HandleInstr = void (*)(Program*, Block* block, Ctx&, aco_ptr<Instruction>&,
+using HandleInstr = void (*)(State& state, Ctx&, aco_ptr<Instruction>&,
                              std::vector<aco_ptr<Instruction>>&);
 
 template <typename Ctx, HandleInstr<Ctx> Handle>
@@ -813,13 +816,16 @@ handle_block(Program* program, Ctx& ctx, Block& block)
    if (block.instructions.empty())
       return;
 
-   std::vector<aco_ptr<Instruction>> old_instructions = std::move(block.instructions);
+   State state;
+   state.program = program;
+   state.block = &block;
+   state.old_instructions = std::move(block.instructions);
 
    block.instructions.clear(); // Silence clang-analyzer-cplusplus.Move warning
-   block.instructions.reserve(old_instructions.size());
+   block.instructions.reserve(state.old_instructions.size());
 
-   for (aco_ptr<Instruction>& instr : old_instructions) {
-      Handle(program, &block, ctx, instr, block.instructions);
+   for (aco_ptr<Instruction>& instr : state.old_instructions) {
+      Handle(state, ctx, instr, block.instructions);
       block.instructions.emplace_back(std::move(instr));
    }
 }