diff --git a/src/amd/compiler/aco_insert_NOPs.cpp b/src/amd/compiler/aco_insert_NOPs.cpp index dac9452421a..b8e4e6688e1 100644 --- a/src/amd/compiler/aco_insert_NOPs.cpp +++ b/src/amd/compiler/aco_insert_NOPs.cpp @@ -33,6 +33,12 @@ namespace aco { namespace { +struct State { + Program* program; + Block* block; + std::vector> old_instructions; +}; + struct NOP_ctx_gfx6 { void join(const NOP_ctx_gfx6& other) { @@ -228,8 +234,7 @@ handle_raw_hazard_instr(aco_ptr& pred, PhysReg reg, int* nops_neede template int -handle_raw_hazard_internal(Program* program, Block* block, int nops_needed, PhysReg reg, - uint32_t mask) +handle_raw_hazard_internal(State& state, Block* block, int nops_needed, PhysReg reg, uint32_t mask) { for (int pred_idx = block->instructions.size() - 1; pred_idx >= 0; pred_idx--) { if (handle_raw_hazard_instr(block->instructions[pred_idx], reg, @@ -244,19 +249,19 @@ handle_raw_hazard_internal(Program* program, Block* block, int nops_needed, Phys * huge value. */ for (unsigned lin_pred : block->linear_preds) { res = std::max(res, handle_raw_hazard_internal( - program, &program->blocks[lin_pred], nops_needed, reg, mask)); + state, &state.program->blocks[lin_pred], nops_needed, reg, mask)); } return res; } template void -handle_raw_hazard(Program* program, Block* cur_block, int* NOPs, int min_states, Operand op) +handle_raw_hazard(State& state, int* NOPs, int min_states, Operand op) { if (*NOPs >= min_states) return; int res = handle_raw_hazard_internal( - program, cur_block, min_states, op.physReg(), u_bit_consecutive(0, op.size())); + state, state.block, min_states, op.physReg(), u_bit_consecutive(0, op.size())); *NOPs = MAX2(*NOPs, res); } @@ -332,15 +337,14 @@ handle_smem_clause_hazards(Program* program, NOP_ctx_gfx6& ctx, aco_ptr& instr, +handle_instruction_gfx6(State& state, NOP_ctx_gfx6& ctx, aco_ptr& instr, std::vector>& new_instructions) { /* check hazards */ int NOPs = 0; if (instr->isSMEM()) { - if (program->chip_class == GFX6) { + if (state.program->chip_class == GFX6) { /* A read of an SGPR by SMRD instruction requires 4 wait states * when the SGPR was written by a VALU instruction. According to LLVM, * there is also an undocumented hardware behavior when the buffer @@ -352,13 +356,13 @@ handle_instruction_gfx6(Program* program, Block* cur_block, NOP_ctx_gfx6& ctx, bool is_buffer_desc = i == 0 && op.size() > 2; if (is_buffer_desc) - handle_valu_salu_then_read_hazard(program, cur_block, &NOPs, 4, op); + handle_valu_salu_then_read_hazard(state, &NOPs, 4, op); else - handle_valu_then_read_hazard(program, cur_block, &NOPs, 4, op); + handle_valu_then_read_hazard(state, &NOPs, 4, op); } } - handle_smem_clause_hazards(program, ctx, instr, &NOPs); + handle_smem_clause_hazards(state.program, ctx, instr, &NOPs); } else if (instr->isSALU()) { if (instr->opcode == aco_opcode::s_setreg_b32 || instr->opcode == aco_opcode::s_setreg_imm32_b32 || @@ -366,7 +370,7 @@ handle_instruction_gfx6(Program* program, Block* cur_block, NOP_ctx_gfx6& ctx, NOPs = MAX2(NOPs, ctx.setreg_then_getsetreg); } - if (program->chip_class == GFX9) { + if (state.program->chip_class == GFX9) { if (instr->opcode == aco_opcode::s_movrels_b32 || instr->opcode == aco_opcode::s_movrels_b64 || instr->opcode == aco_opcode::s_movreld_b32 || @@ -389,7 +393,7 @@ handle_instruction_gfx6(Program* program, Block* cur_block, NOP_ctx_gfx6& ctx, if (instr->isDPP()) { NOPs = MAX2(NOPs, ctx.valu_wr_exec_then_dpp); - handle_valu_then_read_hazard(program, cur_block, &NOPs, 2, instr->operands[0]); + handle_valu_then_read_hazard(state, &NOPs, 2, instr->operands[0]); } for (Definition def : instr->definitions) { @@ -404,7 +408,7 @@ handle_instruction_gfx6(Program* program, Block* cur_block, NOP_ctx_gfx6& ctx, instr->opcode == aco_opcode::v_writelane_b32 || instr->opcode == aco_opcode::v_writelane_b32_e64) && !instr->operands[1].isConstant()) { - handle_valu_then_read_hazard(program, cur_block, &NOPs, 4, instr->operands[1]); + handle_valu_then_read_hazard(state, &NOPs, 4, instr->operands[1]); } /* It's required to insert 1 wait state if the dst VGPR of any v_interp_* @@ -412,10 +416,10 @@ handle_instruction_gfx6(Program* program, Block* cur_block, NOP_ctx_gfx6& ctx, * hangs on GFX6. Note that v_writelane_* is apparently not affected. * This hazard isn't documented anywhere but AMD confirmed that hazard. */ - if (program->chip_class == GFX6 && + if (state.program->chip_class == GFX6 && (instr->opcode == aco_opcode::v_readlane_b32 || /* GFX6 doesn't have v_readlane_b32_e64 */ instr->opcode == aco_opcode::v_readfirstlane_b32)) { - handle_vintrp_then_read_hazard(program, cur_block, &NOPs, 1, instr->operands[0]); + handle_vintrp_then_read_hazard(state, &NOPs, 1, instr->operands[0]); } if (instr->opcode == aco_opcode::v_div_fmas_f32 || @@ -425,14 +429,14 @@ handle_instruction_gfx6(Program* program, Block* cur_block, NOP_ctx_gfx6& ctx, /* If the VALU writes the SGPR that is used by a VMEM, the user must add five wait states. */ for (Operand op : instr->operands) { if (!op.isConstant() && !op.isUndefined() && op.regClass().type() == RegType::sgpr) - handle_valu_then_read_hazard(program, cur_block, &NOPs, 5, op); + handle_valu_then_read_hazard(state, &NOPs, 5, op); } } if (!instr->isSALU() && instr->format != Format::SMEM) NOPs = MAX2(NOPs, ctx.set_vskip_mode_then_vector); - if (program->chip_class == GFX9) { + if (state.program->chip_class == GFX9) { bool lds_scratch_global = (instr->isScratch() || instr->isGlobal()) && instr->flatlike().lds; if (instr->isVINTRP() || lds_scratch_global || instr->opcode == aco_opcode::ds_read_addtid_b32 || @@ -459,7 +463,7 @@ handle_instruction_gfx6(Program* program, Block* cur_block, NOP_ctx_gfx6& ctx, ctx.smem_clause = false; ctx.smem_write = false; - if (program->dev.xnack_enabled) { + if (state.program->dev.xnack_enabled) { BITSET_ZERO(ctx.smem_clause_read_write); BITSET_ZERO(ctx.smem_clause_write); } @@ -471,7 +475,7 @@ handle_instruction_gfx6(Program* program, Block* cur_block, NOP_ctx_gfx6& ctx, } else { ctx.smem_clause = true; - if (program->dev.xnack_enabled) { + if (state.program->dev.xnack_enabled) { for (Operand op : instr->operands) { if (!op.isConstant()) { set_bitset_range(ctx.smem_clause_read_write, op.physReg(), op.size()); @@ -616,8 +620,7 @@ instr_is_branch(const aco_ptr& instr) } void -handle_instruction_gfx10(Program* program, Block* cur_block, NOP_ctx_gfx10& ctx, - aco_ptr& instr, +handle_instruction_gfx10(State& state, NOP_ctx_gfx10& ctx, aco_ptr& instr, std::vector>& new_instructions) { // TODO: s_dcache_inv needs to be in it's own group on GFX10 @@ -630,7 +633,7 @@ handle_instruction_gfx10(Program* program, Block* cur_block, NOP_ctx_gfx10& ctx, /* Remember all SGPRs that are read by the VMEM instruction */ mark_read_regs(instr, ctx.sgprs_read_by_VMEM); ctx.sgprs_read_by_VMEM.set(exec); - if (program->wave_size == 64) + if (state.program->wave_size == 64) ctx.sgprs_read_by_VMEM.set(exec_hi); } else if (instr->isSALU() || instr->isSMEM()) { if (instr->opcode == aco_opcode::s_waitcnt) { @@ -786,7 +789,7 @@ handle_instruction_gfx10(Program* program, Block* cur_block, NOP_ctx_gfx10& ctx, if (instr->isMUBUF() || instr->isMTBUF()) { uint32_t offset = instr->isMUBUF() ? instr->mubuf().offset : instr->mtbuf().offset; if (offset & 6) - Builder(program, &new_instructions).sopp(aco_opcode::s_nop, -1, 0); + Builder(state.program, &new_instructions).sopp(aco_opcode::s_nop, -1, 0); } } @@ -798,12 +801,12 @@ handle_instruction_gfx10(Program* program, Block* cur_block, NOP_ctx_gfx10& ctx, } else if (ctx.has_writelane) { ctx.has_writelane = false; if (instr->isMIMG() && get_mimg_nsa_dwords(instr.get()) > 0) - Builder(program, &new_instructions).sopp(aco_opcode::s_nop, -1, 0); + Builder(state.program, &new_instructions).sopp(aco_opcode::s_nop, -1, 0); } } template -using HandleInstr = void (*)(Program*, Block* block, Ctx&, aco_ptr&, +using HandleInstr = void (*)(State& state, Ctx&, aco_ptr&, std::vector>&); template Handle> @@ -813,13 +816,16 @@ handle_block(Program* program, Ctx& ctx, Block& block) if (block.instructions.empty()) return; - std::vector> old_instructions = std::move(block.instructions); + State state; + state.program = program; + state.block = █ + state.old_instructions = std::move(block.instructions); block.instructions.clear(); // Silence clang-analyzer-cplusplus.Move warning - block.instructions.reserve(old_instructions.size()); + block.instructions.reserve(state.old_instructions.size()); - for (aco_ptr& instr : old_instructions) { - Handle(program, &block, ctx, instr, block.instructions); + for (aco_ptr& instr : state.old_instructions) { + Handle(state, ctx, instr, block.instructions); block.instructions.emplace_back(std::move(instr)); } }