diff --git a/src/amd/compiler/aco_ssa_elimination.cpp b/src/amd/compiler/aco_ssa_elimination.cpp index f4884a42875..ca63ed87c89 100644 --- a/src/amd/compiler/aco_ssa_elimination.cpp +++ b/src/amd/compiler/aco_ssa_elimination.cpp @@ -40,12 +40,14 @@ struct ssa_elimination_ctx { std::vector> logical_phi_info; std::vector> linear_phi_info; std::vector empty_blocks; + std::vector blocks_incoming_exec_used; Program* program; ssa_elimination_ctx(Program* program_) : logical_phi_info(program_->blocks.size()) , linear_phi_info(program_->blocks.size()) , empty_blocks(program_->blocks.size(), true) + , blocks_incoming_exec_used(program_->blocks.size(), true) , program(program_) {} }; @@ -274,10 +276,84 @@ void try_remove_simple_block(ssa_elimination_ctx& ctx, Block* block) block->linear_succs.clear(); } +bool instr_writes_exec(Instruction* instr) +{ + for (Definition& def : instr->definitions) + if (def.physReg() == exec || def.physReg() == exec_hi) + return true; + + return false; +} + +void eliminate_useless_exec_writes_in_block(ssa_elimination_ctx& ctx, Block& block) +{ + /* Check if any successor needs the outgoing exec mask from the current block. */ + + bool exec_write_used; + + if (!ctx.logical_phi_info[block.index].empty()) { + exec_write_used = true; + } else { + bool will_insert_exec_copy = false; + bool will_inserted_exec_copy_need_exec = false; + + for (const auto& successor_phi_info : ctx.linear_phi_info[block.index]) { + if (successor_phi_info.def.physReg() == exec) + will_insert_exec_copy = true; + if (successor_phi_info.op.physReg() == exec) + will_inserted_exec_copy_need_exec = true; + } + + if (will_insert_exec_copy && !will_inserted_exec_copy_need_exec) + exec_write_used = false; + else + /* blocks_incoming_exec_used is initialized to true, so this is correct even for loops. */ + exec_write_used = std::any_of(block.linear_succs.begin(), block.linear_succs.end(), + [&ctx](int succ_idx) { return ctx.blocks_incoming_exec_used[succ_idx]; }); + } + + /* Go through all instructions and eliminate useless exec writes. */ + + for (int i = block.instructions.size() - 1; i >= 0; --i) { + aco_ptr& instr = block.instructions[i]; + + /* We already take information from phis into account before the loop, so let's just break on phis. */ + if (instr->opcode == aco_opcode::p_linear_phi || instr->opcode == aco_opcode::p_phi) + break; + + /* See if the current instruction needs or writes exec. */ + bool needs_exec = needs_exec_mask(instr.get()); + bool writes_exec = instr_writes_exec(instr.get()); + + /* See if we found an unused exec write. */ + if (writes_exec && !exec_write_used) { + instr.reset(); + continue; + } + + /* For a newly encountered exec write, clear the used flag. */ + if (writes_exec) + exec_write_used = false; + + /* If the current instruction needs exec, mark it as used. */ + exec_write_used |= needs_exec; + } + + /* Remember if the current block needs an incoming exec mask from its predecessors. */ + + ctx.blocks_incoming_exec_used[block.index] = exec_write_used; + + /* Cleanup: remove deleted instructions from the vector. */ + + auto new_end = std::remove(block.instructions.begin(), block.instructions.end(), nullptr); + block.instructions.resize(new_end - block.instructions.begin()); +} + void jump_threading(ssa_elimination_ctx& ctx) { for (int i = ctx.program->blocks.size() - 1; i >= 0; i--) { Block* block = &ctx.program->blocks[i]; + eliminate_useless_exec_writes_in_block(ctx, *block); if (!ctx.empty_blocks[i]) continue;