aco: improve waitcnt insertion around loops
Do this by repeating processing of loops until no progress is made. Totals from affected shaders: SGPRS: 162576 -> 162576 (0.00 %) VGPRS: 145228 -> 145228 (0.00 %) Spilled SGPRs: 668 -> 668 (0.00 %) Spilled VGPRs: 0 -> 0 (0.00 %) Private memory VGPRs: 0 -> 0 (0.00 %) Scratch size: 0 -> 0 (0.00 %) dwords per thread Code Size: 15778640 -> 15771336 (-0.05 %) bytes LDS: 146 -> 146 (0.00 %) blocks Max Waves: 6087 -> 6087 (0.00 %) v2: use block_kind_loop_header/block_kind_loop_exit to repeat at the end of loops instead of at each continue Signed-off-by: Rhys Perry <pendingchaos02@gmail.com> Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
This commit is contained in:
parent
1a8c49d76c
commit
d1b9deeea8
|
@ -24,6 +24,7 @@
|
|||
|
||||
#include <algorithm>
|
||||
#include <map>
|
||||
#include <stack>
|
||||
|
||||
#include "aco_ir.h"
|
||||
#include "vulkan/radv_shader.h"
|
||||
|
@ -34,8 +35,9 @@ namespace {
|
|||
|
||||
/**
|
||||
* The general idea of this pass is:
|
||||
* The CFG is traversed in reverse postorder (forward).
|
||||
* Per BB one wait_ctx is maintained.
|
||||
* The CFG is traversed in reverse postorder (forward) and loops are processed
|
||||
* several times until no progress is made.
|
||||
* Per BB two wait_ctx is maintained: an in-context and out-context.
|
||||
* The in-context is the joined out-contexts of the predecessors.
|
||||
* The context contains a map: gpr -> wait_entry
|
||||
* consisting of the information about the cnt values to be waited for.
|
||||
|
@ -114,6 +116,19 @@ struct wait_imm {
|
|||
wait_imm(uint16_t vm_, uint16_t exp_, uint16_t lgkm_, uint16_t vs_) :
|
||||
vm(vm_), exp(exp_), lgkm(lgkm_), vs(vs_) {}
|
||||
|
||||
wait_imm(enum chip_class chip, uint16_t packed) : vs(unset_counter)
|
||||
{
|
||||
vm = packed & 0xf;
|
||||
if (chip >= GFX9)
|
||||
vm |= (packed >> 10) & 0x30;
|
||||
|
||||
exp = (packed >> 4) & 0x7;
|
||||
|
||||
lgkm = (packed >> 8) & 0xf;
|
||||
if (chip >= GFX10)
|
||||
lgkm |= (packed >> 8) & 0x30;
|
||||
}
|
||||
|
||||
uint16_t pack(enum chip_class chip) const
|
||||
{
|
||||
uint16_t imm = 0;
|
||||
|
@ -142,12 +157,14 @@ struct wait_imm {
|
|||
return imm;
|
||||
}
|
||||
|
||||
void combine(const wait_imm& other)
|
||||
bool combine(const wait_imm& other)
|
||||
{
|
||||
bool changed = other.vm < vm || other.exp < exp || other.lgkm < lgkm || other.vs < vs;
|
||||
vm = std::min(vm, other.vm);
|
||||
exp = std::min(exp, other.exp);
|
||||
lgkm = std::min(lgkm, other.lgkm);
|
||||
vs = std::min(vs, other.vs);
|
||||
return changed;
|
||||
}
|
||||
|
||||
bool empty() const
|
||||
|
@ -168,13 +185,17 @@ struct wait_entry {
|
|||
: imm(imm), events(event), counters(get_counters_for_event(event)),
|
||||
wait_on_read(wait_on_read), logical(logical) {}
|
||||
|
||||
void join(const wait_entry& other)
|
||||
bool join(const wait_entry& other)
|
||||
{
|
||||
bool changed = (other.events & ~events) ||
|
||||
(other.counters & ~counters) ||
|
||||
(other.wait_on_read && !wait_on_read);
|
||||
events |= other.events;
|
||||
counters |= other.counters;
|
||||
imm.combine(other.imm);
|
||||
changed |= imm.combine(other.imm);
|
||||
wait_on_read = wait_on_read || other.wait_on_read;
|
||||
assert(logical == other.logical);
|
||||
return changed;
|
||||
}
|
||||
|
||||
void remove_counter(counter_type counter)
|
||||
|
@ -237,8 +258,15 @@ struct wait_ctx {
|
|||
max_vs_cnt(program_->chip_class >= GFX10 ? 62 : 0),
|
||||
unordered_events(event_smem | (program_->chip_class < GFX10 ? event_flat : 0)) {}
|
||||
|
||||
void join(const wait_ctx* other, bool logical)
|
||||
bool join(const wait_ctx* other, bool logical)
|
||||
{
|
||||
bool changed = other->exp_cnt > exp_cnt ||
|
||||
other->vm_cnt > vm_cnt ||
|
||||
other->lgkm_cnt > lgkm_cnt ||
|
||||
other->vs_cnt > vs_cnt ||
|
||||
(other->pending_flat_lgkm && !pending_flat_lgkm) ||
|
||||
(other->pending_flat_vm && !pending_flat_vm);
|
||||
|
||||
exp_cnt = std::max(exp_cnt, other->exp_cnt);
|
||||
vm_cnt = std::max(vm_cnt, other->vm_cnt);
|
||||
lgkm_cnt = std::max(lgkm_cnt, other->lgkm_cnt);
|
||||
|
@ -253,14 +281,18 @@ struct wait_ctx {
|
|||
if (entry.second.logical != logical)
|
||||
continue;
|
||||
|
||||
if (it != gpr_map.end())
|
||||
it->second.join(entry.second);
|
||||
else
|
||||
if (it != gpr_map.end()) {
|
||||
changed |= it->second.join(entry.second);
|
||||
} else {
|
||||
gpr_map.insert(entry);
|
||||
changed = true;
|
||||
}
|
||||
}
|
||||
|
||||
for (unsigned i = 0; i < barrier_count; i++)
|
||||
barrier_imm[i].combine(other->barrier_imm[i]);
|
||||
changed |= barrier_imm[i].combine(other->barrier_imm[i]);
|
||||
|
||||
return changed;
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -319,12 +351,27 @@ wait_imm check_instr(Instruction* instr, wait_ctx& ctx)
|
|||
return wait;
|
||||
}
|
||||
|
||||
wait_imm parse_wait_instr(wait_ctx& ctx, Instruction *instr)
|
||||
{
|
||||
if (instr->opcode == aco_opcode::s_waitcnt_vscnt &&
|
||||
instr->definitions[0].physReg() == sgpr_null) {
|
||||
wait_imm imm;
|
||||
imm.vs = std::min<uint8_t>(imm.vs, static_cast<SOPK_instruction*>(instr)->imm);
|
||||
return imm;
|
||||
} else if (instr->opcode == aco_opcode::s_waitcnt) {
|
||||
return wait_imm(ctx.chip_class, static_cast<SOPK_instruction*>(instr)->imm);
|
||||
}
|
||||
return wait_imm();
|
||||
}
|
||||
|
||||
wait_imm kill(Instruction* instr, wait_ctx& ctx)
|
||||
{
|
||||
wait_imm imm;
|
||||
if (ctx.exp_cnt || ctx.vm_cnt || ctx.lgkm_cnt)
|
||||
imm.combine(check_instr(instr, ctx));
|
||||
|
||||
imm.combine(parse_wait_instr(ctx, instr));
|
||||
|
||||
if (ctx.chip_class >= GFX10) {
|
||||
/* Seems to be required on GFX10 to achieve correct behaviour.
|
||||
* It shouldn't cost anything anyways since we're about to do s_endpgm.
|
||||
|
@ -665,39 +712,23 @@ void handle_block(Program *program, Block& block, wait_ctx& ctx)
|
|||
{
|
||||
std::vector<aco_ptr<Instruction>> new_instructions;
|
||||
|
||||
wait_imm queued_imm;
|
||||
for (aco_ptr<Instruction>& instr : block.instructions) {
|
||||
wait_imm imm = kill(instr.get(), ctx);
|
||||
bool is_wait = !parse_wait_instr(ctx, instr.get()).empty();
|
||||
|
||||
if (!imm.empty())
|
||||
emit_waitcnt(ctx, new_instructions, imm);
|
||||
queued_imm.combine(kill(instr.get(), ctx));
|
||||
|
||||
gen(instr.get(), ctx);
|
||||
|
||||
if (instr->format != Format::PSEUDO_BARRIER)
|
||||
if (instr->format != Format::PSEUDO_BARRIER && !is_wait) {
|
||||
if (!queued_imm.empty()) {
|
||||
emit_waitcnt(ctx, new_instructions, queued_imm);
|
||||
queued_imm = wait_imm();
|
||||
}
|
||||
new_instructions.emplace_back(std::move(instr));
|
||||
}
|
||||
|
||||
/* check if this block is at the end of a loop */
|
||||
for (unsigned succ_idx : block.linear_succs) {
|
||||
/* eliminate any remaining counters */
|
||||
if (succ_idx <= block.index && (ctx.vm_cnt || ctx.exp_cnt || ctx.lgkm_cnt || ctx.vs_cnt)) {
|
||||
// TODO: we could do better if we only wait if the regs between the block and other predecessors differ
|
||||
|
||||
aco_ptr<Instruction> branch = std::move(new_instructions.back());
|
||||
new_instructions.pop_back();
|
||||
|
||||
wait_imm imm(ctx.vm_cnt ? 0 : wait_imm::unset_counter,
|
||||
ctx.exp_cnt ? 0 : wait_imm::unset_counter,
|
||||
ctx.lgkm_cnt ? 0 : wait_imm::unset_counter,
|
||||
ctx.vs_cnt ? 0 : wait_imm::unset_counter);
|
||||
emit_waitcnt(ctx, new_instructions, imm);
|
||||
|
||||
new_instructions.push_back(std::move(branch));
|
||||
|
||||
ctx = wait_ctx(program);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
block.instructions.swap(new_instructions);
|
||||
}
|
||||
|
||||
|
@ -705,23 +736,55 @@ void handle_block(Program *program, Block& block, wait_ctx& ctx)
|
|||
|
||||
void insert_wait_states(Program* program)
|
||||
{
|
||||
wait_ctx out_ctx[program->blocks.size()]; /* per BB ctx */
|
||||
/* per BB ctx */
|
||||
std::vector<bool> done(program->blocks.size());
|
||||
wait_ctx in_ctx[program->blocks.size()];
|
||||
wait_ctx out_ctx[program->blocks.size()];
|
||||
for (unsigned i = 0; i < program->blocks.size(); i++)
|
||||
out_ctx[i] = wait_ctx(program);
|
||||
in_ctx[i] = wait_ctx(program);
|
||||
std::stack<unsigned> loop_header_indices;
|
||||
unsigned loop_progress = 0;
|
||||
|
||||
for (unsigned i = 0; i < program->blocks.size(); i++) {
|
||||
Block& current = program->blocks[i];
|
||||
wait_ctx& in = out_ctx[current.index];
|
||||
for (unsigned i = 0; i < program->blocks.size();) {
|
||||
Block& current = program->blocks[i++];
|
||||
wait_ctx ctx = in_ctx[current.index];
|
||||
|
||||
if (current.kind & block_kind_loop_header) {
|
||||
loop_header_indices.push(current.index);
|
||||
} else if (current.kind & block_kind_loop_exit) {
|
||||
bool repeat = false;
|
||||
if (loop_progress == loop_header_indices.size()) {
|
||||
i = loop_header_indices.top();
|
||||
repeat = true;
|
||||
}
|
||||
loop_header_indices.pop();
|
||||
loop_progress = std::min<unsigned>(loop_progress, loop_header_indices.size());
|
||||
if (repeat)
|
||||
continue;
|
||||
}
|
||||
|
||||
bool changed = false;
|
||||
for (unsigned b : current.linear_preds)
|
||||
in.join(&out_ctx[b], false);
|
||||
changed |= ctx.join(&out_ctx[b], false);
|
||||
for (unsigned b : current.logical_preds)
|
||||
in.join(&out_ctx[b], true);
|
||||
changed |= ctx.join(&out_ctx[b], true);
|
||||
|
||||
if (current.instructions.empty())
|
||||
in_ctx[current.index] = ctx;
|
||||
|
||||
if (done[current.index] && !changed)
|
||||
continue;
|
||||
|
||||
handle_block(program, current, in);
|
||||
if (current.instructions.empty()) {
|
||||
out_ctx[current.index] = ctx;
|
||||
continue;
|
||||
}
|
||||
|
||||
loop_progress = std::max<unsigned>(loop_progress, current.loop_nest_depth);
|
||||
done[current.index] = true;
|
||||
|
||||
handle_block(program, current, ctx);
|
||||
|
||||
out_ctx[current.index] = ctx;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue