aco: add wait_imm::unpack and wait_imm::max

Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Georg Lehmann <dadschoorse@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/28981>
This commit is contained in:
Rhys Perry 2024-05-03 11:19:57 +01:00 committed by Marge Bot
parent c894c9ab1b
commit 75532d8687
7 changed files with 104 additions and 86 deletions

View File

@ -898,14 +898,14 @@ handle_instruction_gfx10(State& state, NOP_ctx_gfx10& ctx, aco_ptr<Instruction>&
if (instr->isFlat() || instr->isDS())
mark_read_regs_exec(state, instr, ctx.sgprs_read_by_DS);
} else if (instr->isSALU() || instr->isSMEM()) {
if (instr->opcode == aco_opcode::s_waitcnt) {
wait_imm imm(state.program->gfx_level, instr->salu().imm);
wait_imm imm;
if (imm.unpack(state.program->gfx_level, instr.get())) {
if (imm.vm == 0)
ctx.sgprs_read_by_VMEM.reset();
if (imm.lgkm == 0)
ctx.sgprs_read_by_DS.reset();
} else if (instr->opcode == aco_opcode::s_waitcnt_vscnt && instr->salu().imm == 0) {
ctx.sgprs_read_by_VMEM_store.reset();
if (imm.vs == 0)
ctx.sgprs_read_by_VMEM_store.reset();
} else if (vm_vsrc == 0) {
ctx.sgprs_read_by_VMEM.reset();
ctx.sgprs_read_by_DS.reset();
@ -981,15 +981,10 @@ handle_instruction_gfx10(State& state, NOP_ctx_gfx10& ctx, aco_ptr<Instruction>&
bld.sop1(aco_opcode::s_mov_b32, Definition(sgpr_null, s1), Operand::zero());
}
} else if (instr->isSALU()) {
/* Reducing lgkmcnt count to 0 always mitigates the hazard. */
if (instr->opcode == aco_opcode::s_waitcnt_lgkmcnt) {
const SALU_instruction& sopk = instr->salu();
if (sopk.imm == 0 && sopk.operands[0].physReg() == sgpr_null)
ctx.sgprs_read_by_SMEM.reset();
} else if (instr->opcode == aco_opcode::s_waitcnt) {
wait_imm imm(state.program->gfx_level, instr->salu().imm);
if (imm.lgkm == 0)
ctx.sgprs_read_by_SMEM.reset();
wait_imm imm;
if (imm.unpack(state.program->gfx_level, instr.get()) && imm.lgkm == 0) {
/* Reducing lgkmcnt count to 0 always mitigates the hazard. */
ctx.sgprs_read_by_SMEM.reset();
} else if (instr->format != Format::SOPP && instr->definitions.size()) {
/* SALU can mitigate the hazard */
ctx.sgprs_read_by_SMEM.reset();
@ -1515,18 +1510,18 @@ handle_instruction_gfx11(State& state, NOP_ctx_gfx11& ctx, aco_ptr<Instruction>&
for (Operand& op : instr->operands)
fill_vgpr_bitset(ctx.vgpr_used_by_ds, op.physReg(), op.bytes());
}
wait_imm imm;
if (instr->isVALU() || instr->isEXP() || vm_vsrc == 0) {
ctx.vgpr_used_by_vmem_load.reset();
ctx.vgpr_used_by_vmem_store.reset();
ctx.vgpr_used_by_ds.reset();
} else if (instr->opcode == aco_opcode::s_waitcnt) {
wait_imm imm(GFX11, instr->salu().imm);
} else if (imm.unpack(state.program->gfx_level, instr.get())) {
if (imm.vm == 0)
ctx.vgpr_used_by_vmem_load.reset();
if (imm.lgkm == 0)
ctx.vgpr_used_by_ds.reset();
} else if (instr->opcode == aco_opcode::s_waitcnt_vscnt && instr->salu().imm == 0) {
ctx.vgpr_used_by_vmem_store.reset();
if (imm.vs == 0)
ctx.vgpr_used_by_vmem_store.reset();
}
if (instr->isLDSDIR()) {
if (ctx.vgpr_used_by_vmem_load[instr->definitions[0].physReg().reg() - 256] ||

View File

@ -228,16 +228,15 @@ struct wait_entry {
};
struct target_info {
uint16_t max_cnt[wait_type_num] = {};
wait_imm max_cnt;
uint32_t events[wait_type_num] = {};
uint16_t unordered_events;
target_info(enum amd_gfx_level gfx_level)
{
max_cnt[wait_type_vm] = gfx_level >= GFX9 ? 62 : 14;
max_cnt[wait_type_exp] = 6;
max_cnt[wait_type_lgkm] = gfx_level >= GFX10 ? 62 : 14;
max_cnt[wait_type_vs] = gfx_level >= GFX10 ? 62 : 0;
max_cnt = wait_imm::max(gfx_level);
for (unsigned i = 0; i < wait_type_num; i++)
max_cnt[i] = max_cnt[i] ? max_cnt[i] - 1 : 0;
events[wait_type_exp] = event_exp_pos | event_exp_param | event_exp_mrt_null |
event_gds_gpr_lock | event_vmem_gpr_lock | event_ldsdir;
@ -402,19 +401,6 @@ check_instr(wait_ctx& ctx, wait_imm& wait, alu_delay_info& delay, Instruction* i
}
}
bool
parse_wait_instr(wait_ctx& ctx, wait_imm& imm, Instruction* instr)
{
if (instr->opcode == aco_opcode::s_waitcnt_vscnt && instr->operands[0].physReg() == sgpr_null) {
imm.vs = std::min<uint8_t>(imm.vs, instr->salu().imm);
return true;
} else if (instr->opcode == aco_opcode::s_waitcnt) {
imm.combine(wait_imm(ctx.gfx_level, instr->salu().imm));
return true;
}
return false;
}
bool
parse_delay_alu(wait_ctx& ctx, alu_delay_info& delay, Instruction* instr)
{
@ -962,7 +948,7 @@ handle_block(Program* program, Block& block, wait_ctx& ctx)
for (size_t i = 0; i < block.instructions.size(); i++) {
aco_ptr<Instruction>& instr = block.instructions[i];
bool is_wait = parse_wait_instr(ctx, queued_imm, instr.get());
bool is_wait = queued_imm.unpack(ctx.gfx_level, instr.get());
bool is_delay_alu = parse_delay_alu(ctx, queued_delay, instr.get());
memory_sync_info sync_info = get_sync_info(instr.get());

View File

@ -1200,32 +1200,6 @@ wait_imm::wait_imm(uint16_t vm_, uint16_t exp_, uint16_t lgkm_, uint16_t vs_)
: exp(exp_), lgkm(lgkm_), vm(vm_), vs(vs_)
{}
wait_imm::wait_imm(enum amd_gfx_level gfx_level, uint16_t packed) : vs(unset_counter)
{
if (gfx_level >= GFX11) {
vm = (packed >> 10) & 0x3f;
lgkm = (packed >> 4) & 0x3f;
exp = packed & 0x7;
} else {
vm = packed & 0xf;
if (gfx_level >= GFX9)
vm |= (packed >> 10) & 0x30;
exp = (packed >> 4) & 0x7;
lgkm = (packed >> 8) & 0xf;
if (gfx_level >= GFX10)
lgkm |= (packed >> 8) & 0x30;
}
if (vm == (gfx_level >= GFX9 ? 0x3f : 0xf))
vm = wait_imm::unset_counter;
if (exp == 0x7)
exp = wait_imm::unset_counter;
if (lgkm == (gfx_level >= GFX10 ? 0x3f : 0xf))
lgkm = wait_imm::unset_counter;
}
uint16_t
wait_imm::pack(enum amd_gfx_level gfx_level) const
{
@ -1257,6 +1231,68 @@ wait_imm::pack(enum amd_gfx_level gfx_level) const
return imm;
}
wait_imm
wait_imm::max(enum amd_gfx_level gfx_level)
{
wait_imm imm;
imm.vm = gfx_level >= GFX9 ? 63 : 15;
imm.exp = 7;
imm.lgkm = gfx_level >= GFX10 ? 63 : 15;
imm.vs = gfx_level >= GFX10 ? 63 : 0;
return imm;
}
bool
wait_imm::unpack(enum amd_gfx_level gfx_level, const Instruction* instr)
{
if (!instr->isSALU() || (!instr->operands.empty() && instr->operands[0].physReg() != sgpr_null))
return false;
aco_opcode op = instr->opcode;
uint16_t packed = instr->salu().imm;
if (op == aco_opcode::s_waitcnt_expcnt) {
exp = std::min<uint8_t>(exp, packed);
} else if (op == aco_opcode::s_waitcnt_lgkmcnt) {
lgkm = std::min<uint8_t>(lgkm, packed);
} else if (op == aco_opcode::s_waitcnt_vmcnt) {
vm = std::min<uint8_t>(vm, packed);
} else if (op == aco_opcode::s_waitcnt_vscnt) {
vs = std::min<uint8_t>(vs, packed);
} else if (op == aco_opcode::s_waitcnt) {
uint8_t vm2, lgkm2, exp2;
if (gfx_level >= GFX11) {
vm2 = (packed >> 10) & 0x3f;
lgkm2 = (packed >> 4) & 0x3f;
exp2 = packed & 0x7;
} else {
vm2 = packed & 0xf;
if (gfx_level >= GFX9)
vm2 |= (packed >> 10) & 0x30;
exp2 = (packed >> 4) & 0x7;
lgkm2 = (packed >> 8) & 0xf;
if (gfx_level >= GFX10)
lgkm2 |= (packed >> 8) & 0x30;
}
if (vm2 == (gfx_level >= GFX9 ? 0x3f : 0xf))
vm2 = wait_imm::unset_counter;
if (exp2 == 0x7)
exp2 = wait_imm::unset_counter;
if (lgkm2 == (gfx_level >= GFX10 ? 0x3f : 0xf))
lgkm2 = wait_imm::unset_counter;
vm = std::min(vm, vm2);
exp = std::min(exp, exp2);
lgkm = std::min(lgkm, lgkm2);
} else {
return false;
}
return true;
}
bool
wait_imm::combine(const wait_imm& other)
{

View File

@ -182,6 +182,8 @@ enum wait_type {
wait_type_num = 4,
};
struct Instruction;
struct wait_imm {
static const uint8_t unset_counter = 0xff;
@ -192,10 +194,13 @@ struct wait_imm {
wait_imm();
wait_imm(uint16_t vm_, uint16_t exp_, uint16_t lgkm_, uint16_t vs_);
wait_imm(enum amd_gfx_level chip, uint16_t packed);
uint16_t pack(enum amd_gfx_level chip) const;
static wait_imm max(enum amd_gfx_level gfx_level);
bool unpack(enum amd_gfx_level gfx_level, const Instruction* instr);
bool combine(const wait_imm& other);
bool empty() const;

View File

@ -275,13 +275,17 @@ print_instr_format_specific(enum amd_gfx_level gfx_level, const Instruction* ins
uint16_t imm = instr->salu().imm;
switch (instr->opcode) {
case aco_opcode::s_waitcnt: {
wait_imm unpacked(gfx_level, imm);
if (unpacked.vm != wait_imm::unset_counter)
fprintf(output, " vmcnt(%d)", unpacked.vm);
if (unpacked.exp != wait_imm::unset_counter)
fprintf(output, " expcnt(%d)", unpacked.exp);
if (unpacked.lgkm != wait_imm::unset_counter)
fprintf(output, " lgkmcnt(%d)", unpacked.lgkm);
wait_imm unpacked;
unpacked.unpack(gfx_level, instr);
const char* names[wait_type_num];
names[wait_type_exp] = "expcnt";
names[wait_type_vm] = "vmcnt";
names[wait_type_lgkm] = "lgkmcnt";
names[wait_type_vs] = "vscnt";
for (unsigned i = 0; i < wait_type_num; i++) {
if (unpacked[i] != wait_imm::unset_counter)
fprintf(output, " %s(%d)", names[i], unpacked[i]);
}
break;
}
case aco_opcode::s_waitcnt_depctr: {

View File

@ -318,28 +318,20 @@ get_wait_imm(Program* program, aco_ptr<Instruction>& instr)
if (instr->opcode == aco_opcode::s_endpgm) {
for (unsigned i = 0; i < wait_type_num; i++)
imm[i] = 0;
} else if (instr->opcode == aco_opcode::s_waitcnt) {
return wait_imm(program->gfx_level, instr->salu().imm);
} else if (instr->opcode == aco_opcode::s_waitcnt_vscnt) {
imm.vs = instr->salu().imm;
return imm;
} else if (imm.unpack(program->gfx_level, instr.get())) {
} else if (instr->isVINTERP_INREG()) {
imm.exp = instr->vinterp_inreg().wait_exp;
if (imm.exp == 0x7)
imm.exp = wait_imm::unset_counter;
return imm;
} else {
unsigned max_lgkm_cnt = program->gfx_level >= GFX10 ? 62 : 14;
unsigned max_exp_cnt = 6;
unsigned max_vm_cnt = program->gfx_level >= GFX9 ? 62 : 14;
unsigned max_vs_cnt = 62;
/* If an instruction increases a counter, it waits for it to be below maximum first. */
std::array<unsigned, wait_type_num> wait_info =
get_wait_counter_info(program->gfx_level, instr);
imm.lgkm = wait_info[wait_type_lgkm] ? max_lgkm_cnt : wait_imm::unset_counter;
imm.exp = wait_info[wait_type_exp] ? max_exp_cnt : wait_imm::unset_counter;
imm.vm = wait_info[wait_type_vm] ? max_vm_cnt : wait_imm::unset_counter;
imm.vs = wait_info[wait_type_vs] ? max_vs_cnt : wait_imm::unset_counter;
wait_imm max = wait_imm::max(program->gfx_level);
for (unsigned i = 0; i < wait_type_num; i++) {
if (wait_info[i])
imm[i] = max[i] - 1;
}
}
return imm;
}

View File

@ -90,7 +90,7 @@ BEGIN_TEST(insert_waitcnt.clause)
Operand::zero());
//! v1: %0:v[4] = buffer_load_dword %0:s[0-3], %0:v[0], 0
bld.mubuf(aco_opcode::buffer_load_dword, def_v4, desc0, op_v0, Operand::zero(), 0, false);
//! s_waitcnt vmcnt(0) lgkmcnt(0)
//! s_waitcnt lgkmcnt(0) vmcnt(0)
//! v1: %0:v[5] = buffer_load_dword %0:s[4-7], %0:v[4], 0
bld.mubuf(aco_opcode::buffer_load_dword, def_v5, Operand(PhysReg(4), s4), op_v4, Operand::zero(),
0, false);