aco: initialize scratch base registers on GFX9-GFX10.3

fossil-db (navi21):
Totals from 1142 (0.70% of 162293) affected shaders:
Instrs: 271636 -> 271974 (+0.12%)
CodeSize: 1532020 -> 1533792 (+0.12%)
Latency: 7484066 -> 7485698 (+0.02%)
InvThroughput: 4048824 -> 4049579 (+0.02%)
SClause: 4171 -> 4212 (+0.98%)
PreSGPRs: 11203 -> 12276 (+9.58%)

fossil-db (vega10):
Totals from 3327 (2.06% of 161355) affected shaders:
Instrs: 257413 -> 257601 (+0.07%)
CodeSize: 1424244 -> 1425372 (+0.08%)
Latency: 8598402 -> 8600466 (+0.02%)
InvThroughput: 7906335 -> 7908234 (+0.02%)
SClause: 4932 -> 4973 (+0.83%)
PreSGPRs: 22010 -> 25405 (+15.42%)

Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17079>
This commit is contained in:
Rhys Perry 2022-05-19 14:12:08 +01:00 committed by Marge Bot
parent 97e9e42e0d
commit d2d94b62f2
9 changed files with 72 additions and 10 deletions

View File

@ -83,7 +83,8 @@ process_block(dce_ctx& ctx, Block& block)
bool
is_dead(const std::vector<uint16_t>& uses, Instruction* instr)
{
if (instr->definitions.empty() || instr->isBranch())
if (instr->definitions.empty() || instr->isBranch() ||
instr->opcode == aco_opcode::p_init_scratch)
return false;
if (std::any_of(instr->definitions.begin(), instr->definitions.end(),
[&uses](const Definition& def) { return !def.isTemp() || uses[def.tempId()]; }))

View File

@ -249,6 +249,12 @@ add_coupling_code(exec_ctx& ctx, Block* block, std::vector<aco_ptr<Instruction>>
assert(startpgm->opcode == aco_opcode::p_startpgm);
bld.insert(std::move(startpgm));
unsigned count = 1;
if (block->instructions[1]->opcode == aco_opcode::p_init_scratch) {
bld.insert(std::move(block->instructions[1]));
count++;
}
Operand start_exec(bld.lm);
/* exec seems to need to be manually initialized with combined shaders */
@ -274,7 +280,7 @@ add_coupling_code(exec_ctx& ctx, Block* block, std::vector<aco_ptr<Instruction>>
ctx.info[0].exec.emplace_back(start_exec, mask);
}
return 1;
return count;
}
/* loop entry block */

View File

@ -11207,9 +11207,16 @@ add_startpgm(struct isel_context* ctx)
* handling spilling.
*/
ctx->program->private_segment_buffer = get_arg(ctx, ctx->args->ring_offsets);
if (ctx->args->ac.scratch_offset.used) {
/* FIXME: Fix scratch loads/stores on GFX11. */
if (ctx->program->gfx_level <= GFX10_3) {
ctx->program->scratch_offset = get_arg(ctx, ctx->args->ac.scratch_offset);
if (ctx->program->gfx_level >= GFX9) {
Operand scratch_offset(ctx->program->scratch_offset);
scratch_offset.setLateKill(true);
Builder bld(ctx->program, ctx->block);
bld.pseudo(aco_opcode::p_init_scratch, bld.def(s2), bld.def(s1, scc),
ctx->program->private_segment_buffer, scratch_offset);
}
}
if (ctx->stage.has(SWStage::VS) && ctx->program->info.vs.dynamic_inputs) {

View File

@ -592,7 +592,8 @@ needs_exec_mask(const Instruction* instr)
case aco_opcode::p_end_linear_vgpr:
case aco_opcode::p_logical_start:
case aco_opcode::p_logical_end:
case aco_opcode::p_startpgm: return instr->reads_exec();
case aco_opcode::p_startpgm:
case aco_opcode::p_init_scratch: return instr->reads_exec();
default: break;
}
}

View File

@ -451,6 +451,8 @@ struct PhysReg {
/* helper expressions for special registers */
static constexpr PhysReg m0{124};
static constexpr PhysReg flat_scr_lo{102}; /* GFX8-GFX9, encoded differently on GFX6-7 */
static constexpr PhysReg flat_scr_hi{103}; /* GFX8-GFX9, encoded differently on GFX6-7 */
static constexpr PhysReg vcc{106};
static constexpr PhysReg vcc_hi{107};
static constexpr PhysReg tba{108}; /* GFX6-GFX8 */
@ -2104,7 +2106,6 @@ public:
bool early_rast = false; /* whether rasterization can start as soon as the 1st DONE pos export */
bool needs_vcc = false;
bool needs_flat_scr = false;
CompilationProgress progress;

View File

@ -293,12 +293,14 @@ calc_waves_per_workgroup(Program* program)
uint16_t
get_extra_sgprs(Program* program)
{
/* We don't use this register on GFX6-8 and it's removed on GFX10+. */
bool needs_flat_scr = program->config->scratch_bytes_per_wave && program->gfx_level == GFX9;
if (program->gfx_level >= GFX10) {
assert(!program->needs_flat_scr);
assert(!program->dev.xnack_enabled);
return 0;
} else if (program->gfx_level >= GFX8) {
if (program->needs_flat_scr)
if (needs_flat_scr)
return 6;
else if (program->dev.xnack_enabled)
return 4;
@ -308,7 +310,7 @@ get_extra_sgprs(Program* program)
return 0;
} else {
assert(!program->dev.xnack_enabled);
if (program->needs_flat_scr)
if (needs_flat_scr)
return 4;
else if (program->needs_vcc)
return 2;

View File

@ -2327,6 +2327,47 @@ lower_to_hw_instr(Program* program)
}
break;
}
case aco_opcode::p_init_scratch: {
assert(program->gfx_level >= GFX8 && program->gfx_level <= GFX10_3);
if (!program->config->scratch_bytes_per_wave)
break;
Operand scratch_addr = instr->operands[0];
Operand scratch_addr_lo(scratch_addr.physReg(), s1);
if (program->stage != compute_cs) {
bld.smem(aco_opcode::s_load_dwordx2, instr->definitions[0], scratch_addr,
Operand::zero());
scratch_addr_lo.setFixed(instr->definitions[0].physReg());
}
Operand scratch_addr_hi(scratch_addr_lo.physReg().advance(4), s1);
/* Since we know what the high 16 bits of scratch_hi is, we can set all the high 16
* bits in the same instruction that we add the carry.
*/
uint32_t hi_add = 0xffff0000 - S_008F04_SWIZZLE_ENABLE_GFX6(1);
if (program->gfx_level >= GFX10) {
Operand scratch_lo(instr->definitions[0].physReg(), s1);
Operand scratch_hi(instr->definitions[0].physReg().advance(4), s1);
bld.sop2(aco_opcode::s_add_u32, Definition(scratch_lo.physReg(), s1),
Definition(scc, s1), scratch_addr_lo, instr->operands[1]);
bld.sop2(aco_opcode::s_addc_u32, Definition(scratch_hi.physReg(), s1),
Definition(scc, s1), scratch_addr_hi, Operand::c32(hi_add),
Operand(scc, s1));
/* "((size - 1) << 11) | register" (FLAT_SCRATCH_LO/HI is encoded as register
* 20/21) */
bld.sopk(aco_opcode::s_setreg_b32, scratch_lo, (31 << 11) | 20);
bld.sopk(aco_opcode::s_setreg_b32, scratch_hi, (31 << 11) | 21);
} else {
bld.sop2(aco_opcode::s_add_u32, Definition(flat_scr_lo, s1), Definition(scc, s1),
scratch_addr_lo, instr->operands[1]);
bld.sop2(aco_opcode::s_addc_u32, Definition(flat_scr_hi, s1), Definition(scc, s1),
scratch_addr_hi, Operand::c32(hi_add), Operand(scc, s1));
}
break;
}
default: break;
}
} else if (instr->isBranch()) {

View File

@ -318,6 +318,8 @@ opcode("p_extract") # src1=index, src2=bits, src3=signext
# (src0 & ((1 << bits) - 1)) << (index * bits)
opcode("p_insert") # src1=index, src2=bits
opcode("p_init_scratch")
# SOP2 instructions: 2 scalar inputs, 1 scalar output (+optional scc)
SOP2 = {

View File

@ -573,7 +573,8 @@ perform_hazard_query(hazard_query* query, Instruction* instr, bool upwards)
/* don't move non-reorderable instructions */
if (instr->opcode == aco_opcode::s_memtime || instr->opcode == aco_opcode::s_memrealtime ||
instr->opcode == aco_opcode::s_setprio || instr->opcode == aco_opcode::s_getreg_b32)
instr->opcode == aco_opcode::s_setprio || instr->opcode == aco_opcode::s_getreg_b32 ||
instr->opcode == aco_opcode::p_init_scratch)
return hazard_fail_unreorderable;
memory_event_set instr_set;