aco: initialize scratch base registers on GFX9-GFX10.3
fossil-db (navi21): Totals from 1142 (0.70% of 162293) affected shaders: Instrs: 271636 -> 271974 (+0.12%) CodeSize: 1532020 -> 1533792 (+0.12%) Latency: 7484066 -> 7485698 (+0.02%) InvThroughput: 4048824 -> 4049579 (+0.02%) SClause: 4171 -> 4212 (+0.98%) PreSGPRs: 11203 -> 12276 (+9.58%) fossil-db (vega10): Totals from 3327 (2.06% of 161355) affected shaders: Instrs: 257413 -> 257601 (+0.07%) CodeSize: 1424244 -> 1425372 (+0.08%) Latency: 8598402 -> 8600466 (+0.02%) InvThroughput: 7906335 -> 7908234 (+0.02%) SClause: 4932 -> 4973 (+0.83%) PreSGPRs: 22010 -> 25405 (+15.42%) Signed-off-by: Rhys Perry <pendingchaos02@gmail.com> Reviewed-by: Daniel Schürmann <daniel@schuermann.dev> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17079>
This commit is contained in:
parent
97e9e42e0d
commit
d2d94b62f2
|
@ -83,7 +83,8 @@ process_block(dce_ctx& ctx, Block& block)
|
|||
bool
|
||||
is_dead(const std::vector<uint16_t>& uses, Instruction* instr)
|
||||
{
|
||||
if (instr->definitions.empty() || instr->isBranch())
|
||||
if (instr->definitions.empty() || instr->isBranch() ||
|
||||
instr->opcode == aco_opcode::p_init_scratch)
|
||||
return false;
|
||||
if (std::any_of(instr->definitions.begin(), instr->definitions.end(),
|
||||
[&uses](const Definition& def) { return !def.isTemp() || uses[def.tempId()]; }))
|
||||
|
|
|
@ -249,6 +249,12 @@ add_coupling_code(exec_ctx& ctx, Block* block, std::vector<aco_ptr<Instruction>>
|
|||
assert(startpgm->opcode == aco_opcode::p_startpgm);
|
||||
bld.insert(std::move(startpgm));
|
||||
|
||||
unsigned count = 1;
|
||||
if (block->instructions[1]->opcode == aco_opcode::p_init_scratch) {
|
||||
bld.insert(std::move(block->instructions[1]));
|
||||
count++;
|
||||
}
|
||||
|
||||
Operand start_exec(bld.lm);
|
||||
|
||||
/* exec seems to need to be manually initialized with combined shaders */
|
||||
|
@ -274,7 +280,7 @@ add_coupling_code(exec_ctx& ctx, Block* block, std::vector<aco_ptr<Instruction>>
|
|||
ctx.info[0].exec.emplace_back(start_exec, mask);
|
||||
}
|
||||
|
||||
return 1;
|
||||
return count;
|
||||
}
|
||||
|
||||
/* loop entry block */
|
||||
|
|
|
@ -11207,9 +11207,16 @@ add_startpgm(struct isel_context* ctx)
|
|||
* handling spilling.
|
||||
*/
|
||||
ctx->program->private_segment_buffer = get_arg(ctx, ctx->args->ring_offsets);
|
||||
if (ctx->args->ac.scratch_offset.used) {
|
||||
/* FIXME: Fix scratch loads/stores on GFX11. */
|
||||
if (ctx->program->gfx_level <= GFX10_3) {
|
||||
ctx->program->scratch_offset = get_arg(ctx, ctx->args->ac.scratch_offset);
|
||||
|
||||
if (ctx->program->gfx_level >= GFX9) {
|
||||
Operand scratch_offset(ctx->program->scratch_offset);
|
||||
scratch_offset.setLateKill(true);
|
||||
Builder bld(ctx->program, ctx->block);
|
||||
bld.pseudo(aco_opcode::p_init_scratch, bld.def(s2), bld.def(s1, scc),
|
||||
ctx->program->private_segment_buffer, scratch_offset);
|
||||
}
|
||||
}
|
||||
|
||||
if (ctx->stage.has(SWStage::VS) && ctx->program->info.vs.dynamic_inputs) {
|
||||
|
|
|
@ -592,7 +592,8 @@ needs_exec_mask(const Instruction* instr)
|
|||
case aco_opcode::p_end_linear_vgpr:
|
||||
case aco_opcode::p_logical_start:
|
||||
case aco_opcode::p_logical_end:
|
||||
case aco_opcode::p_startpgm: return instr->reads_exec();
|
||||
case aco_opcode::p_startpgm:
|
||||
case aco_opcode::p_init_scratch: return instr->reads_exec();
|
||||
default: break;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -451,6 +451,8 @@ struct PhysReg {
|
|||
|
||||
/* helper expressions for special registers */
|
||||
static constexpr PhysReg m0{124};
|
||||
static constexpr PhysReg flat_scr_lo{102}; /* GFX8-GFX9, encoded differently on GFX6-7 */
|
||||
static constexpr PhysReg flat_scr_hi{103}; /* GFX8-GFX9, encoded differently on GFX6-7 */
|
||||
static constexpr PhysReg vcc{106};
|
||||
static constexpr PhysReg vcc_hi{107};
|
||||
static constexpr PhysReg tba{108}; /* GFX6-GFX8 */
|
||||
|
@ -2104,7 +2106,6 @@ public:
|
|||
bool early_rast = false; /* whether rasterization can start as soon as the 1st DONE pos export */
|
||||
|
||||
bool needs_vcc = false;
|
||||
bool needs_flat_scr = false;
|
||||
|
||||
CompilationProgress progress;
|
||||
|
||||
|
|
|
@ -293,12 +293,14 @@ calc_waves_per_workgroup(Program* program)
|
|||
uint16_t
|
||||
get_extra_sgprs(Program* program)
|
||||
{
|
||||
/* We don't use this register on GFX6-8 and it's removed on GFX10+. */
|
||||
bool needs_flat_scr = program->config->scratch_bytes_per_wave && program->gfx_level == GFX9;
|
||||
|
||||
if (program->gfx_level >= GFX10) {
|
||||
assert(!program->needs_flat_scr);
|
||||
assert(!program->dev.xnack_enabled);
|
||||
return 0;
|
||||
} else if (program->gfx_level >= GFX8) {
|
||||
if (program->needs_flat_scr)
|
||||
if (needs_flat_scr)
|
||||
return 6;
|
||||
else if (program->dev.xnack_enabled)
|
||||
return 4;
|
||||
|
@ -308,7 +310,7 @@ get_extra_sgprs(Program* program)
|
|||
return 0;
|
||||
} else {
|
||||
assert(!program->dev.xnack_enabled);
|
||||
if (program->needs_flat_scr)
|
||||
if (needs_flat_scr)
|
||||
return 4;
|
||||
else if (program->needs_vcc)
|
||||
return 2;
|
||||
|
|
|
@ -2327,6 +2327,47 @@ lower_to_hw_instr(Program* program)
|
|||
}
|
||||
break;
|
||||
}
|
||||
case aco_opcode::p_init_scratch: {
|
||||
assert(program->gfx_level >= GFX8 && program->gfx_level <= GFX10_3);
|
||||
if (!program->config->scratch_bytes_per_wave)
|
||||
break;
|
||||
|
||||
Operand scratch_addr = instr->operands[0];
|
||||
Operand scratch_addr_lo(scratch_addr.physReg(), s1);
|
||||
if (program->stage != compute_cs) {
|
||||
bld.smem(aco_opcode::s_load_dwordx2, instr->definitions[0], scratch_addr,
|
||||
Operand::zero());
|
||||
scratch_addr_lo.setFixed(instr->definitions[0].physReg());
|
||||
}
|
||||
Operand scratch_addr_hi(scratch_addr_lo.physReg().advance(4), s1);
|
||||
|
||||
/* Since we know what the high 16 bits of scratch_hi is, we can set all the high 16
|
||||
* bits in the same instruction that we add the carry.
|
||||
*/
|
||||
uint32_t hi_add = 0xffff0000 - S_008F04_SWIZZLE_ENABLE_GFX6(1);
|
||||
|
||||
if (program->gfx_level >= GFX10) {
|
||||
Operand scratch_lo(instr->definitions[0].physReg(), s1);
|
||||
Operand scratch_hi(instr->definitions[0].physReg().advance(4), s1);
|
||||
|
||||
bld.sop2(aco_opcode::s_add_u32, Definition(scratch_lo.physReg(), s1),
|
||||
Definition(scc, s1), scratch_addr_lo, instr->operands[1]);
|
||||
bld.sop2(aco_opcode::s_addc_u32, Definition(scratch_hi.physReg(), s1),
|
||||
Definition(scc, s1), scratch_addr_hi, Operand::c32(hi_add),
|
||||
Operand(scc, s1));
|
||||
|
||||
/* "((size - 1) << 11) | register" (FLAT_SCRATCH_LO/HI is encoded as register
|
||||
* 20/21) */
|
||||
bld.sopk(aco_opcode::s_setreg_b32, scratch_lo, (31 << 11) | 20);
|
||||
bld.sopk(aco_opcode::s_setreg_b32, scratch_hi, (31 << 11) | 21);
|
||||
} else {
|
||||
bld.sop2(aco_opcode::s_add_u32, Definition(flat_scr_lo, s1), Definition(scc, s1),
|
||||
scratch_addr_lo, instr->operands[1]);
|
||||
bld.sop2(aco_opcode::s_addc_u32, Definition(flat_scr_hi, s1), Definition(scc, s1),
|
||||
scratch_addr_hi, Operand::c32(hi_add), Operand(scc, s1));
|
||||
}
|
||||
break;
|
||||
}
|
||||
default: break;
|
||||
}
|
||||
} else if (instr->isBranch()) {
|
||||
|
|
|
@ -318,6 +318,8 @@ opcode("p_extract") # src1=index, src2=bits, src3=signext
|
|||
# (src0 & ((1 << bits) - 1)) << (index * bits)
|
||||
opcode("p_insert") # src1=index, src2=bits
|
||||
|
||||
opcode("p_init_scratch")
|
||||
|
||||
|
||||
# SOP2 instructions: 2 scalar inputs, 1 scalar output (+optional scc)
|
||||
SOP2 = {
|
||||
|
|
|
@ -573,7 +573,8 @@ perform_hazard_query(hazard_query* query, Instruction* instr, bool upwards)
|
|||
|
||||
/* don't move non-reorderable instructions */
|
||||
if (instr->opcode == aco_opcode::s_memtime || instr->opcode == aco_opcode::s_memrealtime ||
|
||||
instr->opcode == aco_opcode::s_setprio || instr->opcode == aco_opcode::s_getreg_b32)
|
||||
instr->opcode == aco_opcode::s_setprio || instr->opcode == aco_opcode::s_getreg_b32 ||
|
||||
instr->opcode == aco_opcode::p_init_scratch)
|
||||
return hazard_fail_unreorderable;
|
||||
|
||||
memory_event_set instr_set;
|
||||
|
|
Loading…
Reference in New Issue