aco: use scratch_* for VGPR spill/reload on GFX9+
fossil-db (navi21): Totals from 12 (0.01% of 162293) affected shaders: Instrs: 122808 -> 122782 (-0.02%); split: -0.11%, +0.09% CodeSize: 711248 -> 710788 (-0.06%); split: -0.16%, +0.10% SpillSGPRs: 928 -> 831 (-10.45%) SpillVGPRs: 1626 -> 1624 (-0.12%) Latency: 4960285 -> 4932547 (-0.56%) InvThroughput: 2574083 -> 2559953 (-0.55%) VClause: 3404 -> 3402 (-0.06%) Copies: 36992 -> 37181 (+0.51%); split: -0.05%, +0.56% Branches: 3582 -> 3585 (+0.08%) PreVGPRs: 3055 -> 3057 (+0.07%) fossil-db (vega10): Totals from 12 (0.01% of 161355) affected shaders: Instrs: 124817 -> 124383 (-0.35%); split: -0.46%, +0.12% CodeSize: 705116 -> 703664 (-0.21%); split: -0.44%, +0.23% SpillSGPRs: 1012 -> 898 (-11.26%) SpillVGPRs: 1632 -> 1624 (-0.49%) Scratch: 201728 -> 200704 (-0.51%) Latency: 6160115 -> 6266025 (+1.72%); split: -0.34%, +2.06% InvThroughput: 6440203 -> 6544595 (+1.62%); split: -0.35%, +1.97% VClause: 3409 -> 3423 (+0.41%) Copies: 37929 -> 37748 (-0.48%); split: -1.16%, +0.69% Branches: 3851 -> 3855 (+0.10%); split: -0.13%, +0.23% PreVGPRs: 3053 -> 3055 (+0.07%) Signed-off-by: Rhys Perry <pendingchaos02@gmail.com> Reviewed-by: Daniel Schürmann <daniel@schuermann.dev> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17079>
This commit is contained in:
parent
0e783d687a
commit
98a65eafb7
|
@ -1408,6 +1408,10 @@ load_scratch_resource(spill_ctx& ctx, Temp& scratch_offset, Block& block,
|
|||
}
|
||||
}
|
||||
|
||||
/* GFX9+ uses scratch_* instructions, which don't use a resource. Return a SADDR instead. */
|
||||
if (ctx.program->gfx_level >= GFX9)
|
||||
return bld.copy(bld.def(s1), Operand::c32(offset));
|
||||
|
||||
Temp private_segment_buffer = ctx.program->private_segment_buffer;
|
||||
if (ctx.program->stage.hw != HWStage::CS)
|
||||
private_segment_buffer =
|
||||
|
@ -1445,17 +1449,29 @@ setup_vgpr_spill_reload(spill_ctx& ctx, Block& block,
|
|||
Temp scratch_offset = ctx.program->scratch_offset;
|
||||
|
||||
*offset = spill_slot * 4;
|
||||
if (ctx.program->gfx_level >= GFX9) {
|
||||
*offset += ctx.program->dev.scratch_global_offset_min;
|
||||
|
||||
bool add_offset_to_sgpr = ctx.program->config->scratch_bytes_per_wave / ctx.program->wave_size +
|
||||
ctx.vgpr_spill_slots * 4 >
|
||||
4096;
|
||||
if (!add_offset_to_sgpr)
|
||||
*offset += ctx.program->config->scratch_bytes_per_wave / ctx.program->wave_size;
|
||||
if (ctx.scratch_rsrc == Temp()) {
|
||||
int32_t saddr = ctx.program->config->scratch_bytes_per_wave / ctx.program->wave_size -
|
||||
ctx.program->dev.scratch_global_offset_min;
|
||||
ctx.scratch_rsrc =
|
||||
load_scratch_resource(ctx, scratch_offset, block, instructions, saddr);
|
||||
}
|
||||
} else {
|
||||
bool add_offset_to_sgpr =
|
||||
ctx.program->config->scratch_bytes_per_wave / ctx.program->wave_size +
|
||||
ctx.vgpr_spill_slots * 4 >
|
||||
4096;
|
||||
if (!add_offset_to_sgpr)
|
||||
*offset += ctx.program->config->scratch_bytes_per_wave / ctx.program->wave_size;
|
||||
|
||||
if (ctx.scratch_rsrc == Temp()) {
|
||||
unsigned rsrc_offset = add_offset_to_sgpr ? ctx.program->config->scratch_bytes_per_wave : 0;
|
||||
ctx.scratch_rsrc =
|
||||
load_scratch_resource(ctx, scratch_offset, block, instructions, rsrc_offset);
|
||||
if (ctx.scratch_rsrc == Temp()) {
|
||||
unsigned rsrc_offset =
|
||||
add_offset_to_sgpr ? ctx.program->config->scratch_bytes_per_wave : 0;
|
||||
ctx.scratch_rsrc =
|
||||
load_scratch_resource(ctx, scratch_offset, block, instructions, rsrc_offset);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1485,11 +1501,19 @@ spill_vgpr(spill_ctx& ctx, Block& block, std::vector<aco_ptr<Instruction>>& inst
|
|||
bld.insert(split);
|
||||
for (unsigned i = 0; i < temp.size(); i++, offset += 4) {
|
||||
Temp elem = split->definitions[i].getTemp();
|
||||
Instruction* instr =
|
||||
bld.mubuf(aco_opcode::buffer_store_dword, ctx.scratch_rsrc, Operand(v1),
|
||||
ctx.program->scratch_offset, elem, offset, false, true);
|
||||
instr->mubuf().sync = memory_sync_info(storage_vgpr_spill, semantic_private);
|
||||
if (ctx.program->gfx_level >= GFX9) {
|
||||
bld.scratch(aco_opcode::scratch_store_dword, Operand(v1), ctx.scratch_rsrc, elem,
|
||||
offset, memory_sync_info(storage_vgpr_spill, semantic_private));
|
||||
} else {
|
||||
Instruction* instr =
|
||||
bld.mubuf(aco_opcode::buffer_store_dword, ctx.scratch_rsrc, Operand(v1),
|
||||
ctx.program->scratch_offset, elem, offset, false, true);
|
||||
instr->mubuf().sync = memory_sync_info(storage_vgpr_spill, semantic_private);
|
||||
}
|
||||
}
|
||||
} else if (ctx.program->gfx_level >= GFX9) {
|
||||
bld.scratch(aco_opcode::scratch_store_dword, Operand(v1), ctx.scratch_rsrc, temp, offset,
|
||||
memory_sync_info(storage_vgpr_spill, semantic_private));
|
||||
} else {
|
||||
Instruction* instr = bld.mubuf(aco_opcode::buffer_store_dword, ctx.scratch_rsrc, Operand(v1),
|
||||
ctx.program->scratch_offset, temp, offset, false, true);
|
||||
|
@ -1517,12 +1541,21 @@ reload_vgpr(spill_ctx& ctx, Block& block, std::vector<aco_ptr<Instruction>>& ins
|
|||
for (unsigned i = 0; i < def.size(); i++, offset += 4) {
|
||||
Temp tmp = bld.tmp(v1);
|
||||
vec->operands[i] = Operand(tmp);
|
||||
Instruction* instr =
|
||||
bld.mubuf(aco_opcode::buffer_load_dword, Definition(tmp), ctx.scratch_rsrc, Operand(v1),
|
||||
ctx.program->scratch_offset, offset, false, true);
|
||||
instr->mubuf().sync = memory_sync_info(storage_vgpr_spill, semantic_private);
|
||||
if (ctx.program->gfx_level >= GFX9) {
|
||||
bld.scratch(aco_opcode::scratch_load_dword, Definition(tmp), Operand(v1),
|
||||
ctx.scratch_rsrc, offset,
|
||||
memory_sync_info(storage_vgpr_spill, semantic_private));
|
||||
} else {
|
||||
Instruction* instr =
|
||||
bld.mubuf(aco_opcode::buffer_load_dword, Definition(tmp), ctx.scratch_rsrc,
|
||||
Operand(v1), ctx.program->scratch_offset, offset, false, true);
|
||||
instr->mubuf().sync = memory_sync_info(storage_vgpr_spill, semantic_private);
|
||||
}
|
||||
}
|
||||
bld.insert(vec);
|
||||
} else if (ctx.program->gfx_level >= GFX9) {
|
||||
bld.scratch(aco_opcode::scratch_load_dword, def, Operand(v1), ctx.scratch_rsrc, offset,
|
||||
memory_sync_info(storage_vgpr_spill, semantic_private));
|
||||
} else {
|
||||
Instruction* instr = bld.mubuf(aco_opcode::buffer_load_dword, def, ctx.scratch_rsrc,
|
||||
Operand(v1), ctx.program->scratch_offset, offset, false, true);
|
||||
|
@ -1907,7 +1940,10 @@ spill(Program* program, live& live_vars)
|
|||
}
|
||||
/* add extra SGPRs required for spilling VGPRs */
|
||||
if (demand.vgpr + extra_vgprs > vgpr_limit) {
|
||||
extra_sgprs = 5; /* scratch_resource (s4) + scratch_offset (s1) */
|
||||
if (program->gfx_level >= GFX9)
|
||||
extra_sgprs = 1; /* SADDR */
|
||||
else
|
||||
extra_sgprs = 5; /* scratch_resource (s4) + scratch_offset (s1) */
|
||||
if (demand.sgpr + extra_sgprs > sgpr_limit) {
|
||||
/* re-calculate in case something has changed */
|
||||
unsigned sgpr_spills = demand.sgpr + extra_sgprs - sgpr_limit;
|
||||
|
|
Loading…
Reference in New Issue