aco: refactor GPR limit calculation

This patch delays the calculation of GPR limits in order to
precisely incorporate extra registers (VCC etc.) and shared VGPRs.

Additionally, the allocation granularity is used to set the config.
This has some effect on the reported SGPR stats.

Totals (Navi10):
SGPRs: 6971787 -> 17753642 (+154.65%)

Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/8921>
This commit is contained in:
Daniel Schürmann 2021-02-05 14:36:39 +01:00 committed by Marge Bot
parent eaf681724e
commit b98a4d4dd7
7 changed files with 44 additions and 34 deletions

View File

@ -1180,8 +1180,6 @@ setup_isel_context(Program* program,
}
calc_min_waves(program);
program->vgpr_limit = get_addr_vgpr_from_waves(program, program->min_waves);
program->sgpr_limit = get_addr_sgpr_from_waves(program, program->min_waves);
unsigned scratch_size = 0;
if (program->stage == gs_copy_vs) {

View File

@ -115,10 +115,8 @@ void init_program(Program *program, Stage stage, struct radv_shader_info *info,
program->physical_sgprs = 800;
program->sgpr_alloc_granule = 16;
program->sgpr_limit = 102;
if (family == CHIP_TONGA || family == CHIP_ICELAND) {
program->sgpr_alloc_granule = 96;
program->sgpr_limit = 94; /* workaround hardware bug */
}
if (family == CHIP_TONGA || family == CHIP_ICELAND)
program->sgpr_alloc_granule = 96; /* workaround hardware bug */
} else {
program->physical_sgprs = 512;
program->sgpr_alloc_granule = 8;

View File

@ -285,17 +285,23 @@ uint16_t get_vgpr_alloc(Program *program, uint16_t addressable_vgprs)
return align(std::max(addressable_vgprs, granule), granule);
}
uint16_t get_addr_sgpr_from_waves(Program *program, uint16_t max_waves)
unsigned round_down(unsigned a, unsigned b)
{
uint16_t sgprs = (program->physical_sgprs / max_waves) - program->sgpr_alloc_granule + 1;
sgprs = get_sgpr_alloc(program, sgprs);
return a - (a % b);
}
uint16_t get_addr_sgpr_from_waves(Program *program, uint16_t waves)
{
/* it's not possible to allocate more than 128 SGPRs */
uint16_t sgprs = std::min(program->physical_sgprs / waves, 128);
sgprs = round_down(sgprs, program->sgpr_alloc_granule);
sgprs -= get_extra_sgprs(program);
return std::min(sgprs, program->sgpr_limit);
}
uint16_t get_addr_vgpr_from_waves(Program *program, uint16_t max_waves)
uint16_t get_addr_vgpr_from_waves(Program *program, uint16_t waves)
{
uint16_t vgprs = program->physical_vgprs / max_waves & ~(program->vgpr_alloc_granule - 1);
uint16_t vgprs = program->physical_vgprs / waves & ~(program->vgpr_alloc_granule - 1);
return std::min(vgprs, program->vgpr_limit);
}
@ -326,8 +332,12 @@ void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand)
unsigned simd_per_cu_wgp = wgp ? simd_per_cu * 2 : simd_per_cu;
unsigned lds_limit = wgp ? program->lds_limit * 2 : program->lds_limit;
assert(program->min_waves >= 1);
uint16_t sgpr_limit = get_addr_sgpr_from_waves(program, program->min_waves);
uint16_t vgpr_limit = get_addr_vgpr_from_waves(program, program->min_waves);
/* this won't compile, register pressure reduction necessary */
if (new_demand.vgpr > program->vgpr_limit || new_demand.sgpr > program->sgpr_limit) {
if (new_demand.vgpr > vgpr_limit || new_demand.sgpr > sgpr_limit) {
program->num_waves = 0;
program->max_reg_demand = new_demand;
} else {

View File

@ -73,8 +73,10 @@ struct ra_ctx {
std::unordered_map<unsigned, Instruction*> vectors;
std::unordered_map<unsigned, Instruction*> split_vectors;
aco_ptr<Instruction> pseudo_dummy;
unsigned max_used_sgpr = 0;
unsigned max_used_vgpr = 0;
uint16_t max_used_sgpr = 0;
uint16_t max_used_vgpr = 0;
uint16_t sgpr_limit;
uint16_t vgpr_limit;
std::bitset<64> defs_done; /* see MAX_ARGS in aco_instruction_selection_setup.cpp */
ra_test_policy policy;
@ -89,6 +91,8 @@ struct ra_ctx {
policy(policy_)
{
pseudo_dummy.reset(create_instruction<Instruction>(aco_opcode::p_parallelcopy, Format::PSEUDO, 0, 0));
sgpr_limit = get_addr_sgpr_from_waves(program, program->min_waves);
vgpr_limit = get_addr_sgpr_from_waves(program, program->min_waves);
}
};
@ -650,14 +654,14 @@ void add_subdword_definition(Program *program, aco_ptr<Instruction>& instr, unsi
void adjust_max_used_regs(ra_ctx& ctx, RegClass rc, unsigned reg)
{
unsigned max_addressible_sgpr = ctx.program->sgpr_limit;
uint16_t max_addressible_sgpr = ctx.sgpr_limit;
unsigned size = rc.size();
if (rc.type() == RegType::vgpr) {
assert(reg >= 256);
unsigned hi = reg - 256 + size - 1;
uint16_t hi = reg - 256 + size - 1;
ctx.max_used_vgpr = std::max(ctx.max_used_vgpr, hi);
} else if (reg + rc.size() <= max_addressible_sgpr) {
unsigned hi = reg + size - 1;
uint16_t hi = reg + size - 1;
ctx.max_used_sgpr = std::max(ctx.max_used_sgpr, std::min(hi, max_addressible_sgpr));
}
}
@ -1241,11 +1245,9 @@ bool get_reg_specified(ra_ctx& ctx,
}
bool increase_register_file(ra_ctx& ctx, RegType type) {
uint16_t max_addressible_sgpr = ctx.program->sgpr_limit;
uint16_t max_addressible_vgpr = ctx.program->vgpr_limit;
if (type == RegType::vgpr && ctx.program->max_reg_demand.vgpr < max_addressible_vgpr) {
if (type == RegType::vgpr && ctx.program->max_reg_demand.vgpr < ctx.vgpr_limit) {
update_vgpr_sgpr_demand(ctx.program, RegisterDemand(ctx.program->max_reg_demand.vgpr + 1, ctx.program->max_reg_demand.sgpr));
} else if (type == RegType::sgpr && ctx.program->max_reg_demand.sgpr < max_addressible_sgpr) {
} else if (type == RegType::sgpr && ctx.program->max_reg_demand.sgpr < ctx.sgpr_limit) {
update_vgpr_sgpr_demand(ctx.program, RegisterDemand(ctx.program->max_reg_demand.vgpr, ctx.program->max_reg_demand.sgpr + 1));
} else {
return false;
@ -2677,11 +2679,8 @@ void register_allocation(Program *program, std::vector<IDSet>& live_out_per_bloc
}
/* num_gpr = rnd_up(max_used_gpr + 1) */
program->config->num_vgprs = align(ctx.max_used_vgpr + 1, 4);
if (program->family == CHIP_TONGA || program->family == CHIP_ICELAND) /* workaround hardware bug */
program->config->num_sgprs = get_sgpr_alloc(program, program->sgpr_limit);
else
program->config->num_sgprs = align(ctx.max_used_sgpr + 1 + get_extra_sgprs(program), 8);
program->config->num_vgprs = get_vgpr_alloc(program, ctx.max_used_vgpr + 1);
program->config->num_sgprs = get_sgpr_alloc(program, ctx.max_used_sgpr + 1);
}
}

View File

@ -1774,14 +1774,16 @@ void spill(Program* program, live& live_vars)
/* calculate target register demand */
RegisterDemand register_target = program->max_reg_demand;
if (register_target.sgpr > program->sgpr_limit)
register_target.vgpr += (register_target.sgpr - program->sgpr_limit + program->wave_size - 1 + 32) / program->wave_size;
register_target.sgpr = program->sgpr_limit;
uint16_t sgpr_limit = get_addr_sgpr_from_waves(program, program->min_waves);
uint16_t vgpr_limit = get_addr_vgpr_from_waves(program, program->min_waves);
if (register_target.sgpr > sgpr_limit)
register_target.vgpr += (register_target.sgpr - sgpr_limit + program->wave_size - 1 + 32) / program->wave_size;
register_target.sgpr = sgpr_limit;
if (register_target.vgpr > program->vgpr_limit)
register_target.sgpr = program->sgpr_limit - 5;
if (register_target.vgpr > vgpr_limit)
register_target.sgpr = sgpr_limit - 5;
int spills_to_vgpr = (program->max_reg_demand.sgpr - register_target.sgpr + program->wave_size - 1 + 32) / program->wave_size;
register_target.vgpr = program->vgpr_limit - spills_to_vgpr;
register_target.vgpr = vgpr_limit - spills_to_vgpr;
/* initialize ctx */
spill_ctx ctx(register_target, program, live_vars.register_demand);

View File

@ -679,6 +679,7 @@ bool validate_ra(Program *program) {
bool err = false;
aco::live live_vars = aco::live_var_analysis(program);
std::vector<std::vector<Temp>> phi_sgpr_ops(program->blocks.size());
uint16_t sgpr_limit = get_addr_sgpr_from_waves(program, program->num_waves);
std::map<unsigned, Assignment> assignments;
for (Block& block : program->blocks) {
@ -704,7 +705,7 @@ bool validate_ra(Program *program) {
if (assignments.count(op.tempId()) && assignments[op.tempId()].reg != op.physReg())
err |= ra_fail(program, loc, assignments.at(op.tempId()).firstloc, "Operand %d has an inconsistent register assignment with instruction", i);
if ((op.getTemp().type() == RegType::vgpr && op.physReg().reg_b + op.bytes() > (256 + program->config->num_vgprs) * 4) ||
(op.getTemp().type() == RegType::sgpr && op.physReg() + op.size() > program->config->num_sgprs && op.physReg() < program->sgpr_limit))
(op.getTemp().type() == RegType::sgpr && op.physReg() + op.size() > program->config->num_sgprs && op.physReg() < sgpr_limit))
err |= ra_fail(program, loc, assignments.at(op.tempId()).firstloc, "Operand %d has an out-of-bounds register assignment", i);
if (op.physReg() == vcc && !program->needs_vcc)
err |= ra_fail(program, loc, Location(), "Operand %d fixed to vcc but needs_vcc=false", i);
@ -725,7 +726,7 @@ bool validate_ra(Program *program) {
if (assignments[def.tempId()].defloc.block)
err |= ra_fail(program, loc, assignments.at(def.tempId()).defloc, "Temporary %%%d also defined by instruction", def.tempId());
if ((def.getTemp().type() == RegType::vgpr && def.physReg().reg_b + def.bytes() > (256 + program->config->num_vgprs) * 4) ||
(def.getTemp().type() == RegType::sgpr && def.physReg() + def.size() > program->config->num_sgprs && def.physReg() < program->sgpr_limit))
(def.getTemp().type() == RegType::sgpr && def.physReg() + def.size() > program->config->num_sgprs && def.physReg() < sgpr_limit))
err |= ra_fail(program, loc, assignments.at(def.tempId()).firstloc, "Definition %d has an out-of-bounds register assignment", i);
if (def.physReg() == vcc && !program->needs_vcc)
err |= ra_fail(program, loc, Location(), "Definition %d fixed to vcc but needs_vcc=false", i);

View File

@ -80,6 +80,8 @@ void create_program(enum chip_class chip_class, Stage stage, unsigned wave_size,
program.reset(new Program);
aco::init_program(program.get(), stage, &info, chip_class, family, &config);
program->workgroup_size = UINT_MAX;
calc_min_waves(program.get());
program->debug.func = nullptr;
program->debug.private_data = nullptr;