aco: refactor GPR limit calculation
This patch delays the calculation of GPR limits in order to precisely incorporate extra registers (VCC etc.) and shared VGPRs. Additionally, the allocation granularity is used to set the config. This has some effect on the reported SGPR stats. Totals (Navi10): SGPRs: 6971787 -> 17753642 (+154.65%) Reviewed-by: Rhys Perry <pendingchaos02@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/8921>
This commit is contained in:
parent
eaf681724e
commit
b98a4d4dd7
|
@ -1180,8 +1180,6 @@ setup_isel_context(Program* program,
|
|||
}
|
||||
|
||||
calc_min_waves(program);
|
||||
program->vgpr_limit = get_addr_vgpr_from_waves(program, program->min_waves);
|
||||
program->sgpr_limit = get_addr_sgpr_from_waves(program, program->min_waves);
|
||||
|
||||
unsigned scratch_size = 0;
|
||||
if (program->stage == gs_copy_vs) {
|
||||
|
|
|
@ -115,10 +115,8 @@ void init_program(Program *program, Stage stage, struct radv_shader_info *info,
|
|||
program->physical_sgprs = 800;
|
||||
program->sgpr_alloc_granule = 16;
|
||||
program->sgpr_limit = 102;
|
||||
if (family == CHIP_TONGA || family == CHIP_ICELAND) {
|
||||
program->sgpr_alloc_granule = 96;
|
||||
program->sgpr_limit = 94; /* workaround hardware bug */
|
||||
}
|
||||
if (family == CHIP_TONGA || family == CHIP_ICELAND)
|
||||
program->sgpr_alloc_granule = 96; /* workaround hardware bug */
|
||||
} else {
|
||||
program->physical_sgprs = 512;
|
||||
program->sgpr_alloc_granule = 8;
|
||||
|
|
|
@ -285,17 +285,23 @@ uint16_t get_vgpr_alloc(Program *program, uint16_t addressable_vgprs)
|
|||
return align(std::max(addressable_vgprs, granule), granule);
|
||||
}
|
||||
|
||||
uint16_t get_addr_sgpr_from_waves(Program *program, uint16_t max_waves)
|
||||
unsigned round_down(unsigned a, unsigned b)
|
||||
{
|
||||
uint16_t sgprs = (program->physical_sgprs / max_waves) - program->sgpr_alloc_granule + 1;
|
||||
sgprs = get_sgpr_alloc(program, sgprs);
|
||||
return a - (a % b);
|
||||
}
|
||||
|
||||
uint16_t get_addr_sgpr_from_waves(Program *program, uint16_t waves)
|
||||
{
|
||||
/* it's not possible to allocate more than 128 SGPRs */
|
||||
uint16_t sgprs = std::min(program->physical_sgprs / waves, 128);
|
||||
sgprs = round_down(sgprs, program->sgpr_alloc_granule);
|
||||
sgprs -= get_extra_sgprs(program);
|
||||
return std::min(sgprs, program->sgpr_limit);
|
||||
}
|
||||
|
||||
uint16_t get_addr_vgpr_from_waves(Program *program, uint16_t max_waves)
|
||||
uint16_t get_addr_vgpr_from_waves(Program *program, uint16_t waves)
|
||||
{
|
||||
uint16_t vgprs = program->physical_vgprs / max_waves & ~(program->vgpr_alloc_granule - 1);
|
||||
uint16_t vgprs = program->physical_vgprs / waves & ~(program->vgpr_alloc_granule - 1);
|
||||
return std::min(vgprs, program->vgpr_limit);
|
||||
}
|
||||
|
||||
|
@ -326,8 +332,12 @@ void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand)
|
|||
unsigned simd_per_cu_wgp = wgp ? simd_per_cu * 2 : simd_per_cu;
|
||||
unsigned lds_limit = wgp ? program->lds_limit * 2 : program->lds_limit;
|
||||
|
||||
assert(program->min_waves >= 1);
|
||||
uint16_t sgpr_limit = get_addr_sgpr_from_waves(program, program->min_waves);
|
||||
uint16_t vgpr_limit = get_addr_vgpr_from_waves(program, program->min_waves);
|
||||
|
||||
/* this won't compile, register pressure reduction necessary */
|
||||
if (new_demand.vgpr > program->vgpr_limit || new_demand.sgpr > program->sgpr_limit) {
|
||||
if (new_demand.vgpr > vgpr_limit || new_demand.sgpr > sgpr_limit) {
|
||||
program->num_waves = 0;
|
||||
program->max_reg_demand = new_demand;
|
||||
} else {
|
||||
|
|
|
@ -73,8 +73,10 @@ struct ra_ctx {
|
|||
std::unordered_map<unsigned, Instruction*> vectors;
|
||||
std::unordered_map<unsigned, Instruction*> split_vectors;
|
||||
aco_ptr<Instruction> pseudo_dummy;
|
||||
unsigned max_used_sgpr = 0;
|
||||
unsigned max_used_vgpr = 0;
|
||||
uint16_t max_used_sgpr = 0;
|
||||
uint16_t max_used_vgpr = 0;
|
||||
uint16_t sgpr_limit;
|
||||
uint16_t vgpr_limit;
|
||||
std::bitset<64> defs_done; /* see MAX_ARGS in aco_instruction_selection_setup.cpp */
|
||||
|
||||
ra_test_policy policy;
|
||||
|
@ -89,6 +91,8 @@ struct ra_ctx {
|
|||
policy(policy_)
|
||||
{
|
||||
pseudo_dummy.reset(create_instruction<Instruction>(aco_opcode::p_parallelcopy, Format::PSEUDO, 0, 0));
|
||||
sgpr_limit = get_addr_sgpr_from_waves(program, program->min_waves);
|
||||
vgpr_limit = get_addr_sgpr_from_waves(program, program->min_waves);
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -650,14 +654,14 @@ void add_subdword_definition(Program *program, aco_ptr<Instruction>& instr, unsi
|
|||
|
||||
void adjust_max_used_regs(ra_ctx& ctx, RegClass rc, unsigned reg)
|
||||
{
|
||||
unsigned max_addressible_sgpr = ctx.program->sgpr_limit;
|
||||
uint16_t max_addressible_sgpr = ctx.sgpr_limit;
|
||||
unsigned size = rc.size();
|
||||
if (rc.type() == RegType::vgpr) {
|
||||
assert(reg >= 256);
|
||||
unsigned hi = reg - 256 + size - 1;
|
||||
uint16_t hi = reg - 256 + size - 1;
|
||||
ctx.max_used_vgpr = std::max(ctx.max_used_vgpr, hi);
|
||||
} else if (reg + rc.size() <= max_addressible_sgpr) {
|
||||
unsigned hi = reg + size - 1;
|
||||
uint16_t hi = reg + size - 1;
|
||||
ctx.max_used_sgpr = std::max(ctx.max_used_sgpr, std::min(hi, max_addressible_sgpr));
|
||||
}
|
||||
}
|
||||
|
@ -1241,11 +1245,9 @@ bool get_reg_specified(ra_ctx& ctx,
|
|||
}
|
||||
|
||||
bool increase_register_file(ra_ctx& ctx, RegType type) {
|
||||
uint16_t max_addressible_sgpr = ctx.program->sgpr_limit;
|
||||
uint16_t max_addressible_vgpr = ctx.program->vgpr_limit;
|
||||
if (type == RegType::vgpr && ctx.program->max_reg_demand.vgpr < max_addressible_vgpr) {
|
||||
if (type == RegType::vgpr && ctx.program->max_reg_demand.vgpr < ctx.vgpr_limit) {
|
||||
update_vgpr_sgpr_demand(ctx.program, RegisterDemand(ctx.program->max_reg_demand.vgpr + 1, ctx.program->max_reg_demand.sgpr));
|
||||
} else if (type == RegType::sgpr && ctx.program->max_reg_demand.sgpr < max_addressible_sgpr) {
|
||||
} else if (type == RegType::sgpr && ctx.program->max_reg_demand.sgpr < ctx.sgpr_limit) {
|
||||
update_vgpr_sgpr_demand(ctx.program, RegisterDemand(ctx.program->max_reg_demand.vgpr, ctx.program->max_reg_demand.sgpr + 1));
|
||||
} else {
|
||||
return false;
|
||||
|
@ -2677,11 +2679,8 @@ void register_allocation(Program *program, std::vector<IDSet>& live_out_per_bloc
|
|||
}
|
||||
|
||||
/* num_gpr = rnd_up(max_used_gpr + 1) */
|
||||
program->config->num_vgprs = align(ctx.max_used_vgpr + 1, 4);
|
||||
if (program->family == CHIP_TONGA || program->family == CHIP_ICELAND) /* workaround hardware bug */
|
||||
program->config->num_sgprs = get_sgpr_alloc(program, program->sgpr_limit);
|
||||
else
|
||||
program->config->num_sgprs = align(ctx.max_used_sgpr + 1 + get_extra_sgprs(program), 8);
|
||||
program->config->num_vgprs = get_vgpr_alloc(program, ctx.max_used_vgpr + 1);
|
||||
program->config->num_sgprs = get_sgpr_alloc(program, ctx.max_used_sgpr + 1);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -1774,14 +1774,16 @@ void spill(Program* program, live& live_vars)
|
|||
|
||||
/* calculate target register demand */
|
||||
RegisterDemand register_target = program->max_reg_demand;
|
||||
if (register_target.sgpr > program->sgpr_limit)
|
||||
register_target.vgpr += (register_target.sgpr - program->sgpr_limit + program->wave_size - 1 + 32) / program->wave_size;
|
||||
register_target.sgpr = program->sgpr_limit;
|
||||
uint16_t sgpr_limit = get_addr_sgpr_from_waves(program, program->min_waves);
|
||||
uint16_t vgpr_limit = get_addr_vgpr_from_waves(program, program->min_waves);
|
||||
if (register_target.sgpr > sgpr_limit)
|
||||
register_target.vgpr += (register_target.sgpr - sgpr_limit + program->wave_size - 1 + 32) / program->wave_size;
|
||||
register_target.sgpr = sgpr_limit;
|
||||
|
||||
if (register_target.vgpr > program->vgpr_limit)
|
||||
register_target.sgpr = program->sgpr_limit - 5;
|
||||
if (register_target.vgpr > vgpr_limit)
|
||||
register_target.sgpr = sgpr_limit - 5;
|
||||
int spills_to_vgpr = (program->max_reg_demand.sgpr - register_target.sgpr + program->wave_size - 1 + 32) / program->wave_size;
|
||||
register_target.vgpr = program->vgpr_limit - spills_to_vgpr;
|
||||
register_target.vgpr = vgpr_limit - spills_to_vgpr;
|
||||
|
||||
/* initialize ctx */
|
||||
spill_ctx ctx(register_target, program, live_vars.register_demand);
|
||||
|
|
|
@ -679,6 +679,7 @@ bool validate_ra(Program *program) {
|
|||
bool err = false;
|
||||
aco::live live_vars = aco::live_var_analysis(program);
|
||||
std::vector<std::vector<Temp>> phi_sgpr_ops(program->blocks.size());
|
||||
uint16_t sgpr_limit = get_addr_sgpr_from_waves(program, program->num_waves);
|
||||
|
||||
std::map<unsigned, Assignment> assignments;
|
||||
for (Block& block : program->blocks) {
|
||||
|
@ -704,7 +705,7 @@ bool validate_ra(Program *program) {
|
|||
if (assignments.count(op.tempId()) && assignments[op.tempId()].reg != op.physReg())
|
||||
err |= ra_fail(program, loc, assignments.at(op.tempId()).firstloc, "Operand %d has an inconsistent register assignment with instruction", i);
|
||||
if ((op.getTemp().type() == RegType::vgpr && op.physReg().reg_b + op.bytes() > (256 + program->config->num_vgprs) * 4) ||
|
||||
(op.getTemp().type() == RegType::sgpr && op.physReg() + op.size() > program->config->num_sgprs && op.physReg() < program->sgpr_limit))
|
||||
(op.getTemp().type() == RegType::sgpr && op.physReg() + op.size() > program->config->num_sgprs && op.physReg() < sgpr_limit))
|
||||
err |= ra_fail(program, loc, assignments.at(op.tempId()).firstloc, "Operand %d has an out-of-bounds register assignment", i);
|
||||
if (op.physReg() == vcc && !program->needs_vcc)
|
||||
err |= ra_fail(program, loc, Location(), "Operand %d fixed to vcc but needs_vcc=false", i);
|
||||
|
@ -725,7 +726,7 @@ bool validate_ra(Program *program) {
|
|||
if (assignments[def.tempId()].defloc.block)
|
||||
err |= ra_fail(program, loc, assignments.at(def.tempId()).defloc, "Temporary %%%d also defined by instruction", def.tempId());
|
||||
if ((def.getTemp().type() == RegType::vgpr && def.physReg().reg_b + def.bytes() > (256 + program->config->num_vgprs) * 4) ||
|
||||
(def.getTemp().type() == RegType::sgpr && def.physReg() + def.size() > program->config->num_sgprs && def.physReg() < program->sgpr_limit))
|
||||
(def.getTemp().type() == RegType::sgpr && def.physReg() + def.size() > program->config->num_sgprs && def.physReg() < sgpr_limit))
|
||||
err |= ra_fail(program, loc, assignments.at(def.tempId()).firstloc, "Definition %d has an out-of-bounds register assignment", i);
|
||||
if (def.physReg() == vcc && !program->needs_vcc)
|
||||
err |= ra_fail(program, loc, Location(), "Definition %d fixed to vcc but needs_vcc=false", i);
|
||||
|
|
|
@ -80,6 +80,8 @@ void create_program(enum chip_class chip_class, Stage stage, unsigned wave_size,
|
|||
|
||||
program.reset(new Program);
|
||||
aco::init_program(program.get(), stage, &info, chip_class, family, &config);
|
||||
program->workgroup_size = UINT_MAX;
|
||||
calc_min_waves(program.get());
|
||||
|
||||
program->debug.func = nullptr;
|
||||
program->debug.private_data = nullptr;
|
||||
|
|
Loading…
Reference in New Issue