aco: increase accuracy of SGPR limits
SGPRs are allocated in groups of 16 on GFX8/GFX9. GFX10 allocates a fixed number of SGPRs and has 106 addressable SGPRs. pipeline-db (Vega): SGPRS: 5912 -> 6232 (5.41 %) VGPRS: 1772 -> 1780 (0.45 %) Spilled SGPRs: 0 -> 0 (0.00 %) Spilled VGPRs: 0 -> 0 (0.00 %) Private memory VGPRs: 0 -> 0 (0.00 %) Scratch size: 0 -> 0 (0.00 %) dwords per thread Code Size: 88228 -> 87904 (-0.37 %) bytes LDS: 0 -> 0 (0.00 %) blocks Max Waves: 559 -> 571 (2.15 %) piepline-db (Navi): SGPRS: 341256 -> 363384 (6.48 %) VGPRS: 171536 -> 170960 (-0.34 %) Spilled SGPRs: 832 -> 581 (-30.17 %) Spilled VGPRs: 0 -> 0 (0.00 %) Private memory VGPRs: 0 -> 0 (0.00 %) Scratch size: 0 -> 0 (0.00 %) dwords per thread Code Size: 14207332 -> 14190872 (-0.12 %) bytes LDS: 33 -> 33 (0.00 %) blocks Max Waves: 18072 -> 18251 (0.99 %) v2: unconditionally count vcc as an extra sgpr on GFX10+ v3: pass SGPRs rounded to 8 Signed-off-by: Rhys Perry <pendingchaos02@gmail.com> Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
This commit is contained in:
parent
7453c1adff
commit
08d510010b
|
@ -1254,9 +1254,25 @@ setup_isel_context(Program* program,
|
|||
program->chip_class = options->chip_class;
|
||||
program->family = options->family;
|
||||
program->wave_size = options->wave_size;
|
||||
program->sgpr_limit = options->chip_class >= GFX8 ? 102 : 104;
|
||||
if (options->family == CHIP_TONGA || options->family == CHIP_ICELAND)
|
||||
program->sgpr_limit = 94; /* workaround hardware bug */
|
||||
|
||||
if (options->chip_class >= GFX10) {
|
||||
program->physical_sgprs = 2560; /* doesn't matter as long as it's at least 128 * 20 */
|
||||
program->sgpr_alloc_granule = 127;
|
||||
program->sgpr_limit = 106;
|
||||
} else if (program->chip_class >= GFX8) {
|
||||
program->physical_sgprs = 800;
|
||||
program->sgpr_alloc_granule = 15;
|
||||
program->sgpr_limit = 102;
|
||||
} else {
|
||||
program->physical_sgprs = 512;
|
||||
program->sgpr_alloc_granule = 7;
|
||||
if (options->family == CHIP_TONGA || options->family == CHIP_ICELAND)
|
||||
program->sgpr_limit = 94; /* workaround hardware bug */
|
||||
else
|
||||
program->sgpr_limit = 104;
|
||||
}
|
||||
/* TODO: we don't have to allocate VCC if we don't need it */
|
||||
program->needs_vcc = true;
|
||||
|
||||
for (unsigned i = 0; i < MAX_SETS; ++i)
|
||||
program->info->user_sgprs_locs.descriptor_sets[i].sgpr_idx = -1;
|
||||
|
|
|
@ -1062,7 +1062,6 @@ class Program final {
|
|||
public:
|
||||
std::vector<Block> blocks;
|
||||
RegisterDemand max_reg_demand = RegisterDemand();
|
||||
uint16_t sgpr_limit = 0;
|
||||
uint16_t num_waves = 0;
|
||||
ac_shader_config* config;
|
||||
struct radv_shader_info *info;
|
||||
|
@ -1076,6 +1075,13 @@ public:
|
|||
|
||||
std::vector<uint8_t> constant_data;
|
||||
|
||||
uint16_t physical_sgprs;
|
||||
uint16_t sgpr_alloc_granule; /* minus one. must be power of two */
|
||||
uint16_t sgpr_limit;
|
||||
bool needs_vcc = false;
|
||||
bool needs_xnack_mask = false;
|
||||
bool needs_flat_scr = false;
|
||||
|
||||
uint32_t allocateId()
|
||||
{
|
||||
assert(allocationID <= 16777215);
|
||||
|
@ -1154,6 +1160,15 @@ void perfwarn(bool cond, const char *msg, Instruction *instr=NULL);
|
|||
void aco_print_instr(Instruction *instr, FILE *output);
|
||||
void aco_print_program(Program *program, FILE *output);
|
||||
|
||||
/* number of sgprs that need to be allocated but might notbe addressable as s0-s105 */
|
||||
uint16_t get_extra_sgprs(Program *program);
|
||||
|
||||
/* get number of sgprs allocated required to address a number of sgprs */
|
||||
uint16_t get_sgpr_alloc(Program *program, uint16_t addressable_sgprs);
|
||||
|
||||
/* return number of addressable SGPRs for max_waves */
|
||||
uint16_t get_addr_sgpr_from_waves(Program *program, uint16_t max_waves);
|
||||
|
||||
typedef struct {
|
||||
const int16_t opcode_gfx9[static_cast<int>(aco_opcode::num_opcodes)];
|
||||
const int16_t opcode_gfx10[static_cast<int>(aco_opcode::num_opcodes)];
|
||||
|
|
|
@ -28,6 +28,7 @@
|
|||
*/
|
||||
|
||||
#include "aco_ir.h"
|
||||
#include "util/u_math.h"
|
||||
|
||||
#include <set>
|
||||
#include <vector>
|
||||
|
@ -190,25 +191,62 @@ void process_live_temps_per_block(Program *program, live& lives, Block* block,
|
|||
}
|
||||
} /* end namespace */
|
||||
|
||||
uint16_t get_extra_sgprs(Program *program)
|
||||
{
|
||||
if (program->chip_class >= GFX10) {
|
||||
assert(!program->needs_flat_scr);
|
||||
assert(!program->needs_xnack_mask);
|
||||
return 2;
|
||||
} else if (program->chip_class >= GFX8) {
|
||||
if (program->needs_flat_scr)
|
||||
return 6;
|
||||
else if (program->needs_xnack_mask)
|
||||
return 4;
|
||||
else if (program->needs_vcc)
|
||||
return 2;
|
||||
else
|
||||
return 0;
|
||||
} else {
|
||||
assert(!program->needs_xnack_mask);
|
||||
if (program->needs_flat_scr)
|
||||
return 4;
|
||||
else if (program->needs_vcc)
|
||||
return 2;
|
||||
else
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
uint16_t get_sgpr_alloc(Program *program, uint16_t addressable_sgprs)
|
||||
{
|
||||
assert(addressable_sgprs <= program->sgpr_limit);
|
||||
uint16_t sgprs = addressable_sgprs + get_extra_sgprs(program);
|
||||
uint16_t granule = program->sgpr_alloc_granule + 1;
|
||||
return align(std::max(sgprs, granule), granule);
|
||||
}
|
||||
|
||||
uint16_t get_addr_sgpr_from_waves(Program *program, uint16_t max_waves)
|
||||
{
|
||||
uint16_t sgprs = program->physical_sgprs / max_waves & ~program->sgpr_alloc_granule;
|
||||
sgprs -= get_extra_sgprs(program);
|
||||
return std::min(sgprs, program->sgpr_limit);
|
||||
}
|
||||
|
||||
void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand)
|
||||
{
|
||||
// TODO: also take shared mem into account
|
||||
const int16_t total_sgpr_regs = program->chip_class >= GFX8 ? 800 : 512;
|
||||
const int16_t max_addressible_sgpr = program->sgpr_limit;
|
||||
/* VGPRs are allocated in chunks of 4 */
|
||||
const int16_t rounded_vgpr_demand = std::max<int16_t>(4, (new_demand.vgpr + 3) & ~3);
|
||||
/* SGPRs are allocated in chunks of 16 between 8 and 104. VCC occupies the last 2 registers */
|
||||
const int16_t rounded_sgpr_demand = std::min(std::max<int16_t>(8, (new_demand.sgpr + 2 + 7) & ~7), max_addressible_sgpr);
|
||||
const int16_t vgpr_alloc = std::max<int16_t>(4, (new_demand.vgpr + 3) & ~3);
|
||||
/* this won't compile, register pressure reduction necessary */
|
||||
if (new_demand.vgpr > 256 || new_demand.sgpr > max_addressible_sgpr) {
|
||||
if (new_demand.vgpr > 256 || new_demand.sgpr > program->sgpr_limit) {
|
||||
program->num_waves = 0;
|
||||
program->max_reg_demand = new_demand;
|
||||
} else {
|
||||
program->num_waves = std::min<uint16_t>(10,
|
||||
std::min<uint16_t>(256 / rounded_vgpr_demand,
|
||||
total_sgpr_regs / rounded_sgpr_demand));
|
||||
program->num_waves = program->physical_sgprs / get_sgpr_alloc(program, new_demand.sgpr);
|
||||
program->num_waves = std::min<uint16_t>(program->num_waves, 256 / vgpr_alloc);
|
||||
program->num_waves = std::min<uint16_t>(program->num_waves, 10);
|
||||
|
||||
program->max_reg_demand = { int16_t((256 / program->num_waves) & ~3), std::min<int16_t>(((total_sgpr_regs / program->num_waves) & ~7) - 2, max_addressible_sgpr)};
|
||||
program->max_reg_demand.vgpr = int16_t((256 / program->num_waves) & ~3);
|
||||
program->max_reg_demand.sgpr = get_addr_sgpr_from_waves(program, program->num_waves);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -34,6 +34,7 @@
|
|||
|
||||
#include "aco_ir.h"
|
||||
#include "sid.h"
|
||||
#include "util/u_math.h"
|
||||
|
||||
namespace aco {
|
||||
namespace {
|
||||
|
@ -1914,12 +1915,11 @@ void register_allocation(Program *program, std::vector<std::set<Temp>> live_out_
|
|||
}
|
||||
|
||||
/* num_gpr = rnd_up(max_used_gpr + 1) */
|
||||
program->config->num_vgprs = (ctx.max_used_vgpr + 1 + 3) & ~3;
|
||||
if (program->family == CHIP_TONGA || program->family == CHIP_ICELAND) {
|
||||
assert(ctx.max_used_sgpr <= 93);
|
||||
ctx.max_used_sgpr = 93; /* workaround hardware bug */
|
||||
}
|
||||
program->config->num_sgprs = (ctx.max_used_sgpr + 1 + 2 + 7) & ~7; /* + 2 sgprs for vcc */
|
||||
program->config->num_vgprs = align(ctx.max_used_vgpr + 1, 4);
|
||||
if (program->family == CHIP_TONGA || program->family == CHIP_ICELAND) /* workaround hardware bug */
|
||||
program->config->num_sgprs = get_sgpr_alloc(program, program->sgpr_limit);
|
||||
else
|
||||
program->config->num_sgprs = align(ctx.max_used_sgpr + 1 + get_extra_sgprs(program), 8);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -806,9 +806,9 @@ void schedule_program(Program *program, live& live_vars)
|
|||
//TODO: this also increases window-size/max-moves? did I realize that at the time?
|
||||
ctx.num_waves = std::min<uint16_t>(program->num_waves, 5);
|
||||
assert(ctx.num_waves);
|
||||
uint16_t total_sgpr_regs = program->chip_class >= GFX8 ? 800 : 512;
|
||||
uint16_t total_sgpr_regs = program->physical_sgprs;
|
||||
uint16_t max_addressible_sgpr = program->sgpr_limit;
|
||||
ctx.max_registers = { int16_t(((256 / ctx.num_waves) & ~3) - 2), std::min<int16_t>(((total_sgpr_regs / ctx.num_waves) & ~7) - 2, max_addressible_sgpr)};
|
||||
ctx.max_registers = { int16_t(((256 / ctx.num_waves) & ~3) - 2), std::min<int16_t>(((total_sgpr_regs / ctx.num_waves) & ~program->sgpr_alloc_granule) - 2, max_addressible_sgpr)};
|
||||
|
||||
for (Block& block : program->blocks)
|
||||
schedule_block(ctx, program, &block, live_vars);
|
||||
|
|
|
@ -1568,8 +1568,6 @@ void spill(Program* program, live& live_vars, const struct radv_nir_compiler_opt
|
|||
return;
|
||||
|
||||
/* else, we check if we can improve things a bit */
|
||||
uint16_t total_sgpr_regs = options->chip_class >= GFX8 ? 800 : 512;
|
||||
uint16_t max_addressible_sgpr = program->sgpr_limit;
|
||||
|
||||
/* calculate target register demand */
|
||||
RegisterDemand max_reg_demand;
|
||||
|
@ -1577,14 +1575,14 @@ void spill(Program* program, live& live_vars, const struct radv_nir_compiler_opt
|
|||
max_reg_demand.update(block.register_demand);
|
||||
}
|
||||
|
||||
RegisterDemand target_pressure = {256, int16_t(max_addressible_sgpr)};
|
||||
RegisterDemand target_pressure = {256, int16_t(program->sgpr_limit)};
|
||||
unsigned num_waves = 1;
|
||||
int spills_to_vgpr = (max_reg_demand.sgpr - max_addressible_sgpr + 63) / 64;
|
||||
int spills_to_vgpr = (max_reg_demand.sgpr - program->sgpr_limit + 63) / 64;
|
||||
|
||||
/* test if it possible to increase occupancy with little spilling */
|
||||
for (unsigned num_waves_next = 2; num_waves_next <= 8; num_waves_next++) {
|
||||
RegisterDemand target_pressure_next = {int16_t((256 / num_waves_next) & ~3),
|
||||
int16_t(std::min<uint16_t>(((total_sgpr_regs / num_waves_next) & ~7) - 2, max_addressible_sgpr))};
|
||||
int16_t(get_addr_sgpr_from_waves(program, num_waves_next))};
|
||||
|
||||
/* Currently no vgpr spilling supported.
|
||||
* Spill as many sgprs as necessary to not hinder occupancy */
|
||||
|
|
Loading…
Reference in New Issue