broadcom/compiler: change register allocation policy for accumulators

The current policy is to always favor accumulators if possible, however,
this is not always optimal.

Particularly, accumulators play a crucial role in enabling QPU instruction
merges, since these are limited to both the ADD and the ALU instructions
addressing at most 2 physical registers. For 2-src instructions, this means
that to be able to merge we need them to address at least 2 accumulators.

While favoring accumulators does help the case for instruction merges in
general, it is risky to assign accumulators to variables that have
long life spans. Doing so will make the accumulator unavailable for
any other instructions during that life span, and since we only have a few
accumulators, we can quickly run out and losing our capacity to merge
instructions for large parts of the qpu program.

On the other hand, we also want to avoid the extreme case were we keep
allocating physical registers to the point we run out, even if we have
accumulators available, since accumulators have additional restrictions
and may not be suitable for everything.

This change continues the policy of favoring accumulators, but it only
does so if the life span of the temps is short, to ensure that we can
recycle accumulators often across instructions and avoid running out
for sections of the QPU code, unless we are already running out of
physical registers.

total instructions in shared programs: 13654647 -> 13336921 (-2.33%)
instructions in affected programs: 11015919 -> 10698193 (-2.88%)
helped: 39758
HURT: 17325
Instructions are helped.

total threads in shared programs: 412046 -> 412038 (<.01%)
threads in affected programs: 16 -> 8 (-50.00%)
helped: 0
HURT: 4
Threads are HURT.

total uniforms in shared programs: 3745726 -> 3746003 (<.01%)
uniforms in affected programs: 17296 -> 17573 (1.60%)
helped: 76
HURT: 99
Uniforms are HURT.

total max-temps in shared programs: 2364430 -> 2359942 (-0.19%)
max-temps in affected programs: 109117 -> 104629 (-4.11%)
helped: 2893
HURT: 772
Max-temps are helped.

total spills in shared programs: 5727 -> 5746 (0.33%)
spills in affected programs: 221 -> 240 (8.60%)
helped: 1
HURT: 2

total fills in shared programs: 13121 -> 13139 (0.14%)
fills in affected programs: 466 -> 484 (3.86%)
helped: 1
HURT: 2

total sfu-stalls in shared programs: 33432 -> 34491 (3.17%)
sfu-stalls in affected programs: 18219 -> 19278 (5.81%)
helped: 4459
HURT: 5087
Inconclusive result

total inst-and-stalls in shared programs: 13688079 -> 13371412 (-2.31%)
inst-and-stalls in affected programs: 11030017 -> 10713350 (-2.87%)
helped: 39630
HURT: 17429
Inst-and-stalls are helped.

total nops in shared programs: 335753 -> 333708 (-0.61%)
nops in affected programs: 112659 -> 110614 (-1.82%)
helped: 8726
HURT: 7383
Inconclusive result

Reviewed-by: Alejandro Piñeiro <apinheiro@igalia.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/10686>
This commit is contained in:
Iago Toral Quiroga 2021-05-06 11:32:46 +02:00
parent 24043d215f
commit d81a6e5f1d
1 changed files with 100 additions and 27 deletions

View File

@ -372,11 +372,97 @@ v3d_spill_reg(struct v3d_compile *c, int spill_temp)
c->disable_ldunif_opt = had_disable_ldunif_opt;
}
struct node_to_temp_map {
uint32_t temp;
uint32_t priority;
};
struct v3d_ra_select_callback_data {
uint32_t next_acc;
uint32_t next_phys;
struct node_to_temp_map *map;
};
/* Choosing accumulators improves chances of merging QPU instructions
* due to these merges requiring that at most 2 rf registers are used
* by the add and mul instructions.
*/
static bool
v3d_ra_favor_accum(struct v3d_ra_select_callback_data *v3d_ra,
BITSET_WORD *regs,
int priority)
{
/* Favor accumulators if we have less that this number of physical
* registers. Accumulators have more restrictions (like being
* invalidated through thrsw), so running out of physical registers
* even if we have accumulators available can lead to register
* allocation failures.
*/
static const int available_rf_threshold = 5;
int available_rf = 0 ;
for (int i = 0; i < PHYS_COUNT; i++) {
if (BITSET_TEST(regs, PHYS_INDEX + i))
available_rf++;
if (available_rf >= available_rf_threshold)
break;
}
if (available_rf < available_rf_threshold)
return true;
/* Favor accumulators for short-lived temps (our priority represents
* liveness), to prevent long-lived temps from grabbing accumulators
* and preventing follow-up instructions from using them, potentially
* leading to large portions of the shader being unable to use
* accumulators and therefore merge instructions successfully.
*/
static const int priority_threshold = 20;
if (priority <= priority_threshold)
return true;
return false;
}
static bool
v3d_ra_select_accum(struct v3d_ra_select_callback_data *v3d_ra,
BITSET_WORD *regs,
unsigned int *out)
{
/* Round-robin through our accumulators to give post-RA instruction
* selection more options.
*/
for (int i = 0; i < ACC_COUNT; i++) {
int acc_off = (v3d_ra->next_acc + i) % ACC_COUNT;
int acc = ACC_INDEX + acc_off;
if (BITSET_TEST(regs, acc)) {
v3d_ra->next_acc = acc_off + 1;
*out = acc;
return true;
}
}
return false;
}
static bool
v3d_ra_select_rf(struct v3d_ra_select_callback_data *v3d_ra,
BITSET_WORD *regs,
unsigned int *out)
{
for (int i = 0; i < PHYS_COUNT; i++) {
int phys_off = (v3d_ra->next_phys + i) % PHYS_COUNT;
int phys = PHYS_INDEX + phys_off;
if (BITSET_TEST(regs, phys)) {
v3d_ra->next_phys = phys_off + 1;
*out = phys;
return true;
}
}
return false;
}
static unsigned int
v3d_ra_select_callback(unsigned int n, BITSET_WORD *regs, void *data)
{
@ -390,29 +476,20 @@ v3d_ra_select_callback(unsigned int n, BITSET_WORD *regs, void *data)
if (BITSET_TEST(regs, r5))
return r5;
/* Choose an accumulator if possible (I think it's lower power than
* phys regs), but round-robin through them to give post-RA
* instruction selection more options.
unsigned int reg;
if (v3d_ra_favor_accum(v3d_ra, regs, v3d_ra->map[n].priority) &&
v3d_ra_select_accum(v3d_ra, regs, &reg)) {
return reg;
}
if (v3d_ra_select_rf(v3d_ra, regs, &reg))
return reg;
/* If we ran out of physical registers try to assign an accumulator
* if we didn't favor that option earlier.
*/
for (int i = 0; i < ACC_COUNT; i++) {
int acc_off = (v3d_ra->next_acc + i) % ACC_COUNT;
int acc = ACC_INDEX + acc_off;
if (BITSET_TEST(regs, acc)) {
v3d_ra->next_acc = acc_off + 1;
return acc;
}
}
for (int i = 0; i < PHYS_COUNT; i++) {
int phys_off = (v3d_ra->next_phys + i) % PHYS_COUNT;
int phys = PHYS_INDEX + phys_off;
if (BITSET_TEST(regs, phys)) {
v3d_ra->next_phys = phys_off + 1;
return phys;
}
}
if (v3d_ra_select_accum(v3d_ra, regs, &reg))
return reg;
unreachable("RA must pass us at least one possible reg.");
}
@ -472,11 +549,6 @@ vir_init_reg_sets(struct v3d_compiler *compiler)
return true;
}
struct node_to_temp_map {
uint32_t temp;
uint32_t priority;
};
static int
node_to_temp_priority(const void *in_a, const void *in_b)
{
@ -542,6 +614,7 @@ v3d_register_allocate(struct v3d_compile *c, bool *spilled)
* RF0-2.
*/
.next_phys = 3,
.map = map,
};
*spilled = false;