vc4: Prioritize allocating accumulators to short-lived values.

The register allocator walks from the end of the nodes array looking for
trivially-allocatable things to put on the stack, meaning (assuming
everything is trivially colorable and gets put on the stack in a single
pass) the low node numbers get allocated first.  The things allocated
first happen to get the lower-numbered registers, which is to say the fast
accumulators that can be paired more easily.

When we previously made the nodes match the temporary register numbers,
we'd end up putting the shader inputs (VS or FS) in the accumulators,
which are often long-lived values.  By prioritizing the shortest-lived
values for allocation, we can get a lot more instructions that involve
accumulators, and thus fewer conflicts for raddr and WS.

total instructions in shared programs: 52870 -> 46428 (-12.18%)
instructions in affected programs:     52260 -> 45818 (-12.33%)
This commit is contained in:
Eric Anholt 2014-12-08 17:43:29 -08:00
parent 0d4272cd8e
commit ab1b1fa6fb
1 changed files with 79 additions and 34 deletions

View File

@ -139,6 +139,20 @@ vc4_alloc_reg_set(struct vc4_context *vc4)
ra_set_finalize(vc4->regs, NULL);
}
struct node_to_temp_map {
uint32_t temp;
uint32_t priority;
};
static int
node_to_temp_priority(const void *in_a, const void *in_b)
{
const struct node_to_temp_map *a = in_a;
const struct node_to_temp_map *b = in_b;
return a->priority - b->priority;
}
/**
* Returns a mapping from QFILE_TEMP indices to struct qpu_regs.
*
@ -148,6 +162,8 @@ struct qpu_reg *
vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c)
{
struct simple_node *node;
struct node_to_temp_map map[c->num_temps];
uint32_t temp_to_node[c->num_temps];
uint32_t def[c->num_temps];
uint32_t use[c->num_temps];
struct qpu_reg *temp_registers = calloc(c->num_temps,
@ -166,11 +182,11 @@ vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c)
struct ra_graph *g = ra_alloc_interference_graph(vc4->regs,
c->num_temps);
for (uint32_t i = 0; i < c->num_temps; i++)
for (uint32_t i = 0; i < c->num_temps; i++) {
ra_set_node_class(g, i, vc4->reg_class_any);
}
/* Compute the live ranges so we can figure out interference, and
* figure out our register classes and preallocated registers.
/* Compute the live ranges so we can figure out interference.
*/
uint32_t ip = 0;
foreach(node, &c->instructions) {
@ -188,37 +204,12 @@ vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c)
switch (inst->op) {
case QOP_FRAG_Z:
def[inst->dst.index] = 0;
ra_set_node_reg(g, inst->dst.index,
AB_INDEX + QPU_R_FRAG_PAYLOAD_ZW * 2 + 1);
break;
case QOP_FRAG_W:
/* The payload registers have values implicitly loaded
* at the start of the program.
*/
def[inst->dst.index] = 0;
ra_set_node_reg(g, inst->dst.index,
AB_INDEX + QPU_R_FRAG_PAYLOAD_ZW * 2);
break;
case QOP_TEX_RESULT:
case QOP_TLB_COLOR_READ:
assert(vc4_regs[ACC_INDEX + 4].mux == QPU_MUX_R4);
ra_set_node_reg(g, inst->dst.index,
ACC_INDEX + 4);
break;
case QOP_PACK_SCALED:
/* The pack flags require an A-file dst register. */
ra_set_node_class(g, inst->dst.index, vc4->reg_class_a);
break;
case QOP_UNPACK_8A:
case QOP_UNPACK_8B:
case QOP_UNPACK_8C:
case QOP_UNPACK_8D:
/* The unpack flags require an A-file src register. */
ra_set_node_class(g, inst->src[0].index, vc4->reg_class_a);
break;
default:
break;
}
@ -226,10 +217,64 @@ vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c)
ip++;
}
for (uint32_t i = 0; i < c->num_temps; i++) {
map[i].temp = i;
map[i].priority = use[i] - def[i];
}
qsort(map, c->num_temps, sizeof(map[0]), node_to_temp_priority);
for (uint32_t i = 0; i < c->num_temps; i++) {
temp_to_node[map[i].temp] = i;
}
/* Figure out our register classes and preallocated registers*/
foreach(node, &c->instructions) {
struct qinst *inst = (struct qinst *)node;
switch (inst->op) {
case QOP_FRAG_Z:
ra_set_node_reg(g, temp_to_node[inst->dst.index],
AB_INDEX + QPU_R_FRAG_PAYLOAD_ZW * 2 + 1);
break;
case QOP_FRAG_W:
ra_set_node_reg(g, temp_to_node[inst->dst.index],
AB_INDEX + QPU_R_FRAG_PAYLOAD_ZW * 2);
break;
case QOP_TEX_RESULT:
case QOP_TLB_COLOR_READ:
assert(vc4_regs[ACC_INDEX + 4].mux == QPU_MUX_R4);
ra_set_node_reg(g, temp_to_node[inst->dst.index],
ACC_INDEX + 4);
break;
case QOP_PACK_SCALED:
/* The pack flags require an A-file dst register. */
ra_set_node_class(g, temp_to_node[inst->dst.index],
vc4->reg_class_a);
break;
case QOP_UNPACK_8A:
case QOP_UNPACK_8B:
case QOP_UNPACK_8C:
case QOP_UNPACK_8D:
/* The unpack flags require an A-file src register. */
ra_set_node_class(g, temp_to_node[inst->src[0].index],
vc4->reg_class_a);
break;
default:
break;
}
}
for (uint32_t i = 0; i < c->num_temps; i++) {
for (uint32_t j = i + 1; j < c->num_temps; j++) {
if (!(def[i] >= use[j] || def[j] >= use[i]))
ra_add_node_interference(g, i, j);
if (!(def[i] >= use[j] || def[j] >= use[i])) {
ra_add_node_interference(g,
temp_to_node[i],
temp_to_node[j]);
}
}
}
@ -237,7 +282,7 @@ vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c)
assert(ok);
for (uint32_t i = 0; i < c->num_temps; i++) {
temp_registers[i] = vc4_regs[ra_get_node_reg(g, i)];
temp_registers[i] = vc4_regs[ra_get_node_reg(g, temp_to_node[i])];
/* If the value's never used, just write to the NOP register
* for clarity in debug output.