util/ra: Improve the performance of ra_simplify
The most expensive part of register allocation is the ra_simplify step which is a fixed-point algorithm with a worst-case complexity of O(n^2) which adds the registers to a stack which we then use later to do the actual allocation. This commit uses bit sets and changes the core loop of ra_simplify to first walk 32-node chunks and then walk each chunk. This lets us skip whole 32-node chunks in one go based on bit operations and compute the minimum q value potentially 32x as fast. Of course, the algorithm still has the same fundamental O(n^2) run-time but the constant is now much lower. In the nasty Aztec Ruins compute shader, this shaves a full four seconds off the 30s compile time for a release build of mesa. In a debug build (needed for accurate stack traces), perf says that ra_select takes 20% of runtime before this patch and only 5-6% of runtime after this patch. It also makes shader-db runs faster. Shader-db results on Kaby Lake: total instructions in shared programs: 15311100 -> 15311100 (0.00%) instructions in affected programs: 0 -> 0 helped: 0 HURT: 0 total cycles in shared programs: 355468050 -> 355468050 (0.00%) cycles in affected programs: 0 -> 0 helped: 0 HURT: 0 Total CPU time (seconds): 2602.37 -> 2524.31 (-3.00%) Reviewed-by: Eric Anholt <eric@anholt.net>
This commit is contained in:
parent
e1511f1d4c
commit
41b310e219
|
@ -163,6 +163,21 @@ struct ra_graph {
|
||||||
/** Bit-set indicating, for each register, if it's in the stack */
|
/** Bit-set indicating, for each register, if it's in the stack */
|
||||||
BITSET_WORD *in_stack;
|
BITSET_WORD *in_stack;
|
||||||
|
|
||||||
|
/** Bit-set indicating, for each register, if it pre-assigned */
|
||||||
|
BITSET_WORD *reg_assigned;
|
||||||
|
|
||||||
|
/** Bit-set indicating, for each register, the value of the pq test */
|
||||||
|
BITSET_WORD *pq_test;
|
||||||
|
|
||||||
|
/** For each BITSET_WORD, the minimum q value or ~0 if unknown */
|
||||||
|
unsigned int *min_q_total;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* * For each BITSET_WORD, the node with the minimum q_total if
|
||||||
|
* min_q_total[i] != ~0.
|
||||||
|
*/
|
||||||
|
unsigned int *min_q_node;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Tracks the start of the set of optimistically-colored registers in the
|
* Tracks the start of the set of optimistically-colored registers in the
|
||||||
* stack.
|
* stack.
|
||||||
|
@ -424,6 +439,11 @@ ra_alloc_interference_graph(struct ra_regs *regs, unsigned int count)
|
||||||
int bitset_count = BITSET_WORDS(count);
|
int bitset_count = BITSET_WORDS(count);
|
||||||
g->in_stack = rzalloc_array(g, BITSET_WORD, bitset_count);
|
g->in_stack = rzalloc_array(g, BITSET_WORD, bitset_count);
|
||||||
|
|
||||||
|
g->reg_assigned = rzalloc_array(g, BITSET_WORD, bitset_count);
|
||||||
|
g->pq_test = rzalloc_array(g, BITSET_WORD, bitset_count);
|
||||||
|
g->min_q_total = rzalloc_array(g, unsigned int, bitset_count);
|
||||||
|
g->min_q_node = rzalloc_array(g, unsigned int, bitset_count);
|
||||||
|
|
||||||
for (i = 0; i < count; i++) {
|
for (i = 0; i < count; i++) {
|
||||||
g->nodes[i].adjacency = rzalloc_array(g, BITSET_WORD, bitset_count);
|
g->nodes[i].adjacency = rzalloc_array(g, BITSET_WORD, bitset_count);
|
||||||
|
|
||||||
|
@ -466,29 +486,54 @@ ra_add_node_interference(struct ra_graph *g,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool
|
static void
|
||||||
pq_test(struct ra_graph *g, unsigned int n)
|
update_pq_info(struct ra_graph *g, unsigned int n)
|
||||||
{
|
{
|
||||||
|
int i = n / BITSET_WORDBITS;
|
||||||
int n_class = g->nodes[n].class;
|
int n_class = g->nodes[n].class;
|
||||||
|
if (g->nodes[n].q_total < g->regs->classes[n_class]->p) {
|
||||||
return g->nodes[n].q_total < g->regs->classes[n_class]->p;
|
BITSET_SET(g->pq_test, n);
|
||||||
|
} else if (g->min_q_total[i] != UINT_MAX) {
|
||||||
|
/* Only update min_q_total and min_q_node if min_q_total != UINT_MAX so
|
||||||
|
* that we don't update while we have stale data and accidentally mark
|
||||||
|
* it as non-stale. Also, in order to remain consistent with the old
|
||||||
|
* naive implementation of the algorithm, we do a lexicographical sort
|
||||||
|
* to ensure that we always choose the node with the highest node index.
|
||||||
|
*/
|
||||||
|
if (g->nodes[n].q_total < g->min_q_total[i] ||
|
||||||
|
(g->nodes[n].q_total == g->min_q_total[i] &&
|
||||||
|
n > g->min_q_node[i])) {
|
||||||
|
g->min_q_total[i] = g->nodes[n].q_total;
|
||||||
|
g->min_q_node[i] = n;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void
|
static void
|
||||||
decrement_q(struct ra_graph *g, unsigned int n)
|
add_node_to_stack(struct ra_graph *g, unsigned int n)
|
||||||
{
|
{
|
||||||
unsigned int i;
|
unsigned int i;
|
||||||
int n_class = g->nodes[n].class;
|
int n_class = g->nodes[n].class;
|
||||||
|
|
||||||
|
assert(!BITSET_TEST(g->in_stack, n));
|
||||||
|
|
||||||
for (i = 0; i < g->nodes[n].adjacency_count; i++) {
|
for (i = 0; i < g->nodes[n].adjacency_count; i++) {
|
||||||
unsigned int n2 = g->nodes[n].adjacency_list[i];
|
unsigned int n2 = g->nodes[n].adjacency_list[i];
|
||||||
unsigned int n2_class = g->nodes[n2].class;
|
unsigned int n2_class = g->nodes[n2].class;
|
||||||
|
|
||||||
if (!BITSET_TEST(g->in_stack, n2) && g->nodes[n2].reg == NO_REG) {
|
if (!BITSET_TEST(g->in_stack, n2) && !BITSET_TEST(g->reg_assigned, n2)) {
|
||||||
assert(g->nodes[n2].q_total >= g->regs->classes[n2_class]->q[n_class]);
|
assert(g->nodes[n2].q_total >= g->regs->classes[n2_class]->q[n_class]);
|
||||||
g->nodes[n2].q_total -= g->regs->classes[n2_class]->q[n_class];
|
g->nodes[n2].q_total -= g->regs->classes[n2_class]->q[n_class];
|
||||||
|
update_pq_info(g, n2);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
g->stack[g->stack_count] = n;
|
||||||
|
g->stack_count++;
|
||||||
|
BITSET_SET(g->in_stack, n);
|
||||||
|
|
||||||
|
/* Flag the min_q_total for n's block as dirty so it gets recalculated */
|
||||||
|
g->min_q_total[n / BITSET_WORDBITS] = UINT_MAX;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -506,45 +551,89 @@ ra_simplify(struct ra_graph *g)
|
||||||
{
|
{
|
||||||
bool progress = true;
|
bool progress = true;
|
||||||
unsigned int stack_optimistic_start = UINT_MAX;
|
unsigned int stack_optimistic_start = UINT_MAX;
|
||||||
int i;
|
|
||||||
|
/* Figure out the high bit and bit mask for the first iteration of a loop
|
||||||
|
* over BITSET_WORDs.
|
||||||
|
*/
|
||||||
|
const unsigned int top_word_high_bit = (g->count - 1) % BITSET_WORDBITS;
|
||||||
|
|
||||||
|
/* Do a quick pre-pass to set things up */
|
||||||
|
for (int i = BITSET_WORDS(g->count) - 1, high_bit = top_word_high_bit;
|
||||||
|
i >= 0; i--, high_bit = BITSET_WORDBITS - 1) {
|
||||||
|
g->min_q_total[i] = UINT_MAX;
|
||||||
|
g->min_q_node[i] = UINT_MAX;
|
||||||
|
for (int j = high_bit; j >= 0; j--) {
|
||||||
|
unsigned int n = i * BITSET_WORDBITS + j;
|
||||||
|
if (g->nodes[n].reg != NO_REG)
|
||||||
|
g->reg_assigned[i] |= BITSET_BIT(j);
|
||||||
|
update_pq_info(g, n);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
while (progress) {
|
while (progress) {
|
||||||
unsigned int best_optimistic_node = ~0;
|
unsigned int min_q_total = UINT_MAX;
|
||||||
unsigned int lowest_q_total = ~0;
|
unsigned int min_q_node = UINT_MAX;
|
||||||
|
|
||||||
progress = false;
|
progress = false;
|
||||||
|
|
||||||
for (i = g->count - 1; i >= 0; i--) {
|
for (int i = BITSET_WORDS(g->count) - 1, high_bit = top_word_high_bit;
|
||||||
if (BITSET_TEST(g->in_stack, i) || g->nodes[i].reg != NO_REG)
|
i >= 0; i--, high_bit = BITSET_WORDBITS - 1) {
|
||||||
|
BITSET_WORD mask = ~(BITSET_WORD)0 >> (31 - high_bit);
|
||||||
|
|
||||||
|
BITSET_WORD skip = g->in_stack[i] | g->reg_assigned[i];
|
||||||
|
if (skip == mask)
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
if (pq_test(g, i)) {
|
BITSET_WORD pq = g->pq_test[i] & ~skip;
|
||||||
decrement_q(g, i);
|
if (pq) {
|
||||||
g->stack[g->stack_count] = i;
|
/* In this case, we have stuff we can immediately take off the
|
||||||
g->stack_count++;
|
* stack. This also means that we're guaranteed to make progress
|
||||||
BITSET_SET(g->in_stack, i);
|
* and we don't need to bother updating lowest_q_total because we
|
||||||
progress = true;
|
* know we're going to loop again before attempting to do anything
|
||||||
} else if (!progress) {
|
* optimistic.
|
||||||
/* We only need to do this if we haven't made progress. If we
|
|
||||||
* have made progress, we'll throw the data away and loop again
|
|
||||||
* anyway.
|
|
||||||
*/
|
*/
|
||||||
unsigned int new_q_total = g->nodes[i].q_total;
|
for (int j = high_bit; j >= 0; j--) {
|
||||||
if (new_q_total < lowest_q_total) {
|
if (pq & BITSET_BIT(j)) {
|
||||||
best_optimistic_node = i;
|
unsigned int n = i * BITSET_WORDBITS + j;
|
||||||
lowest_q_total = new_q_total;
|
assert(n < g->count);
|
||||||
|
add_node_to_stack(g, n);
|
||||||
|
/* add_node_to_stack() may update pq_test for this word so
|
||||||
|
* we need to update our local copy.
|
||||||
|
*/
|
||||||
|
pq = g->pq_test[i] & ~skip;
|
||||||
|
progress = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else if (!progress) {
|
||||||
|
if (g->min_q_total[i] == UINT_MAX) {
|
||||||
|
/* The min_q_total and min_q_node are dirty because we added
|
||||||
|
* one of these nodes to the stack. It needs to be
|
||||||
|
* recalculated.
|
||||||
|
*/
|
||||||
|
for (int j = high_bit; j >= 0; j--) {
|
||||||
|
if (skip & BITSET_BIT(j))
|
||||||
|
continue;
|
||||||
|
|
||||||
|
unsigned int n = i * BITSET_WORDBITS + j;
|
||||||
|
assert(n < g->count);
|
||||||
|
if (g->nodes[n].q_total < g->min_q_total[i]) {
|
||||||
|
g->min_q_total[i] = g->nodes[n].q_total;
|
||||||
|
g->min_q_node[i] = n;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (g->min_q_total[i] < min_q_total) {
|
||||||
|
min_q_node = g->min_q_node[i];
|
||||||
|
min_q_total = g->min_q_total[i];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!progress && best_optimistic_node != ~0U) {
|
if (!progress && min_q_total != UINT_MAX) {
|
||||||
if (stack_optimistic_start == UINT_MAX)
|
if (stack_optimistic_start == UINT_MAX)
|
||||||
stack_optimistic_start = g->stack_count;
|
stack_optimistic_start = g->stack_count;
|
||||||
|
|
||||||
decrement_q(g, best_optimistic_node);
|
add_node_to_stack(g, min_q_node);
|
||||||
g->stack[g->stack_count] = best_optimistic_node;
|
|
||||||
g->stack_count++;
|
|
||||||
BITSET_SET(g->in_stack, best_optimistic_node);
|
|
||||||
progress = true;
|
progress = true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue