mesa/src/panfrost/bifrost/bi_ra.c

1132 lines
41 KiB
C

/*
* Copyright (C) 2020 Collabora Ltd.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
* Authors (Collabora):
* Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
*/
#include "compiler.h"
#include "nodearray.h"
#include "bi_builder.h"
#include "util/u_memory.h"
struct lcra_state {
unsigned node_count;
uint64_t *affinity;
/* Linear constraints imposed. For each node there there is a
* 'nodearray' structure, which changes between a sparse and dense
* array depending on the number of elements.
*
* Each element is itself a bit field denoting whether (c_j - c_i) bias
* is present or not, including negative biases.
*
* We support up to 8 components so the bias is in range
* [-7, 7] encoded by a 16-bit field
*/
nodearray *linear;
/* Before solving, forced registers; after solving, solutions. */
unsigned *solutions;
/** Node which caused register allocation to fail */
unsigned spill_node;
};
/* This module is an implementation of "Linearly Constrained
* Register Allocation". The paper is available in PDF form
* (https://people.collabora.com/~alyssa/LCRA.pdf) as well as Markdown+LaTeX
* (https://gitlab.freedesktop.org/alyssa/lcra/blob/master/LCRA.md)
*/
static struct lcra_state *
lcra_alloc_equations(unsigned node_count)
{
struct lcra_state *l = calloc(1, sizeof(*l));
l->node_count = node_count;
l->linear = calloc(sizeof(l->linear[0]), node_count);
l->solutions = calloc(sizeof(l->solutions[0]), node_count);
l->affinity = calloc(sizeof(l->affinity[0]), node_count);
memset(l->solutions, ~0, sizeof(l->solutions[0]) * node_count);
return l;
}
static void
lcra_free(struct lcra_state *l)
{
for (unsigned i = 0; i < l->node_count; ++i)
nodearray_reset(&l->linear[i]);
free(l->linear);
free(l->affinity);
free(l->solutions);
free(l);
}
static void
lcra_add_node_interference(struct lcra_state *l, unsigned i, unsigned cmask_i, unsigned j, unsigned cmask_j)
{
if (i == j)
return;
nodearray_value constraint_fw = 0;
nodearray_value constraint_bw = 0;
/* The constraint bits are reversed from lcra.c so that register
* allocation can be done in parallel for every possible solution,
* with lower-order bits representing smaller registers. */
for (unsigned D = 0; D < 8; ++D) {
if (cmask_i & (cmask_j << D)) {
constraint_fw |= (1 << (7 + D));
constraint_bw |= (1 << (7 - D));
}
if (cmask_i & (cmask_j >> D)) {
constraint_bw |= (1 << (7 + D));
constraint_fw |= (1 << (7 - D));
}
}
/* Use dense arrays after adding 256 elements */
nodearray_orr(&l->linear[j], i, constraint_fw, 256, l->node_count);
nodearray_orr(&l->linear[i], j, constraint_bw, 256, l->node_count);
}
static bool
lcra_test_linear(struct lcra_state *l, unsigned *solutions, unsigned i)
{
signed constant = solutions[i];
if (nodearray_is_sparse(&l->linear[i])) {
nodearray_sparse_foreach(&l->linear[i], elem) {
unsigned j = nodearray_sparse_key(elem);
nodearray_value constraint = nodearray_sparse_value(elem);
if (solutions[j] == ~0) continue;
signed lhs = constant - solutions[j];
if (lhs < -7 || lhs > 7)
continue;
if (constraint & (1 << (lhs + 7)))
return false;
}
return true;
}
nodearray_value *row = l->linear[i].dense;
for (unsigned j = 0; j < l->node_count; ++j) {
if (solutions[j] == ~0) continue;
signed lhs = constant - solutions[j];
if (lhs < -7 || lhs > 7)
continue;
if (row[j] & (1 << (lhs + 7)))
return false;
}
return true;
}
static bool
lcra_solve(struct lcra_state *l)
{
for (unsigned step = 0; step < l->node_count; ++step) {
if (l->solutions[step] != ~0) continue;
if (l->affinity[step] == 0) continue;
bool succ = false;
u_foreach_bit64(r, l->affinity[step]) {
l->solutions[step] = r;
if (lcra_test_linear(l, l->solutions, step)) {
succ = true;
break;
}
}
/* Out of registers - prepare to spill */
if (!succ) {
l->spill_node = step;
return false;
}
}
return true;
}
/* Register spilling is implemented with a cost-benefit system. Costs are set
* by the user. Benefits are calculated from the constraints. */
static unsigned
lcra_count_constraints(struct lcra_state *l, unsigned i)
{
unsigned count = 0;
nodearray *constraints = &l->linear[i];
if (nodearray_is_sparse(constraints)) {
nodearray_sparse_foreach(constraints, elem)
count += util_bitcount(nodearray_sparse_value(elem));
} else {
nodearray_dense_foreach_64(constraints, elem)
count += util_bitcount64(*elem);
}
return count;
}
/* Liveness analysis is a backwards-may dataflow analysis pass. Within a block,
* we compute live_out from live_in. The intrablock pass is linear-time. It
* returns whether progress was made. */
static void
bi_liveness_ins_update_ra(uint8_t *live, bi_instr *ins)
{
/* live_in[s] = GEN[s] + (live_out[s] - KILL[s]) */
bi_foreach_dest(ins, d) {
live[ins->dest[d].value] &= ~bi_writemask(ins, d);
}
bi_foreach_ssa_src(ins, src) {
unsigned count = bi_count_read_registers(ins, src);
unsigned rmask = BITFIELD_MASK(count);
live[ins->src[src].value] |= (rmask << ins->src[src].offset);
}
}
static bool
liveness_block_update(bi_block *blk, unsigned temp_count)
{
bool progress = false;
/* live_out[s] = sum { p in succ[s] } ( live_in[p] ) */
bi_foreach_successor(blk, succ) {
for (unsigned i = 0; i < temp_count; ++i)
blk->live_out[i] |= succ->live_in[i];
}
uint8_t *live = ralloc_array(blk, uint8_t, temp_count);
memcpy(live, blk->live_out, temp_count);
bi_foreach_instr_in_block_rev(blk, ins)
bi_liveness_ins_update_ra(live, ins);
/* To figure out progress, diff live_in */
for (unsigned i = 0; (i < temp_count) && !progress; ++i)
progress |= (blk->live_in[i] != live[i]);
ralloc_free(blk->live_in);
blk->live_in = live;
return progress;
}
/* Globally, liveness analysis uses a fixed-point algorithm based on a
* worklist. We initialize a work list with the exit block. We iterate the work
* list to compute live_in from live_out for each block on the work list,
* adding the predecessors of the block to the work list if we made progress.
*/
static void
bi_compute_liveness_ra(bi_context *ctx)
{
u_worklist worklist;
bi_worklist_init(ctx, &worklist);
bi_foreach_block(ctx, block) {
if (block->live_in)
ralloc_free(block->live_in);
if (block->live_out)
ralloc_free(block->live_out);
block->live_in = rzalloc_array(block, uint8_t, ctx->ssa_alloc);
block->live_out = rzalloc_array(block, uint8_t, ctx->ssa_alloc);
bi_worklist_push_tail(&worklist, block);
}
while (!u_worklist_is_empty(&worklist)) {
/* Pop off in reverse order since liveness is backwards */
bi_block *blk = bi_worklist_pop_tail(&worklist);
/* Update liveness information. If we made progress, we need to
* reprocess the predecessors
*/
if (liveness_block_update(blk, ctx->ssa_alloc)) {
bi_foreach_predecessor(blk, pred)
bi_worklist_push_head(&worklist, *pred);
}
}
u_worklist_fini(&worklist);
}
/* Construct an affinity mask such that the vector with `count` elements does
* not intersect any of the registers in the bitset `clobber`. In other words,
* an allocated register r needs to satisfy for each i < count: a + i != b.
* Equivalently that's a != b - i, so we need a \ne { b - i : i < n }. For the
* entire clobber set B, we need a \ne union b \in B { b - i : i < n }, where
* that union is the desired clobber set. That may be written equivalently as
* the union over i < n of (B - i), where subtraction is defined elementwise
* and corresponds to a shift of the entire bitset.
*
* EVEN_BITS_MASK is an affinity mask for aligned register pairs. Interpreted
* as a bit set, it is { x : 0 <= x < 64 if x is even }
*/
#define EVEN_BITS_MASK (0x5555555555555555ull)
static uint64_t
bi_make_affinity(uint64_t clobber, unsigned count, bool split_file)
{
uint64_t clobbered = 0;
for (unsigned i = 0; i < count; ++i)
clobbered |= (clobber >> i);
/* Don't allocate past the end of the register file */
if (count > 1) {
unsigned excess = count - 1;
uint64_t mask = BITFIELD_MASK(excess);
clobbered |= mask << (64 - excess);
if (split_file)
clobbered |= mask << (16 - excess);
}
/* Don't allocate the middle if we split out the middle */
if (split_file)
clobbered |= BITFIELD64_MASK(32) << 16;
/* We can use a register iff it's not clobberred */
return ~clobbered;
}
static void
bi_mark_interference(bi_block *block, struct lcra_state *l, uint8_t *live, uint64_t preload_live, unsigned node_count, bool is_blend, bool split_file, bool aligned_sr)
{
bi_foreach_instr_in_block_rev(block, ins) {
/* Mark all registers live after the instruction as
* interfering with the destination */
bi_foreach_dest(ins, d) {
unsigned node = ins->dest[d].value;
/* Don't allocate to anything that's read later as a
* preloaded register. The affinity is the intersection
* of affinity masks for each write. Since writes have
* offsets, but the affinity is for the whole node, we
* need to offset the affinity opposite the write
* offset, so we shift right. */
unsigned count = bi_count_write_registers(ins, d);
unsigned offset = ins->dest[d].offset;
uint64_t affinity = bi_make_affinity(preload_live, count, split_file) >> offset;
/* Valhall needs >= 64-bit staging writes to be pair-aligned */
if (aligned_sr && (count >= 2 || offset))
affinity &= EVEN_BITS_MASK;
l->affinity[node] &= affinity;
for (unsigned i = 0; i < node_count; ++i) {
uint8_t r = live[i];
/* Nodes only interfere if they occupy
* /different values/ at the same time
* (Boissinot). In particular, sources of
* moves do not interfere with their
* destinations. This enables a limited form of
* coalescing.
*/
if (ins->op == BI_OPCODE_MOV_I32 &&
bi_is_ssa(ins->src[0]) &&
i == ins->src[0].value) {
r &= ~BITFIELD_BIT(ins->src[0].offset);
}
if (r) {
lcra_add_node_interference(l, node,
bi_writemask(ins, d), i, r);
}
}
unsigned node_first = ins->dest[0].value;
if (d == 1) {
lcra_add_node_interference(l, node, bi_writemask(ins, 1),
node_first, bi_writemask(ins, 0));
}
}
/* Valhall needs >= 64-bit reads to be pair-aligned */
if (aligned_sr) {
bi_foreach_ssa_src(ins, s) {
if (bi_count_read_registers(ins, s) >= 2)
l->affinity[ins->src[s].value] &= EVEN_BITS_MASK;
}
}
if (!is_blend && ins->op == BI_OPCODE_BLEND) {
/* Blend shaders might clobber r0-r15, r48. */
uint64_t clobber = BITFIELD64_MASK(16) | BITFIELD64_BIT(48);
for (unsigned i = 0; i < node_count; ++i) {
if (live[i])
l->affinity[i] &= ~clobber;
}
}
/* Update live_in */
preload_live = bi_postra_liveness_ins(preload_live, ins);
bi_liveness_ins_update_ra(live, ins);
}
block->reg_live_in = preload_live;
}
static void
bi_compute_interference(bi_context *ctx, struct lcra_state *l, bool full_regs)
{
bi_compute_liveness_ra(ctx);
bi_postra_liveness(ctx);
bi_foreach_block_rev(ctx, blk) {
uint8_t *live = mem_dup(blk->live_out, ctx->ssa_alloc);
bi_mark_interference(blk, l, live, blk->reg_live_out,
ctx->ssa_alloc, ctx->inputs->is_blend,
!full_regs, ctx->arch >= 9);
free(live);
}
}
static struct lcra_state *
bi_allocate_registers(bi_context *ctx, bool *success, bool full_regs)
{
struct lcra_state *l = lcra_alloc_equations(ctx->ssa_alloc);
/* Blend shaders are restricted to R0-R15. Other shaders at full
* occupancy also can access R48-R63. At half occupancy they can access
* the whole file. */
uint64_t default_affinity =
ctx->inputs->is_blend ? BITFIELD64_MASK(16) :
full_regs ? BITFIELD64_MASK(64) :
(BITFIELD64_MASK(16) | (BITFIELD64_MASK(16) << 48));
/* To test spilling, mimic a small register file */
if (bifrost_debug & BIFROST_DBG_SPILL && !ctx->inputs->is_blend)
default_affinity &= BITFIELD64_MASK(48) << 8;
bi_foreach_instr_global(ctx, ins) {
bi_foreach_dest(ins, d)
l->affinity[ins->dest[d].value] = default_affinity;
/* Blend shaders expect the src colour to be in r0-r3 */
if (ins->op == BI_OPCODE_BLEND &&
!ctx->inputs->is_blend) {
assert(bi_is_ssa(ins->src[0]));
l->solutions[ins->src[0].value] = 0;
/* Dual source blend input in r4-r7 */
if (bi_is_ssa(ins->src[4]))
l->solutions[ins->src[4].value] = 4;
/* Writes to R48 */
if (!bi_is_null(ins->dest[0]))
l->solutions[ins->dest[0].value] = 48;
}
/* Coverage mask writes stay in R60 */
if ((ins->op == BI_OPCODE_ATEST ||
ins->op == BI_OPCODE_ZS_EMIT) &&
!bi_is_null(ins->dest[0])) {
l->solutions[ins->dest[0].value] = 60;
}
/* Experimentally, it seems coverage masks inputs to ATEST must
* be in R60. Otherwise coverage mask writes do not work with
* early-ZS with pixel-frequency-shading (this combination of
* settings is legal if depth/stencil writes are disabled).
*/
if (ins->op == BI_OPCODE_ATEST) {
assert(bi_is_ssa(ins->src[0]));
l->solutions[ins->src[0].value] = 60;
}
}
bi_compute_interference(ctx, l, full_regs);
/* Coalesce register moves if we're allowed. We need to be careful due
* to the restricted affinity induced by the blend shader ABI.
*/
bi_foreach_instr_global(ctx, I) {
if (I->op != BI_OPCODE_MOV_I32) continue;
if (I->src[0].type != BI_INDEX_REGISTER) continue;
unsigned reg = I->src[0].value;
unsigned node = I->dest[0].value;
if (l->solutions[node] != ~0) continue;
uint64_t affinity = l->affinity[node];
if (ctx->inputs->is_blend) {
/* We're allowed to coalesce the moves to these */
affinity |= BITFIELD64_BIT(48);
affinity |= BITFIELD64_BIT(60);
}
/* Try to coalesce */
if (affinity & BITFIELD64_BIT(reg)) {
l->solutions[node] = reg;
if (!lcra_test_linear(l, l->solutions, node))
l->solutions[node] = ~0;
}
}
*success = lcra_solve(l);
return l;
}
static bi_index
bi_reg_from_index(bi_context *ctx, struct lcra_state *l, bi_index index)
{
/* Offsets can only be applied when we register allocated an index, or
* alternatively for FAU's encoding */
ASSERTED bool is_offset = (index.offset > 0) &&
(index.type != BI_INDEX_FAU);
/* Did we run RA for this index at all */
if (!bi_is_ssa(index)) {
assert(!is_offset);
return index;
}
/* LCRA didn't bother solving this index (how lazy!) */
signed solution = l->solutions[index.value];
if (solution < 0) {
assert(!is_offset);
return index;
}
/* todo: do we want to compose with the subword swizzle? */
bi_index new_index = bi_register(solution + index.offset);
new_index.swizzle = index.swizzle;
new_index.abs = index.abs;
new_index.neg = index.neg;
return new_index;
}
/* Dual texture instructions write to two sets of staging registers, modeled as
* two destinations in the IR. The first set is communicated with the usual
* staging register mechanism. The second set is encoded in the texture
* operation descriptor. This is quite unusual, and requires the following late
* fixup.
*/
static void
bi_fixup_dual_tex_register(bi_instr *I)
{
assert(I->dest[1].type == BI_INDEX_REGISTER);
assert(I->src[3].type == BI_INDEX_CONSTANT);
struct bifrost_dual_texture_operation desc = {
.secondary_register = I->dest[1].value
};
I->src[3].value |= bi_dual_tex_as_u32(desc);
}
static void
bi_install_registers(bi_context *ctx, struct lcra_state *l)
{
bi_foreach_instr_global(ctx, ins) {
bi_foreach_dest(ins, d)
ins->dest[d] = bi_reg_from_index(ctx, l, ins->dest[d]);
bi_foreach_src(ins, s)
ins->src[s] = bi_reg_from_index(ctx, l, ins->src[s]);
if (ins->op == BI_OPCODE_TEXC_DUAL)
bi_fixup_dual_tex_register(ins);
}
}
static void
bi_rewrite_index_src_single(bi_instr *ins, bi_index old, bi_index new)
{
bi_foreach_src(ins, i) {
if (bi_is_equiv(ins->src[i], old)) {
ins->src[i].type = new.type;
ins->src[i].value = new.value;
}
}
}
/* If register allocation fails, find the best spill node */
static signed
bi_choose_spill_node(bi_context *ctx, struct lcra_state *l)
{
/* Pick a node satisfying bi_spill_register's preconditions */
BITSET_WORD *no_spill = calloc(sizeof(BITSET_WORD), BITSET_WORDS(l->node_count));
bi_foreach_instr_global(ctx, ins) {
bi_foreach_dest(ins, d) {
/* Don't allow spilling coverage mask writes because the
* register preload logic assumes it will stay in R60.
* This could be optimized.
*/
if (ins->no_spill ||
ins->op == BI_OPCODE_ATEST ||
ins->op == BI_OPCODE_ZS_EMIT ||
(ins->op == BI_OPCODE_MOV_I32 &&
ins->src[0].type == BI_INDEX_REGISTER &&
ins->src[0].value == 60)) {
BITSET_SET(no_spill, ins->dest[d].value);
}
}
}
unsigned best_benefit = 0.0;
signed best_node = -1;
if (nodearray_is_sparse(&l->linear[l->spill_node])) {
nodearray_sparse_foreach(&l->linear[l->spill_node], elem) {
unsigned i = nodearray_sparse_key(elem);
unsigned constraint = nodearray_sparse_value(elem);
/* Only spill nodes that interfere with the node failing
* register allocation. It's pointless to spill anything else */
if (!constraint) continue;
if (BITSET_TEST(no_spill, i)) continue;
unsigned benefit = lcra_count_constraints(l, i);
if (benefit > best_benefit) {
best_benefit = benefit;
best_node = i;
}
}
} else {
nodearray_value *row = l->linear[l->spill_node].dense;
for (unsigned i = 0; i < l->node_count; ++i) {
/* Only spill nodes that interfere with the node failing
* register allocation. It's pointless to spill anything else */
if (!row[i]) continue;
if (BITSET_TEST(no_spill, i)) continue;
unsigned benefit = lcra_count_constraints(l, i);
if (benefit > best_benefit) {
best_benefit = benefit;
best_node = i;
}
}
}
free(no_spill);
return best_node;
}
static unsigned
bi_count_read_index(bi_instr *I, bi_index index)
{
unsigned max = 0;
bi_foreach_src(I, s) {
if (bi_is_equiv(I->src[s], index)) {
unsigned count = bi_count_read_registers(I, s);
max = MAX2(max, count + I->src[s].offset);
}
}
return max;
}
/*
* Wrappers to emit loads/stores to thread-local storage in an appropriate way
* for the target, so the spill/fill code becomes architecture-independent.
*/
static bi_index
bi_tls_ptr(bool hi)
{
return bi_fau(BIR_FAU_TLS_PTR, hi);
}
static bi_instr *
bi_load_tl(bi_builder *b, unsigned bits, bi_index src, unsigned offset)
{
if (b->shader->arch >= 9) {
return bi_load_to(b, bits, src, bi_tls_ptr(false),
bi_tls_ptr(true), BI_SEG_TL, offset);
} else {
return bi_load_to(b, bits, src, bi_imm_u32(offset), bi_zero(),
BI_SEG_TL, 0);
}
}
static void
bi_store_tl(bi_builder *b, unsigned bits, bi_index src, unsigned offset)
{
if (b->shader->arch >= 9) {
bi_store(b, bits, src, bi_tls_ptr(false), bi_tls_ptr(true), BI_SEG_TL, offset);
} else {
bi_store(b, bits, src, bi_imm_u32(offset), bi_zero(), BI_SEG_TL, 0);
}
}
/* Once we've chosen a spill node, spill it and returns bytes spilled */
static unsigned
bi_spill_register(bi_context *ctx, bi_index index, uint32_t offset)
{
bi_builder b = { .shader = ctx };
unsigned channels = 0;
/* Spill after every store, fill before every load */
bi_foreach_instr_global_safe(ctx, I) {
bi_foreach_dest(I, d) {
if (!bi_is_equiv(I->dest[d], index)) continue;
unsigned extra = I->dest[d].offset;
bi_index tmp = bi_temp(ctx);
I->dest[d] = bi_replace_index(I->dest[d], tmp);
I->no_spill = true;
unsigned count = bi_count_write_registers(I, d);
unsigned bits = count * 32;
b.cursor = bi_after_instr(I);
bi_store_tl(&b, bits, tmp, offset + 4 * extra);
ctx->spills++;
channels = MAX2(channels, extra + count);
}
if (bi_has_arg(I, index)) {
b.cursor = bi_before_instr(I);
bi_index tmp = bi_temp(ctx);
unsigned bits = bi_count_read_index(I, index) * 32;
bi_rewrite_index_src_single(I, index, tmp);
bi_instr *ld = bi_load_tl(&b, bits, tmp, offset);
ld->no_spill = true;
ctx->fills++;
}
}
return (channels * 4);
}
/*
* For transition, lower collects and splits before RA, rather than after RA.
* LCRA knows how to deal with offsets (broken SSA), but not how to coalesce
* these vector moves.
*/
static void
bi_lower_vector(bi_context *ctx, unsigned first_reg)
{
bi_index *remap = calloc(ctx->ssa_alloc, sizeof(bi_index));
bi_foreach_instr_global_safe(ctx, I) {
bi_builder b = bi_init_builder(ctx, bi_after_instr(I));
if (I->op == BI_OPCODE_SPLIT_I32) {
bi_index src = I->src[0];
assert(src.offset == 0);
bi_foreach_dest(I, i) {
src.offset = i;
bi_mov_i32_to(&b, I->dest[i], src);
if (I->dest[i].value < first_reg)
remap[I->dest[i].value] = src;
}
bi_remove_instruction(I);
} else if (I->op == BI_OPCODE_COLLECT_I32) {
bi_index dest = I->dest[0];
assert(dest.offset == 0);
assert(((dest.value < first_reg) || I->nr_srcs == 1) && "nir_lower_phis_to_scalar");
bi_foreach_src(I, i) {
if (bi_is_null(I->src[i]))
continue;
dest.offset = i;
bi_mov_i32_to(&b, dest, I->src[i]);
}
bi_remove_instruction(I);
}
}
bi_foreach_instr_global(ctx, I) {
bi_foreach_ssa_src(I, s) {
if (I->src[s].value < first_reg && !bi_is_null(remap[I->src[s].value]))
bi_replace_src(I, s, remap[I->src[s].value]);
}
}
free(remap);
/* After generating a pile of moves, clean up */
bi_compute_liveness_ra(ctx);
bi_foreach_block_rev(ctx, block) {
uint8_t *live = rzalloc_array(block, uint8_t, ctx->ssa_alloc);
bi_foreach_successor(block, succ) {
for (unsigned i = 0; i < ctx->ssa_alloc; ++i)
live[i] |= succ->live_in[i];
}
bi_foreach_instr_in_block_safe_rev(block, ins) {
bool all_null = true;
bi_foreach_dest(ins, d) {
if (live[ins->dest[d].value] & bi_writemask(ins, d))
all_null = false;
}
if (all_null && !bi_side_effects(ins))
bi_remove_instruction(ins);
else
bi_liveness_ins_update_ra(live, ins);
}
ralloc_free(block->live_in);
block->live_in = live;
}
}
/*
* Check if the instruction requires a "tied" operand. Such instructions MUST
* allocate their source and destination to the same register. This is a
* constraint on RA, and may require extra moves.
*
* In particular, this is the case for Bifrost instructions that both read and
* write with the staging register mechanism.
*/
static bool
bi_is_tied(const bi_instr *I)
{
return (I->op == BI_OPCODE_TEXC ||
I->op == BI_OPCODE_TEXC_DUAL ||
I->op == BI_OPCODE_ATOM_RETURN_I32 ||
I->op == BI_OPCODE_AXCHG_I32 ||
I->op == BI_OPCODE_ACMPXCHG_I32) &&
!bi_is_null(I->src[0]);
}
/*
* For transition, coalesce tied operands together, as LCRA knows how to handle
* non-SSA operands but doesn't know about tied operands.
*
* This breaks the SSA form of the program, but that doesn't matter for LCRA.
*/
static void
bi_coalesce_tied(bi_context *ctx)
{
bi_foreach_instr_global(ctx, I) {
if (!bi_is_tied(I)) continue;
bi_builder b = bi_init_builder(ctx, bi_before_instr(I));
unsigned n = bi_count_read_registers(I, 0);
for (unsigned i = 0; i < n; ++i) {
bi_index dst = I->dest[0], src = I->src[0];
assert(dst.offset == 0 && src.offset == 0);
dst.offset = src.offset = i;
bi_mov_i32_to(&b, dst, src);
}
bi_replace_src(I, 0, I->dest[0]);
}
}
static unsigned
find_or_allocate_temp(unsigned *map, unsigned value, unsigned *alloc)
{
if (!map[value])
map[value] = ++(*alloc);
assert(map[value]);
return map[value] - 1;
}
/* Reassigns numbering to get rid of gaps in the indices and to prioritize
* smaller register classes */
static void
squeeze_index(bi_context *ctx)
{
unsigned *map = rzalloc_array(ctx, unsigned, ctx->ssa_alloc);
ctx->ssa_alloc = 0;
bi_foreach_instr_global(ctx, I) {
bi_foreach_dest(I, d)
I->dest[d].value = find_or_allocate_temp(map, I->dest[d].value, &ctx->ssa_alloc);
bi_foreach_ssa_src(I, s)
I->src[s].value = find_or_allocate_temp(map, I->src[s].value, &ctx->ssa_alloc);
}
ralloc_free(map);
}
/*
* Brainless out-of-SSA pass. The eventual goal is to go out-of-SSA after RA and
* coalesce implicitly with biased colouring in a tree scan allocator. For now,
* this should be good enough for LCRA.
*/
static unsigned
bi_out_of_ssa(bi_context *ctx)
{
bi_index zero = bi_fau(BIR_FAU_IMMEDIATE | 0, false);
unsigned first_reg = ctx->ssa_alloc;
/* Trivially lower phis */
bi_foreach_block(ctx, block) {
bi_foreach_instr_in_block_safe(block, I) {
if (I->op != BI_OPCODE_PHI)
break;
/* Assign a register for the phi */
bi_index reg = bi_temp(ctx);
assert(reg.value >= first_reg);
/* Lower to a move in each predecessor. The destinations
* cannot interfere so these can be sequentialized
* in arbitrary order.
*/
bi_foreach_predecessor(block, pred) {
bi_builder b = bi_init_builder(ctx, bi_after_block_logical(*pred));
unsigned i = bi_predecessor_index(block, *pred);
assert(!I->src[i].abs);
assert(!I->src[i].neg);
assert(I->src[i].swizzle == BI_SWIZZLE_H01);
/* MOV of immediate needs lowering on Valhall */
if (ctx->arch >= 9 && I->src[i].type == BI_INDEX_CONSTANT)
bi_iadd_imm_i32_to(&b, reg, zero, I->src[i].value);
else
bi_mov_i32_to(&b, reg, I->src[i]);
}
/* Replace the phi with a move */
bi_builder b = bi_init_builder(ctx, bi_before_instr(I));
bi_mov_i32_to(&b, I->dest[0], reg);
bi_remove_instruction(I);
/* Propagate that move within the block. The destination
* is SSA and the source is not written in this block,
* so this is legal. The move itself will be DCE'd if
* possible in the next pass.
*/
bi_foreach_instr_in_block_rev(block, prop) {
if (prop->op == BI_OPCODE_PHI)
break;
bi_foreach_src(prop, s) {
if (bi_is_equiv(prop->src[s], I->dest[0])) {
bi_replace_src(prop, s, reg);
}
}
}
}
}
/* Try to locally propagate the moves we created. We need to be extra
* careful because we're not in SSA at this point, as such this
* algorithm is quadratic. This will go away when we go out of SSA after
* RA.
*/
BITSET_WORD *used = calloc(sizeof(BITSET_WORD), BITSET_WORDS(ctx->ssa_alloc));
BITSET_WORD *multiple_uses = calloc(sizeof(BITSET_WORD), BITSET_WORDS(ctx->ssa_alloc));
bi_foreach_instr_global(ctx, I) {
bi_foreach_ssa_src(I, s) {
if (BITSET_TEST(used, I->src[s].value))
BITSET_SET(multiple_uses, I->src[s].value);
else
BITSET_SET(used, I->src[s].value);
}
}
bi_foreach_block(ctx, block) {
bi_foreach_instr_in_block_safe_rev(block, mov) {
/* Match "reg = ssa" */
if (mov->op != BI_OPCODE_MOV_I32) continue;
if (mov->dest[0].type != BI_INDEX_NORMAL) continue;
if (mov->dest[0].value < first_reg) continue;
if (!bi_is_ssa(mov->src[0])) continue;
if (mov->src[0].value >= first_reg) continue;
if (BITSET_TEST(multiple_uses, mov->src[0].value)) continue;
bool found = false;
/* Look locally for the write of the SSA */
bi_foreach_instr_in_block_rev(block, I) {
bool bail = false;
bi_foreach_src(I, s) {
/* Bail: write-after-read */
if (bi_is_equiv(I->src[s], mov->dest[0]))
bail = true;
}
if (bail)
break;
bi_foreach_dest(I, d) {
/* Bail: write-after-write */
if (bi_is_equiv(I->dest[d], mov->dest[0]))
break;
if (!bi_is_equiv(I->dest[d], mov->src[0]))
continue;
/* We found it, replace */
I->dest[d] = bi_replace_index(I->dest[d], mov->dest[0]);
found = true;
break;
}
if (found)
break;
}
if (found)
bi_remove_instruction(mov);
}
}
free(used);
free(multiple_uses);
return first_reg;
}
void
bi_register_allocate(bi_context *ctx)
{
struct lcra_state *l = NULL;
bool success = false;
unsigned iter_count = 1000; /* max iterations */
/* Number of bytes of memory we've spilled into */
unsigned spill_count = ctx->info.tls_size;
if (ctx->arch >= 9)
va_lower_split_64bit(ctx);
/* Lower tied operands. SSA is broken from here on. */
unsigned first_reg = bi_out_of_ssa(ctx);
bi_lower_vector(ctx, first_reg);
bi_coalesce_tied(ctx);
squeeze_index(ctx);
/* Try with reduced register pressure to improve thread count */
if (ctx->arch >= 7) {
l = bi_allocate_registers(ctx, &success, false);
if (success) {
ctx->info.work_reg_count = 32;
} else {
lcra_free(l);
l = NULL;
}
}
/* Otherwise, use the register file and spill until we succeed */
while (!success && ((iter_count--) > 0)) {
l = bi_allocate_registers(ctx, &success, true);
if (success) {
ctx->info.work_reg_count = 64;
} else {
signed spill_node = bi_choose_spill_node(ctx, l);
lcra_free(l);
l = NULL;
if (spill_node == -1)
unreachable("Failed to choose spill node\n");
if (ctx->inputs->is_blend)
unreachable("Blend shaders may not spill");
/* By default, we use packed TLS addressing on Valhall.
* We cannot cross 16 byte boundaries with packed TLS
* addressing. Align to ensure this doesn't happen. This
* could be optimized a bit.
*/
if (ctx->arch >= 9)
spill_count = ALIGN_POT(spill_count, 16);
spill_count += bi_spill_register(ctx,
bi_get_index(spill_node), spill_count);
/* In case the spill affected an instruction with tied
* operands, we need to fix up.
*/
bi_coalesce_tied(ctx);
}
}
assert(success);
assert(l != NULL);
ctx->info.tls_size = spill_count;
bi_install_registers(ctx, l);
lcra_free(l);
}