panfrost/midgard: Implement register spilling
Now that we run RA in a loop, before each iteration after a failed allocation we choose a spill node and spill it to Thread Local Storage using st_int4/ld_int4 instructions (for spills and fills respectively). This allows us to compile complex shaders that normally would not fit within the 16 work register limits, although it comes at a fairly steep performance penalty. Signed-off-by: Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
This commit is contained in:
parent
533d65786f
commit
21510c253c
|
@ -158,6 +158,7 @@ panfrost_emit_mfbd(struct panfrost_context *ctx, unsigned vertex_count)
|
|||
unsigned height = ctx->pipe_framebuffer.height;
|
||||
|
||||
struct bifrost_framebuffer framebuffer = {
|
||||
.unk0 = 0x1e5, /* 1e4 if no spill */
|
||||
.width1 = MALI_POSITIVE(width),
|
||||
.height1 = MALI_POSITIVE(height),
|
||||
.width2 = MALI_POSITIVE(width),
|
||||
|
@ -2663,7 +2664,7 @@ panfrost_setup_hardware(struct panfrost_context *ctx)
|
|||
struct pipe_context *gallium = (struct pipe_context *) ctx;
|
||||
struct panfrost_screen *screen = pan_screen(gallium->screen);
|
||||
|
||||
panfrost_drm_allocate_slab(screen, &ctx->scratchpad, 64, false, 0, 0, 0);
|
||||
panfrost_drm_allocate_slab(screen, &ctx->scratchpad, 64*4, false, 0, 0, 0);
|
||||
panfrost_drm_allocate_slab(screen, &ctx->shaders, 4096, true, PAN_ALLOCATE_EXECUTE, 0, 0);
|
||||
panfrost_drm_allocate_slab(screen, &ctx->tiler_heap, 4096, false, PAN_ALLOCATE_INVISIBLE | PAN_ALLOCATE_GROWABLE, 1, 128);
|
||||
panfrost_drm_allocate_slab(screen, &ctx->tiler_polygon_list, 128*128, false, PAN_ALLOCATE_INVISIBLE | PAN_ALLOCATE_GROWABLE, 1, 128);
|
||||
|
|
|
@ -429,9 +429,6 @@ mir_has_arg(midgard_instruction *ins, unsigned arg)
|
|||
return false;
|
||||
}
|
||||
|
||||
midgard_instruction
|
||||
v_load_store_scratch(unsigned srcdest, unsigned index, bool is_store);
|
||||
|
||||
/* Scheduling */
|
||||
|
||||
void schedule_program(compiler_context *ctx);
|
||||
|
|
|
@ -113,31 +113,6 @@ compose_swizzle(unsigned swizzle, unsigned mask,
|
|||
return shifted;
|
||||
}
|
||||
|
||||
/* When we're 'squeezing down' the values in the IR, we maintain a hash
|
||||
* as such */
|
||||
|
||||
static unsigned
|
||||
find_or_allocate_temp(compiler_context *ctx, unsigned hash)
|
||||
{
|
||||
if ((hash < 0) || (hash >= SSA_FIXED_MINIMUM))
|
||||
return hash;
|
||||
|
||||
unsigned temp = (uintptr_t) _mesa_hash_table_u64_search(
|
||||
ctx->hash_to_temp, hash + 1);
|
||||
|
||||
if (temp)
|
||||
return temp - 1;
|
||||
|
||||
/* If no temp is find, allocate one */
|
||||
temp = ctx->temp_count++;
|
||||
ctx->max_hash = MAX2(ctx->max_hash, hash);
|
||||
|
||||
_mesa_hash_table_u64_insert(ctx->hash_to_temp,
|
||||
hash + 1, (void *) ((uintptr_t) temp + 1));
|
||||
|
||||
return temp;
|
||||
}
|
||||
|
||||
/* Helper to return the default phys_reg for a given register */
|
||||
|
||||
static struct phys_reg
|
||||
|
@ -242,21 +217,7 @@ allocate_registers(compiler_context *ctx, bool *spilled)
|
|||
/* We're done setting up */
|
||||
ra_set_finalize(regs, NULL);
|
||||
|
||||
/* Transform the MIR into squeezed index form */
|
||||
mir_foreach_block(ctx, block) {
|
||||
mir_foreach_instr_in_block(block, ins) {
|
||||
if (ins->compact_branch) continue;
|
||||
|
||||
ins->ssa_args.dest = find_or_allocate_temp(ctx, ins->ssa_args.dest);
|
||||
ins->ssa_args.src0 = find_or_allocate_temp(ctx, ins->ssa_args.src0);
|
||||
|
||||
if (!ins->ssa_args.inline_constant)
|
||||
ins->ssa_args.src1 = find_or_allocate_temp(ctx, ins->ssa_args.src1);
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
/* No register allocation to do with no SSA */
|
||||
/* No register allocation to do with no SSA */
|
||||
|
||||
if (!ctx->temp_count)
|
||||
return NULL;
|
||||
|
@ -381,9 +342,13 @@ allocate_registers(compiler_context *ctx, bool *spilled)
|
|||
|
||||
if (!ra_allocate(g)) {
|
||||
*spilled = true;
|
||||
return NULL;
|
||||
} else {
|
||||
*spilled = false;
|
||||
}
|
||||
|
||||
/* Whether we were successful or not, report the graph so we can
|
||||
* compute spill nodes */
|
||||
|
||||
return g;
|
||||
}
|
||||
|
||||
|
|
|
@ -24,6 +24,7 @@
|
|||
#include "compiler.h"
|
||||
#include "midgard_ops.h"
|
||||
#include "util/u_memory.h"
|
||||
#include "util/register_allocate.h"
|
||||
|
||||
/* Create a mask of accessed components from a swizzle to figure out vector
|
||||
* dependencies */
|
||||
|
@ -575,15 +576,66 @@ midgard_pair_load_store(compiler_context *ctx, midgard_block *block)
|
|||
}
|
||||
}
|
||||
|
||||
midgard_instruction
|
||||
v_load_store_scratch(unsigned srcdest, unsigned index, bool is_store)
|
||||
/* When we're 'squeezing down' the values in the IR, we maintain a hash
|
||||
* as such */
|
||||
|
||||
static unsigned
|
||||
find_or_allocate_temp(compiler_context *ctx, unsigned hash)
|
||||
{
|
||||
if ((hash < 0) || (hash >= SSA_FIXED_MINIMUM))
|
||||
return hash;
|
||||
|
||||
unsigned temp = (uintptr_t) _mesa_hash_table_u64_search(
|
||||
ctx->hash_to_temp, hash + 1);
|
||||
|
||||
if (temp)
|
||||
return temp - 1;
|
||||
|
||||
/* If no temp is find, allocate one */
|
||||
temp = ctx->temp_count++;
|
||||
ctx->max_hash = MAX2(ctx->max_hash, hash);
|
||||
|
||||
_mesa_hash_table_u64_insert(ctx->hash_to_temp,
|
||||
hash + 1, (void *) ((uintptr_t) temp + 1));
|
||||
|
||||
return temp;
|
||||
}
|
||||
|
||||
/* Reassigns numbering to get rid of gaps in the indices */
|
||||
|
||||
static void
|
||||
mir_squeeze_index(compiler_context *ctx)
|
||||
{
|
||||
/* Reset */
|
||||
ctx->temp_count = 0;
|
||||
/* TODO don't leak old hash_to_temp */
|
||||
ctx->hash_to_temp = _mesa_hash_table_u64_create(NULL);
|
||||
|
||||
mir_foreach_instr_global(ctx, ins) {
|
||||
if (ins->compact_branch) continue;
|
||||
|
||||
ins->ssa_args.dest = find_or_allocate_temp(ctx, ins->ssa_args.dest);
|
||||
ins->ssa_args.src0 = find_or_allocate_temp(ctx, ins->ssa_args.src0);
|
||||
|
||||
if (!ins->ssa_args.inline_constant)
|
||||
ins->ssa_args.src1 = find_or_allocate_temp(ctx, ins->ssa_args.src1);
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
static midgard_instruction
|
||||
v_load_store_scratch(
|
||||
unsigned srcdest,
|
||||
unsigned index,
|
||||
bool is_store,
|
||||
unsigned mask)
|
||||
{
|
||||
/* We index by 32-bit vec4s */
|
||||
unsigned byte = (index * 4 * 4);
|
||||
|
||||
midgard_instruction ins = {
|
||||
.type = TAG_LOAD_STORE_4,
|
||||
.mask = 0xF,
|
||||
.mask = mask,
|
||||
.ssa_args = {
|
||||
.dest = -1,
|
||||
.src0 = -1,
|
||||
|
@ -602,10 +654,10 @@ v_load_store_scratch(unsigned srcdest, unsigned index, bool is_store)
|
|||
}
|
||||
};
|
||||
|
||||
if (is_store) {
|
||||
if (is_store) {
|
||||
/* r0 = r26, r1 = r27 */
|
||||
assert(srcdest == 26 || srcdest == 27);
|
||||
ins.ssa_args.src0 = SSA_FIXED_REGISTER(srcdest - 26);
|
||||
assert(srcdest == SSA_FIXED_REGISTER(26) || srcdest == SSA_FIXED_REGISTER(27));
|
||||
ins.ssa_args.src0 = (srcdest == SSA_FIXED_REGISTER(27)) ? SSA_FIXED_REGISTER(1) : SSA_FIXED_REGISTER(0);
|
||||
} else {
|
||||
ins.ssa_args.dest = srcdest;
|
||||
}
|
||||
|
@ -618,7 +670,10 @@ schedule_program(compiler_context *ctx)
|
|||
{
|
||||
struct ra_graph *g = NULL;
|
||||
bool spilled = false;
|
||||
int iter_count = 10; /* max iterations */
|
||||
int iter_count = 1000; /* max iterations */
|
||||
|
||||
/* Number of 128-bit slots in memory we've spilled into */
|
||||
unsigned spill_count = 0;
|
||||
|
||||
midgard_promote_uniforms(ctx, 8);
|
||||
|
||||
|
@ -627,18 +682,104 @@ schedule_program(compiler_context *ctx)
|
|||
}
|
||||
|
||||
do {
|
||||
/* If we spill, find the best spill node and spill it */
|
||||
|
||||
unsigned spill_index = ctx->temp_count;
|
||||
if (g && spilled) {
|
||||
/* All nodes are equal in spill cost, but we can't
|
||||
* spill nodes written to from an unspill */
|
||||
|
||||
for (unsigned i = 0; i < ctx->temp_count; ++i) {
|
||||
ra_set_node_spill_cost(g, i, 1.0);
|
||||
}
|
||||
|
||||
mir_foreach_instr_global(ctx, ins) {
|
||||
if (ins->type != TAG_LOAD_STORE_4) continue;
|
||||
if (ins->load_store.op != midgard_op_ld_int4) continue;
|
||||
if (ins->load_store.unknown != 0x1EEA) continue;
|
||||
ra_set_node_spill_cost(g, ins->ssa_args.dest, -1.0);
|
||||
}
|
||||
|
||||
int spill_node = ra_get_best_spill_node(g);
|
||||
|
||||
if (spill_node < 0)
|
||||
assert(0);
|
||||
|
||||
/* Allocate TLS slot */
|
||||
unsigned spill_slot = spill_count++;
|
||||
|
||||
/* Replace all stores to the spilled node with stores
|
||||
* to TLS */
|
||||
|
||||
mir_foreach_instr_global_safe(ctx, ins) {
|
||||
if (ins->compact_branch) continue;
|
||||
if (ins->ssa_args.dest != spill_node) continue;
|
||||
ins->ssa_args.dest = SSA_FIXED_REGISTER(26);
|
||||
|
||||
midgard_instruction st = v_load_store_scratch(ins->ssa_args.dest, spill_slot, true, ins->mask);
|
||||
mir_insert_instruction_before(mir_next_op(ins), st);
|
||||
}
|
||||
|
||||
/* Insert a load from TLS before the first consecutive
|
||||
* use of the node, rewriting to use spilled indices to
|
||||
* break up the live range */
|
||||
|
||||
mir_foreach_block(ctx, block) {
|
||||
|
||||
bool consecutive_skip = false;
|
||||
unsigned consecutive_index = 0;
|
||||
|
||||
mir_foreach_instr_in_block(block, ins) {
|
||||
if (ins->compact_branch) continue;
|
||||
|
||||
if (!mir_has_arg(ins, spill_node)) {
|
||||
consecutive_skip = false;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (consecutive_skip) {
|
||||
/* Rewrite */
|
||||
mir_rewrite_index_src_single(ins, spill_node, consecutive_index);
|
||||
continue;
|
||||
}
|
||||
|
||||
consecutive_index = ++spill_index;
|
||||
midgard_instruction st = v_load_store_scratch(consecutive_index, spill_slot, false, 0xF);
|
||||
midgard_instruction *before = ins;
|
||||
|
||||
/* For a csel, go back one more not to break up the bundle */
|
||||
if (ins->type == TAG_ALU_4 && OP_IS_CSEL(ins->alu.op))
|
||||
before = mir_prev_op(before);
|
||||
|
||||
mir_insert_instruction_before(before, st);
|
||||
// consecutive_skip = true;
|
||||
|
||||
|
||||
/* Rewrite to use */
|
||||
mir_rewrite_index_src_single(ins, spill_node, consecutive_index);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
mir_squeeze_index(ctx);
|
||||
|
||||
g = NULL;
|
||||
g = allocate_registers(ctx, &spilled);
|
||||
} while(spilled && ((iter_count--) > 0));
|
||||
|
||||
/* We would like to run RA after scheduling, but spilling can
|
||||
* complicate this */
|
||||
|
||||
mir_foreach_block(ctx, block) {
|
||||
schedule_block(ctx, block);
|
||||
}
|
||||
#if 0
|
||||
|
||||
/* Pipeline registers creation is a prepass before RA */
|
||||
mir_create_pipeline_registers(ctx);
|
||||
#endif
|
||||
|
||||
|
||||
g = allocate_registers(ctx, &spilled);
|
||||
} while(spilled && ((iter_count--) > 0));
|
||||
|
||||
if (iter_count <= 0) {
|
||||
fprintf(stderr, "panfrost: Gave up allocating registers, rendering will be incomplete\n");
|
||||
|
|
Loading…
Reference in New Issue