panfrost/midgard: Implement register spilling

Now that we run RA in a loop, before each iteration after a failed
allocation we choose a spill node and spill it to Thread Local Storage
using st_int4/ld_int4 instructions (for spills and fills respectively).

This allows us to compile complex shaders that normally would not fit
within the 16 work register limits, although it comes at a fairly steep
performance penalty.

Signed-off-by: Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
This commit is contained in:
Alyssa Rosenzweig 2019-07-19 13:21:11 -07:00
parent 533d65786f
commit 21510c253c
4 changed files with 158 additions and 54 deletions

View File

@ -158,6 +158,7 @@ panfrost_emit_mfbd(struct panfrost_context *ctx, unsigned vertex_count)
unsigned height = ctx->pipe_framebuffer.height;
struct bifrost_framebuffer framebuffer = {
.unk0 = 0x1e5, /* 1e4 if no spill */
.width1 = MALI_POSITIVE(width),
.height1 = MALI_POSITIVE(height),
.width2 = MALI_POSITIVE(width),
@ -2663,7 +2664,7 @@ panfrost_setup_hardware(struct panfrost_context *ctx)
struct pipe_context *gallium = (struct pipe_context *) ctx;
struct panfrost_screen *screen = pan_screen(gallium->screen);
panfrost_drm_allocate_slab(screen, &ctx->scratchpad, 64, false, 0, 0, 0);
panfrost_drm_allocate_slab(screen, &ctx->scratchpad, 64*4, false, 0, 0, 0);
panfrost_drm_allocate_slab(screen, &ctx->shaders, 4096, true, PAN_ALLOCATE_EXECUTE, 0, 0);
panfrost_drm_allocate_slab(screen, &ctx->tiler_heap, 4096, false, PAN_ALLOCATE_INVISIBLE | PAN_ALLOCATE_GROWABLE, 1, 128);
panfrost_drm_allocate_slab(screen, &ctx->tiler_polygon_list, 128*128, false, PAN_ALLOCATE_INVISIBLE | PAN_ALLOCATE_GROWABLE, 1, 128);

View File

@ -429,9 +429,6 @@ mir_has_arg(midgard_instruction *ins, unsigned arg)
return false;
}
midgard_instruction
v_load_store_scratch(unsigned srcdest, unsigned index, bool is_store);
/* Scheduling */
void schedule_program(compiler_context *ctx);

View File

@ -113,31 +113,6 @@ compose_swizzle(unsigned swizzle, unsigned mask,
return shifted;
}
/* When we're 'squeezing down' the values in the IR, we maintain a hash
* as such */
static unsigned
find_or_allocate_temp(compiler_context *ctx, unsigned hash)
{
if ((hash < 0) || (hash >= SSA_FIXED_MINIMUM))
return hash;
unsigned temp = (uintptr_t) _mesa_hash_table_u64_search(
ctx->hash_to_temp, hash + 1);
if (temp)
return temp - 1;
/* If no temp is find, allocate one */
temp = ctx->temp_count++;
ctx->max_hash = MAX2(ctx->max_hash, hash);
_mesa_hash_table_u64_insert(ctx->hash_to_temp,
hash + 1, (void *) ((uintptr_t) temp + 1));
return temp;
}
/* Helper to return the default phys_reg for a given register */
static struct phys_reg
@ -242,21 +217,7 @@ allocate_registers(compiler_context *ctx, bool *spilled)
/* We're done setting up */
ra_set_finalize(regs, NULL);
/* Transform the MIR into squeezed index form */
mir_foreach_block(ctx, block) {
mir_foreach_instr_in_block(block, ins) {
if (ins->compact_branch) continue;
ins->ssa_args.dest = find_or_allocate_temp(ctx, ins->ssa_args.dest);
ins->ssa_args.src0 = find_or_allocate_temp(ctx, ins->ssa_args.src0);
if (!ins->ssa_args.inline_constant)
ins->ssa_args.src1 = find_or_allocate_temp(ctx, ins->ssa_args.src1);
}
}
/* No register allocation to do with no SSA */
/* No register allocation to do with no SSA */
if (!ctx->temp_count)
return NULL;
@ -381,9 +342,13 @@ allocate_registers(compiler_context *ctx, bool *spilled)
if (!ra_allocate(g)) {
*spilled = true;
return NULL;
} else {
*spilled = false;
}
/* Whether we were successful or not, report the graph so we can
* compute spill nodes */
return g;
}

View File

@ -24,6 +24,7 @@
#include "compiler.h"
#include "midgard_ops.h"
#include "util/u_memory.h"
#include "util/register_allocate.h"
/* Create a mask of accessed components from a swizzle to figure out vector
* dependencies */
@ -575,15 +576,66 @@ midgard_pair_load_store(compiler_context *ctx, midgard_block *block)
}
}
midgard_instruction
v_load_store_scratch(unsigned srcdest, unsigned index, bool is_store)
/* When we're 'squeezing down' the values in the IR, we maintain a hash
* as such */
static unsigned
find_or_allocate_temp(compiler_context *ctx, unsigned hash)
{
if ((hash < 0) || (hash >= SSA_FIXED_MINIMUM))
return hash;
unsigned temp = (uintptr_t) _mesa_hash_table_u64_search(
ctx->hash_to_temp, hash + 1);
if (temp)
return temp - 1;
/* If no temp is find, allocate one */
temp = ctx->temp_count++;
ctx->max_hash = MAX2(ctx->max_hash, hash);
_mesa_hash_table_u64_insert(ctx->hash_to_temp,
hash + 1, (void *) ((uintptr_t) temp + 1));
return temp;
}
/* Reassigns numbering to get rid of gaps in the indices */
static void
mir_squeeze_index(compiler_context *ctx)
{
/* Reset */
ctx->temp_count = 0;
/* TODO don't leak old hash_to_temp */
ctx->hash_to_temp = _mesa_hash_table_u64_create(NULL);
mir_foreach_instr_global(ctx, ins) {
if (ins->compact_branch) continue;
ins->ssa_args.dest = find_or_allocate_temp(ctx, ins->ssa_args.dest);
ins->ssa_args.src0 = find_or_allocate_temp(ctx, ins->ssa_args.src0);
if (!ins->ssa_args.inline_constant)
ins->ssa_args.src1 = find_or_allocate_temp(ctx, ins->ssa_args.src1);
}
}
static midgard_instruction
v_load_store_scratch(
unsigned srcdest,
unsigned index,
bool is_store,
unsigned mask)
{
/* We index by 32-bit vec4s */
unsigned byte = (index * 4 * 4);
midgard_instruction ins = {
.type = TAG_LOAD_STORE_4,
.mask = 0xF,
.mask = mask,
.ssa_args = {
.dest = -1,
.src0 = -1,
@ -602,10 +654,10 @@ v_load_store_scratch(unsigned srcdest, unsigned index, bool is_store)
}
};
if (is_store) {
if (is_store) {
/* r0 = r26, r1 = r27 */
assert(srcdest == 26 || srcdest == 27);
ins.ssa_args.src0 = SSA_FIXED_REGISTER(srcdest - 26);
assert(srcdest == SSA_FIXED_REGISTER(26) || srcdest == SSA_FIXED_REGISTER(27));
ins.ssa_args.src0 = (srcdest == SSA_FIXED_REGISTER(27)) ? SSA_FIXED_REGISTER(1) : SSA_FIXED_REGISTER(0);
} else {
ins.ssa_args.dest = srcdest;
}
@ -618,7 +670,10 @@ schedule_program(compiler_context *ctx)
{
struct ra_graph *g = NULL;
bool spilled = false;
int iter_count = 10; /* max iterations */
int iter_count = 1000; /* max iterations */
/* Number of 128-bit slots in memory we've spilled into */
unsigned spill_count = 0;
midgard_promote_uniforms(ctx, 8);
@ -627,18 +682,104 @@ schedule_program(compiler_context *ctx)
}
do {
/* If we spill, find the best spill node and spill it */
unsigned spill_index = ctx->temp_count;
if (g && spilled) {
/* All nodes are equal in spill cost, but we can't
* spill nodes written to from an unspill */
for (unsigned i = 0; i < ctx->temp_count; ++i) {
ra_set_node_spill_cost(g, i, 1.0);
}
mir_foreach_instr_global(ctx, ins) {
if (ins->type != TAG_LOAD_STORE_4) continue;
if (ins->load_store.op != midgard_op_ld_int4) continue;
if (ins->load_store.unknown != 0x1EEA) continue;
ra_set_node_spill_cost(g, ins->ssa_args.dest, -1.0);
}
int spill_node = ra_get_best_spill_node(g);
if (spill_node < 0)
assert(0);
/* Allocate TLS slot */
unsigned spill_slot = spill_count++;
/* Replace all stores to the spilled node with stores
* to TLS */
mir_foreach_instr_global_safe(ctx, ins) {
if (ins->compact_branch) continue;
if (ins->ssa_args.dest != spill_node) continue;
ins->ssa_args.dest = SSA_FIXED_REGISTER(26);
midgard_instruction st = v_load_store_scratch(ins->ssa_args.dest, spill_slot, true, ins->mask);
mir_insert_instruction_before(mir_next_op(ins), st);
}
/* Insert a load from TLS before the first consecutive
* use of the node, rewriting to use spilled indices to
* break up the live range */
mir_foreach_block(ctx, block) {
bool consecutive_skip = false;
unsigned consecutive_index = 0;
mir_foreach_instr_in_block(block, ins) {
if (ins->compact_branch) continue;
if (!mir_has_arg(ins, spill_node)) {
consecutive_skip = false;
continue;
}
if (consecutive_skip) {
/* Rewrite */
mir_rewrite_index_src_single(ins, spill_node, consecutive_index);
continue;
}
consecutive_index = ++spill_index;
midgard_instruction st = v_load_store_scratch(consecutive_index, spill_slot, false, 0xF);
midgard_instruction *before = ins;
/* For a csel, go back one more not to break up the bundle */
if (ins->type == TAG_ALU_4 && OP_IS_CSEL(ins->alu.op))
before = mir_prev_op(before);
mir_insert_instruction_before(before, st);
// consecutive_skip = true;
/* Rewrite to use */
mir_rewrite_index_src_single(ins, spill_node, consecutive_index);
}
}
}
mir_squeeze_index(ctx);
g = NULL;
g = allocate_registers(ctx, &spilled);
} while(spilled && ((iter_count--) > 0));
/* We would like to run RA after scheduling, but spilling can
* complicate this */
mir_foreach_block(ctx, block) {
schedule_block(ctx, block);
}
#if 0
/* Pipeline registers creation is a prepass before RA */
mir_create_pipeline_registers(ctx);
#endif
g = allocate_registers(ctx, &spilled);
} while(spilled && ((iter_count--) > 0));
if (iter_count <= 0) {
fprintf(stderr, "panfrost: Gave up allocating registers, rendering will be incomplete\n");