From 21510c253ca8f381fb39d365eb0770b47a44add0 Mon Sep 17 00:00:00 2001 From: Alyssa Rosenzweig Date: Fri, 19 Jul 2019 13:21:11 -0700 Subject: [PATCH] panfrost/midgard: Implement register spilling Now that we run RA in a loop, before each iteration after a failed allocation we choose a spill node and spill it to Thread Local Storage using st_int4/ld_int4 instructions (for spills and fills respectively). This allows us to compile complex shaders that normally would not fit within the 16 work register limits, although it comes at a fairly steep performance penalty. Signed-off-by: Alyssa Rosenzweig --- src/gallium/drivers/panfrost/pan_context.c | 3 +- src/panfrost/midgard/compiler.h | 3 - src/panfrost/midgard/midgard_ra.c | 47 +----- src/panfrost/midgard/midgard_schedule.c | 159 +++++++++++++++++++-- 4 files changed, 158 insertions(+), 54 deletions(-) diff --git a/src/gallium/drivers/panfrost/pan_context.c b/src/gallium/drivers/panfrost/pan_context.c index 26e7fca1d2f..d20f0185a42 100644 --- a/src/gallium/drivers/panfrost/pan_context.c +++ b/src/gallium/drivers/panfrost/pan_context.c @@ -158,6 +158,7 @@ panfrost_emit_mfbd(struct panfrost_context *ctx, unsigned vertex_count) unsigned height = ctx->pipe_framebuffer.height; struct bifrost_framebuffer framebuffer = { + .unk0 = 0x1e5, /* 1e4 if no spill */ .width1 = MALI_POSITIVE(width), .height1 = MALI_POSITIVE(height), .width2 = MALI_POSITIVE(width), @@ -2663,7 +2664,7 @@ panfrost_setup_hardware(struct panfrost_context *ctx) struct pipe_context *gallium = (struct pipe_context *) ctx; struct panfrost_screen *screen = pan_screen(gallium->screen); - panfrost_drm_allocate_slab(screen, &ctx->scratchpad, 64, false, 0, 0, 0); + panfrost_drm_allocate_slab(screen, &ctx->scratchpad, 64*4, false, 0, 0, 0); panfrost_drm_allocate_slab(screen, &ctx->shaders, 4096, true, PAN_ALLOCATE_EXECUTE, 0, 0); panfrost_drm_allocate_slab(screen, &ctx->tiler_heap, 4096, false, PAN_ALLOCATE_INVISIBLE | PAN_ALLOCATE_GROWABLE, 1, 128); panfrost_drm_allocate_slab(screen, &ctx->tiler_polygon_list, 128*128, false, PAN_ALLOCATE_INVISIBLE | PAN_ALLOCATE_GROWABLE, 1, 128); diff --git a/src/panfrost/midgard/compiler.h b/src/panfrost/midgard/compiler.h index 3002a079dea..91ca185d628 100644 --- a/src/panfrost/midgard/compiler.h +++ b/src/panfrost/midgard/compiler.h @@ -429,9 +429,6 @@ mir_has_arg(midgard_instruction *ins, unsigned arg) return false; } -midgard_instruction -v_load_store_scratch(unsigned srcdest, unsigned index, bool is_store); - /* Scheduling */ void schedule_program(compiler_context *ctx); diff --git a/src/panfrost/midgard/midgard_ra.c b/src/panfrost/midgard/midgard_ra.c index dcae8183513..fdd222b88a1 100644 --- a/src/panfrost/midgard/midgard_ra.c +++ b/src/panfrost/midgard/midgard_ra.c @@ -113,31 +113,6 @@ compose_swizzle(unsigned swizzle, unsigned mask, return shifted; } -/* When we're 'squeezing down' the values in the IR, we maintain a hash - * as such */ - -static unsigned -find_or_allocate_temp(compiler_context *ctx, unsigned hash) -{ - if ((hash < 0) || (hash >= SSA_FIXED_MINIMUM)) - return hash; - - unsigned temp = (uintptr_t) _mesa_hash_table_u64_search( - ctx->hash_to_temp, hash + 1); - - if (temp) - return temp - 1; - - /* If no temp is find, allocate one */ - temp = ctx->temp_count++; - ctx->max_hash = MAX2(ctx->max_hash, hash); - - _mesa_hash_table_u64_insert(ctx->hash_to_temp, - hash + 1, (void *) ((uintptr_t) temp + 1)); - - return temp; -} - /* Helper to return the default phys_reg for a given register */ static struct phys_reg @@ -242,21 +217,7 @@ allocate_registers(compiler_context *ctx, bool *spilled) /* We're done setting up */ ra_set_finalize(regs, NULL); - /* Transform the MIR into squeezed index form */ - mir_foreach_block(ctx, block) { - mir_foreach_instr_in_block(block, ins) { - if (ins->compact_branch) continue; - - ins->ssa_args.dest = find_or_allocate_temp(ctx, ins->ssa_args.dest); - ins->ssa_args.src0 = find_or_allocate_temp(ctx, ins->ssa_args.src0); - - if (!ins->ssa_args.inline_constant) - ins->ssa_args.src1 = find_or_allocate_temp(ctx, ins->ssa_args.src1); - - } - } - - /* No register allocation to do with no SSA */ + /* No register allocation to do with no SSA */ if (!ctx->temp_count) return NULL; @@ -381,9 +342,13 @@ allocate_registers(compiler_context *ctx, bool *spilled) if (!ra_allocate(g)) { *spilled = true; - return NULL; + } else { + *spilled = false; } + /* Whether we were successful or not, report the graph so we can + * compute spill nodes */ + return g; } diff --git a/src/panfrost/midgard/midgard_schedule.c b/src/panfrost/midgard/midgard_schedule.c index db87ab65f7f..5c03c53023a 100644 --- a/src/panfrost/midgard/midgard_schedule.c +++ b/src/panfrost/midgard/midgard_schedule.c @@ -24,6 +24,7 @@ #include "compiler.h" #include "midgard_ops.h" #include "util/u_memory.h" +#include "util/register_allocate.h" /* Create a mask of accessed components from a swizzle to figure out vector * dependencies */ @@ -575,15 +576,66 @@ midgard_pair_load_store(compiler_context *ctx, midgard_block *block) } } -midgard_instruction -v_load_store_scratch(unsigned srcdest, unsigned index, bool is_store) +/* When we're 'squeezing down' the values in the IR, we maintain a hash + * as such */ + +static unsigned +find_or_allocate_temp(compiler_context *ctx, unsigned hash) +{ + if ((hash < 0) || (hash >= SSA_FIXED_MINIMUM)) + return hash; + + unsigned temp = (uintptr_t) _mesa_hash_table_u64_search( + ctx->hash_to_temp, hash + 1); + + if (temp) + return temp - 1; + + /* If no temp is find, allocate one */ + temp = ctx->temp_count++; + ctx->max_hash = MAX2(ctx->max_hash, hash); + + _mesa_hash_table_u64_insert(ctx->hash_to_temp, + hash + 1, (void *) ((uintptr_t) temp + 1)); + + return temp; +} + +/* Reassigns numbering to get rid of gaps in the indices */ + +static void +mir_squeeze_index(compiler_context *ctx) +{ + /* Reset */ + ctx->temp_count = 0; + /* TODO don't leak old hash_to_temp */ + ctx->hash_to_temp = _mesa_hash_table_u64_create(NULL); + + mir_foreach_instr_global(ctx, ins) { + if (ins->compact_branch) continue; + + ins->ssa_args.dest = find_or_allocate_temp(ctx, ins->ssa_args.dest); + ins->ssa_args.src0 = find_or_allocate_temp(ctx, ins->ssa_args.src0); + + if (!ins->ssa_args.inline_constant) + ins->ssa_args.src1 = find_or_allocate_temp(ctx, ins->ssa_args.src1); + + } +} + +static midgard_instruction +v_load_store_scratch( + unsigned srcdest, + unsigned index, + bool is_store, + unsigned mask) { /* We index by 32-bit vec4s */ unsigned byte = (index * 4 * 4); midgard_instruction ins = { .type = TAG_LOAD_STORE_4, - .mask = 0xF, + .mask = mask, .ssa_args = { .dest = -1, .src0 = -1, @@ -602,10 +654,10 @@ v_load_store_scratch(unsigned srcdest, unsigned index, bool is_store) } }; - if (is_store) { + if (is_store) { /* r0 = r26, r1 = r27 */ - assert(srcdest == 26 || srcdest == 27); - ins.ssa_args.src0 = SSA_FIXED_REGISTER(srcdest - 26); + assert(srcdest == SSA_FIXED_REGISTER(26) || srcdest == SSA_FIXED_REGISTER(27)); + ins.ssa_args.src0 = (srcdest == SSA_FIXED_REGISTER(27)) ? SSA_FIXED_REGISTER(1) : SSA_FIXED_REGISTER(0); } else { ins.ssa_args.dest = srcdest; } @@ -618,7 +670,10 @@ schedule_program(compiler_context *ctx) { struct ra_graph *g = NULL; bool spilled = false; - int iter_count = 10; /* max iterations */ + int iter_count = 1000; /* max iterations */ + + /* Number of 128-bit slots in memory we've spilled into */ + unsigned spill_count = 0; midgard_promote_uniforms(ctx, 8); @@ -627,18 +682,104 @@ schedule_program(compiler_context *ctx) } do { + /* If we spill, find the best spill node and spill it */ + + unsigned spill_index = ctx->temp_count; + if (g && spilled) { + /* All nodes are equal in spill cost, but we can't + * spill nodes written to from an unspill */ + + for (unsigned i = 0; i < ctx->temp_count; ++i) { + ra_set_node_spill_cost(g, i, 1.0); + } + + mir_foreach_instr_global(ctx, ins) { + if (ins->type != TAG_LOAD_STORE_4) continue; + if (ins->load_store.op != midgard_op_ld_int4) continue; + if (ins->load_store.unknown != 0x1EEA) continue; + ra_set_node_spill_cost(g, ins->ssa_args.dest, -1.0); + } + + int spill_node = ra_get_best_spill_node(g); + + if (spill_node < 0) + assert(0); + + /* Allocate TLS slot */ + unsigned spill_slot = spill_count++; + + /* Replace all stores to the spilled node with stores + * to TLS */ + + mir_foreach_instr_global_safe(ctx, ins) { + if (ins->compact_branch) continue; + if (ins->ssa_args.dest != spill_node) continue; + ins->ssa_args.dest = SSA_FIXED_REGISTER(26); + + midgard_instruction st = v_load_store_scratch(ins->ssa_args.dest, spill_slot, true, ins->mask); + mir_insert_instruction_before(mir_next_op(ins), st); + } + + /* Insert a load from TLS before the first consecutive + * use of the node, rewriting to use spilled indices to + * break up the live range */ + + mir_foreach_block(ctx, block) { + + bool consecutive_skip = false; + unsigned consecutive_index = 0; + + mir_foreach_instr_in_block(block, ins) { + if (ins->compact_branch) continue; + + if (!mir_has_arg(ins, spill_node)) { + consecutive_skip = false; + continue; + } + + if (consecutive_skip) { + /* Rewrite */ + mir_rewrite_index_src_single(ins, spill_node, consecutive_index); + continue; + } + + consecutive_index = ++spill_index; + midgard_instruction st = v_load_store_scratch(consecutive_index, spill_slot, false, 0xF); + midgard_instruction *before = ins; + + /* For a csel, go back one more not to break up the bundle */ + if (ins->type == TAG_ALU_4 && OP_IS_CSEL(ins->alu.op)) + before = mir_prev_op(before); + + mir_insert_instruction_before(before, st); + // consecutive_skip = true; + + + /* Rewrite to use */ + mir_rewrite_index_src_single(ins, spill_node, consecutive_index); + } + } + } + + mir_squeeze_index(ctx); + + g = NULL; + g = allocate_registers(ctx, &spilled); + } while(spilled && ((iter_count--) > 0)); + /* We would like to run RA after scheduling, but spilling can * complicate this */ mir_foreach_block(ctx, block) { schedule_block(ctx, block); } +#if 0 /* Pipeline registers creation is a prepass before RA */ mir_create_pipeline_registers(ctx); +#endif + - g = allocate_registers(ctx, &spilled); - } while(spilled && ((iter_count--) > 0)); if (iter_count <= 0) { fprintf(stderr, "panfrost: Gave up allocating registers, rendering will be incomplete\n");