panfrost/midgard: Implement register spilling

Now that we run RA in a loop, before each iteration after a failed allocation we choose a spill node and spill it to Thread Local Storage using st_int4/ld_int4 instructions (for spills and fills respectively). This allows us to compile complex shaders that normally would not fit within the 16 work register limits, although it comes at a fairly steep performance penalty. Signed-off-by: Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
2019-07-19 13:21:11 -07:00 · 2019-07-19 13:21:11 -07:00 · 21510c253c
parent 533d65786f
commit 21510c253c
4 changed files with 158 additions and 54 deletions
--- a/src/gallium/drivers/panfrost/pan_context.c
+++ b/src/gallium/drivers/panfrost/pan_context.c
@ -158,6 +158,7 @@ panfrost_emit_mfbd(struct panfrost_context *ctx, unsigned vertex_count)
        unsigned height = ctx->pipe_framebuffer.height;

        struct bifrost_framebuffer framebuffer = {
+                .unk0 = 0x1e5, /* 1e4 if no spill */
                .width1 = MALI_POSITIVE(width),
                .height1 = MALI_POSITIVE(height),
                .width2 = MALI_POSITIVE(width),
@ -2663,7 +2664,7 @@ panfrost_setup_hardware(struct panfrost_context *ctx)
        struct pipe_context *gallium = (struct pipe_context *) ctx;
        struct panfrost_screen *screen = pan_screen(gallium->screen);

-        panfrost_drm_allocate_slab(screen, &ctx->scratchpad, 64, false, 0, 0, 0);
+        panfrost_drm_allocate_slab(screen, &ctx->scratchpad, 64*4, false, 0, 0, 0);
        panfrost_drm_allocate_slab(screen, &ctx->shaders, 4096, true, PAN_ALLOCATE_EXECUTE, 0, 0);
        panfrost_drm_allocate_slab(screen, &ctx->tiler_heap, 4096, false, PAN_ALLOCATE_INVISIBLE | PAN_ALLOCATE_GROWABLE, 1, 128);
        panfrost_drm_allocate_slab(screen, &ctx->tiler_polygon_list, 128*128, false, PAN_ALLOCATE_INVISIBLE | PAN_ALLOCATE_GROWABLE, 1, 128);
--- a/src/panfrost/midgard/compiler.h
+++ b/src/panfrost/midgard/compiler.h
@ -429,9 +429,6 @@ mir_has_arg(midgard_instruction *ins, unsigned arg)
        return false;
 }

-midgard_instruction
-v_load_store_scratch(unsigned srcdest, unsigned index, bool is_store);
-
 /* Scheduling */

 void schedule_program(compiler_context *ctx);
--- a/src/panfrost/midgard/midgard_ra.c
+++ b/src/panfrost/midgard/midgard_ra.c
@ -113,31 +113,6 @@ compose_swizzle(unsigned swizzle, unsigned mask,
        return shifted;
 }

-/* When we're 'squeezing down' the values in the IR, we maintain a hash
- * as such */
-
-static unsigned
-find_or_allocate_temp(compiler_context *ctx, unsigned hash)
-{
-        if ((hash < 0) || (hash >= SSA_FIXED_MINIMUM))
-                return hash;
-
-        unsigned temp = (uintptr_t) _mesa_hash_table_u64_search(
-                                ctx->hash_to_temp, hash + 1);
-
-        if (temp)
-                return temp - 1;
-
-        /* If no temp is find, allocate one */
-        temp = ctx->temp_count++;
-        ctx->max_hash = MAX2(ctx->max_hash, hash);
-
-        _mesa_hash_table_u64_insert(ctx->hash_to_temp,
-                                    hash + 1, (void *) ((uintptr_t) temp + 1));
-
-        return temp;
-}
-
 /* Helper to return the default phys_reg for a given register */

 static struct phys_reg
@ -242,21 +217,7 @@ allocate_registers(compiler_context *ctx, bool *spilled)
        /* We're done setting up */
        ra_set_finalize(regs, NULL);

-        /* Transform the MIR into squeezed index form */
-        mir_foreach_block(ctx, block) {
-                mir_foreach_instr_in_block(block, ins) {
-                        if (ins->compact_branch) continue;
-
-                        ins->ssa_args.dest = find_or_allocate_temp(ctx, ins->ssa_args.dest);
-                        ins->ssa_args.src0 = find_or_allocate_temp(ctx, ins->ssa_args.src0);
-
-                        if (!ins->ssa_args.inline_constant)
-                                ins->ssa_args.src1 = find_or_allocate_temp(ctx, ins->ssa_args.src1);
-
-                }
-        }
-
-        /* No register allocation to do with no SSA */
+       /* No register allocation to do with no SSA */

        if (!ctx->temp_count)
                return NULL;
@ -381,9 +342,13 @@ allocate_registers(compiler_context *ctx, bool *spilled)

        if (!ra_allocate(g)) {
                *spilled = true;
-                return NULL;
+        } else {
+                *spilled = false;
        }

+        /* Whether we were successful or not, report the graph so we can
+         * compute spill nodes */
+
        return g;
 }

--- a/src/panfrost/midgard/midgard_schedule.c
+++ b/src/panfrost/midgard/midgard_schedule.c
@ -24,6 +24,7 @@
 #include "compiler.h"
 #include "midgard_ops.h"
 #include "util/u_memory.h"
+#include "util/register_allocate.h"

 /* Create a mask of accessed components from a swizzle to figure out vector
 * dependencies */
@ -575,15 +576,66 @@ midgard_pair_load_store(compiler_context *ctx, midgard_block *block)
        }
 }

-midgard_instruction
-v_load_store_scratch(unsigned srcdest, unsigned index, bool is_store)
+/* When we're 'squeezing down' the values in the IR, we maintain a hash
+ * as such */
+
+static unsigned
+find_or_allocate_temp(compiler_context *ctx, unsigned hash)
+{
+        if ((hash < 0) || (hash >= SSA_FIXED_MINIMUM))
+                return hash;
+
+        unsigned temp = (uintptr_t) _mesa_hash_table_u64_search(
+                                ctx->hash_to_temp, hash + 1);
+
+        if (temp)
+                return temp - 1;
+
+        /* If no temp is find, allocate one */
+        temp = ctx->temp_count++;
+        ctx->max_hash = MAX2(ctx->max_hash, hash);
+
+        _mesa_hash_table_u64_insert(ctx->hash_to_temp,
+                                    hash + 1, (void *) ((uintptr_t) temp + 1));
+
+        return temp;
+}
+
+/* Reassigns numbering to get rid of gaps in the indices */
+
+static void
+mir_squeeze_index(compiler_context *ctx)
+{
+        /* Reset */
+        ctx->temp_count = 0;
+        /* TODO don't leak old hash_to_temp */
+        ctx->hash_to_temp = _mesa_hash_table_u64_create(NULL);
+
+        mir_foreach_instr_global(ctx, ins) {
+                if (ins->compact_branch) continue;
+
+                ins->ssa_args.dest = find_or_allocate_temp(ctx, ins->ssa_args.dest);
+                ins->ssa_args.src0 = find_or_allocate_temp(ctx, ins->ssa_args.src0);
+
+                if (!ins->ssa_args.inline_constant)
+                        ins->ssa_args.src1 = find_or_allocate_temp(ctx, ins->ssa_args.src1);
+
+        }
+}
+
+static midgard_instruction
+v_load_store_scratch(
+                unsigned srcdest,
+                unsigned index,
+                bool is_store,
+                unsigned mask)
 {
        /* We index by 32-bit vec4s */
        unsigned byte = (index * 4 * 4);

        midgard_instruction ins = {
                .type = TAG_LOAD_STORE_4,
-                .mask = 0xF,
+                .mask = mask,
                .ssa_args = {
                        .dest = -1,
                        .src0 = -1,
@ -602,10 +654,10 @@ v_load_store_scratch(unsigned srcdest, unsigned index, bool is_store)
                }
        };

-        if (is_store) {
+       if (is_store) {
                /* r0 = r26, r1 = r27 */
-                assert(srcdest == 26 || srcdest == 27);
-                ins.ssa_args.src0 = SSA_FIXED_REGISTER(srcdest - 26);
+                assert(srcdest == SSA_FIXED_REGISTER(26) || srcdest == SSA_FIXED_REGISTER(27));
+                ins.ssa_args.src0 = (srcdest == SSA_FIXED_REGISTER(27)) ? SSA_FIXED_REGISTER(1) : SSA_FIXED_REGISTER(0);
        } else {
                ins.ssa_args.dest = srcdest;
        }
@ -618,7 +670,10 @@ schedule_program(compiler_context *ctx)
 {
        struct ra_graph *g = NULL;
        bool spilled = false;
-        int iter_count = 10; /* max iterations */
+        int iter_count = 1000; /* max iterations */
+
+        /* Number of 128-bit slots in memory we've spilled into */
+        unsigned spill_count = 0;

        midgard_promote_uniforms(ctx, 8);

@ -627,18 +682,104 @@ schedule_program(compiler_context *ctx)
        }

        do {
+                /* If we spill, find the best spill node and spill it */
+
+                unsigned spill_index = ctx->temp_count;
+                if (g && spilled) {
+                        /* All nodes are equal in spill cost, but we can't
+                         * spill nodes written to from an unspill */
+
+                        for (unsigned i = 0; i < ctx->temp_count; ++i) {
+                                ra_set_node_spill_cost(g, i, 1.0);
+                        }
+
+                        mir_foreach_instr_global(ctx, ins) {
+                                if (ins->type != TAG_LOAD_STORE_4)  continue;
+                                if (ins->load_store.op != midgard_op_ld_int4) continue;
+                                if (ins->load_store.unknown != 0x1EEA) continue;
+                                ra_set_node_spill_cost(g, ins->ssa_args.dest, -1.0);
+                        }
+
+                        int spill_node = ra_get_best_spill_node(g);
+
+                        if (spill_node < 0)
+                                assert(0);
+
+                        /* Allocate TLS slot */
+                        unsigned spill_slot = spill_count++;
+
+                        /* Replace all stores to the spilled node with stores
+                         * to TLS */
+
+                        mir_foreach_instr_global_safe(ctx, ins) {
+                                if (ins->compact_branch) continue;
+                                if (ins->ssa_args.dest != spill_node) continue;
+                                ins->ssa_args.dest = SSA_FIXED_REGISTER(26);
+
+                                midgard_instruction st = v_load_store_scratch(ins->ssa_args.dest, spill_slot, true, ins->mask);
+                                mir_insert_instruction_before(mir_next_op(ins), st);
+                        }
+
+                        /* Insert a load from TLS before the first consecutive
+                         * use of the node, rewriting to use spilled indices to
+                         * break up the live range */
+
+                        mir_foreach_block(ctx, block) {
+
+                        bool consecutive_skip = false;
+                        unsigned consecutive_index = 0;
+
+                        mir_foreach_instr_in_block(block, ins) {
+                                if (ins->compact_branch) continue;
+                                
+                                if (!mir_has_arg(ins, spill_node)) {
+                                        consecutive_skip = false;
+                                        continue;
+                                }
+
+                                if (consecutive_skip) {
+                                        /* Rewrite */
+                                        mir_rewrite_index_src_single(ins, spill_node, consecutive_index);
+                                        continue;
+                                }
+
+                                consecutive_index = ++spill_index;
+                                midgard_instruction st = v_load_store_scratch(consecutive_index, spill_slot, false, 0xF);
+                                midgard_instruction *before = ins;
+
+                                /* For a csel, go back one more not to break up the bundle */
+                                if (ins->type == TAG_ALU_4 && OP_IS_CSEL(ins->alu.op))
+                                        before = mir_prev_op(before);
+
+                                mir_insert_instruction_before(before, st);
+                               // consecutive_skip = true;
+
+
+                                /* Rewrite to use */
+                                mir_rewrite_index_src_single(ins, spill_node, consecutive_index);
+                        }
+                        }
+                }
+
+                mir_squeeze_index(ctx);
+
+                g = NULL;
+                g = allocate_registers(ctx, &spilled);
+        } while(spilled && ((iter_count--) > 0));
+
                /* We would like to run RA after scheduling, but spilling can
                 * complicate this */

                mir_foreach_block(ctx, block) {
                        schedule_block(ctx, block);
                }
+#if 0

                /* Pipeline registers creation is a prepass before RA */
                mir_create_pipeline_registers(ctx);
+#endif
+

-                g = allocate_registers(ctx, &spilled);
-        } while(spilled && ((iter_count--) > 0));

        if (iter_count <= 0) {
                fprintf(stderr, "panfrost: Gave up allocating registers, rendering will be incomplete\n");