From 21510c253ca8f381fb39d365eb0770b47a44add0 Mon Sep 17 00:00:00 2001
From: Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
Date: Fri, 19 Jul 2019 13:21:11 -0700
Subject: [PATCH] panfrost/midgard: Implement register spilling

Now that we run RA in a loop, before each iteration after a failed
allocation we choose a spill node and spill it to Thread Local Storage
using st_int4/ld_int4 instructions (for spills and fills respectively).

This allows us to compile complex shaders that normally would not fit
within the 16 work register limits, although it comes at a fairly steep
performance penalty.

Signed-off-by: Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
---
 src/gallium/drivers/panfrost/pan_context.c |   3 +-
 src/panfrost/midgard/compiler.h            |   3 -
 src/panfrost/midgard/midgard_ra.c          |  47 +-----
 src/panfrost/midgard/midgard_schedule.c    | 159 +++++++++++++++++++--
 4 files changed, 158 insertions(+), 54 deletions(-)

diff --git a/src/gallium/drivers/panfrost/pan_context.c b/src/gallium/drivers/panfrost/pan_context.c
index 26e7fca1d2f..d20f0185a42 100644
--- a/src/gallium/drivers/panfrost/pan_context.c
+++ b/src/gallium/drivers/panfrost/pan_context.c
@@ -158,6 +158,7 @@ panfrost_emit_mfbd(struct panfrost_context *ctx, unsigned vertex_count)
         unsigned height = ctx->pipe_framebuffer.height;
 
         struct bifrost_framebuffer framebuffer = {
+                .unk0 = 0x1e5, /* 1e4 if no spill */
                 .width1 = MALI_POSITIVE(width),
                 .height1 = MALI_POSITIVE(height),
                 .width2 = MALI_POSITIVE(width),
@@ -2663,7 +2664,7 @@ panfrost_setup_hardware(struct panfrost_context *ctx)
         struct pipe_context *gallium = (struct pipe_context *) ctx;
         struct panfrost_screen *screen = pan_screen(gallium->screen);
 
-        panfrost_drm_allocate_slab(screen, &ctx->scratchpad, 64, false, 0, 0, 0);
+        panfrost_drm_allocate_slab(screen, &ctx->scratchpad, 64*4, false, 0, 0, 0);
         panfrost_drm_allocate_slab(screen, &ctx->shaders, 4096, true, PAN_ALLOCATE_EXECUTE, 0, 0);
         panfrost_drm_allocate_slab(screen, &ctx->tiler_heap, 4096, false, PAN_ALLOCATE_INVISIBLE | PAN_ALLOCATE_GROWABLE, 1, 128);
         panfrost_drm_allocate_slab(screen, &ctx->tiler_polygon_list, 128*128, false, PAN_ALLOCATE_INVISIBLE | PAN_ALLOCATE_GROWABLE, 1, 128);
diff --git a/src/panfrost/midgard/compiler.h b/src/panfrost/midgard/compiler.h
index 3002a079dea..91ca185d628 100644
--- a/src/panfrost/midgard/compiler.h
+++ b/src/panfrost/midgard/compiler.h
@@ -429,9 +429,6 @@ mir_has_arg(midgard_instruction *ins, unsigned arg)
         return false;
 }
 
-midgard_instruction
-v_load_store_scratch(unsigned srcdest, unsigned index, bool is_store);
-
 /* Scheduling */
 
 void schedule_program(compiler_context *ctx);
diff --git a/src/panfrost/midgard/midgard_ra.c b/src/panfrost/midgard/midgard_ra.c
index dcae8183513..fdd222b88a1 100644
--- a/src/panfrost/midgard/midgard_ra.c
+++ b/src/panfrost/midgard/midgard_ra.c
@@ -113,31 +113,6 @@ compose_swizzle(unsigned swizzle, unsigned mask,
         return shifted;
 }
 
-/* When we're 'squeezing down' the values in the IR, we maintain a hash
- * as such */
-
-static unsigned
-find_or_allocate_temp(compiler_context *ctx, unsigned hash)
-{
-        if ((hash < 0) || (hash >= SSA_FIXED_MINIMUM))
-                return hash;
-
-        unsigned temp = (uintptr_t) _mesa_hash_table_u64_search(
-                                ctx->hash_to_temp, hash + 1);
-
-        if (temp)
-                return temp - 1;
-
-        /* If no temp is find, allocate one */
-        temp = ctx->temp_count++;
-        ctx->max_hash = MAX2(ctx->max_hash, hash);
-
-        _mesa_hash_table_u64_insert(ctx->hash_to_temp,
-                                    hash + 1, (void *) ((uintptr_t) temp + 1));
-
-        return temp;
-}
-
 /* Helper to return the default phys_reg for a given register */
 
 static struct phys_reg
@@ -242,21 +217,7 @@ allocate_registers(compiler_context *ctx, bool *spilled)
         /* We're done setting up */
         ra_set_finalize(regs, NULL);
 
-        /* Transform the MIR into squeezed index form */
-        mir_foreach_block(ctx, block) {
-                mir_foreach_instr_in_block(block, ins) {
-                        if (ins->compact_branch) continue;
-
-                        ins->ssa_args.dest = find_or_allocate_temp(ctx, ins->ssa_args.dest);
-                        ins->ssa_args.src0 = find_or_allocate_temp(ctx, ins->ssa_args.src0);
-
-                        if (!ins->ssa_args.inline_constant)
-                                ins->ssa_args.src1 = find_or_allocate_temp(ctx, ins->ssa_args.src1);
-
-                }
-        }
-
-        /* No register allocation to do with no SSA */
+       /* No register allocation to do with no SSA */
 
         if (!ctx->temp_count)
                 return NULL;
@@ -381,9 +342,13 @@ allocate_registers(compiler_context *ctx, bool *spilled)
 
         if (!ra_allocate(g)) {
                 *spilled = true;
-                return NULL;
+        } else {
+                *spilled = false;
         }
 
+        /* Whether we were successful or not, report the graph so we can
+         * compute spill nodes */
+
         return g;
 }
 
diff --git a/src/panfrost/midgard/midgard_schedule.c b/src/panfrost/midgard/midgard_schedule.c
index db87ab65f7f..5c03c53023a 100644
--- a/src/panfrost/midgard/midgard_schedule.c
+++ b/src/panfrost/midgard/midgard_schedule.c
@@ -24,6 +24,7 @@
 #include "compiler.h"
 #include "midgard_ops.h"
 #include "util/u_memory.h"
+#include "util/register_allocate.h"
 
 /* Create a mask of accessed components from a swizzle to figure out vector
  * dependencies */
@@ -575,15 +576,66 @@ midgard_pair_load_store(compiler_context *ctx, midgard_block *block)
         }
 }
 
-midgard_instruction
-v_load_store_scratch(unsigned srcdest, unsigned index, bool is_store)
+/* When we're 'squeezing down' the values in the IR, we maintain a hash
+ * as such */
+
+static unsigned
+find_or_allocate_temp(compiler_context *ctx, unsigned hash)
+{
+        if ((hash < 0) || (hash >= SSA_FIXED_MINIMUM))
+                return hash;
+
+        unsigned temp = (uintptr_t) _mesa_hash_table_u64_search(
+                                ctx->hash_to_temp, hash + 1);
+
+        if (temp)
+                return temp - 1;
+
+        /* If no temp is find, allocate one */
+        temp = ctx->temp_count++;
+        ctx->max_hash = MAX2(ctx->max_hash, hash);
+
+        _mesa_hash_table_u64_insert(ctx->hash_to_temp,
+                                    hash + 1, (void *) ((uintptr_t) temp + 1));
+
+        return temp;
+}
+
+/* Reassigns numbering to get rid of gaps in the indices */
+
+static void
+mir_squeeze_index(compiler_context *ctx)
+{
+        /* Reset */
+        ctx->temp_count = 0;
+        /* TODO don't leak old hash_to_temp */
+        ctx->hash_to_temp = _mesa_hash_table_u64_create(NULL);
+
+        mir_foreach_instr_global(ctx, ins) {
+                if (ins->compact_branch) continue;
+
+                ins->ssa_args.dest = find_or_allocate_temp(ctx, ins->ssa_args.dest);
+                ins->ssa_args.src0 = find_or_allocate_temp(ctx, ins->ssa_args.src0);
+
+                if (!ins->ssa_args.inline_constant)
+                        ins->ssa_args.src1 = find_or_allocate_temp(ctx, ins->ssa_args.src1);
+
+        }
+}
+
+static midgard_instruction
+v_load_store_scratch(
+                unsigned srcdest,
+                unsigned index,
+                bool is_store,
+                unsigned mask)
 {
         /* We index by 32-bit vec4s */
         unsigned byte = (index * 4 * 4);
 
         midgard_instruction ins = {
                 .type = TAG_LOAD_STORE_4,
-                .mask = 0xF,
+                .mask = mask,
                 .ssa_args = {
                         .dest = -1,
                         .src0 = -1,
@@ -602,10 +654,10 @@ v_load_store_scratch(unsigned srcdest, unsigned index, bool is_store)
                 }
         };
 
-        if (is_store) {
+       if (is_store) {
                 /* r0 = r26, r1 = r27 */
-                assert(srcdest == 26 || srcdest == 27);
-                ins.ssa_args.src0 = SSA_FIXED_REGISTER(srcdest - 26);
+                assert(srcdest == SSA_FIXED_REGISTER(26) || srcdest == SSA_FIXED_REGISTER(27));
+                ins.ssa_args.src0 = (srcdest == SSA_FIXED_REGISTER(27)) ? SSA_FIXED_REGISTER(1) : SSA_FIXED_REGISTER(0);
         } else {
                 ins.ssa_args.dest = srcdest;
         }
@@ -618,7 +670,10 @@ schedule_program(compiler_context *ctx)
 {
         struct ra_graph *g = NULL;
         bool spilled = false;
-        int iter_count = 10; /* max iterations */
+        int iter_count = 1000; /* max iterations */
+
+        /* Number of 128-bit slots in memory we've spilled into */
+        unsigned spill_count = 0;
 
         midgard_promote_uniforms(ctx, 8);
 
@@ -627,18 +682,104 @@ schedule_program(compiler_context *ctx)
         }
 
         do {
+                /* If we spill, find the best spill node and spill it */
+
+                unsigned spill_index = ctx->temp_count;
+                if (g && spilled) {
+                        /* All nodes are equal in spill cost, but we can't
+                         * spill nodes written to from an unspill */
+
+                        for (unsigned i = 0; i < ctx->temp_count; ++i) {
+                                ra_set_node_spill_cost(g, i, 1.0);
+                        }
+
+                        mir_foreach_instr_global(ctx, ins) {
+                                if (ins->type != TAG_LOAD_STORE_4)  continue;
+                                if (ins->load_store.op != midgard_op_ld_int4) continue;
+                                if (ins->load_store.unknown != 0x1EEA) continue;
+                                ra_set_node_spill_cost(g, ins->ssa_args.dest, -1.0);
+                        }
+
+                        int spill_node = ra_get_best_spill_node(g);
+
+                        if (spill_node < 0)
+                                assert(0);
+
+                        /* Allocate TLS slot */
+                        unsigned spill_slot = spill_count++;
+
+                        /* Replace all stores to the spilled node with stores
+                         * to TLS */
+
+                        mir_foreach_instr_global_safe(ctx, ins) {
+                                if (ins->compact_branch) continue;
+                                if (ins->ssa_args.dest != spill_node) continue;
+                                ins->ssa_args.dest = SSA_FIXED_REGISTER(26);
+
+                                midgard_instruction st = v_load_store_scratch(ins->ssa_args.dest, spill_slot, true, ins->mask);
+                                mir_insert_instruction_before(mir_next_op(ins), st);
+                        }
+
+                        /* Insert a load from TLS before the first consecutive
+                         * use of the node, rewriting to use spilled indices to
+                         * break up the live range */
+
+                        mir_foreach_block(ctx, block) {
+
+                        bool consecutive_skip = false;
+                        unsigned consecutive_index = 0;
+
+                        mir_foreach_instr_in_block(block, ins) {
+                                if (ins->compact_branch) continue;
+                                
+                                if (!mir_has_arg(ins, spill_node)) {
+                                        consecutive_skip = false;
+                                        continue;
+                                }
+
+                                if (consecutive_skip) {
+                                        /* Rewrite */
+                                        mir_rewrite_index_src_single(ins, spill_node, consecutive_index);
+                                        continue;
+                                }
+
+                                consecutive_index = ++spill_index;
+                                midgard_instruction st = v_load_store_scratch(consecutive_index, spill_slot, false, 0xF);
+                                midgard_instruction *before = ins;
+
+                                /* For a csel, go back one more not to break up the bundle */
+                                if (ins->type == TAG_ALU_4 && OP_IS_CSEL(ins->alu.op))
+                                        before = mir_prev_op(before);
+
+                                mir_insert_instruction_before(before, st);
+                               // consecutive_skip = true;
+
+
+                                /* Rewrite to use */
+                                mir_rewrite_index_src_single(ins, spill_node, consecutive_index);
+                        }
+                        }
+                }
+
+                mir_squeeze_index(ctx);
+
+                g = NULL;
+                g = allocate_registers(ctx, &spilled);
+        } while(spilled && ((iter_count--) > 0));
+
                 /* We would like to run RA after scheduling, but spilling can
                  * complicate this */
 
                 mir_foreach_block(ctx, block) {
                         schedule_block(ctx, block);
                 }
+#if 0
 
                 /* Pipeline registers creation is a prepass before RA */
                 mir_create_pipeline_registers(ctx);
+#endif
+
 
-                g = allocate_registers(ctx, &spilled);
-        } while(spilled && ((iter_count--) > 0));
 
         if (iter_count <= 0) {
                 fprintf(stderr, "panfrost: Gave up allocating registers, rendering will be incomplete\n");