From 40e091267dd02d729cc6d12d190309f103217111 Mon Sep 17 00:00:00 2001
From: Iago Toral Quiroga <itoral@igalia.com>
Date: Fri, 4 Feb 2022 13:40:50 +0100
Subject: [PATCH] broadcom/compiler: define max number of tmu spills for
 compile strategies
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Instead of whether they are allowed to spill or not. This is more flexible.
Also, while we are not currently enabling spilling on any 4-thread strategies,
should we do that in the future, always prefer a 4-thread compile.

Reviewed-by: Alejandro Piñeiro <apinheiro@igalia.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/15041>
---
 src/broadcom/compiler/nir_to_vir.c            |  2 +-
 src/broadcom/compiler/v3d_compiler.h          |  2 +-
 src/broadcom/compiler/vir.c                   | 41 +++++++++++--------
 src/broadcom/compiler/vir_register_allocate.c | 11 ++---
 4 files changed, 28 insertions(+), 28 deletions(-)

diff --git a/src/broadcom/compiler/nir_to_vir.c b/src/broadcom/compiler/nir_to_vir.c
index abfbecc502f..b645ef5790a 100644
--- a/src/broadcom/compiler/nir_to_vir.c
+++ b/src/broadcom/compiler/nir_to_vir.c
@@ -4473,7 +4473,7 @@ v3d_nir_to_vir(struct v3d_compile *c)
         while (true) {
                 bool spilled;
                 temp_registers = v3d_register_allocate(c, &spilled);
-                if (spilled)
+                if (spilled && c->spills + c->fills <= c->max_tmu_spills)
                         continue;
 
                 if (temp_registers)
diff --git a/src/broadcom/compiler/v3d_compiler.h b/src/broadcom/compiler/v3d_compiler.h
index e42ea184c45..844a9603606 100644
--- a/src/broadcom/compiler/v3d_compiler.h
+++ b/src/broadcom/compiler/v3d_compiler.h
@@ -710,7 +710,7 @@ struct v3d_compile {
          * strategies that can reduce register pressure and hopefully reduce or
          * eliminate TMU spills in the shader.
          */
-        bool tmu_spilling_allowed;
+        uint32_t max_tmu_spills;
 
         /* The UBO index and block used with the last unifa load, as well as the
          * current unifa offset *after* emitting that load. This is used to skip
diff --git a/src/broadcom/compiler/vir.c b/src/broadcom/compiler/vir.c
index a08206fdf30..4753b2e274e 100644
--- a/src/broadcom/compiler/vir.c
+++ b/src/broadcom/compiler/vir.c
@@ -541,7 +541,7 @@ vir_compile_init(const struct v3d_compiler *compiler,
                  int program_id, int variant_id,
                  uint32_t max_threads,
                  uint32_t min_threads_for_reg_alloc,
-                 bool tmu_spilling_allowed,
+                 uint32_t max_tmu_spills,
                  bool disable_loop_unrolling,
                  bool disable_constant_ubo_load_sorting,
                  bool disable_tmu_pipelining,
@@ -559,7 +559,7 @@ vir_compile_init(const struct v3d_compiler *compiler,
         c->debug_output_data = debug_output_data;
         c->compilation_result = V3D_COMPILATION_SUCCEEDED;
         c->min_threads_for_reg_alloc = min_threads_for_reg_alloc;
-        c->tmu_spilling_allowed = tmu_spilling_allowed;
+        c->max_tmu_spills = max_tmu_spills;
         c->fallback_scheduler = fallback_scheduler;
         c->disable_tmu_pipelining = disable_tmu_pipelining;
         c->disable_constant_ubo_load_sorting = disable_constant_ubo_load_sorting;
@@ -1624,17 +1624,17 @@ struct v3d_compiler_strategy {
         bool disable_loop_unrolling;
         bool disable_ubo_load_sorting;
         bool disable_tmu_pipelining;
-        bool tmu_spilling_allowed;
+        uint32_t max_tmu_spills;
 } static const strategies[] = {
-  /*0*/ { "default",                        4, 4, false, false, false, false },
-  /*1*/ { "disable loop unrolling",         4, 4, true,  false, false, false },
-  /*2*/ { "disable UBO load sorting",       4, 4, true,  true,  false, false },
-  /*3*/ { "disable TMU pipelining",         4, 4, true,  true,  true,  false },
-  /*4*/ { "lower thread count",             2, 1, false, false, false, true },
-  /*5*/ { "disable loop unrolling (ltc)",   2, 1, true,  false, false, true },
-  /*6*/ { "disable UBO load sorting (ltc)", 2, 1, true,  true,  false, true },
-  /*7*/ { "disable TMU pipelining (ltc)",   2, 1, true,  true,  true,  true },
-  /*8*/ { "fallback scheduler",             2, 1, true,  true,  true,  true  }
+  /*0*/ { "default",                        4, 4, false, false, false,  0 },
+  /*1*/ { "disable loop unrolling",         4, 4, true,  false, false,  0 },
+  /*2*/ { "disable UBO load sorting",       4, 4, true,  true,  false,  0 },
+  /*3*/ { "disable TMU pipelining",         4, 4, true,  true,  true,   0 },
+  /*4*/ { "lower thread count",             2, 1, false, false, false, -1 },
+  /*5*/ { "disable loop unrolling (ltc)",   2, 1, true,  false, false, -1 },
+  /*6*/ { "disable UBO load sorting (ltc)", 2, 1, true,  true,  false, -1 },
+  /*7*/ { "disable TMU pipelining (ltc)",   2, 1, true,  true,  true,  -1 },
+  /*8*/ { "fallback scheduler",             2, 1, true,  true,  true,  -1 }
 };
 
 /**
@@ -1655,8 +1655,8 @@ skip_compile_strategy(struct v3d_compile *c, uint32_t idx)
    assert(idx > 0);
 
    /* Don't skip a strategy that changes spilling behavior */
-   if (strategies[idx].tmu_spilling_allowed !=
-       strategies[idx - 1].tmu_spilling_allowed) {
+   if (strategies[idx].max_tmu_spills !=
+       strategies[idx - 1].max_tmu_spills) {
            return false;
    }
 
@@ -1726,7 +1726,7 @@ uint64_t *v3d_compile(const struct v3d_compiler *compiler,
                                      program_id, variant_id,
                                      strategies[strat].max_threads,
                                      strategies[strat].min_threads,
-                                     strategies[strat].tmu_spilling_allowed,
+                                     strategies[strat].max_tmu_spills,
                                      strategies[strat].disable_loop_unrolling,
                                      strategies[strat].disable_ubo_load_sorting,
                                      strategies[strat].disable_tmu_pipelining,
@@ -1738,11 +1738,16 @@ uint64_t *v3d_compile(const struct v3d_compiler *compiler,
                 if (c->compilation_result == V3D_COMPILATION_FAILED)
                         break;
 
-                /* If we compiled without spills, choose this. Otherwise keep
-                 * going and track strategy with less spilling.
+                /* If we compiled without spills, choose this.
+                 * Otherwise if this is a 4-thread compile, choose this (these
+                 * have a very low cap on the allowed TMU spills so we assume
+                 * it will be better than a 2-thread compile without spills).
+                 * Otherwise, keep going while tracking the strategy with the
+                 * lowest spill count.
                  */
                 if (c->compilation_result == V3D_COMPILATION_SUCCEEDED) {
-                        if (c->spills == 0) {
+                        if (c->spills == 0 ||
+                            strategies[strat].min_threads == 4) {
                                 best_c = c;
                                 break;
                         } else if (c->spills + c->fills <
diff --git a/src/broadcom/compiler/vir_register_allocate.c b/src/broadcom/compiler/vir_register_allocate.c
index e26b790c946..05b71e3369a 100644
--- a/src/broadcom/compiler/vir_register_allocate.c
+++ b/src/broadcom/compiler/vir_register_allocate.c
@@ -561,15 +561,10 @@ get_spill_batch_size(struct v3d_compile *c)
    return 20;
 }
 
-/* Don't emit spills using the TMU until we've dropped thread count first. We,
- * may also disable spilling when certain optimizations that are known to
- * increase register pressure are active so we favor recompiling with
- * optimizations disabled instead of spilling.
- */
 static inline bool
-tmu_spilling_allowed(struct v3d_compile *c, int thread_index)
+tmu_spilling_allowed(struct v3d_compile *c)
 {
-        return thread_index == 0 && c->tmu_spilling_allowed;
+        return c->spills + c->fills < c->max_tmu_spills;
 }
 
 #define CLASS_BIT_PHYS			(1 << 0)
@@ -818,7 +813,7 @@ v3d_register_allocate(struct v3d_compile *c, bool *spilled)
                         if (i > 0 && !is_uniform)
                                 break;
 
-                        if (is_uniform || tmu_spilling_allowed(c, thread_index)) {
+                        if (is_uniform || tmu_spilling_allowed(c)) {
                                 v3d_spill_reg(c, map[node].temp);
 
                                 /* Ask the outer loop to call back in. */