broadcom/compiler: refactor compile strategies

Until now, if we can't compile at 4 threads we would lower thread count with optimizations disabled, however, lowering thread count doubles the amount of registers available per thread, so that alone is already a big relief for register pressure so it makes sense to enable optimizations when we do that, and progressively disable them until we enable spilling as a last resort. This can slightly improve performance for some applications. Sponza, for example, gets a ~1.5% boost. I see several UE4 shaders that also get compiled to better code at 2 threads with this, but it is more difficult to assess how much this improves performance in practice due to the large variance in frame times that we observe with UE4 demos. Also, if a compiler strategy disables an optimization that did not make any progress in the previous compile attempt, we would end up re-compiling the exact same shader code and failing again. This, patch keeps track of which strategies won't make progress and skips them in that case to save some CPU time during shader compiles. Care should be taken to ensure that we try to compile with the default NIR scheduler at minimum thread count at least once though, so a specific strategy for this is added, to prevent the scenario where no optimizations are used and we skip directly to the fallback scheduler if the default strategy fails at 4 threads. Similarly, we now also explicitly specify which strategies are allowed to do TMU spills and make sure we take this into account when deciding to skip strategies. This prevents the case where no optimizations are used in a shader and we skip directly to the fallback scheduler after failing compilation at 2 threads with the default NIR scheduler but without trying to spill first. Reviewed-by: Alejandro Piñeiro <apinheiro@igalia.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/10647>
2021-05-03 10:30:31 +02:00 · 2021-05-03 10:30:31 +02:00 · d19ce36ff2
parent 296fe4daa6
commit d19ce36ff2
4 changed files with 133 additions and 60 deletions
--- a/src/broadcom/compiler/nir_to_vir.c
+++ b/src/broadcom/compiler/nir_to_vir.c
@ -282,6 +282,8 @@ ntq_add_pending_tmu_flush(struct v3d_compile *c,

        if (c->disable_tmu_pipelining)
                ntq_flush_tmu(c);
+        else if (c->tmu.flush_count > 1)
+                c->pipelined_any_tmu = true;
 }

 enum emit_mode {
@ -1828,10 +1830,13 @@ v3d_optimize_nir(struct v3d_compile *c, struct nir_shader *s)

                if (c && !c->disable_loop_unrolling &&
                    s->options->max_unroll_iterations > 0) {
-                        NIR_PASS(progress, s, nir_opt_loop_unroll,
-                                 nir_var_shader_in |
-                                 nir_var_shader_out |
-                                 nir_var_function_temp);
+                       bool local_progress = false;
+                       NIR_PASS(local_progress, s, nir_opt_loop_unroll,
+                                nir_var_shader_in |
+                                nir_var_shader_out |
+                                nir_var_function_temp);
+                       c->unrolled_any_loops |= local_progress;
+                       progress |= local_progress;
                }
        } while (progress);

--- a/src/broadcom/compiler/v3d_compiler.h
+++ b/src/broadcom/compiler/v3d_compiler.h
@ -646,12 +646,14 @@ struct v3d_compile {
         * TMU spills.
         */
        bool disable_tmu_pipelining;
+        bool pipelined_any_tmu;

        /* Disable sorting of UBO loads with constant offset. This may
         * increase the chances of being able to compile shaders with high
         * register pressure.
         */
        bool disable_constant_ubo_load_sorting;
+        bool sorted_any_ubo_loads;

        /* Emits ldunif for each new uniform, even if the uniform was already
         * emitted in the same block. Useful to compile shaders with high
@ -662,6 +664,7 @@ struct v3d_compile {

        /* Disables loop unrolling to reduce register pressure. */
        bool disable_loop_unrolling;
+        bool unrolled_any_loops;

        /* Minimum number of threads we are willing to use to register allocate
         * a shader with the current compilation strategy. This only prevents
@ -671,6 +674,13 @@ struct v3d_compile {
         */
        uint32_t min_threads_for_reg_alloc;

+        /* Whether TMU spills are allowed. If this is disabled it may cause
+         * register allocation to fail. We set this to favor other compilation
+         * strategies that can reduce register pressure and hopefully reduce or
+         * eliminate TMU spills in the shader.
+         */
+        bool tmu_spilling_allowed;
+
        /* The UBO index and block used with the last unifa load, as well as the
         * current unifa offset *after* emitting that load. This is used to skip
         * unifa writes (and their 3 delay slot) when the next UBO load reads
--- a/src/broadcom/compiler/vir.c
+++ b/src/broadcom/compiler/vir.c
@ -526,6 +526,7 @@ vir_compile_init(const struct v3d_compiler *compiler,
                 void *debug_output_data,
                 int program_id, int variant_id,
                 uint32_t min_threads_for_reg_alloc,
+                 bool tmu_spilling_allowed,
                 bool disable_loop_unrolling,
                 bool disable_constant_ubo_load_sorting,
                 bool disable_tmu_pipelining,
@ -543,6 +544,7 @@ vir_compile_init(const struct v3d_compiler *compiler,
        c->debug_output_data = debug_output_data;
        c->compilation_result = V3D_COMPILATION_SUCCEEDED;
        c->min_threads_for_reg_alloc = min_threads_for_reg_alloc;
+        c->tmu_spilling_allowed = tmu_spilling_allowed;
        c->fallback_scheduler = fallback_scheduler;
        c->disable_tmu_pipelining = disable_tmu_pipelining;
        c->disable_constant_ubo_load_sorting = disable_constant_ubo_load_sorting;
@ -1333,11 +1335,10 @@ v3d_nir_sort_constant_ubo_loads_block(struct v3d_compile *c,
 static bool
 v3d_nir_sort_constant_ubo_loads(nir_shader *s, struct v3d_compile *c)
 {
-        bool progress = false;
        nir_foreach_function(function, s) {
                if (function->impl) {
                        nir_foreach_block(block, function->impl) {
-                                progress |=
+                                c->sorted_any_ubo_loads |=
                                        v3d_nir_sort_constant_ubo_loads_block(c, block);
                        }
                        nir_metadata_preserve(function->impl,
@ -1345,7 +1346,7 @@ v3d_nir_sort_constant_ubo_loads(nir_shader *s, struct v3d_compile *c)
                                              nir_metadata_dominance);
                }
        }
-        return progress;
+        return c->sorted_any_ubo_loads;
 }

 static void
@ -1508,6 +1509,82 @@ int v3d_shaderdb_dump(struct v3d_compile *c,
                        c->nop_count);
 }

+/* This is a list of incremental changes to the compilation strategy
+ * that will be used to try to compile the shader successfully. The
+ * default strategy is to enable all optimizations which will have
+ * the highest register pressure but is expected to produce most
+ * optimal code. Following strategies incrementally disable specific
+ * optimizations that are known to contribute to register pressure
+ * in order to be able to compile the shader successfully while meeting
+ * thread count requirements.
+ *
+ * V3D 4.1+ has a min thread count of 2, but we can use 1 here to also
+ * cover previous hardware as well (meaning that we are not limiting
+ * register allocation to any particular thread count). This is fine
+ * because v3d_nir_to_vir will cap this to the actual minimum.
+ */
+struct v3d_compiler_strategy {
+        const char *name;
+        uint32_t min_threads_for_reg_alloc;
+        bool disable_loop_unrolling;
+        bool disable_ubo_load_sorting;
+        bool disable_tmu_pipelining;
+        bool tmu_spilling_allowed;
+} static const strategies[] = {
+  /*0*/ { "default",                        4, false, false, false, false },
+  /*1*/ { "disable loop unrolling",         4, true,  false, false, false },
+  /*2*/ { "disable UBO load sorting",       4, true,  true,  false, false },
+  /*3*/ { "disable TMU pipelining",         4, true,  true,  true,  false },
+  /*4*/ { "lower thread count",             1, false, false, false, false },
+  /*5*/ { "disable loop unrolling (ltc)",   1, true,  false, false, false },
+  /*6*/ { "disable UBO load sorting (ltc)", 1, true,  true,  false, false },
+  /*7*/ { "disable TMU pipelining (ltc)",   1, true,  true,  true,  true  },
+  /*8*/ { "fallback scheduler",             1, true,  true,  true,  true  }
+};
+
+/**
+ * If a particular optimization didn't make any progress during a compile
+ * attempt disabling it alone won't allow us to compile the shader successfuly,
+ * since we'll end up with the same code. Detect these scenarios so we can
+ * avoid wasting time with useless compiles. We should also consider if the
+ * strategy changes other aspects of the compilation process though, like
+ * spilling, and not skip it in that case.
+ */
+static bool
+skip_compile_strategy(struct v3d_compile *c, uint32_t idx)
+{
+   /* We decide if we can skip a strategy based on the optimizations that
+    * were active in the previous strategy, so we should only be calling this
+    * for strategies after the first.
+    */
+   assert(idx > 0);
+
+   /* Don't skip a strategy that changes spilling behavior */
+   if (strategies[idx].tmu_spilling_allowed !=
+       strategies[idx - 1].tmu_spilling_allowed) {
+           return false;
+   }
+
+   switch (idx) {
+   /* Loop unrolling: skip if we didn't unroll any loops */
+   case 1:
+   case 5:
+           return !c->unrolled_any_loops;
+   /* UBO load sorting: skip if we didn't sort any loads */
+   case 2:
+   case 6:
+           return !c->sorted_any_ubo_loads;
+   /* TMU pipelining: skip if we didn't pipeline any TMU ops */
+   case 3:
+   case 7:
+           return !c->pipelined_any_tmu;
+   /* Lower thread count: skip if we already tried less that 4 threads */
+   case 4:
+          return c->threads < 4;
+   default:
+           return false;
+   };
+}
 uint64_t *v3d_compile(const struct v3d_compiler *compiler,
                      struct v3d_key *key,
                      struct v3d_prog_data **out_prog_data,
@ -1518,42 +1595,40 @@ uint64_t *v3d_compile(const struct v3d_compiler *compiler,
                      int program_id, int variant_id,
                      uint32_t *final_assembly_size)
 {
-        struct v3d_compile *c;
-
-        /* This is a list of incremental changes to the compilation strategy
-         * that will be used to try to compile the shader successfully. The
-         * default strategy is to enable all optimizations which will have
-         * the highest register pressure but is expected to produce most
-         * optimal code. Following strategies incrementally disable specific
-         * optimizations that are known to contribute to register pressure
-         * in order to be able to compile the shader successfully while meeting
-         * thread count requirements.
-         *
-         * V3D 4.1+ has a min thread count of 2, but we can use 1 here to also
-         * cover previous hardware as well (meaning that we are not limiting
-         * register allocation to any particular thread count). This is fine
-         * because v3d_nir_to_vir will cap this to the actual minimum.
-         */
-        struct v3d_compiler_strategy {
-                const char *name;
-                uint32_t min_threads_for_reg_alloc;
-        } static const strategies[] = {
-                { "default",                  4 },
-                { "disable loop unrolling",   4 },
-                { "disable UBO load sorting", 1 },
-                { "disable TMU pipelining",   1 },
-                { "fallback scheduler",       1 }
-        };
-
+        struct v3d_compile *c = NULL;
        for (int i = 0; i < ARRAY_SIZE(strategies); i++) {
+                /* Fallback strategy */
+                if (i > 0) {
+                        assert(c);
+                        if (skip_compile_strategy(c, i))
+                                continue;
+
+                        char *debug_msg;
+                        int ret = asprintf(&debug_msg,
+                                           "Falling back to strategy '%s' for %s",
+                                           strategies[i].name,
+                                           vir_get_stage_name(c));
+
+                        if (ret >= 0) {
+                                if (unlikely(V3D_DEBUG & V3D_DEBUG_PERF))
+                                        fprintf(stderr, "%s\n", debug_msg);
+
+                                c->debug_output(debug_msg, c->debug_output_data);
+                                free(debug_msg);
+                        }
+
+                        vir_compile_destroy(c);
+                }
+
                c = vir_compile_init(compiler, key, s,
                                     debug_output, debug_output_data,
                                     program_id, variant_id,
                                     strategies[i].min_threads_for_reg_alloc,
-                                     i > 0, /* Disable loop unrolling */
-                                     i > 1, /* Disable UBO load sorting */
-                                     i > 2, /* Disable TMU pipelining */
-                                     i > 3  /* Fallback_scheduler */);
+                                     strategies[i].tmu_spilling_allowed,
+                                     strategies[i].disable_loop_unrolling,
+                                     strategies[i].disable_ubo_load_sorting,
+                                     strategies[i].disable_tmu_pipelining,
+                                     i == ARRAY_SIZE(strategies) - 1);

                v3d_attempt_compile(c);

@ -1562,23 +1637,6 @@ uint64_t *v3d_compile(const struct v3d_compiler *compiler,
                    V3D_COMPILATION_FAILED_REGISTER_ALLOCATION) {
                        break;
                }
-
-                /* Fallback strategy */
-                char *debug_msg;
-                int ret = asprintf(&debug_msg,
-                                   "Falling back to strategy '%s' for %s",
-                                   strategies[i + 1].name,
-                                   vir_get_stage_name(c));
-
-                if (ret >= 0) {
-                        if (unlikely(V3D_DEBUG & V3D_DEBUG_PERF))
-                                fprintf(stderr, "%s\n", debug_msg);
-
-                        c->debug_output(debug_msg, c->debug_output_data);
-                        free(debug_msg);
-                }
-
-                vir_compile_destroy(c);
        }

        if (unlikely(V3D_DEBUG & V3D_DEBUG_PERF) &&
--- a/src/broadcom/compiler/vir_register_allocate.c
+++ b/src/broadcom/compiler/vir_register_allocate.c
@ -506,15 +506,15 @@ get_spill_batch_size(struct v3d_compile *c)
   return 20;
 }

-/* Don't emit spills using the TMU until we've dropped thread count first. Also,
- * don't spill if we have enabled any other optimization that can lead to
- * higher register pressure, such as TMU pipelining, we rather recompile without
- * the optimization in that case.
+/* Don't emit spills using the TMU until we've dropped thread count first. We,
+ * may also disable spilling when certain optimizations that are known to
+ * increase register pressure are active so we favor recompiling with
+ * optimizations disabled instead of spilling.
 */
 static inline bool
 tmu_spilling_allowed(struct v3d_compile *c, int thread_index)
 {
-        return thread_index == 0 && c->disable_tmu_pipelining;
+        return thread_index == 0 && c->tmu_spilling_allowed;
 }

 #define CLASS_BIT_PHYS			(1 << 0)