broadcom/compiler: change register allocation policy for accumulators

The current policy is to always favor accumulators if possible, however, this is not always optimal. Particularly, accumulators play a crucial role in enabling QPU instruction merges, since these are limited to both the ADD and the ALU instructions addressing at most 2 physical registers. For 2-src instructions, this means that to be able to merge we need them to address at least 2 accumulators. While favoring accumulators does help the case for instruction merges in general, it is risky to assign accumulators to variables that have long life spans. Doing so will make the accumulator unavailable for any other instructions during that life span, and since we only have a few accumulators, we can quickly run out and losing our capacity to merge instructions for large parts of the qpu program. On the other hand, we also want to avoid the extreme case were we keep allocating physical registers to the point we run out, even if we have accumulators available, since accumulators have additional restrictions and may not be suitable for everything. This change continues the policy of favoring accumulators, but it only does so if the life span of the temps is short, to ensure that we can recycle accumulators often across instructions and avoid running out for sections of the QPU code, unless we are already running out of physical registers. total instructions in shared programs: 13654647 -> 13336921 (-2.33%) instructions in affected programs: 11015919 -> 10698193 (-2.88%) helped: 39758 HURT: 17325 Instructions are helped. total threads in shared programs: 412046 -> 412038 (<.01%) threads in affected programs: 16 -> 8 (-50.00%) helped: 0 HURT: 4 Threads are HURT. total uniforms in shared programs: 3745726 -> 3746003 (<.01%) uniforms in affected programs: 17296 -> 17573 (1.60%) helped: 76 HURT: 99 Uniforms are HURT. total max-temps in shared programs: 2364430 -> 2359942 (-0.19%) max-temps in affected programs: 109117 -> 104629 (-4.11%) helped: 2893 HURT: 772 Max-temps are helped. total spills in shared programs: 5727 -> 5746 (0.33%) spills in affected programs: 221 -> 240 (8.60%) helped: 1 HURT: 2 total fills in shared programs: 13121 -> 13139 (0.14%) fills in affected programs: 466 -> 484 (3.86%) helped: 1 HURT: 2 total sfu-stalls in shared programs: 33432 -> 34491 (3.17%) sfu-stalls in affected programs: 18219 -> 19278 (5.81%) helped: 4459 HURT: 5087 Inconclusive result total inst-and-stalls in shared programs: 13688079 -> 13371412 (-2.31%) inst-and-stalls in affected programs: 11030017 -> 10713350 (-2.87%) helped: 39630 HURT: 17429 Inst-and-stalls are helped. total nops in shared programs: 335753 -> 333708 (-0.61%) nops in affected programs: 112659 -> 110614 (-1.82%) helped: 8726 HURT: 7383 Inconclusive result Reviewed-by: Alejandro Piñeiro <apinheiro@igalia.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/10686>
2021-05-06 11:32:46 +02:00 · 2021-05-06 11:32:46 +02:00 · d81a6e5f1d
parent 24043d215f
commit d81a6e5f1d
1 changed files with 100 additions and 27 deletions
--- a/src/broadcom/compiler/vir_register_allocate.c
+++ b/src/broadcom/compiler/vir_register_allocate.c
@ -372,11 +372,97 @@ v3d_spill_reg(struct v3d_compile *c, int spill_temp)
        c->disable_ldunif_opt = had_disable_ldunif_opt;
 }

+struct node_to_temp_map {
+        uint32_t temp;
+        uint32_t priority;
+};
+
 struct v3d_ra_select_callback_data {
        uint32_t next_acc;
        uint32_t next_phys;
+        struct node_to_temp_map *map;
 };

+/* Choosing accumulators improves chances of merging QPU instructions
+ * due to these merges requiring that at most 2 rf registers are used
+ * by the add and mul instructions.
+ */
+static bool
+v3d_ra_favor_accum(struct v3d_ra_select_callback_data *v3d_ra,
+                   BITSET_WORD *regs,
+                   int priority)
+{
+        /* Favor accumulators if we have less that this number of physical
+         * registers. Accumulators have more restrictions (like being
+         * invalidated through thrsw), so running out of physical registers
+         * even if we have accumulators available can lead to register
+         * allocation failures.
+         */
+        static const int available_rf_threshold = 5;
+        int available_rf = 0 ;
+        for (int i = 0; i < PHYS_COUNT; i++) {
+                if (BITSET_TEST(regs, PHYS_INDEX + i))
+                        available_rf++;
+                if (available_rf >= available_rf_threshold)
+                        break;
+        }
+        if (available_rf < available_rf_threshold)
+                return true;
+
+        /* Favor accumulators for short-lived temps (our priority represents
+         * liveness), to prevent long-lived temps from grabbing accumulators
+         * and preventing follow-up instructions from using them, potentially
+         * leading to large portions of the shader being unable to use
+         * accumulators and therefore merge instructions successfully.
+         */
+        static const int priority_threshold = 20;
+        if (priority <= priority_threshold)
+                return true;
+
+        return false;
+}
+
+static bool
+v3d_ra_select_accum(struct v3d_ra_select_callback_data *v3d_ra,
+                    BITSET_WORD *regs,
+                    unsigned int *out)
+{
+        /* Round-robin through our accumulators to give post-RA instruction
+         * selection more options.
+         */
+        for (int i = 0; i < ACC_COUNT; i++) {
+                int acc_off = (v3d_ra->next_acc + i) % ACC_COUNT;
+                int acc = ACC_INDEX + acc_off;
+
+                if (BITSET_TEST(regs, acc)) {
+                        v3d_ra->next_acc = acc_off + 1;
+                        *out = acc;
+                        return true;
+                }
+        }
+
+        return false;
+}
+
+static bool
+v3d_ra_select_rf(struct v3d_ra_select_callback_data *v3d_ra,
+                 BITSET_WORD *regs,
+                 unsigned int *out)
+{
+        for (int i = 0; i < PHYS_COUNT; i++) {
+                int phys_off = (v3d_ra->next_phys + i) % PHYS_COUNT;
+                int phys = PHYS_INDEX + phys_off;
+
+                if (BITSET_TEST(regs, phys)) {
+                        v3d_ra->next_phys = phys_off + 1;
+                        *out = phys;
+                        return true;
+                }
+        }
+
+        return false;
+}
+
 static unsigned int
 v3d_ra_select_callback(unsigned int n, BITSET_WORD *regs, void *data)
 {
@ -390,29 +476,20 @@ v3d_ra_select_callback(unsigned int n, BITSET_WORD *regs, void *data)
        if (BITSET_TEST(regs, r5))
                return r5;

-        /* Choose an accumulator if possible (I think it's lower power than
-         * phys regs), but round-robin through them to give post-RA
-         * instruction selection more options.
+        unsigned int reg;
+        if (v3d_ra_favor_accum(v3d_ra, regs, v3d_ra->map[n].priority) &&
+            v3d_ra_select_accum(v3d_ra, regs, &reg)) {
+                return reg;
+        }
+
+        if (v3d_ra_select_rf(v3d_ra, regs, &reg))
+                return reg;
+
+        /* If we ran out of physical registers try to assign an accumulator
+         * if we didn't favor that option earlier.
         */
-        for (int i = 0; i < ACC_COUNT; i++) {
-                int acc_off = (v3d_ra->next_acc + i) % ACC_COUNT;
-                int acc = ACC_INDEX + acc_off;
-
-                if (BITSET_TEST(regs, acc)) {
-                        v3d_ra->next_acc = acc_off + 1;
-                        return acc;
-                }
-        }
-
-        for (int i = 0; i < PHYS_COUNT; i++) {
-                int phys_off = (v3d_ra->next_phys + i) % PHYS_COUNT;
-                int phys = PHYS_INDEX + phys_off;
-
-                if (BITSET_TEST(regs, phys)) {
-                        v3d_ra->next_phys = phys_off + 1;
-                        return phys;
-                }
-        }
+        if (v3d_ra_select_accum(v3d_ra, regs, &reg))
+                return reg;

        unreachable("RA must pass us at least one possible reg.");
 }
@ -472,11 +549,6 @@ vir_init_reg_sets(struct v3d_compiler *compiler)
        return true;
 }

-struct node_to_temp_map {
-        uint32_t temp;
-        uint32_t priority;
-};
-
 static int
 node_to_temp_priority(const void *in_a, const void *in_b)
 {
@ -542,6 +614,7 @@ v3d_register_allocate(struct v3d_compile *c, bool *spilled)
                 * RF0-2.
                 */
                .next_phys = 3,
+                .map = map,
        };

        *spilled = false;