broadcom/compiler: use nir_lower_wrmasks to simplify TMU general stores

This pass splits writemaks with non-consecutive bits into multiple store operations ensuring that each store only has consecutive writemask bits set. We can use this to simplify writemask handling in our backend removing a loop solely intended to handle this case. Reviewed-by: Alejandro Piñeiro <apinheiro@igalia.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9619>
2021-03-16 09:51:45 +01:00 · 2021-03-16 09:51:45 +01:00 · aefac60741
parent 51a263530f
commit aefac60741
2 changed files with 94 additions and 76 deletions
--- a/src/broadcom/compiler/nir_to_vir.c
+++ b/src/broadcom/compiler/nir_to_vir.c
@ -549,87 +549,88 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr,
        const uint32_t dest_components = nir_intrinsic_dest_components(instr);
        uint32_t base_const_offset = const_offset;
        uint32_t writemask = is_store ? nir_intrinsic_write_mask(instr) : 0;
-        do {
-                uint32_t tmu_writes = 0;
-                for (enum emit_mode mode = MODE_COUNT; mode != MODE_LAST; mode++) {
-                        assert(mode == MODE_COUNT || tmu_writes > 0);
+        uint32_t tmu_writes = 0;
+        for (enum emit_mode mode = MODE_COUNT; mode != MODE_LAST; mode++) {
+                assert(mode == MODE_COUNT || tmu_writes > 0);

-                        if (is_store) {
-                                emit_tmu_general_store_writes(c, mode, instr,
-                                                              base_const_offset,
-                                                              &writemask,
-                                                              &const_offset,
-                                                              &tmu_writes);
-                        } else if (!is_load && !atomic_add_replaced) {
-                                 emit_tmu_general_atomic_writes(c, mode, instr,
-                                                                tmu_op,
-                                                                has_index,
-                                                                &tmu_writes);
-                        }
+                if (is_store) {
+                        emit_tmu_general_store_writes(c, mode, instr,
+                                                      base_const_offset,
+                                                      &writemask,
+                                                      &const_offset,
+                                                      &tmu_writes);
+                } else if (!is_load && !atomic_add_replaced) {
+                         emit_tmu_general_atomic_writes(c, mode, instr,
+                                                        tmu_op, has_index,
+                                                        &tmu_writes);
+                }

-                        /* The spec says that for atomics, the TYPE field is
-                         * ignored, but that doesn't seem to be the case for
-                         * CMPXCHG.  Just use the number of tmud writes we did
-                         * to decide the type (or choose "32bit" for atomic
-                         * reads, which has been fine).
-                         */
-                        uint32_t config = 0;
-                        if (mode == MODE_EMIT) {
-                                uint32_t num_components;
-                                if (is_load || atomic_add_replaced)
-                                        num_components = instr->num_components;
-                                else {
-                                        assert(tmu_writes > 0);
-                                        num_components = tmu_writes - 1;
-                                }
-
-                                uint32_t perquad =
-                                        is_load && !vir_in_nonuniform_control_flow(c)
-                                        ? GENERAL_TMU_LOOKUP_PER_QUAD
-                                        : GENERAL_TMU_LOOKUP_PER_PIXEL;
-                                config = 0xffffff00 | tmu_op << 3 | perquad;
-
-                                if (num_components == 1) {
-                                        config |= GENERAL_TMU_LOOKUP_TYPE_32BIT_UI;
-                                } else {
-                                        config |= GENERAL_TMU_LOOKUP_TYPE_VEC2 +
-                                                  num_components - 2;
-                                }
-                        }
-
-                        emit_tmu_general_address_write(c, mode, instr, config,
-                                                       dynamic_src,
-                                                       offset_src,
-                                                       base_offset,
-                                                       const_offset,
-                                                       &tmu_writes);
-
-                        assert(tmu_writes > 0);
-                        if (mode == MODE_COUNT) {
-                                /* Make sure we won't exceed the 16-entry TMU
-                                 * fifo if each thread is storing at the same
-                                 * time.
-                                 */
-                                while (tmu_writes > 16 / c->threads)
-                                        c->threads /= 2;
-
-                                /* If pipelining this TMU operation would
-                                 * overflow TMU fifos, we need to flush.
-                                 */
-                                if (ntq_tmu_fifo_overflow(c, dest_components))
-                                        ntq_flush_tmu(c);
+                /* The spec says that for atomics, the TYPE field is
+                 * ignored, but that doesn't seem to be the case for
+                 * CMPXCHG.  Just use the number of tmud writes we did
+                 * to decide the type (or choose "32bit" for atomic
+                 * reads, which has been fine).
+                 */
+                uint32_t config = 0;
+                if (mode == MODE_EMIT) {
+                        uint32_t num_components;
+                        if (is_load || atomic_add_replaced) {
+                                num_components = instr->num_components;
                        } else {
-                                /* Delay emission of the thread switch and
-                                 * LDTMU/TMUWT until we really need to do it to
-                                 * improve pipelining.
-                                 */
-                                const uint32_t component_mask =
-                                        (1 << dest_components) - 1;
-                                ntq_add_pending_tmu_flush(c, &instr->dest,
-                                                          component_mask);
+                                assert(tmu_writes > 0);
+                                num_components = tmu_writes - 1;
+                        }
+
+                        uint32_t perquad =
+                                is_load && !vir_in_nonuniform_control_flow(c)
+                                ? GENERAL_TMU_LOOKUP_PER_QUAD
+                                : GENERAL_TMU_LOOKUP_PER_PIXEL;
+                        config = 0xffffff00 | tmu_op << 3 | perquad;
+
+                        if (num_components == 1) {
+                                config |= GENERAL_TMU_LOOKUP_TYPE_32BIT_UI;
+                        } else {
+                                config |= GENERAL_TMU_LOOKUP_TYPE_VEC2 +
+                                          num_components - 2;
                        }
                }
-        } while (is_store && writemask != 0);
+
+                emit_tmu_general_address_write(c, mode, instr, config,
+                                               dynamic_src, offset_src,
+                                               base_offset, const_offset,
+                                               &tmu_writes);
+
+                assert(tmu_writes > 0);
+                if (mode == MODE_COUNT) {
+                        /* Make sure we won't exceed the 16-entry TMU
+                         * fifo if each thread is storing at the same
+                         * time.
+                         */
+                        while (tmu_writes > 16 / c->threads)
+                                c->threads /= 2;
+
+                        /* If pipelining this TMU operation would
+                         * overflow TMU fifos, we need to flush.
+                         */
+                        if (ntq_tmu_fifo_overflow(c, dest_components))
+                                ntq_flush_tmu(c);
+                } else {
+                        /* Delay emission of the thread switch and
+                         * LDTMU/TMUWT until we really need to do it to
+                         * improve pipelining.
+                         */
+                        const uint32_t component_mask =
+                                (1 << dest_components) - 1;
+                        ntq_add_pending_tmu_flush(c, &instr->dest,
+                                                  component_mask);
+                }
+        }
+
+        /* nir_lower_wrmasks should've ensured that any writemask on a store
+         * operation only has consecutive bits set, in which case we should've
+         * processed the full writemask above.
+         */
+        assert(writemask == 0);
 }

 static struct qreg *
--- a/src/broadcom/compiler/vir.c
+++ b/src/broadcom/compiler/vir.c
@ -1072,6 +1072,21 @@ v3d_intrinsic_dependency_cb(nir_intrinsic_instr *intr,
        return false;
 }

+static bool
+should_split_wrmask(const nir_instr *instr, const void *data)
+{
+        nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+        switch (intr->intrinsic) {
+        case nir_intrinsic_store_ssbo:
+        case nir_intrinsic_store_shared:
+        case nir_intrinsic_store_global:
+        case nir_intrinsic_store_scratch:
+                return true;
+        default:
+                return false;
+        }
+}
+
 static void
 v3d_attempt_compile(struct v3d_compile *c)
 {
@ -1137,6 +1152,8 @@ v3d_attempt_compile(struct v3d_compile *c)
           NIR_PASS_V(c->s, v3d_nir_lower_robust_buffer_access, c);
        }

+        NIR_PASS_V(c->s, nir_lower_wrmasks, should_split_wrmask, c->s);
+
        v3d_optimize_nir(c->s);

        /* Do late algebraic optimization to turn add(a, neg(b)) back into