broadcom/compiler: support pipelining of tex instructions

This follows the same idea as for TMU general instructions of reusing the existing infrastructure to first count required register writes and flush outstanding TMU dependencies, and then emit the actual writes, which requires that we split the code that decides about register writes to a helper. We also need to start using a component mask instead of the number of components that we need to read with a particular TMU operation. v2: update tmu_writes for V3D_QPU_WADDR_TMUOFF Reviewed-by: Alejandro Piñeiro <apinheiro@igalia.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/8825>
2021-01-27 09:45:52 +01:00 · 2021-01-27 09:45:52 +01:00 · be45960d3e
parent 197090a3fc
commit be45960d3e
4 changed files with 242 additions and 147 deletions
--- a/src/broadcom/compiler/nir_to_vir.c
+++ b/src/broadcom/compiler/nir_to_vir.c
@ -206,8 +206,10 @@ v3d_general_tmu_op(nir_intrinsic_instr *instr)
 * Checks if pipelining a new TMU operation requiring 'components' LDTMUs and
 * 'writes' TMU register writes would overflow any of the TMU fifos.
 */
-static bool
-tmu_fifo_overflow(struct v3d_compile *c, uint32_t components, uint32_t writes)
+bool
+ntq_tmu_fifo_overflow(struct v3d_compile *c,
+                      uint32_t components,
+                      uint32_t writes)
 {
        if (c->tmu.input_fifo_size + writes > 16 / c->threads)
                return true;
@ -236,13 +238,15 @@ ntq_flush_tmu(struct v3d_compile *c)

        bool emitted_tmuwt = false;
        for (int i = 0; i < c->tmu.flush_count; i++) {
-                if (c->tmu.flush[i].num_components > 0) {
+                if (c->tmu.flush[i].component_mask > 0) {
                        nir_dest *dest = c->tmu.flush[i].dest;
                        assert(dest);

-                        for (int j = 0; j < c->tmu.flush[i].num_components; j++) {
-                                ntq_store_dest(c, dest, j,
-                                               vir_MOV(c, vir_LDTMU(c)));
+                        for (int j = 0; j < 4; j++) {
+                                if (c->tmu.flush[i].component_mask & (1 << j)) {
+                                        ntq_store_dest(c, dest, j,
+                                                       vir_MOV(c, vir_LDTMU(c)));
+                                }
                        }
                } else if (!emitted_tmuwt) {
                        vir_TMUWT(c);
@ -262,13 +266,14 @@ ntq_flush_tmu(struct v3d_compile *c)
 * is reponsible for ensuring that doing this doesn't overflow the TMU fifos,
 * and more specifically, the output fifo, since that can't stall.
 */
-static void
+void
 ntq_add_pending_tmu_flush(struct v3d_compile *c,
                          nir_dest *dest,
-                          uint32_t num_components,
+                          uint32_t component_mask,
                          uint32_t tmu_writes)
 {
-        assert(!tmu_fifo_overflow(c, num_components, tmu_writes));
+        const uint32_t num_components = util_bitcount(component_mask);
+        assert(!ntq_tmu_fifo_overflow(c, num_components, tmu_writes));

        c->tmu.input_fifo_size += tmu_writes;
        if (num_components > 0) {
@ -279,7 +284,7 @@ ntq_add_pending_tmu_flush(struct v3d_compile *c,
        }

        c->tmu.flush[c->tmu.flush_count].dest = dest;
-        c->tmu.flush[c->tmu.flush_count].num_components = num_components;
+        c->tmu.flush[c->tmu.flush_count].component_mask = component_mask;
        c->tmu.flush_count++;
 }

@ -615,15 +620,17 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr,
                                /* If pipelining this TMU operation would
                                 * overflow TMU fifos, we need to flush.
                                 */
-                                if (tmu_fifo_overflow(c, dest_components, tmu_writes))
+                                if (ntq_tmu_fifo_overflow(c, dest_components, tmu_writes))
                                        ntq_flush_tmu(c);
                        } else {
                                /* Delay emission of the thread switch and
                                 * LDTMU/TMUWT until we really need to do it to
                                 * improve pipelining.
                                 */
+                                const uint32_t component_mask =
+                                        (1 << dest_components) - 1;
                                ntq_add_pending_tmu_flush(c, &instr->dest,
-                                                          dest_components,
+                                                          component_mask,
                                                          tmu_writes);
                        }
                }
--- a/src/broadcom/compiler/v3d33_tex.c
+++ b/src/broadcom/compiler/v3d33_tex.c
@ -33,7 +33,11 @@
 void
 v3d33_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr)
 {
-        /* FIXME: allow tex pipelining */
+        /* FIXME: We don't bother implementing pipelining for texture reads
+         * for any pre 4.x hardware. It should be straight forward to do but
+         * we are not really testing or even targetting this hardware at
+         * present.
+         */
        ntq_flush_tmu(c);

        unsigned unit = instr->texture_index;
--- a/src/broadcom/compiler/v3d40_tex.c
+++ b/src/broadcom/compiler/v3d40_tex.c
@ -39,7 +39,8 @@ vir_TMU_WRITE(struct v3d_compile *c, enum v3d_qpu_waddr waddr, struct qreg val,
         */
        vir_MOV_dest(c, vir_reg(QFILE_MAGIC, waddr), val);

-        (*tmu_writes)++;
+        if (tmu_writes)
+                (*tmu_writes)++;
 }

 static void
@ -58,125 +59,185 @@ static const struct V3D41_TMU_CONFIG_PARAMETER_2 p2_unpacked_default = {
        .op = V3D_TMU_OP_REGULAR,
 };

+/**
+ * If 'tmu_writes' is not NULL, then it just counts required register writes,
+ * otherwise, it emits the actual register writes.
+ *
+ * It is important to notice that emitting register writes for the current
+ * TMU operation may trigger a TMU flush, since it is possible that any
+ * of the inputs required for the register writes is the result of a pending
+ * TMU operation. If that happens we need to make sure that it doesn't happen
+ * in the middle of the TMU register writes for the current TMU operation,
+ * which is why we always call ntq_get_src() even if we are only interested in
+ * register write counts.
+ */
+static void
+handle_tex_src(struct v3d_compile *c,
+               nir_tex_instr *instr,
+               unsigned src_idx,
+               unsigned non_array_components,
+               struct V3D41_TMU_CONFIG_PARAMETER_2 *p2_unpacked,
+               struct qreg *s_out,
+               unsigned *tmu_writes)
+{
+        /* Either we are calling this just to count required TMU writes, or we
+         * are calling this to emit the actual TMU writes.
+         */
+        assert(tmu_writes || (s_out && p2_unpacked));
+
+        struct qreg s;
+        switch (instr->src[src_idx].src_type) {
+        case nir_tex_src_coord:
+                /* S triggers the lookup, so save it for the end. */
+                s = ntq_get_src(c, instr->src[src_idx].src, 0);
+                if (tmu_writes)
+                        (*tmu_writes)++;
+                else
+                        *s_out = s;
+
+                if (non_array_components > 1) {
+                        struct qreg src =
+                                ntq_get_src(c, instr->src[src_idx].src, 1);
+                        if (tmu_writes)
+                                (*tmu_writes)++;
+                        else
+                                vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUT, src, NULL);
+                }
+
+                if (non_array_components > 2) {
+                        struct qreg src =
+                                ntq_get_src(c, instr->src[src_idx].src, 2);
+                        if (tmu_writes)
+                                (*tmu_writes)++;
+                        else
+                                vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUR, src, NULL);
+                }
+
+                if (instr->is_array) {
+                        struct qreg src =
+                                ntq_get_src(c, instr->src[src_idx].src,
+                                            instr->coord_components - 1);
+                        if (tmu_writes)
+                                (*tmu_writes)++;
+                        else
+                                vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUI, src, NULL);
+                }
+                break;
+
+        case nir_tex_src_bias: {
+                struct qreg src = ntq_get_src(c, instr->src[src_idx].src, 0);
+                if (tmu_writes)
+                        (*tmu_writes)++;
+                else
+                        vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUB, src, NULL);
+                break;
+        }
+
+        case nir_tex_src_lod: {
+                struct qreg src = ntq_get_src(c, instr->src[src_idx].src, 0);
+                if (tmu_writes) {
+                        (*tmu_writes)++;
+                } else {
+                         vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUB, src, NULL);
+
+                         /* With texel fetch automatic LOD is already disabled,
+                          * and disable_autolod must not be enabled. For
+                          * non-cubes we can use the register TMUSLOD, that
+                          * implicitly sets disable_autolod.
+                          */
+                          if (instr->op != nir_texop_txf &&
+                              instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) {
+                                  p2_unpacked->disable_autolod = true;
+                          }
+               }
+               break;
+        }
+
+        case nir_tex_src_comparator: {
+                struct qreg src = ntq_get_src(c, instr->src[src_idx].src, 0);
+                if (tmu_writes)
+                        (*tmu_writes)++;
+                else
+                        vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUDREF, src , NULL);
+                break;
+        }
+
+        case nir_tex_src_offset: {
+                bool is_const_offset = nir_src_is_const(instr->src[src_idx].src);
+                if (is_const_offset) {
+                        if (!tmu_writes) {
+                                p2_unpacked->offset_s =
+                                        nir_src_comp_as_int(instr->src[src_idx].src, 0);
+                                if (non_array_components >= 2)
+                                        p2_unpacked->offset_t =
+                                                nir_src_comp_as_int(instr->src[src_idx].src, 1);
+                                if (non_array_components >= 3)
+                                        p2_unpacked->offset_r =
+                                                nir_src_comp_as_int(instr->src[src_idx].src, 2);
+                        }
+                } else {
+                        struct qreg src_0 =
+                                ntq_get_src(c, instr->src[src_idx].src, 0);
+                        struct qreg src_1 =
+                                ntq_get_src(c, instr->src[src_idx].src, 1);
+                        if (!tmu_writes) {
+                                struct qreg mask = vir_uniform_ui(c, 0xf);
+                                struct qreg x, y, offset;
+
+                                x = vir_AND(c, src_0, mask);
+                                y = vir_AND(c, src_1, mask);
+                                offset = vir_OR(c, x,
+                                                vir_SHL(c, y, vir_uniform_ui(c, 4)));
+
+                                vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUOFF, offset, NULL);
+                        } else {
+                                (*tmu_writes)++;
+                        }
+                }
+                break;
+        }
+
+        default:
+                unreachable("unknown texture source");
+        }
+}
+
+static void
+vir_tex_handle_srcs(struct v3d_compile *c,
+                    nir_tex_instr *instr,
+                    struct V3D41_TMU_CONFIG_PARAMETER_2 *p2_unpacked,
+                    struct qreg *s,
+                    unsigned *tmu_writes)
+{
+        unsigned non_array_components = instr->op != nir_texop_lod ?
+                instr->coord_components - instr->is_array :
+                instr->coord_components;
+
+        for (unsigned i = 0; i < instr->num_srcs; i++) {
+                handle_tex_src(c, instr, i, non_array_components,
+                               p2_unpacked, s, tmu_writes);
+        }
+}
+
+static unsigned
+get_required_tmu_writes(struct v3d_compile *c, nir_tex_instr *instr)
+{
+        unsigned tmu_writes = 0;
+        vir_tex_handle_srcs(c, instr, NULL, NULL, &tmu_writes);
+        return tmu_writes;
+}
+
 void
 v3d40_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr)
 {
-        /* FIXME: allow tex pipelining */
-        ntq_flush_tmu(c);
+        assert(instr->op != nir_texop_lod || c->devinfo->ver >= 42);

        unsigned texture_idx = instr->texture_index;
        unsigned sampler_idx = instr->sampler_index;

-        int tmu_writes = 0;
-
        struct V3D41_TMU_CONFIG_PARAMETER_0 p0_unpacked = {
        };

-        assert(instr->op != nir_texop_lod || c->devinfo->ver >= 42);
-
-        struct V3D41_TMU_CONFIG_PARAMETER_2 p2_unpacked = {
-                .op = V3D_TMU_OP_REGULAR,
-
-                .gather_mode = instr->op == nir_texop_tg4,
-                .gather_component = instr->component,
-
-                .coefficient_mode = instr->op == nir_texop_txd,
-
-                .disable_autolod = instr->op == nir_texop_tg4
-        };
-
-        int non_array_components =
-           instr->op != nir_texop_lod ?
-           instr->coord_components - instr->is_array :
-           instr->coord_components;
-
-        struct qreg s;
-
-        for (unsigned i = 0; i < instr->num_srcs; i++) {
-                switch (instr->src[i].src_type) {
-                case nir_tex_src_coord:
-                        /* S triggers the lookup, so save it for the end. */
-                        s = ntq_get_src(c, instr->src[i].src, 0);
-
-                        if (non_array_components > 1) {
-                                vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUT,
-                                              ntq_get_src(c, instr->src[i].src,
-                                                          1), &tmu_writes);
-                        }
-                        if (non_array_components > 2) {
-                                vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUR,
-                                              ntq_get_src(c, instr->src[i].src,
-                                                          2), &tmu_writes);
-                        }
-
-                        if (instr->is_array) {
-                                vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUI,
-                                              ntq_get_src(c, instr->src[i].src,
-                                                          instr->coord_components - 1),
-                                              &tmu_writes);
-                        }
-                        break;
-
-                case nir_tex_src_bias:
-                        vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUB,
-                                      ntq_get_src(c, instr->src[i].src, 0),
-                                      &tmu_writes);
-                        break;
-
-                case nir_tex_src_lod:
-                        vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUB,
-                                      ntq_get_src(c, instr->src[i].src, 0),
-                                      &tmu_writes);
-
-                        /* With texel fetch automatic LOD is already disabled,
-                         * and disable_autolod must not be enabled. For
-                         * non-cubes we can use the register TMUSLOD, that
-                         * implicitly sets disable_autolod.
-                         */
-                        if (instr->op != nir_texop_txf &&
-                            instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) {
-                                p2_unpacked.disable_autolod = true;
-                        }
-                        break;
-
-                case nir_tex_src_comparator:
-                        vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUDREF,
-                                      ntq_get_src(c, instr->src[i].src, 0),
-                                      &tmu_writes);
-                        break;
-
-                case nir_tex_src_offset: {
-                        if (nir_src_is_const(instr->src[i].src)) {
-                                p2_unpacked.offset_s = nir_src_comp_as_int(instr->src[i].src, 0);
-                                if (non_array_components >= 2)
-                                        p2_unpacked.offset_t =
-                                                nir_src_comp_as_int(instr->src[i].src, 1);
-                                if (non_array_components >= 3)
-                                        p2_unpacked.offset_r =
-                                                nir_src_comp_as_int(instr->src[i].src, 2);
-                        } else {
-                                struct qreg mask = vir_uniform_ui(c, 0xf);
-                                struct qreg x, y, offset;
-
-                                x = vir_AND(c, ntq_get_src(c, instr->src[i].src,
-                                                           0), mask);
-                                y = vir_AND(c, ntq_get_src(c, instr->src[i].src,
-                                                           1), mask);
-                                offset = vir_OR(c, x,
-                                                vir_SHL(c, y,
-                                                        vir_uniform_ui(c, 4)));
-
-                                vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUOFF,
-                                              offset, &tmu_writes);
-                        }
-                        break;
-                }
-
-                default:
-                        unreachable("unknown texture source");
-                }
-        }
-
        /* Limit the number of channels returned to both how many the NIR
         * instruction writes and how many the instruction could produce.
         */
@ -184,9 +245,36 @@ v3d40_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr)
                instr->dest.is_ssa ?
                nir_ssa_def_components_read(&instr->dest.ssa) :
                (1 << instr->dest.reg.reg->num_components) - 1;
-
        assert(p0_unpacked.return_words_of_texture_data != 0);

+        struct V3D41_TMU_CONFIG_PARAMETER_2 p2_unpacked = {
+                .op = V3D_TMU_OP_REGULAR,
+                .gather_mode = instr->op == nir_texop_tg4,
+                .gather_component = instr->component,
+                .coefficient_mode = instr->op == nir_texop_txd,
+                .disable_autolod = instr->op == nir_texop_tg4
+        };
+
+        const unsigned tmu_writes = get_required_tmu_writes(c, instr);
+
+        /* The input FIFO has 16 slots across all threads so if we require
+         * more than that we need to lower thread count.
+         */
+        while (tmu_writes > 16 / c->threads)
+                c->threads /= 2;
+
+       /* If pipelining this TMU operation would overflow TMU fifos, we need
+        * to flush any outstanding TMU operations.
+        */
+        const unsigned dest_components =
+           util_bitcount(p0_unpacked.return_words_of_texture_data);
+        if (ntq_tmu_fifo_overflow(c, dest_components, tmu_writes))
+                ntq_flush_tmu(c);
+
+        /* Process tex sources emitting corresponding TMU writes */
+        struct qreg s = { };
+        vir_tex_handle_srcs(c, instr, &p2_unpacked, &s, NULL);
+
        uint32_t p0_packed;
        V3D41_TMU_CONFIG_PARAMETER_0_pack(NULL,
                                          (uint8_t *)&p0_packed,
@ -216,15 +304,15 @@ v3d40_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr)
         * itself, we still need to add the sampler configuration
         * parameter if the output is 32 bit
         */
-        bool output_type_32_bit = (c->key->sampler[sampler_idx].return_size == 32 &&
-                                   !instr->is_shadow);
+        bool output_type_32_bit =
+                c->key->sampler[sampler_idx].return_size == 32 &&
+                !instr->is_shadow;

-        /*
-         * p1 is optional, but we can skip it only if p2 can be skipped too
-         */
+        /* p1 is optional, but we can skip it only if p2 can be skipped too */
        bool needs_p2_config =
                (instr->op == nir_texop_lod ||
-                 memcmp(&p2_unpacked, &p2_unpacked_default, sizeof(p2_unpacked)) != 0);
+                 memcmp(&p2_unpacked, &p2_unpacked_default,
+                        sizeof(p2_unpacked)) != 0);

        /* To handle the cases were we can't just use p1_unpacked_default */
        bool non_default_p1_config = nir_tex_instr_need_sampler(instr) ||
@ -285,29 +373,21 @@ v3d40_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr)
        if (needs_p2_config)
                vir_WRTMUC(c, QUNIFORM_CONSTANT, p2_packed);

+        /* Emit retiring TMU write */
        if (instr->op == nir_texop_txf) {
                assert(instr->sampler_dim != GLSL_SAMPLER_DIM_CUBE);
-                vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUSF, s, &tmu_writes);
+                vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUSF, s, NULL);
        } else if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) {
-                vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUSCM, s, &tmu_writes);
+                vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUSCM, s, NULL);
        } else if (instr->op == nir_texop_txl) {
-                vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUSLOD, s, &tmu_writes);
+                vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUSLOD, s, NULL);
        } else {
-                vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUS, s, &tmu_writes);
+                vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUS, s, NULL);
        }

-        vir_emit_thrsw(c);
-
-        /* The input FIFO has 16 slots across all threads, so make sure we
-         * don't overfill our allocation.
-         */
-        while (tmu_writes > 16 / c->threads)
-                c->threads /= 2;
-
-        for (int i = 0; i < 4; i++) {
-                if (p0_unpacked.return_words_of_texture_data & (1 << i))
-                        ntq_store_dest(c, &instr->dest, i, vir_LDTMU(c));
-        }
+        ntq_add_pending_tmu_flush(c, &instr->dest,
+                                  p0_unpacked.return_words_of_texture_data,
+                                  tmu_writes);
 }

 static uint32_t
--- a/src/broadcom/compiler/v3d_compiler.h
+++ b/src/broadcom/compiler/v3d_compiler.h
@ -579,7 +579,8 @@ struct v3d_compile {

                struct {
                        nir_dest *dest;
-                        uint32_t num_components;
+                        uint8_t num_components;
+                        uint8_t component_mask;
                } flush[8]; /* 16 entries / 2 threads for input/output fifos */
                uint32_t flush_count;
        } tmu;
@ -936,6 +937,9 @@ uint8_t vir_channels_written(struct qinst *inst);
 struct qreg ntq_get_src(struct v3d_compile *c, nir_src src, int i);
 void ntq_store_dest(struct v3d_compile *c, nir_dest *dest, int chan,
                    struct qreg result);
+bool ntq_tmu_fifo_overflow(struct v3d_compile *c, uint32_t components, uint32_t writes);
+void ntq_add_pending_tmu_flush(struct v3d_compile *c, nir_dest *dest,
+                               uint32_t component_mask, uint32_t tmu_writes);
 void ntq_flush_tmu(struct v3d_compile *c);
 void vir_emit_thrsw(struct v3d_compile *c);