pan/bi: Rework varying linking on Valhall

Valhall introduces hardware-allocated varyings. Instead of allocating varying descriptors on the CPU with a slot based interface, the driver just tells the hardware how many bytes to allocate per vertex and loads/stores with byte offsets. This is much nicer! However, this requires us to rework our linking code to account for separable shaders. With separable shaders, we can't rely on driver_location matching between stages, and unlike on Midgard, we can't resolve the differences with curated command stream descriptors. However, we *can* rely on slots matching. So we should "just" determine the byte offsets based on the slot, and then separable shaders work. For GLES, it really is that easy. For desktop GL, it's not -- desktop GL brings unpredictable extra varyings like COL1 and TEX2. Allocating space for all of these unconditionally would hamper performance. To cope, we key fragment shaders to the set of non-GLES varyings written by the linked vertex shader. Then we may define an efficient ABI, where only apps only pay for what they use. Fixes various tests in dEQP-GLES31.functional.separate_shader.random.* on Valhall. Signed-off-by: Alyssa Rosenzweig <alyssa@collabora.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/16310>
2022-05-03 12:31:01 -04:00 · 2022-05-03 12:31:01 -04:00 · 0fcddd4d2c
parent 635d8d6bd7
commit 0fcddd4d2c
7 changed files with 123 additions and 49 deletions
--- a/src/gallium/drivers/panfrost/pan_assemble.c
+++ b/src/gallium/drivers/panfrost/pan_assemble.c
@ -69,6 +69,7 @@ panfrost_shader_compile(struct pipe_screen *pscreen,
        struct panfrost_compile_inputs inputs = {
                .gpu_id = dev->gpu_id,
                .shaderdb = !!(dev->debug & PAN_DBG_PRECOMPILE),
+                .fixed_varying_mask = state->key.fixed_varying_mask
        };

        memcpy(inputs.rt_formats, state->key.fs.rt_formats, sizeof(inputs.rt_formats));
--- a/src/gallium/drivers/panfrost/pan_cmdstream.c
+++ b/src/gallium/drivers/panfrost/pan_cmdstream.c
@ -3360,9 +3360,15 @@ panfrost_emit_malloc_vertex(struct panfrost_batch *batch,

        pan_section_pack(job, MALLOC_VERTEX_JOB, ALLOCATION, cfg) {
                if (secondary_shader) {
+                        unsigned v = vs->info.varyings.output_count;
+                        unsigned f = fs->info.varyings.input_count;
+                        unsigned slots = MAX2(v, f);
+                        slots += util_bitcount(fs->key.fixed_varying_mask);
+                        unsigned size = slots * 16;
+
                        /* Assumes 16 byte slots. We could do better. */
-                        cfg.vertex_packet_stride = vs->info.varyings.output_count * 16;
-                        cfg.vertex_attribute_stride = fs->info.varyings.input_count * 16;
+                        cfg.vertex_packet_stride = size + 16;
+                        cfg.vertex_attribute_stride = size;
                } else {
                        /* Hardware requirement for "no varyings" */
                        cfg.vertex_packet_stride = 16;
--- a/src/gallium/drivers/panfrost/pan_context.c
+++ b/src/gallium/drivers/panfrost/pan_context.c
@ -307,6 +307,13 @@ panfrost_create_shader_state(
        else
                so->nir = cso->ir.nir;

+        /* Fix linkage early */
+        if (so->nir->info.stage == MESA_SHADER_VERTEX) {
+                so->fixed_varying_mask =
+                        (so->nir->info.outputs_written & BITFIELD_MASK(VARYING_SLOT_VAR0)) &
+                        ~VARYING_BIT_POS & ~VARYING_BIT_PSIZ;
+        }
+
        /* Precompile for shader-db if we need to */
        if (unlikely(dev->debug & PAN_DBG_PRECOMPILE)) {
                struct panfrost_context *ctx = pan_context(pctx);
@ -372,6 +379,7 @@ panfrost_build_key(struct panfrost_context *ctx,
        struct panfrost_device *dev = pan_device(ctx->base.screen);
        struct pipe_framebuffer_state *fb = &ctx->pipe_framebuffer;
        struct pipe_rasterizer_state *rast = (void *) ctx->rasterizer;
+        struct panfrost_shader_variants *vs = ctx->shader[MESA_SHADER_VERTEX];

        key->fs.nr_cbufs = fb->nr_cbufs;

@ -398,6 +406,12 @@ panfrost_build_key(struct panfrost_context *ctx,
                        key->fs.rt_formats[i] = fmt;
                }
        }
+
+        /* Funny desktop GL varying lowering on Valhall */
+        if (dev->arch >= 9) {
+                assert(vs != NULL && "too early");
+                key->fixed_varying_mask = vs->fixed_varying_mask;
+        }
 }

 /**
@ -508,13 +522,20 @@ panfrost_update_shader_variant(struct panfrost_context *ctx,
        if (type == PIPE_SHADER_COMPUTE)
                return;

+        /* We need linking information, defer this */
+        if (type == PIPE_SHADER_FRAGMENT && !ctx->shader[PIPE_SHADER_VERTEX])
+                return;
+
        /* Match the appropriate variant */
        signed variant = -1;
        struct panfrost_shader_variants *variants = ctx->shader[type];

        simple_mtx_lock(&variants->lock);

-        struct panfrost_shader_key key = { 0 };
+        struct panfrost_shader_key key = {
+                .fixed_varying_mask = variants->fixed_varying_mask
+        };
+
        panfrost_build_key(ctx, &key, variants->nir);

        for (unsigned i = 0; i < variants->variant_count; ++i) {
@ -539,6 +560,10 @@ static void
 panfrost_bind_vs_state(struct pipe_context *pctx, void *hwcso)
 {
        panfrost_bind_shader_state(pctx, hwcso, PIPE_SHADER_VERTEX);
+
+        /* Fragment shaders are linked with vertex shaders */
+        struct panfrost_context *ctx = pan_context(pctx);
+        panfrost_update_shader_variant(ctx, PIPE_SHADER_FRAGMENT);
 }

 static void
--- a/src/gallium/drivers/panfrost/pan_context.h
+++ b/src/gallium/drivers/panfrost/pan_context.h
@ -272,6 +272,9 @@ struct panfrost_fs_key {
 };

 struct panfrost_shader_key {
+        /* Valhall needs special handling for desktop GL varyings */
+        uint32_t fixed_varying_mask;
+
        /* If we need vertex shader keys, union it in */
        struct panfrost_fs_key fs;
 };
@ -315,6 +318,12 @@ struct panfrost_shader_variants {

        unsigned variant_count;

+        /* On vertex shaders, bit mask of special desktop-only varyings to link
+         * with the fragment shader. Used on Valhall to implement separable
+         * shaders for desktop GL.
+         */
+        uint32_t fixed_varying_mask;
+
        /* The current active variant */
        unsigned active_variant;
 };
--- a/src/panfrost/bifrost/bifrost_compile.c
+++ b/src/panfrost/bifrost/bifrost_compile.c
@ -284,6 +284,41 @@ bi_emit_load_attr(bi_builder *b, nir_intrinsic_instr *instr)
        bi_copy_component(b, instr, dest);
 }

+/*
+ * ABI: Special (desktop GL) slots come first, tightly packed. General varyings
+ * come later, sparsely packed. This handles both linked and separable shaders
+ * with a common code path, with minimal keying only for desktop GL. Each slot
+ * consumes 16 bytes (TODO: fp16, partial vectors).
+ */
+static unsigned
+bi_varying_base_bytes(bi_context *ctx, nir_intrinsic_instr *intr)
+{
+        nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
+        uint32_t mask = ctx->inputs->fixed_varying_mask;
+
+        if (sem.location >= VARYING_SLOT_VAR0) {
+                unsigned nr_special = util_bitcount(mask);
+                unsigned general_index = (sem.location - VARYING_SLOT_VAR0);
+
+                return 16 * (nr_special + general_index);
+        } else {
+                return 16 * (util_bitcount(mask & BITFIELD_MASK(sem.location)));
+        }
+}
+
+/*
+ * Compute the offset in bytes of a varying with an immediate offset, adding the
+ * offset to the base computed above. Convenience method.
+ */
+static unsigned
+bi_varying_offset(bi_context *ctx, nir_intrinsic_instr *intr)
+{
+        nir_src *src = nir_get_io_offset_src(intr);
+        assert(nir_src_is_const(*src) && "assumes immediate offset");
+
+        return bi_varying_base_bytes(ctx, intr) + (nir_src_as_uint(*src) * 16);
+}
+
 static void
 bi_emit_load_vary(bi_builder *b, nir_intrinsic_instr *instr)
 {
@ -328,7 +363,8 @@ bi_emit_load_vary(bi_builder *b, nir_intrinsic_instr *instr)
        if (b->shader->malloc_idvs && immediate) {
                /* Immediate index given in bytes. */
                bi_ld_var_buf_imm_f32_to(b, dest, src0, regfmt, sample, update,
-                                         vecsize, imm_index * 16);
+                                         vecsize,
+                                         bi_varying_offset(b->shader, instr));
        } else if (immediate && smooth) {
                I = bi_ld_var_imm_to(b, dest, src0, regfmt, sample, update,
                                     vecsize, imm_index);
@ -339,24 +375,31 @@ bi_emit_load_vary(bi_builder *b, nir_intrinsic_instr *instr)
                bi_index idx = bi_src_index(offset);
                unsigned base = nir_intrinsic_base(instr);

-                if (base != 0)
-                        idx = bi_iadd_u32(b, idx, bi_imm_u32(base), false);
-
                if (b->shader->malloc_idvs) {
                        /* Index needs to be in bytes, but NIR gives the index
-                         * in slots. For now assume 16 bytes per slots.
-                         *
-                         * TODO: more complex linking?
+                         * in slots. For now assume 16 bytes per element.
                         */
-                        idx = bi_lshift_or_i32(b, idx, bi_zero(), bi_imm_u8(4));
-                        bi_ld_var_buf_f32_to(b, dest, src0, idx, regfmt, sample,
-                                             update, vecsize);
+                        bi_index idx_bytes = bi_lshift_or_i32(b, idx, bi_zero(), bi_imm_u8(4));
+                        unsigned vbase = bi_varying_base_bytes(b->shader, instr);
+
+                        if (vbase != 0)
+                                idx_bytes = bi_iadd_u32(b, idx, bi_imm_u32(vbase), false);
+
+                        bi_ld_var_buf_f32_to(b, dest, src0, idx_bytes, regfmt,
+                                             sample, update, vecsize);
                } else if (smooth) {
+                        if (base != 0)
+                                idx = bi_iadd_u32(b, idx, bi_imm_u32(base), false);
+
                        I = bi_ld_var_to(b, dest, src0, idx, regfmt, sample,
                                         update, vecsize);
                } else {
-                        I = bi_ld_var_flat_to(b, dest, idx, BI_FUNCTION_NONE,
-                                              regfmt, vecsize);
+                        if (base != 0)
+                                idx = bi_iadd_u32(b, idx, bi_imm_u32(base), false);
+
+                        I = bi_ld_var_flat_to(b, dest, idx,
+                                              BI_FUNCTION_NONE, regfmt,
+                                              vecsize);
                }
        }

@ -794,39 +837,6 @@ bifrost_nir_specialize_idvs(nir_builder *b, nir_instr *instr, void *data)
        return false;
 }

-/**
- * Computes the offset in bytes of a varying. This assumes VARYING_SLOT_POS is
- * mapped to location=0 and always present. This also assumes each slot
- * consumes 16 bytes, which is a worst-case (highp vec4). In the future, this
- * should be optimized to support fp16 and partial vectors. There are
- * nontrivial interactions with separable shaders, however.
- */
-static unsigned
-bi_varying_offset(nir_shader *nir, nir_intrinsic_instr *intr)
-{
-        nir_src *offset = nir_get_io_offset_src(intr);
-        assert(nir_src_is_const(*offset) && "no indirect varyings on Valhall");
-
-        unsigned loc = 0;
-        unsigned slot = nir_intrinsic_base(intr) + nir_src_as_uint(*offset);
-
-        nir_foreach_shader_out_variable(var, nir) {
-                if ((var->data.location == VARYING_SLOT_POS) ||
-                    (var->data.location == VARYING_SLOT_PSIZ))
-                        continue;
-
-                if (var->data.driver_location > slot)
-                        continue;
-
-                if (var->data.driver_location == slot)
-                        return loc;
-
-                loc += 16; // todo size
-        }
-
-        unreachable("Unlinked variable");
-}
-
 static void
 bi_emit_store_vary(bi_builder *b, nir_intrinsic_instr *instr)
 {
@ -880,7 +890,7 @@ bi_emit_store_vary(bi_builder *b, nir_intrinsic_instr *instr)
                         bi_src_index(&instr->src[0]),
                         address, bi_word(address, 1),
                         varying ? BI_SEG_VARY : BI_SEG_POS,
-                         varying ? bi_varying_offset(b->shader->nir, instr) : 0);
+                         varying ? bi_varying_offset(b->shader, instr) : 0);
        } else if (immediate) {
                bi_index address = bi_lea_attr_imm(b,
                                          bi_vertex_id(b), bi_instance_id(b),
--- a/src/panfrost/lib/pan_shader.c
+++ b/src/panfrost/lib/pan_shader.c
@ -42,6 +42,7 @@ GENX(pan_shader_get_compiler_options)(void)
 #endif
 }

+#if PAN_ARCH <= 7
 static enum pipe_format
 varying_format(nir_alu_type t, unsigned ncomps)
 {
@ -157,6 +158,7 @@ collect_varyings(nir_shader *s, nir_variable_mode varying_mode,
                *varying_count = MAX2(*varying_count, loc + sz);
        }
 }
+#endif

 #if PAN_ARCH >= 6
 static enum mali_register_file_format
@ -230,8 +232,14 @@ GENX(pan_shader_compile)(nir_shader *s,

                info->vs.writes_point_size =
                        s->info.outputs_written & (1 << VARYING_SLOT_PSIZ);
+
+#if PAN_ARCH >= 9
+                info->varyings.output_count =
+                        util_last_bit(s->info.outputs_written >> VARYING_SLOT_VAR0);
+#else
                collect_varyings(s, nir_var_shader_out, info->varyings.output,
                                 &info->varyings.output_count);
+#endif
                break;
        case MESA_SHADER_FRAGMENT:
                if (s->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
@ -286,8 +294,13 @@ GENX(pan_shader_compile)(nir_shader *s,
                info->fs.reads_face =
                        (s->info.inputs_read & (1 << VARYING_SLOT_FACE)) ||
                        BITSET_TEST(s->info.system_values_read, SYSTEM_VALUE_FRONT_FACE);
+#if PAN_ARCH >= 9
+                info->varyings.output_count =
+                        util_last_bit(s->info.outputs_read >> VARYING_SLOT_VAR0);
+#else
                collect_varyings(s, nir_var_shader_in, info->varyings.input,
                                 &info->varyings.input_count);
+#endif
                break;
        case MESA_SHADER_COMPUTE:
                info->wls_size = s->info.shared_size;
--- a/src/panfrost/util/pan_ir.h
+++ b/src/panfrost/util/pan_ir.h
@ -190,6 +190,16 @@ struct panfrost_compile_inputs {
        uint8_t raw_fmt_mask;
        unsigned nr_cbufs;

+        /* Used on Valhall.
+         *
+         * Bit mask of special desktop-only varyings (e.g VARYING_SLOT_TEX0)
+         * written by the previous stage (fragment shader) or written by this
+         * stage (vertex shader). Bits are slots from gl_varying_slot.
+         *
+         * For modern APIs (GLES or VK), this should be 0.
+         */
+        uint32_t fixed_varying_mask;
+
        union {
                struct {
                        bool static_rt_conv;