From 0fcddd4d2c401a7678139456702cbe15288eebf1 Mon Sep 17 00:00:00 2001
From: Alyssa Rosenzweig <alyssa@collabora.com>
Date: Tue, 3 May 2022 12:31:01 -0400
Subject: [PATCH] pan/bi: Rework varying linking on Valhall

Valhall introduces hardware-allocated varyings. Instead of allocating varying
descriptors on the CPU with a slot based interface, the driver just tells the
hardware how many bytes to allocate per vertex and loads/stores with byte
offsets. This is much nicer!

However, this requires us to rework our linking code to account for separable
shaders. With separable shaders, we can't rely on driver_location matching
between stages, and unlike on Midgard, we can't resolve the differences with
curated command stream descriptors. However, we *can* rely on slots matching. So
we should "just" determine the byte offsets based on the slot, and then
separable shaders work.

For GLES, it really is that easy.

For desktop GL, it's not -- desktop GL brings unpredictable extra varyings like
COL1 and TEX2. Allocating space for all of these unconditionally would hamper
performance. To cope, we key fragment shaders to the set of non-GLES varyings
written by the linked vertex shader. Then we may define an efficient ABI, where
only apps only pay for what they use.

Fixes various tests in dEQP-GLES31.functional.separate_shader.random.* on
Valhall.

Signed-off-by: Alyssa Rosenzweig <alyssa@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/16310>
---
 src/gallium/drivers/panfrost/pan_assemble.c  |   1 +
 src/gallium/drivers/panfrost/pan_cmdstream.c |  10 +-
 src/gallium/drivers/panfrost/pan_context.c   |  27 ++++-
 src/gallium/drivers/panfrost/pan_context.h   |   9 ++
 src/panfrost/bifrost/bifrost_compile.c       | 102 ++++++++++---------
 src/panfrost/lib/pan_shader.c                |  13 +++
 src/panfrost/util/pan_ir.h                   |  10 ++
 7 files changed, 123 insertions(+), 49 deletions(-)

diff --git a/src/gallium/drivers/panfrost/pan_assemble.c b/src/gallium/drivers/panfrost/pan_assemble.c
index 43b28465193..6d55125c4ac 100644
--- a/src/gallium/drivers/panfrost/pan_assemble.c
+++ b/src/gallium/drivers/panfrost/pan_assemble.c
@@ -69,6 +69,7 @@ panfrost_shader_compile(struct pipe_screen *pscreen,
         struct panfrost_compile_inputs inputs = {
                 .gpu_id = dev->gpu_id,
                 .shaderdb = !!(dev->debug & PAN_DBG_PRECOMPILE),
+                .fixed_varying_mask = state->key.fixed_varying_mask
         };
 
         memcpy(inputs.rt_formats, state->key.fs.rt_formats, sizeof(inputs.rt_formats));
diff --git a/src/gallium/drivers/panfrost/pan_cmdstream.c b/src/gallium/drivers/panfrost/pan_cmdstream.c
index 358523c2189..1ef8d09bd2b 100644
--- a/src/gallium/drivers/panfrost/pan_cmdstream.c
+++ b/src/gallium/drivers/panfrost/pan_cmdstream.c
@@ -3360,9 +3360,15 @@ panfrost_emit_malloc_vertex(struct panfrost_batch *batch,
 
         pan_section_pack(job, MALLOC_VERTEX_JOB, ALLOCATION, cfg) {
                 if (secondary_shader) {
+                        unsigned v = vs->info.varyings.output_count;
+                        unsigned f = fs->info.varyings.input_count;
+                        unsigned slots = MAX2(v, f);
+                        slots += util_bitcount(fs->key.fixed_varying_mask);
+                        unsigned size = slots * 16;
+
                         /* Assumes 16 byte slots. We could do better. */
-                        cfg.vertex_packet_stride = vs->info.varyings.output_count * 16;
-                        cfg.vertex_attribute_stride = fs->info.varyings.input_count * 16;
+                        cfg.vertex_packet_stride = size + 16;
+                        cfg.vertex_attribute_stride = size;
                 } else {
                         /* Hardware requirement for "no varyings" */
                         cfg.vertex_packet_stride = 16;
diff --git a/src/gallium/drivers/panfrost/pan_context.c b/src/gallium/drivers/panfrost/pan_context.c
index b1044c0206d..e9567d9cbb3 100644
--- a/src/gallium/drivers/panfrost/pan_context.c
+++ b/src/gallium/drivers/panfrost/pan_context.c
@@ -307,6 +307,13 @@ panfrost_create_shader_state(
         else
                 so->nir = cso->ir.nir;
 
+        /* Fix linkage early */
+        if (so->nir->info.stage == MESA_SHADER_VERTEX) {
+                so->fixed_varying_mask =
+                        (so->nir->info.outputs_written & BITFIELD_MASK(VARYING_SLOT_VAR0)) &
+                        ~VARYING_BIT_POS & ~VARYING_BIT_PSIZ;
+        }
+
         /* Precompile for shader-db if we need to */
         if (unlikely(dev->debug & PAN_DBG_PRECOMPILE)) {
                 struct panfrost_context *ctx = pan_context(pctx);
@@ -372,6 +379,7 @@ panfrost_build_key(struct panfrost_context *ctx,
         struct panfrost_device *dev = pan_device(ctx->base.screen);
         struct pipe_framebuffer_state *fb = &ctx->pipe_framebuffer;
         struct pipe_rasterizer_state *rast = (void *) ctx->rasterizer;
+        struct panfrost_shader_variants *vs = ctx->shader[MESA_SHADER_VERTEX];
 
         key->fs.nr_cbufs = fb->nr_cbufs;
 
@@ -398,6 +406,12 @@ panfrost_build_key(struct panfrost_context *ctx,
                         key->fs.rt_formats[i] = fmt;
                 }
         }
+
+        /* Funny desktop GL varying lowering on Valhall */
+        if (dev->arch >= 9) {
+                assert(vs != NULL && "too early");
+                key->fixed_varying_mask = vs->fixed_varying_mask;
+        }
 }
 
 /**
@@ -508,13 +522,20 @@ panfrost_update_shader_variant(struct panfrost_context *ctx,
         if (type == PIPE_SHADER_COMPUTE)
                 return;
 
+        /* We need linking information, defer this */
+        if (type == PIPE_SHADER_FRAGMENT && !ctx->shader[PIPE_SHADER_VERTEX])
+                return;
+
         /* Match the appropriate variant */
         signed variant = -1;
         struct panfrost_shader_variants *variants = ctx->shader[type];
 
         simple_mtx_lock(&variants->lock);
 
-        struct panfrost_shader_key key = { 0 };
+        struct panfrost_shader_key key = {
+                .fixed_varying_mask = variants->fixed_varying_mask
+        };
+
         panfrost_build_key(ctx, &key, variants->nir);
 
         for (unsigned i = 0; i < variants->variant_count; ++i) {
@@ -539,6 +560,10 @@ static void
 panfrost_bind_vs_state(struct pipe_context *pctx, void *hwcso)
 {
         panfrost_bind_shader_state(pctx, hwcso, PIPE_SHADER_VERTEX);
+
+        /* Fragment shaders are linked with vertex shaders */
+        struct panfrost_context *ctx = pan_context(pctx);
+        panfrost_update_shader_variant(ctx, PIPE_SHADER_FRAGMENT);
 }
 
 static void
diff --git a/src/gallium/drivers/panfrost/pan_context.h b/src/gallium/drivers/panfrost/pan_context.h
index 737ed78c802..6d482911e10 100644
--- a/src/gallium/drivers/panfrost/pan_context.h
+++ b/src/gallium/drivers/panfrost/pan_context.h
@@ -272,6 +272,9 @@ struct panfrost_fs_key {
 };
 
 struct panfrost_shader_key {
+        /* Valhall needs special handling for desktop GL varyings */
+        uint32_t fixed_varying_mask;
+
         /* If we need vertex shader keys, union it in */
         struct panfrost_fs_key fs;
 };
@@ -315,6 +318,12 @@ struct panfrost_shader_variants {
 
         unsigned variant_count;
 
+        /* On vertex shaders, bit mask of special desktop-only varyings to link
+         * with the fragment shader. Used on Valhall to implement separable
+         * shaders for desktop GL.
+         */
+        uint32_t fixed_varying_mask;
+
         /* The current active variant */
         unsigned active_variant;
 };
diff --git a/src/panfrost/bifrost/bifrost_compile.c b/src/panfrost/bifrost/bifrost_compile.c
index 12c09a68282..d0d9d1dfc6f 100644
--- a/src/panfrost/bifrost/bifrost_compile.c
+++ b/src/panfrost/bifrost/bifrost_compile.c
@@ -284,6 +284,41 @@ bi_emit_load_attr(bi_builder *b, nir_intrinsic_instr *instr)
         bi_copy_component(b, instr, dest);
 }
 
+/*
+ * ABI: Special (desktop GL) slots come first, tightly packed. General varyings
+ * come later, sparsely packed. This handles both linked and separable shaders
+ * with a common code path, with minimal keying only for desktop GL. Each slot
+ * consumes 16 bytes (TODO: fp16, partial vectors).
+ */
+static unsigned
+bi_varying_base_bytes(bi_context *ctx, nir_intrinsic_instr *intr)
+{
+        nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
+        uint32_t mask = ctx->inputs->fixed_varying_mask;
+
+        if (sem.location >= VARYING_SLOT_VAR0) {
+                unsigned nr_special = util_bitcount(mask);
+                unsigned general_index = (sem.location - VARYING_SLOT_VAR0);
+
+                return 16 * (nr_special + general_index);
+        } else {
+                return 16 * (util_bitcount(mask & BITFIELD_MASK(sem.location)));
+        }
+}
+
+/*
+ * Compute the offset in bytes of a varying with an immediate offset, adding the
+ * offset to the base computed above. Convenience method.
+ */
+static unsigned
+bi_varying_offset(bi_context *ctx, nir_intrinsic_instr *intr)
+{
+        nir_src *src = nir_get_io_offset_src(intr);
+        assert(nir_src_is_const(*src) && "assumes immediate offset");
+
+        return bi_varying_base_bytes(ctx, intr) + (nir_src_as_uint(*src) * 16);
+}
+
 static void
 bi_emit_load_vary(bi_builder *b, nir_intrinsic_instr *instr)
 {
@@ -328,7 +363,8 @@ bi_emit_load_vary(bi_builder *b, nir_intrinsic_instr *instr)
         if (b->shader->malloc_idvs && immediate) {
                 /* Immediate index given in bytes. */
                 bi_ld_var_buf_imm_f32_to(b, dest, src0, regfmt, sample, update,
-                                         vecsize, imm_index * 16);
+                                         vecsize,
+                                         bi_varying_offset(b->shader, instr));
         } else if (immediate && smooth) {
                 I = bi_ld_var_imm_to(b, dest, src0, regfmt, sample, update,
                                      vecsize, imm_index);
@@ -339,24 +375,31 @@ bi_emit_load_vary(bi_builder *b, nir_intrinsic_instr *instr)
                 bi_index idx = bi_src_index(offset);
                 unsigned base = nir_intrinsic_base(instr);
 
-                if (base != 0)
-                        idx = bi_iadd_u32(b, idx, bi_imm_u32(base), false);
-
                 if (b->shader->malloc_idvs) {
                         /* Index needs to be in bytes, but NIR gives the index
-                         * in slots. For now assume 16 bytes per slots.
-                         *
-                         * TODO: more complex linking?
+                         * in slots. For now assume 16 bytes per element.
                          */
-                        idx = bi_lshift_or_i32(b, idx, bi_zero(), bi_imm_u8(4));
-                        bi_ld_var_buf_f32_to(b, dest, src0, idx, regfmt, sample,
-                                             update, vecsize);
+                        bi_index idx_bytes = bi_lshift_or_i32(b, idx, bi_zero(), bi_imm_u8(4));
+                        unsigned vbase = bi_varying_base_bytes(b->shader, instr);
+
+                        if (vbase != 0)
+                                idx_bytes = bi_iadd_u32(b, idx, bi_imm_u32(vbase), false);
+
+                        bi_ld_var_buf_f32_to(b, dest, src0, idx_bytes, regfmt,
+                                             sample, update, vecsize);
                 } else if (smooth) {
+                        if (base != 0)
+                                idx = bi_iadd_u32(b, idx, bi_imm_u32(base), false);
+
                         I = bi_ld_var_to(b, dest, src0, idx, regfmt, sample,
                                          update, vecsize);
                 } else {
-                        I = bi_ld_var_flat_to(b, dest, idx, BI_FUNCTION_NONE,
-                                              regfmt, vecsize);
+                        if (base != 0)
+                                idx = bi_iadd_u32(b, idx, bi_imm_u32(base), false);
+
+                        I = bi_ld_var_flat_to(b, dest, idx,
+                                              BI_FUNCTION_NONE, regfmt,
+                                              vecsize);
                 }
         }
 
@@ -794,39 +837,6 @@ bifrost_nir_specialize_idvs(nir_builder *b, nir_instr *instr, void *data)
         return false;
 }
 
-/**
- * Computes the offset in bytes of a varying. This assumes VARYING_SLOT_POS is
- * mapped to location=0 and always present. This also assumes each slot
- * consumes 16 bytes, which is a worst-case (highp vec4). In the future, this
- * should be optimized to support fp16 and partial vectors. There are
- * nontrivial interactions with separable shaders, however.
- */
-static unsigned
-bi_varying_offset(nir_shader *nir, nir_intrinsic_instr *intr)
-{
-        nir_src *offset = nir_get_io_offset_src(intr);
-        assert(nir_src_is_const(*offset) && "no indirect varyings on Valhall");
-
-        unsigned loc = 0;
-        unsigned slot = nir_intrinsic_base(intr) + nir_src_as_uint(*offset);
-
-        nir_foreach_shader_out_variable(var, nir) {
-                if ((var->data.location == VARYING_SLOT_POS) ||
-                    (var->data.location == VARYING_SLOT_PSIZ))
-                        continue;
-
-                if (var->data.driver_location > slot)
-                        continue;
-
-                if (var->data.driver_location == slot)
-                        return loc;
-
-                loc += 16; // todo size
-        }
-
-        unreachable("Unlinked variable");
-}
-
 static void
 bi_emit_store_vary(bi_builder *b, nir_intrinsic_instr *instr)
 {
@@ -880,7 +890,7 @@ bi_emit_store_vary(bi_builder *b, nir_intrinsic_instr *instr)
                          bi_src_index(&instr->src[0]),
                          address, bi_word(address, 1),
                          varying ? BI_SEG_VARY : BI_SEG_POS,
-                         varying ? bi_varying_offset(b->shader->nir, instr) : 0);
+                         varying ? bi_varying_offset(b->shader, instr) : 0);
         } else if (immediate) {
                 bi_index address = bi_lea_attr_imm(b,
                                           bi_vertex_id(b), bi_instance_id(b),
diff --git a/src/panfrost/lib/pan_shader.c b/src/panfrost/lib/pan_shader.c
index d16a9f67d73..59179f4888c 100644
--- a/src/panfrost/lib/pan_shader.c
+++ b/src/panfrost/lib/pan_shader.c
@@ -42,6 +42,7 @@ GENX(pan_shader_get_compiler_options)(void)
 #endif
 }
 
+#if PAN_ARCH <= 7
 static enum pipe_format
 varying_format(nir_alu_type t, unsigned ncomps)
 {
@@ -157,6 +158,7 @@ collect_varyings(nir_shader *s, nir_variable_mode varying_mode,
                 *varying_count = MAX2(*varying_count, loc + sz);
         }
 }
+#endif
 
 #if PAN_ARCH >= 6
 static enum mali_register_file_format
@@ -230,8 +232,14 @@ GENX(pan_shader_compile)(nir_shader *s,
 
                 info->vs.writes_point_size =
                         s->info.outputs_written & (1 << VARYING_SLOT_PSIZ);
+
+#if PAN_ARCH >= 9
+                info->varyings.output_count =
+                        util_last_bit(s->info.outputs_written >> VARYING_SLOT_VAR0);
+#else
                 collect_varyings(s, nir_var_shader_out, info->varyings.output,
                                  &info->varyings.output_count);
+#endif
                 break;
         case MESA_SHADER_FRAGMENT:
                 if (s->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
@@ -286,8 +294,13 @@ GENX(pan_shader_compile)(nir_shader *s,
                 info->fs.reads_face =
                         (s->info.inputs_read & (1 << VARYING_SLOT_FACE)) ||
                         BITSET_TEST(s->info.system_values_read, SYSTEM_VALUE_FRONT_FACE);
+#if PAN_ARCH >= 9
+                info->varyings.output_count =
+                        util_last_bit(s->info.outputs_read >> VARYING_SLOT_VAR0);
+#else
                 collect_varyings(s, nir_var_shader_in, info->varyings.input,
                                  &info->varyings.input_count);
+#endif
                 break;
         case MESA_SHADER_COMPUTE:
                 info->wls_size = s->info.shared_size;
diff --git a/src/panfrost/util/pan_ir.h b/src/panfrost/util/pan_ir.h
index 7d397d3ca17..3819e814272 100644
--- a/src/panfrost/util/pan_ir.h
+++ b/src/panfrost/util/pan_ir.h
@@ -190,6 +190,16 @@ struct panfrost_compile_inputs {
         uint8_t raw_fmt_mask;
         unsigned nr_cbufs;
 
+        /* Used on Valhall.
+         *
+         * Bit mask of special desktop-only varyings (e.g VARYING_SLOT_TEX0)
+         * written by the previous stage (fragment shader) or written by this
+         * stage (vertex shader). Bits are slots from gl_varying_slot.
+         *
+         * For modern APIs (GLES or VK), this should be 0.
+         */
+        uint32_t fixed_varying_mask;
+
         union {
                 struct {
                         bool static_rt_conv;