pan/bi: Lift split/collect cache from AGX

Design based on ACO (and fruitful discussions with Daniel). Signed-off-by: Alyssa Rosenzweig <alyssa@collabora.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/16585>
2022-05-10 11:54:03 -04:00 · 2022-05-10 11:54:03 -04:00 · 7bfaa119f4
parent 8fdb01b96f
commit 7bfaa119f4
2 changed files with 147 additions and 34 deletions
--- a/src/panfrost/bifrost/bifrost_compile.c
+++ b/src/panfrost/bifrost/bifrost_compile.c
@ -1,5 +1,6 @@
 /*
 * Copyright (C) 2020 Collabora Ltd.
+ * Copyright (C) 2022 Alyssa Rosenzweig <alyssa@rosenzweig.io>
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
@ -116,6 +117,144 @@ bi_emit_jump(bi_builder *b, nir_jump_instr *instr)
        b->shader->current_block->unconditional_jumps = true;
 }

+/* Builds a 64-bit hash table key for an index */
+static uint64_t
+bi_index_to_key(bi_index idx)
+{
+        static_assert(sizeof(idx) <= sizeof(uint64_t), "too much padding");
+
+        uint64_t key = 0;
+        memcpy(&key, &idx, sizeof(idx));
+        return key;
+}
+
+/*
+ * Extract a single channel out of a vector source. We split vectors with SPLIT
+ * so we can use the split components directly, without emitting an extract.
+ * This has advantages of RA, as the split can usually be optimized away.
+ */
+static bi_index
+bi_extract(bi_builder *b, bi_index vec, unsigned channel)
+{
+        /* Extract caching relies on SSA form. It is incorrect for nir_register.
+         * Bypass the cache and emit an explicit split for registers.
+         */
+        if (vec.reg) {
+                bi_instr *I = bi_split_i32_to(b, bi_null(), vec);
+                I->nr_dests = channel + 1;
+                I->dest[channel] = bi_temp(b->shader);
+                return I->dest[channel];
+        }
+
+        bi_index *components =
+                _mesa_hash_table_u64_search(b->shader->allocated_vec,
+                                            bi_index_to_key(vec));
+
+        /* No extract needed for scalars.
+         *
+         * This is a bit imprecise, but actual bugs (missing splits for vectors)
+         * should be caught by the following assertion. It is too difficult to
+         * ensure bi_extract is only called for real vectors.
+         */
+        if (components == NULL && channel == 0)
+                return vec;
+
+        assert(components != NULL && "missing bi_cache_collect()");
+        return components[channel];
+}
+
+static void
+bi_cache_collect(bi_builder *b, bi_index dst, bi_index *s, unsigned n)
+{
+        /* Lifetime of a hash table entry has to be at least as long as the table */
+        bi_index *channels = ralloc_array(b->shader, bi_index, n);
+        memcpy(channels, s, sizeof(bi_index) * n);
+
+        _mesa_hash_table_u64_insert(b->shader->allocated_vec,
+                                    bi_index_to_key(dst), channels);
+}
+
+/*
+ * Splits an n-component vector (vec) into n scalar destinations (dests) using a
+ * split pseudo-instruction.
+ *
+ * Pre-condition: dests is filled with bi_null().
+ */
+static void
+bi_emit_split_i32(bi_builder *b, bi_index dests[4], bi_index vec, unsigned n)
+{
+        /* Setup the destinations */
+        for (unsigned i = 0; i < n; ++i) {
+                dests[i] = bi_temp(b->shader);
+        }
+
+        /* Emit the split */
+        if (n == 1) {
+                bi_mov_i32_to(b, dests[0], vec);
+        } else {
+                bi_instr *I = bi_split_i32_to(b, dests[0], vec);
+                I->nr_dests = n;
+
+                for (unsigned j = 1; j < n; ++j)
+                        I->dest[j] = dests[j];
+        }
+}
+
+static void
+bi_emit_cached_split_i32(bi_builder *b, bi_index vec, unsigned n)
+{
+        bi_index dests[4] = { bi_null(), bi_null(), bi_null(), bi_null() };
+        bi_emit_split_i32(b, dests, vec, n);
+        bi_cache_collect(b, vec, dests, n);
+}
+
+/*
+ * Emit and cache a split for a vector of a given bitsize. The vector may not be
+ * composed of 32-bit words, but it will be split at 32-bit word boundaries.
+ */
+static void
+bi_emit_cached_split(bi_builder *b, bi_index vec, unsigned bits)
+{
+        bi_emit_cached_split(b, vec, DIV_ROUND_UP(bits, 32));
+}
+
+static bi_instr *
+bi_emit_collect_to(bi_builder *b, bi_index dst, bi_index *chan, unsigned n)
+{
+        /* Special case: COLLECT of a single value is a scalar move */
+        if (n == 1)
+                return bi_mov_i32_to(b, dst, chan[0]);
+
+        bi_instr *I = bi_collect_i32_to(b, dst);
+        I->nr_srcs = n;
+
+        for (unsigned i = 0; i < n; ++i)
+                I->src[i] = chan[i];
+
+        bi_cache_collect(b, dst, chan, n);
+        return I;
+}
+
+static bi_instr *
+bi_collect_v2i32_to(bi_builder *b, bi_index dst, bi_index s0, bi_index s1)
+{
+        return bi_emit_collect_to(b, dst, (bi_index[]) { s0, s1 }, 2);
+}
+
+static bi_instr *
+bi_collect_v3i32_to(bi_builder *b, bi_index dst, bi_index s0, bi_index s1, bi_index s2)
+{
+        return bi_emit_collect_to(b, dst, (bi_index[]) { s0, s1, s2 }, 3);
+}
+
+static bi_index
+bi_collect_v2i32(bi_builder *b, bi_index s0, bi_index s1)
+{
+        bi_index dst = bi_temp(b->shader);
+        bi_collect_v2i32_to(b, dst, s0, s1);
+        return dst;
+}
+
 static bi_index
 bi_varying_src0_for_barycentric(bi_builder *b, nir_intrinsic_instr *intr)
 {
@ -154,7 +293,7 @@ bi_varying_src0_for_barycentric(bi_builder *b, nir_intrinsic_instr *intr)
                        bi_index f[2];
                        for (unsigned i = 0; i < 2; ++i) {
                                f[i] = bi_fadd_rscale_f32(b,
-                                                bi_word(offset, i),
+                                                bi_extract(b, offset, i),
                                                bi_imm_f32(0.5), bi_imm_u32(8),
                                                BI_SPECIAL_NONE);
                        }
@ -225,39 +364,6 @@ bi_make_vec_to(bi_builder *b, bi_index final_dst,
                unsigned count,
                unsigned bitsize);

-static bi_instr *
-bi_collect_v2i32_to(bi_builder *b, bi_index dst, bi_index s0, bi_index s1)
-{
-        bi_instr *I = bi_collect_i32_to(b, dst);
-
-        I->nr_srcs = 2;
-        I->src[0] = s0;
-        I->src[1] = s1;
-
-        return I;
-}
-
-static bi_instr *
-bi_collect_v3i32_to(bi_builder *b, bi_index dst, bi_index s0, bi_index s1, bi_index s2)
-{
-        bi_instr *I = bi_collect_i32_to(b, dst);
-
-        I->nr_srcs = 3;
-        I->src[0] = s0;
-        I->src[1] = s1;
-        I->src[2] = s2;
-
-        return I;
-}
-
-static bi_index
-bi_collect_v2i32(bi_builder *b, bi_index s0, bi_index s1)
-{
-        bi_index dst = bi_temp(b->shader);
-        bi_collect_v2i32_to(b, dst, s0, s1);
-        return dst;
-}
-
 /* Bifrost's load instructions lack a component offset despite operating in
 * terms of vec4 slots. Usually I/O vectorization avoids nonzero components,
 * but they may be unavoidable with separate shaders in use. To solve this, we
@ -4690,6 +4796,8 @@ bi_compile_variant_nir(nir_shader *nir,
                nir_print_shader(nir, stdout);
        }

+        ctx->allocated_vec = _mesa_hash_table_u64_create(ctx);
+
        nir_foreach_function(func, nir) {
                if (!func->impl)
                        continue;
--- a/src/panfrost/bifrost/compiler.h
+++ b/src/panfrost/bifrost/compiler.h
@ -766,6 +766,11 @@ typedef struct {
       /* Mask of UBOs that need to be uploaded */
       uint32_t ubo_mask;

+       /* During instruction selection, map from vector bi_index to its scalar
+        * components, populated by a split.
+        */
+       struct hash_table_u64 *allocated_vec;
+
       /* Stats for shader-db */
       unsigned instruction_count;
       unsigned loop_count;