From 8b36d107fdd6f6b91556fcdc3498df16803d4181 Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Tue, 18 Aug 2015 21:26:05 -0700
Subject: [PATCH] vc4: Pack the unorm-packing bits into a src MUL instruction
 when possible.

Now that we do non-SSA QIR instructions, we can take a NIR SSA src that's
only used by the unorm packing and just stuff the pack bits into it.

total instructions in shared programs: 98136 -> 97974 (-0.17%)
instructions in affected programs:     4149 -> 3987 (-3.90%)
---
 src/gallium/drivers/vc4/vc4_opt_algebraic.c | 14 ++--
 src/gallium/drivers/vc4/vc4_program.c       | 77 ++++++++++++++++++---
 src/gallium/drivers/vc4/vc4_qir.c           | 23 ++++++
 src/gallium/drivers/vc4/vc4_qir.h           |  1 +
 src/gallium/drivers/vc4/vc4_qpu_emit.c      |  5 ++
 5 files changed, 104 insertions(+), 16 deletions(-)

diff --git a/src/gallium/drivers/vc4/vc4_opt_algebraic.c b/src/gallium/drivers/vc4/vc4_opt_algebraic.c
index e8c93dedfd2..77028bc026f 100644
--- a/src/gallium/drivers/vc4/vc4_opt_algebraic.c
+++ b/src/gallium/drivers/vc4/vc4_opt_algebraic.c
@@ -229,18 +229,20 @@ qir_opt_algebraic(struct vc4_compile *c)
                         break;
 
                 case QOP_FMUL:
-                        if (replace_x_0_with_0(c, inst, 0) ||
-                            replace_x_0_with_0(c, inst, 1) ||
-                            fmul_replace_one(c, inst, 0) ||
-                            fmul_replace_one(c, inst, 1)) {
+                        if (!inst->dst.pack &&
+                            (replace_x_0_with_0(c, inst, 0) ||
+                             replace_x_0_with_0(c, inst, 1) ||
+                             fmul_replace_one(c, inst, 0) ||
+                             fmul_replace_one(c, inst, 1))) {
                                 progress = true;
                                 break;
                         }
                         break;
 
                 case QOP_MUL24:
-                        if (replace_x_0_with_0(c, inst, 0) ||
-                            replace_x_0_with_0(c, inst, 1)) {
+                        if (!inst->dst.pack &&
+                            (replace_x_0_with_0(c, inst, 0) ||
+                             replace_x_0_with_0(c, inst, 1))) {
                                 progress = true;
                                 break;
                         }
diff --git a/src/gallium/drivers/vc4/vc4_program.c b/src/gallium/drivers/vc4/vc4_program.c
index ff41779e6c1..6bf4c9eab9b 100644
--- a/src/gallium/drivers/vc4/vc4_program.c
+++ b/src/gallium/drivers/vc4/vc4_program.c
@@ -818,6 +818,72 @@ declare_uniform_range(struct vc4_compile *c, uint32_t start, uint32_t size)
         c->ubo_ranges[array_id].used = false;
 }
 
+static bool
+ntq_src_is_only_ssa_def_user(nir_src *src)
+{
+        if (!src->is_ssa)
+                return false;
+
+        if (!list_empty(&src->ssa->if_uses))
+                return false;
+
+        return (src->ssa->uses.next == &src->use_link &&
+                src->ssa->uses.next->next == &src->ssa->uses);
+}
+
+/**
+ * In general, emits a nir_pack_unorm_4x8 as a series of MOVs with the pack
+ * bit set.
+ *
+ * However, as an optimization, it tries to find the instructions generating
+ * the sources to be packed and just emit the pack flag there, if possible.
+ */
+static void
+ntq_emit_pack_unorm_4x8(struct vc4_compile *c, nir_alu_instr *instr)
+{
+        struct qreg result = qir_get_temp(c);
+        struct nir_alu_instr *vec4 = NULL;
+
+        /* If packing from a vec4 op (as expected), identify it so that we can
+         * peek back at what generated its sources.
+         */
+        if (instr->src[0].src.is_ssa &&
+            instr->src[0].src.ssa->parent_instr->type == nir_instr_type_alu &&
+            nir_instr_as_alu(instr->src[0].src.ssa->parent_instr)->op ==
+            nir_op_vec4) {
+                vec4 = nir_instr_as_alu(instr->src[0].src.ssa->parent_instr);
+        }
+
+        for (int i = 0; i < 4; i++) {
+                int swiz = instr->src[0].swizzle[i];
+                struct qreg src;
+                if (vec4) {
+                        src = ntq_get_src(c, vec4->src[swiz].src,
+                                          vec4->src[swiz].swizzle[0]);
+                } else {
+                        src = ntq_get_src(c, instr->src[0].src, swiz);
+                }
+
+                if (vec4 &&
+                    ntq_src_is_only_ssa_def_user(&vec4->src[swiz].src) &&
+                    src.file == QFILE_TEMP &&
+                    c->defs[src.index] &&
+                    qir_is_mul(c->defs[src.index]) &&
+                    !c->defs[src.index]->dst.pack) {
+                        struct qinst *rewrite = c->defs[src.index];
+                        c->defs[src.index] = NULL;
+                        rewrite->dst = result;
+                        rewrite->dst.pack = QPU_PACK_MUL_8A + i;
+                        continue;
+                }
+
+                qir_PACK_8_F(c, result, src, i);
+        }
+
+        struct qreg *dest = ntq_get_dest(c, &instr->dest.dest);
+        *dest = result;
+}
+
 static void
 ntq_emit_alu(struct vc4_compile *c, nir_alu_instr *instr)
 {
@@ -839,16 +905,7 @@ ntq_emit_alu(struct vc4_compile *c, nir_alu_instr *instr)
         }
 
         if (instr->op == nir_op_pack_unorm_4x8) {
-                struct qreg result = qir_get_temp(c);
-
-                for (int i = 0; i < 4; i++) {
-                        qir_PACK_8_F(c, result,
-                                     ntq_get_src(c, instr->src[0].src,
-                                                 instr->src[0].swizzle[i]),
-                                     i);
-                }
-                struct qreg *dest = ntq_get_dest(c, &instr->dest.dest);
-                *dest = result;
+                ntq_emit_pack_unorm_4x8(c, instr);
                 return;
         }
 
diff --git a/src/gallium/drivers/vc4/vc4_qir.c b/src/gallium/drivers/vc4/vc4_qir.c
index f27b2d2d949..92669a83010 100644
--- a/src/gallium/drivers/vc4/vc4_qir.c
+++ b/src/gallium/drivers/vc4/vc4_qir.c
@@ -286,6 +286,29 @@ qir_dump_inst(struct vc4_compile *c, struct qinst *inst)
                 inst->sf ? ".sf" : "");
 
         qir_print_reg(c, inst->dst, true);
+        if (inst->dst.pack) {
+                if (qir_is_mul(inst)) {
+                        switch (inst->dst.pack) {
+                        case QPU_PACK_MUL_8888:
+                                fprintf(stderr, ".8888");
+                                break;
+                        case QPU_PACK_MUL_8A:
+                                fprintf(stderr, ".8a");
+                                break;
+                        case QPU_PACK_MUL_8B:
+                                fprintf(stderr, ".8b");
+                                break;
+                        case QPU_PACK_MUL_8C:
+                                fprintf(stderr, ".8c");
+                                break;
+                        case QPU_PACK_MUL_8D:
+                                fprintf(stderr, ".8d");
+                                break;
+                        }
+                } else {
+                        unreachable("packs only set up for MULs so far.\n");
+                }
+        }
         for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) {
                 fprintf(stderr, ", ");
                 qir_print_reg(c, inst->src[i], false);
diff --git a/src/gallium/drivers/vc4/vc4_qir.h b/src/gallium/drivers/vc4/vc4_qir.h
index c9ca3da203c..65d493dd558 100644
--- a/src/gallium/drivers/vc4/vc4_qir.h
+++ b/src/gallium/drivers/vc4/vc4_qir.h
@@ -58,6 +58,7 @@ enum qfile {
 struct qreg {
         enum qfile file;
         uint32_t index;
+        int pack;
 };
 
 enum qop {
diff --git a/src/gallium/drivers/vc4/vc4_qpu_emit.c b/src/gallium/drivers/vc4/vc4_qpu_emit.c
index ef35f33a0eb..bf614a2c1fd 100644
--- a/src/gallium/drivers/vc4/vc4_qpu_emit.c
+++ b/src/gallium/drivers/vc4/vc4_qpu_emit.c
@@ -507,6 +507,11 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
                                 queue(c, qpu_m_alu2(translate[qinst->op].op,
                                                     dst,
                                                     src[0], src[1]));
+                                if (qinst->dst.pack) {
+                                        *last_inst(c) |= QPU_PM;
+                                        *last_inst(c) |= QPU_SET_FIELD(qinst->dst.pack,
+                                                                       QPU_PACK);
+                                }
                         } else {
                                 queue(c, qpu_a_alu2(translate[qinst->op].op,
                                                     dst,