vc4: Expose compares at a lower level in QIR.

Before, we had some special opcodes like CMP and SNE that emitted multiple instructions. Now, we reduce those operations significantly, giving optimization more to look at for reducing redundant operations. The downside is that QOP_SF is pretty special -- we're going to have to track it separately when we're doing instruction scheduling, and we want to peephole it into the instruction generating the destination write in most cases (and not allocate the destination reg, probably. Unless it's used for some other purpose, as well).
2014-08-24 16:51:32 -07:00 · 2014-08-24 16:51:32 -07:00 · 874dfa8b2e
parent 3972a6f057
commit 874dfa8b2e
5 changed files with 148 additions and 79 deletions
--- a/src/gallium/drivers/vc4/vc4_opt_algebraic.c
+++ b/src/gallium/drivers/vc4/vc4_opt_algebraic.c
@ -45,9 +45,12 @@ qir_opt_algebraic(struct qcompile *c)
                struct qinst *inst = (struct qinst *)node;

                switch (inst->op) {
-                case QOP_CMP:
-                        /* Turn "dst = (a < 0) ? b : b)" into "dst = b" */
-                        if (qir_reg_equals(inst->src[1], inst->src[2])) {
+                case QOP_SEL_X_Y_ZS:
+                case QOP_SEL_X_Y_ZC:
+                case QOP_SEL_X_Y_NS:
+                case QOP_SEL_X_Y_NC:
+                        /* Turn "dst = (sf == x) ? a : a)" into "dst = a" */
+                        if (qir_reg_equals(inst->src[0], inst->src[1])) {
                                if (debug) {
                                        fprintf(stderr, "optimizing: ");
                                        qir_dump_inst(inst);
--- a/src/gallium/drivers/vc4/vc4_program.c
+++ b/src/gallium/drivers/vc4/vc4_program.c
@ -248,6 +248,58 @@ tgsi_to_qir_alu(struct tgsi_to_qir *trans,
        return dst;
 }

+static struct qreg
+tgsi_to_qir_seq(struct tgsi_to_qir *trans,
+                struct tgsi_full_instruction *tgsi_inst,
+                enum qop op, struct qreg *src, int i)
+{
+        struct qcompile *c = trans->c;
+        qir_SF(c, qir_FSUB(c, src[0 * 4 + i], src[1 * 4 + i]));
+        return qir_SEL_X_0_ZS(c, qir_uniform_f(trans, 1.0));
+}
+
+static struct qreg
+tgsi_to_qir_sne(struct tgsi_to_qir *trans,
+                struct tgsi_full_instruction *tgsi_inst,
+                enum qop op, struct qreg *src, int i)
+{
+        struct qcompile *c = trans->c;
+        qir_SF(c, qir_FSUB(c, src[0 * 4 + i], src[1 * 4 + i]));
+        return qir_SEL_X_0_ZC(c, qir_uniform_f(trans, 1.0));
+}
+
+static struct qreg
+tgsi_to_qir_slt(struct tgsi_to_qir *trans,
+                struct tgsi_full_instruction *tgsi_inst,
+                enum qop op, struct qreg *src, int i)
+{
+        struct qcompile *c = trans->c;
+        qir_SF(c, qir_FSUB(c, src[0 * 4 + i], src[1 * 4 + i]));
+        return qir_SEL_X_0_NS(c, qir_uniform_f(trans, 1.0));
+}
+
+static struct qreg
+tgsi_to_qir_sge(struct tgsi_to_qir *trans,
+                struct tgsi_full_instruction *tgsi_inst,
+                enum qop op, struct qreg *src, int i)
+{
+        struct qcompile *c = trans->c;
+        qir_SF(c, qir_FSUB(c, src[0 * 4 + i], src[1 * 4 + i]));
+        return qir_SEL_X_0_NC(c, qir_uniform_f(trans, 1.0));
+}
+
+static struct qreg
+tgsi_to_qir_cmp(struct tgsi_to_qir *trans,
+                struct tgsi_full_instruction *tgsi_inst,
+                enum qop op, struct qreg *src, int i)
+{
+        struct qcompile *c = trans->c;
+        qir_SF(c, src[0 * 4 + i]);
+        return qir_SEL_X_Y_NS(c,
+                              src[1 * 4 + i],
+                              src[2 * 4 + i]);
+}
+
 static struct qreg
 tgsi_to_qir_mad(struct tgsi_to_qir *trans,
                struct tgsi_full_instruction *tgsi_inst,
@ -280,16 +332,15 @@ tgsi_to_qir_lit(struct tgsi_to_qir *trans,
        case 2: {
                struct qreg zero = qir_uniform_f(trans, 0.0);

+                qir_SF(c, x);
                /* XXX: Clamp w to -128..128 */
-                return qir_CMP(c,
-                               x,
-                               zero,
-                               qir_EXP2(c, qir_FMUL(c,
-                                                    w,
-                                                    qir_LOG2(c,
-                                                             qir_FMAX(c,
-                                                                      y,
-                                                                      zero)))));
+                return qir_SEL_X_0_NC(c,
+                                      qir_EXP2(c, qir_FMUL(c,
+                                                           w,
+                                                           qir_LOG2(c,
+                                                                    qir_FMAX(c,
+                                                                             y,
+                                                                             zero)))));
        }
        default:
                assert(!"not reached");
@ -415,10 +466,10 @@ tgsi_to_qir_frc(struct tgsi_to_qir *trans,
        struct qcompile *c = trans->c;
        struct qreg trunc = qir_ITOF(c, qir_FTOI(c, src[0 * 4 + i]));
        struct qreg diff = qir_FSUB(c, src[0 * 4 + i], trunc);
-        return qir_CMP(c,
-                       diff,
-                       qir_FADD(c, diff, qir_uniform_f(trans, 1.0)),
-                       diff);
+        qir_SF(c, diff);
+        return qir_SEL_X_Y_NS(c,
+                              qir_FADD(c, diff, qir_uniform_f(trans, 1.0)),
+                              diff);
 }

 /**
@ -436,12 +487,11 @@ tgsi_to_qir_flr(struct tgsi_to_qir *trans,
        /* This will be < 0 if we truncated and the truncation was of a value
         * that was < 0 in the first place.
         */
-        struct qreg diff = qir_FSUB(c, src[0 * 4 + i], trunc);
+        qir_SF(c, qir_FSUB(c, src[0 * 4 + i], trunc));

-        return qir_CMP(c,
-                       diff,
-                       qir_FSUB(c, trunc, qir_uniform_f(trans, 1.0)),
-                       trunc);
+        return qir_SEL_X_Y_NS(c,
+                              qir_FSUB(c, trunc, qir_uniform_f(trans, 1.0)),
+                              trunc);
 }

 static struct qreg
@ -613,10 +663,10 @@ tgsi_to_qir_kill_if(struct tgsi_to_qir *trans, struct qreg *src, int i)

        if (trans->discard.file == QFILE_NULL)
                trans->discard = qir_uniform_f(trans, 0.0);
-        trans->discard = qir_CMP(c,
-                                 src[0 * 4 + i],
-                                 qir_uniform_f(trans, 1.0),
-                                 trans->discard);
+        qir_SF(c, src[0 * 4 + i]);
+        trans->discard = qir_SEL_X_Y_NS(c,
+                                        qir_uniform_f(trans, 1.0),
+                                        trans->discard);
 }

 static void
@ -705,11 +755,11 @@ emit_tgsi_instruction(struct tgsi_to_qir *trans,
                [TGSI_OPCODE_MIN] = { QOP_FMIN, tgsi_to_qir_alu },
                [TGSI_OPCODE_MAX] = { QOP_FMAX, tgsi_to_qir_alu },
                [TGSI_OPCODE_RSQ] = { QOP_RSQ, tgsi_to_qir_alu },
-                [TGSI_OPCODE_SEQ] = { QOP_SEQ, tgsi_to_qir_alu },
-                [TGSI_OPCODE_SNE] = { QOP_SNE, tgsi_to_qir_alu },
-                [TGSI_OPCODE_SGE] = { QOP_SGE, tgsi_to_qir_alu },
-                [TGSI_OPCODE_SLT] = { QOP_SLT, tgsi_to_qir_alu },
-                [TGSI_OPCODE_CMP] = { QOP_CMP, tgsi_to_qir_alu },
+                [TGSI_OPCODE_SEQ] = { 0, tgsi_to_qir_seq },
+                [TGSI_OPCODE_SNE] = { 0, tgsi_to_qir_sne },
+                [TGSI_OPCODE_SGE] = { 0, tgsi_to_qir_sge },
+                [TGSI_OPCODE_SLT] = { 0, tgsi_to_qir_slt },
+                [TGSI_OPCODE_CMP] = { 0, tgsi_to_qir_cmp },
                [TGSI_OPCODE_MAD] = { 0, tgsi_to_qir_mad },
                [TGSI_OPCODE_DP2] = { 0, tgsi_to_qir_dp2 },
                [TGSI_OPCODE_DP3] = { 0, tgsi_to_qir_dp3 },
--- a/src/gallium/drivers/vc4/vc4_qir.c
+++ b/src/gallium/drivers/vc4/vc4_qir.c
@ -43,11 +43,15 @@ static const struct qir_op_info qir_op_info[] = {
        [QOP_FMINABS] = { "fminabs", 1, 2 },
        [QOP_FMAXABS] = { "fmaxabs", 1, 2 },

-        [QOP_SEQ] = { "seq", 1, 2 },
-        [QOP_SNE] = { "sne", 1, 2 },
-        [QOP_SGE] = { "sge", 1, 2 },
-        [QOP_SLT] = { "slt", 1, 2 },
-        [QOP_CMP] = { "cmp", 1, 3 },
+        [QOP_SF] = { "sf", 0, 1 },
+        [QOP_SEL_X_0_NS] = { "fsel_x_0_ns", 1, 1 },
+        [QOP_SEL_X_0_NC] = { "fsel_x_0_nc", 1, 1 },
+        [QOP_SEL_X_0_ZS] = { "fsel_x_0_zs", 1, 1 },
+        [QOP_SEL_X_0_ZC] = { "fsel_x_0_zc", 1, 1 },
+        [QOP_SEL_X_Y_NS] = { "fsel_x_y_ns", 1, 2 },
+        [QOP_SEL_X_Y_NC] = { "fsel_x_y_nc", 1, 2 },
+        [QOP_SEL_X_Y_ZS] = { "fsel_x_y_zs", 1, 2 },
+        [QOP_SEL_X_Y_ZC] = { "fsel_x_y_zc", 1, 2 },

        [QOP_FTOI] = { "ftoi", 1, 1 },
        [QOP_ITOF] = { "itof", 1, 1 },
--- a/src/gallium/drivers/vc4/vc4_qir.h
+++ b/src/gallium/drivers/vc4/vc4_qir.h
@ -54,11 +54,21 @@ enum qop {
        QOP_FMINABS,
        QOP_FMAXABS,

-        QOP_SEQ,
-        QOP_SNE,
-        QOP_SGE,
-        QOP_SLT,
-        QOP_CMP,
+        /* Sets the flag register according to src. */
+        QOP_SF,
+
+        /* Note: Orderings of these compares must be the same as in
+         * qpu_defines.h.  Selects the src[0] if the ns flag bit is set,
+         * otherwise 0. */
+        QOP_SEL_X_0_ZS,
+        QOP_SEL_X_0_ZC,
+        QOP_SEL_X_0_NS,
+        QOP_SEL_X_0_NC,
+        /* Selects the src[0] if the ns flag bit is set, otherwise src[1]. */
+        QOP_SEL_X_Y_ZS,
+        QOP_SEL_X_Y_ZC,
+        QOP_SEL_X_Y_NS,
+        QOP_SEL_X_Y_NC,

        QOP_FTOI,
        QOP_ITOF,
@ -260,6 +270,15 @@ QIR_ALU1(MOV)
 QIR_ALU2(FADD)
 QIR_ALU2(FSUB)
 QIR_ALU2(FMUL)
+QIR_NODST_1(SF)
+QIR_ALU1(SEL_X_0_ZS)
+QIR_ALU1(SEL_X_0_ZC)
+QIR_ALU1(SEL_X_0_NS)
+QIR_ALU1(SEL_X_0_NC)
+QIR_ALU2(SEL_X_Y_ZS)
+QIR_ALU2(SEL_X_Y_ZC)
+QIR_ALU2(SEL_X_Y_NS)
+QIR_ALU2(SEL_X_Y_NC)
 QIR_ALU2(FMIN)
 QIR_ALU2(FMAX)
 QIR_ALU2(FMINABS)
@ -283,14 +302,6 @@ QIR_ALU0(FRAG_Z)
 QIR_ALU0(FRAG_RCP_W)
 QIR_NODST_1(TLB_DISCARD_SETUP)

-static inline struct qreg
-qir_CMP(struct qcompile *c, struct qreg cmp, struct qreg a, struct qreg b)
-{
-        struct qreg t = qir_get_temp(c);
-        qir_emit(c, qir_inst4(QOP_CMP, t, cmp, a, b, c->undef));
-        return t;
-}
-
 static inline struct qreg
 qir_R4_UNPACK(struct qcompile *c, int i)
 {
--- a/src/gallium/drivers/vc4/vc4_qpu_emit.c
+++ b/src/gallium/drivers/vc4/vc4_qpu_emit.c
@ -60,6 +60,12 @@ last_inst(struct qcompile *c)
        return &q->inst;
 }

+static void
+set_last_cond_add(struct qcompile *c, uint32_t cond)
+{
+        *last_inst(c) = qpu_set_cond_add(*last_inst(c), cond);
+}
+
 /**
 * This is used to resolve the fact that we might register-allocate two
 * different operands of an instruction to the same physical register file
@ -278,13 +284,6 @@ vc4_generate_code(struct qcompile *c)
                        M(FMUL),
                };

-                static const uint32_t compareflags[] = {
-                        [QOP_SEQ - QOP_SEQ] = QPU_COND_ZS,
-                        [QOP_SNE - QOP_SEQ] = QPU_COND_ZC,
-                        [QOP_SLT - QOP_SEQ] = QPU_COND_NS,
-                        [QOP_SGE - QOP_SEQ] = QPU_COND_NC,
-                };
-
                struct qpu_reg src[4];
                for (int i = 0; i < qir_get_op_nsrc(qinst->op); i++) {
                        int index = qinst->src[i].index;
@ -365,32 +364,36 @@ vc4_generate_code(struct qcompile *c)
                        }
                        break;

-                case QOP_CMP:
+                case QOP_SF:
+                        fixup_raddr_conflict(c, src[0], &src[1]);
                        queue(c, qpu_a_MOV(qpu_ra(QPU_W_NOP), src[0]));
                        *last_inst(c) |= QPU_SF;
-
-                        queue(c, qpu_a_MOV(dst, src[1]));
-                        *last_inst(c) = qpu_set_cond_add(*last_inst(c),
-                                                         QPU_COND_NS);
-
-                        queue(c, qpu_a_MOV(dst, src[2]));
-                        *last_inst(c) = qpu_set_cond_add(*last_inst(c),
-                                                         QPU_COND_NC);
                        break;

-                case QOP_SEQ:
-                case QOP_SNE:
-                case QOP_SGE:
-                case QOP_SLT:
-                        fixup_raddr_conflict(c, src[0], &src[1]);
-                        queue(c, qpu_a_FSUB(qpu_ra(QPU_W_NOP), src[0], src[1]));
-                        *last_inst(c) |= QPU_SF;
+                case QOP_SEL_X_0_ZS:
+                case QOP_SEL_X_0_ZC:
+                case QOP_SEL_X_0_NS:
+                case QOP_SEL_X_0_NC:
+                        queue(c, qpu_a_MOV(dst, src[0]));
+                        set_last_cond_add(c, qinst->op - QOP_SEL_X_0_ZS +
+                                          QPU_COND_ZS);

-                        queue(c, qpu_load_imm_f(dst, 0.0));
-                        queue(c, qpu_load_imm_f(dst, 1.0));
-                        *last_inst(c) = qpu_set_cond_add(*last_inst(c),
-                                                         compareflags[qinst->op - QOP_SEQ]);
+                        queue(c, qpu_a_XOR(dst, qpu_r0(), qpu_r0()));
+                        set_last_cond_add(c, ((qinst->op - QOP_SEL_X_0_ZS) ^
+                                              1) + QPU_COND_ZS);
+                        break;

+                case QOP_SEL_X_Y_ZS:
+                case QOP_SEL_X_Y_ZC:
+                case QOP_SEL_X_Y_NS:
+                case QOP_SEL_X_Y_NC:
+                        queue(c, qpu_a_MOV(dst, src[0]));
+                        set_last_cond_add(c, qinst->op - QOP_SEL_X_Y_ZS +
+                                          QPU_COND_ZS);
+
+                        queue(c, qpu_a_MOV(dst, src[1]));
+                        set_last_cond_add(c, ((qinst->op - QOP_SEL_X_Y_ZS) ^
+                                              1) + QPU_COND_ZS);

                        break;

@ -475,8 +478,7 @@ vc4_generate_code(struct qcompile *c)
                        queue(c, qpu_a_MOV(qpu_ra(QPU_W_TLB_Z),
                                           qpu_rb(QPU_R_FRAG_PAYLOAD_ZW)));
                        if (discard) {
-                                *last_inst(c) = qpu_set_cond_add(*last_inst(c),
-                                                                 QPU_COND_ZS);
+                                set_last_cond_add(c, QPU_COND_ZS);
                        }
                        break;

@ -490,8 +492,7 @@ vc4_generate_code(struct qcompile *c)
                case QOP_TLB_COLOR_WRITE:
                        queue(c, qpu_a_MOV(qpu_tlbc(), src[0]));
                        if (discard) {
-                                *last_inst(c) = qpu_set_cond_add(*last_inst(c),
-                                                                 QPU_COND_ZS);
+                                set_last_cond_add(c, QPU_COND_ZS);
                        }
                        break;