vc4: Reserve rb31 instead of r3 for raddr conflict spills.

This increases the cost of a raddr b conflict spill (save r3 to rb31, move src1 to r3, move rb31 back to r3 when done, instead of just move src1 to r3), but on average thanks to instruction pairing it's more worthwhile to have another accumulator. total instructions in shared programs: 46428 -> 46171 (-0.55%) instructions in affected programs: 38030 -> 37773 (-0.68%)
2014-12-08 16:52:53 -08:00 · 2014-12-08 16:52:53 -08:00 · 8420a95692
parent ab1b1fa6fb
commit 8420a95692
2 changed files with 45 additions and 11 deletions
--- a/src/gallium/drivers/vc4/vc4_qpu_emit.c
+++ b/src/gallium/drivers/vc4/vc4_qpu_emit.c
@ -93,21 +93,41 @@ swap_file(struct qpu_reg *src)
 * In that case, we need to move one to a temporary that can be used in the
 * instruction, instead.
 */
-static void
+static bool
 fixup_raddr_conflict(struct vc4_compile *c,
-                     struct qpu_reg *src0, struct qpu_reg *src1)
+                     struct qpu_reg dst,
+                     struct qpu_reg *src0, struct qpu_reg *src1,
+                     bool r3_live)
 {
        if ((src0->mux != QPU_MUX_A && src0->mux != QPU_MUX_B) ||
            src0->mux != src1->mux ||
            src0->addr == src1->addr) {
-                return;
+                return false;
        }

        if (swap_file(src0) || swap_file(src1))
-                return;
+                return false;

-        queue(c, qpu_a_MOV(qpu_r3(), *src1));
-        *src1 = qpu_r3();
+        if (src0->mux == QPU_MUX_A) {
+                /* If we're conflicting over the A regfile, then we can just
+                 * use the reserved rb31.
+                 */
+                queue(c, qpu_a_MOV(qpu_rb(31), *src1));
+                *src1 = qpu_rb(31);
+                return false;
+        } else {
+                /* Otherwise, we need a non-B regfile.  So, we spill r3 out to
+                 * rb31, then store our desired value in r3, and tell the
+                 * caller to put rb31 back into r3 when we're done.
+                 */
+                if (r3_live)
+                        queue(c, qpu_a_MOV(qpu_rb(31), qpu_r3()));
+                queue(c, qpu_a_MOV(qpu_r3(), *src1));
+
+                *src1 = qpu_r3();
+
+                return r3_live && dst.mux != QPU_MUX_R3;
+        }
 }

 void
@ -118,6 +138,8 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
        uint32_t inputs_remaining = c->num_inputs;
        uint32_t vpm_read_fifo_count = 0;
        uint32_t vpm_read_offset = 0;
+        bool written_r3 = false;
+        bool needs_restore;

        make_empty_list(&c->qpu_inst_list);

@ -416,8 +438,12 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
                        break;

                case QOP_TEX_DIRECT:
-                        fixup_raddr_conflict(c, &src[0], &src[1]);
+                        needs_restore = fixup_raddr_conflict(c, dst,
+                                                             &src[0], &src[1],
+                                                             written_r3);
                        queue(c, qpu_a_ADD(qpu_rb(QPU_W_TMU0_S), src[0], src[1]));
+                        if (needs_restore)
+                                queue(c, qpu_a_MOV(qpu_r3(), qpu_rb(31)));
                        break;

                case QOP_TEX_RESULT:
@ -477,7 +503,9 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
                        if (qir_get_op_nsrc(qinst->op) == 1)
                                src[1] = src[0];

-                        fixup_raddr_conflict(c, &src[0], &src[1]);
+                        needs_restore = fixup_raddr_conflict(c, dst,
+                                                             &src[0], &src[1],
+                                                             written_r3);

                        if (translate[qinst->op].is_mul) {
                                queue(c, qpu_m_alu2(translate[qinst->op].op,
@ -488,8 +516,14 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
                                                    dst,
                                                    src[0], src[1]));
                        }
+                        if (needs_restore)
+                                queue(c, qpu_a_MOV(qpu_r3(), qpu_rb(31)));
+
                        break;
                }
+
+                if (dst.mux == QPU_MUX_R3)
+                        written_r3 = true;
        }

        qpu_schedule_instructions(c);
--- a/src/gallium/drivers/vc4/vc4_register_allocate.c
+++ b/src/gallium/drivers/vc4/vc4_register_allocate.c
@ -117,10 +117,10 @@ vc4_alloc_reg_set(struct vc4_context *vc4)

        vc4->reg_class_any = ra_alloc_reg_class(vc4->regs);
        for (uint32_t i = 0; i < ARRAY_SIZE(vc4_regs); i++) {
-                /* Reserve r3 for now, since we're using it for spilling-like
-                 * operations in vc4_qpu_emit.c
+                /* Reserve rb31 for spilling fixup_raddr_conflict() in
+                 * vc4_qpu_emit.c
                 */
-                if (vc4_regs[i].mux == QPU_MUX_R3)
+                if (vc4_regs[i].mux == QPU_MUX_B && vc4_regs[i].addr == 31)
                        continue;

                /* R4 can't be written as a general purpose register. (it's