diff --git a/src/gallium/drivers/vc4/vc4_opt_copy_propagation.c b/src/gallium/drivers/vc4/vc4_opt_copy_propagation.c index f8f1365f658..d20ee5e227d 100644 --- a/src/gallium/drivers/vc4/vc4_opt_copy_propagation.c +++ b/src/gallium/drivers/vc4/vc4_opt_copy_propagation.c @@ -90,6 +90,14 @@ try_copy_prop(struct vc4_compile *c, struct qinst *inst, struct qinst **movs) continue; } + /* Mul rotation's source needs to be in an r0-r3 accumulator, + * so no uniforms or regfile-a/r4 unpacking allowed. + */ + if (inst->op == QOP_ROT_MUL && + (mov->src[0].file != QFILE_TEMP || + mov->src[0].pack)) + continue; + uint8_t unpack; if (mov->src[0].pack) { /* Make sure that the meaning of the unpack diff --git a/src/gallium/drivers/vc4/vc4_qir.c b/src/gallium/drivers/vc4/vc4_qir.c index 9b4a28ebab6..446af66affd 100644 --- a/src/gallium/drivers/vc4/vc4_qir.c +++ b/src/gallium/drivers/vc4/vc4_qir.c @@ -86,6 +86,8 @@ static const struct qir_op_info qir_op_info[] = { [QOP_LOAD_IMM_U2] = { "load_imm_u2", 0, 1 }, [QOP_LOAD_IMM_I2] = { "load_imm_i2", 0, 1 }, + [QOP_ROT_MUL] = { "rot_mul", 0, 2 }, + [QOP_BRANCH] = { "branch", 0, 0, true }, [QOP_UNIFORMS_RESET] = { "uniforms_reset", 0, 2, true }, }; @@ -164,6 +166,7 @@ qir_is_mul(struct qinst *inst) case QOP_V8MAX: case QOP_V8ADDS: case QOP_V8SUBS: + case QOP_ROT_MUL: return true; default: return false; diff --git a/src/gallium/drivers/vc4/vc4_qir.h b/src/gallium/drivers/vc4/vc4_qir.h index 90cc1385043..a82c47c0341 100644 --- a/src/gallium/drivers/vc4/vc4_qir.h +++ b/src/gallium/drivers/vc4/vc4_qir.h @@ -168,6 +168,8 @@ enum qop { */ QOP_LOAD_IMM_I2, + QOP_ROT_MUL, + /* Jumps to block->successor[0] if the qinst->cond (as a * QPU_COND_BRANCH_*) passes, or block->successor[1] if not. Note * that block->successor[1] may be unset if the condition is ALWAYS. @@ -822,6 +824,16 @@ qir_LOAD_IMM_I2(struct vc4_compile *c, uint32_t val) c->undef)); } +/** Shifts the multiply output to the right by rot channels */ +static inline struct qreg +qir_ROT_MUL(struct vc4_compile *c, struct qreg val, uint32_t rot) +{ + return qir_emit_def(c, qir_inst(QOP_ROT_MUL, c->undef, + val, + qir_reg(QFILE_LOAD_IMM, + QPU_SMALL_IMM_MUL_ROT + rot))); +} + static inline void qir_MOV_cond(struct vc4_compile *c, uint8_t cond, struct qreg dest, struct qreg src) diff --git a/src/gallium/drivers/vc4/vc4_qpu.c b/src/gallium/drivers/vc4/vc4_qpu.c index d022d107eb3..67850a8114a 100644 --- a/src/gallium/drivers/vc4/vc4_qpu.c +++ b/src/gallium/drivers/vc4/vc4_qpu.c @@ -234,6 +234,19 @@ qpu_m_alu2(enum qpu_op_mul op, return inst; } +uint64_t +qpu_m_rot(struct qpu_reg dst, struct qpu_reg src0, int rot) +{ + uint64_t inst = 0; + inst = qpu_m_alu2(QPU_M_V8MIN, dst, src0, src0); + + inst = QPU_UPDATE_FIELD(inst, QPU_SIG_SMALL_IMM, QPU_SIG); + inst = QPU_UPDATE_FIELD(inst, QPU_SMALL_IMM_MUL_ROT + rot, + QPU_SMALL_IMM); + + return inst; +} + static bool merge_fields(uint64_t *merge, uint64_t a, uint64_t b, diff --git a/src/gallium/drivers/vc4/vc4_qpu.h b/src/gallium/drivers/vc4/vc4_qpu.h index 437e4f5e5a4..5ec80f05375 100644 --- a/src/gallium/drivers/vc4/vc4_qpu.h +++ b/src/gallium/drivers/vc4/vc4_qpu.h @@ -150,6 +150,7 @@ uint64_t qpu_set_sig(uint64_t inst, uint32_t sig) ATTRIBUTE_CONST; uint64_t qpu_set_cond_add(uint64_t inst, uint32_t cond) ATTRIBUTE_CONST; uint64_t qpu_set_cond_mul(uint64_t inst, uint32_t cond) ATTRIBUTE_CONST; uint32_t qpu_encode_small_immediate(uint32_t i) ATTRIBUTE_CONST; +uint64_t qpu_m_rot(struct qpu_reg dst, struct qpu_reg src, int rot) ATTRIBUTE_CONST; bool qpu_waddr_is_tlb(uint32_t waddr) ATTRIBUTE_CONST; bool qpu_inst_is_tlb(uint64_t inst) ATTRIBUTE_CONST; diff --git a/src/gallium/drivers/vc4/vc4_qpu_emit.c b/src/gallium/drivers/vc4/vc4_qpu_emit.c index f5a5b8a862a..79588b3f51c 100644 --- a/src/gallium/drivers/vc4/vc4_qpu_emit.c +++ b/src/gallium/drivers/vc4/vc4_qpu_emit.c @@ -434,6 +434,20 @@ vc4_generate_code_block(struct vc4_compile *c, case QOP_LOAD_IMM_I2: queue(block, qpu_load_imm_i2(dst, qinst->src[0].index)); + + case QOP_ROT_MUL: + /* Rotation at the hardware level occurs on the inputs + * to the MUL unit, and they must be accumulators in + * order to have the time necessary to move things. + */ + assert(src[0].mux <= QPU_MUX_R3); + + queue(block, + qpu_m_rot(dst, src[0], qinst->src[1].index - + QPU_SMALL_IMM_MUL_ROT) | unpack); + set_last_cond_mul(block, qinst->cond); + handled_qinst_cond = true; + set_last_dst_pack(block, qinst); break; case QOP_MS_MASK: