vc4: Expose compares at a lower level in QIR.

Before, we had some special opcodes like CMP and SNE that emitted multiple
instructions.  Now, we reduce those operations significantly, giving
optimization more to look at for reducing redundant operations.

The downside is that QOP_SF is pretty special -- we're going to have to
track it separately when we're doing instruction scheduling, and we want
to peephole it into the instruction generating the destination write in
most cases (and not allocate the destination reg, probably.  Unless it's
used for some other purpose, as well).
This commit is contained in:
Eric Anholt 2014-08-24 16:51:32 -07:00
parent 3972a6f057
commit 874dfa8b2e
5 changed files with 148 additions and 79 deletions

View File

@ -45,9 +45,12 @@ qir_opt_algebraic(struct qcompile *c)
struct qinst *inst = (struct qinst *)node;
switch (inst->op) {
case QOP_CMP:
/* Turn "dst = (a < 0) ? b : b)" into "dst = b" */
if (qir_reg_equals(inst->src[1], inst->src[2])) {
case QOP_SEL_X_Y_ZS:
case QOP_SEL_X_Y_ZC:
case QOP_SEL_X_Y_NS:
case QOP_SEL_X_Y_NC:
/* Turn "dst = (sf == x) ? a : a)" into "dst = a" */
if (qir_reg_equals(inst->src[0], inst->src[1])) {
if (debug) {
fprintf(stderr, "optimizing: ");
qir_dump_inst(inst);

View File

@ -248,6 +248,58 @@ tgsi_to_qir_alu(struct tgsi_to_qir *trans,
return dst;
}
static struct qreg
tgsi_to_qir_seq(struct tgsi_to_qir *trans,
struct tgsi_full_instruction *tgsi_inst,
enum qop op, struct qreg *src, int i)
{
struct qcompile *c = trans->c;
qir_SF(c, qir_FSUB(c, src[0 * 4 + i], src[1 * 4 + i]));
return qir_SEL_X_0_ZS(c, qir_uniform_f(trans, 1.0));
}
static struct qreg
tgsi_to_qir_sne(struct tgsi_to_qir *trans,
struct tgsi_full_instruction *tgsi_inst,
enum qop op, struct qreg *src, int i)
{
struct qcompile *c = trans->c;
qir_SF(c, qir_FSUB(c, src[0 * 4 + i], src[1 * 4 + i]));
return qir_SEL_X_0_ZC(c, qir_uniform_f(trans, 1.0));
}
static struct qreg
tgsi_to_qir_slt(struct tgsi_to_qir *trans,
struct tgsi_full_instruction *tgsi_inst,
enum qop op, struct qreg *src, int i)
{
struct qcompile *c = trans->c;
qir_SF(c, qir_FSUB(c, src[0 * 4 + i], src[1 * 4 + i]));
return qir_SEL_X_0_NS(c, qir_uniform_f(trans, 1.0));
}
static struct qreg
tgsi_to_qir_sge(struct tgsi_to_qir *trans,
struct tgsi_full_instruction *tgsi_inst,
enum qop op, struct qreg *src, int i)
{
struct qcompile *c = trans->c;
qir_SF(c, qir_FSUB(c, src[0 * 4 + i], src[1 * 4 + i]));
return qir_SEL_X_0_NC(c, qir_uniform_f(trans, 1.0));
}
static struct qreg
tgsi_to_qir_cmp(struct tgsi_to_qir *trans,
struct tgsi_full_instruction *tgsi_inst,
enum qop op, struct qreg *src, int i)
{
struct qcompile *c = trans->c;
qir_SF(c, src[0 * 4 + i]);
return qir_SEL_X_Y_NS(c,
src[1 * 4 + i],
src[2 * 4 + i]);
}
static struct qreg
tgsi_to_qir_mad(struct tgsi_to_qir *trans,
struct tgsi_full_instruction *tgsi_inst,
@ -280,16 +332,15 @@ tgsi_to_qir_lit(struct tgsi_to_qir *trans,
case 2: {
struct qreg zero = qir_uniform_f(trans, 0.0);
qir_SF(c, x);
/* XXX: Clamp w to -128..128 */
return qir_CMP(c,
x,
zero,
qir_EXP2(c, qir_FMUL(c,
w,
qir_LOG2(c,
qir_FMAX(c,
y,
zero)))));
return qir_SEL_X_0_NC(c,
qir_EXP2(c, qir_FMUL(c,
w,
qir_LOG2(c,
qir_FMAX(c,
y,
zero)))));
}
default:
assert(!"not reached");
@ -415,10 +466,10 @@ tgsi_to_qir_frc(struct tgsi_to_qir *trans,
struct qcompile *c = trans->c;
struct qreg trunc = qir_ITOF(c, qir_FTOI(c, src[0 * 4 + i]));
struct qreg diff = qir_FSUB(c, src[0 * 4 + i], trunc);
return qir_CMP(c,
diff,
qir_FADD(c, diff, qir_uniform_f(trans, 1.0)),
diff);
qir_SF(c, diff);
return qir_SEL_X_Y_NS(c,
qir_FADD(c, diff, qir_uniform_f(trans, 1.0)),
diff);
}
/**
@ -436,12 +487,11 @@ tgsi_to_qir_flr(struct tgsi_to_qir *trans,
/* This will be < 0 if we truncated and the truncation was of a value
* that was < 0 in the first place.
*/
struct qreg diff = qir_FSUB(c, src[0 * 4 + i], trunc);
qir_SF(c, qir_FSUB(c, src[0 * 4 + i], trunc));
return qir_CMP(c,
diff,
qir_FSUB(c, trunc, qir_uniform_f(trans, 1.0)),
trunc);
return qir_SEL_X_Y_NS(c,
qir_FSUB(c, trunc, qir_uniform_f(trans, 1.0)),
trunc);
}
static struct qreg
@ -613,10 +663,10 @@ tgsi_to_qir_kill_if(struct tgsi_to_qir *trans, struct qreg *src, int i)
if (trans->discard.file == QFILE_NULL)
trans->discard = qir_uniform_f(trans, 0.0);
trans->discard = qir_CMP(c,
src[0 * 4 + i],
qir_uniform_f(trans, 1.0),
trans->discard);
qir_SF(c, src[0 * 4 + i]);
trans->discard = qir_SEL_X_Y_NS(c,
qir_uniform_f(trans, 1.0),
trans->discard);
}
static void
@ -705,11 +755,11 @@ emit_tgsi_instruction(struct tgsi_to_qir *trans,
[TGSI_OPCODE_MIN] = { QOP_FMIN, tgsi_to_qir_alu },
[TGSI_OPCODE_MAX] = { QOP_FMAX, tgsi_to_qir_alu },
[TGSI_OPCODE_RSQ] = { QOP_RSQ, tgsi_to_qir_alu },
[TGSI_OPCODE_SEQ] = { QOP_SEQ, tgsi_to_qir_alu },
[TGSI_OPCODE_SNE] = { QOP_SNE, tgsi_to_qir_alu },
[TGSI_OPCODE_SGE] = { QOP_SGE, tgsi_to_qir_alu },
[TGSI_OPCODE_SLT] = { QOP_SLT, tgsi_to_qir_alu },
[TGSI_OPCODE_CMP] = { QOP_CMP, tgsi_to_qir_alu },
[TGSI_OPCODE_SEQ] = { 0, tgsi_to_qir_seq },
[TGSI_OPCODE_SNE] = { 0, tgsi_to_qir_sne },
[TGSI_OPCODE_SGE] = { 0, tgsi_to_qir_sge },
[TGSI_OPCODE_SLT] = { 0, tgsi_to_qir_slt },
[TGSI_OPCODE_CMP] = { 0, tgsi_to_qir_cmp },
[TGSI_OPCODE_MAD] = { 0, tgsi_to_qir_mad },
[TGSI_OPCODE_DP2] = { 0, tgsi_to_qir_dp2 },
[TGSI_OPCODE_DP3] = { 0, tgsi_to_qir_dp3 },

View File

@ -43,11 +43,15 @@ static const struct qir_op_info qir_op_info[] = {
[QOP_FMINABS] = { "fminabs", 1, 2 },
[QOP_FMAXABS] = { "fmaxabs", 1, 2 },
[QOP_SEQ] = { "seq", 1, 2 },
[QOP_SNE] = { "sne", 1, 2 },
[QOP_SGE] = { "sge", 1, 2 },
[QOP_SLT] = { "slt", 1, 2 },
[QOP_CMP] = { "cmp", 1, 3 },
[QOP_SF] = { "sf", 0, 1 },
[QOP_SEL_X_0_NS] = { "fsel_x_0_ns", 1, 1 },
[QOP_SEL_X_0_NC] = { "fsel_x_0_nc", 1, 1 },
[QOP_SEL_X_0_ZS] = { "fsel_x_0_zs", 1, 1 },
[QOP_SEL_X_0_ZC] = { "fsel_x_0_zc", 1, 1 },
[QOP_SEL_X_Y_NS] = { "fsel_x_y_ns", 1, 2 },
[QOP_SEL_X_Y_NC] = { "fsel_x_y_nc", 1, 2 },
[QOP_SEL_X_Y_ZS] = { "fsel_x_y_zs", 1, 2 },
[QOP_SEL_X_Y_ZC] = { "fsel_x_y_zc", 1, 2 },
[QOP_FTOI] = { "ftoi", 1, 1 },
[QOP_ITOF] = { "itof", 1, 1 },

View File

@ -54,11 +54,21 @@ enum qop {
QOP_FMINABS,
QOP_FMAXABS,
QOP_SEQ,
QOP_SNE,
QOP_SGE,
QOP_SLT,
QOP_CMP,
/* Sets the flag register according to src. */
QOP_SF,
/* Note: Orderings of these compares must be the same as in
* qpu_defines.h. Selects the src[0] if the ns flag bit is set,
* otherwise 0. */
QOP_SEL_X_0_ZS,
QOP_SEL_X_0_ZC,
QOP_SEL_X_0_NS,
QOP_SEL_X_0_NC,
/* Selects the src[0] if the ns flag bit is set, otherwise src[1]. */
QOP_SEL_X_Y_ZS,
QOP_SEL_X_Y_ZC,
QOP_SEL_X_Y_NS,
QOP_SEL_X_Y_NC,
QOP_FTOI,
QOP_ITOF,
@ -260,6 +270,15 @@ QIR_ALU1(MOV)
QIR_ALU2(FADD)
QIR_ALU2(FSUB)
QIR_ALU2(FMUL)
QIR_NODST_1(SF)
QIR_ALU1(SEL_X_0_ZS)
QIR_ALU1(SEL_X_0_ZC)
QIR_ALU1(SEL_X_0_NS)
QIR_ALU1(SEL_X_0_NC)
QIR_ALU2(SEL_X_Y_ZS)
QIR_ALU2(SEL_X_Y_ZC)
QIR_ALU2(SEL_X_Y_NS)
QIR_ALU2(SEL_X_Y_NC)
QIR_ALU2(FMIN)
QIR_ALU2(FMAX)
QIR_ALU2(FMINABS)
@ -283,14 +302,6 @@ QIR_ALU0(FRAG_Z)
QIR_ALU0(FRAG_RCP_W)
QIR_NODST_1(TLB_DISCARD_SETUP)
static inline struct qreg
qir_CMP(struct qcompile *c, struct qreg cmp, struct qreg a, struct qreg b)
{
struct qreg t = qir_get_temp(c);
qir_emit(c, qir_inst4(QOP_CMP, t, cmp, a, b, c->undef));
return t;
}
static inline struct qreg
qir_R4_UNPACK(struct qcompile *c, int i)
{

View File

@ -60,6 +60,12 @@ last_inst(struct qcompile *c)
return &q->inst;
}
static void
set_last_cond_add(struct qcompile *c, uint32_t cond)
{
*last_inst(c) = qpu_set_cond_add(*last_inst(c), cond);
}
/**
* This is used to resolve the fact that we might register-allocate two
* different operands of an instruction to the same physical register file
@ -278,13 +284,6 @@ vc4_generate_code(struct qcompile *c)
M(FMUL),
};
static const uint32_t compareflags[] = {
[QOP_SEQ - QOP_SEQ] = QPU_COND_ZS,
[QOP_SNE - QOP_SEQ] = QPU_COND_ZC,
[QOP_SLT - QOP_SEQ] = QPU_COND_NS,
[QOP_SGE - QOP_SEQ] = QPU_COND_NC,
};
struct qpu_reg src[4];
for (int i = 0; i < qir_get_op_nsrc(qinst->op); i++) {
int index = qinst->src[i].index;
@ -365,32 +364,36 @@ vc4_generate_code(struct qcompile *c)
}
break;
case QOP_CMP:
case QOP_SF:
fixup_raddr_conflict(c, src[0], &src[1]);
queue(c, qpu_a_MOV(qpu_ra(QPU_W_NOP), src[0]));
*last_inst(c) |= QPU_SF;
queue(c, qpu_a_MOV(dst, src[1]));
*last_inst(c) = qpu_set_cond_add(*last_inst(c),
QPU_COND_NS);
queue(c, qpu_a_MOV(dst, src[2]));
*last_inst(c) = qpu_set_cond_add(*last_inst(c),
QPU_COND_NC);
break;
case QOP_SEQ:
case QOP_SNE:
case QOP_SGE:
case QOP_SLT:
fixup_raddr_conflict(c, src[0], &src[1]);
queue(c, qpu_a_FSUB(qpu_ra(QPU_W_NOP), src[0], src[1]));
*last_inst(c) |= QPU_SF;
case QOP_SEL_X_0_ZS:
case QOP_SEL_X_0_ZC:
case QOP_SEL_X_0_NS:
case QOP_SEL_X_0_NC:
queue(c, qpu_a_MOV(dst, src[0]));
set_last_cond_add(c, qinst->op - QOP_SEL_X_0_ZS +
QPU_COND_ZS);
queue(c, qpu_load_imm_f(dst, 0.0));
queue(c, qpu_load_imm_f(dst, 1.0));
*last_inst(c) = qpu_set_cond_add(*last_inst(c),
compareflags[qinst->op - QOP_SEQ]);
queue(c, qpu_a_XOR(dst, qpu_r0(), qpu_r0()));
set_last_cond_add(c, ((qinst->op - QOP_SEL_X_0_ZS) ^
1) + QPU_COND_ZS);
break;
case QOP_SEL_X_Y_ZS:
case QOP_SEL_X_Y_ZC:
case QOP_SEL_X_Y_NS:
case QOP_SEL_X_Y_NC:
queue(c, qpu_a_MOV(dst, src[0]));
set_last_cond_add(c, qinst->op - QOP_SEL_X_Y_ZS +
QPU_COND_ZS);
queue(c, qpu_a_MOV(dst, src[1]));
set_last_cond_add(c, ((qinst->op - QOP_SEL_X_Y_ZS) ^
1) + QPU_COND_ZS);
break;
@ -475,8 +478,7 @@ vc4_generate_code(struct qcompile *c)
queue(c, qpu_a_MOV(qpu_ra(QPU_W_TLB_Z),
qpu_rb(QPU_R_FRAG_PAYLOAD_ZW)));
if (discard) {
*last_inst(c) = qpu_set_cond_add(*last_inst(c),
QPU_COND_ZS);
set_last_cond_add(c, QPU_COND_ZS);
}
break;
@ -490,8 +492,7 @@ vc4_generate_code(struct qcompile *c)
case QOP_TLB_COLOR_WRITE:
queue(c, qpu_a_MOV(qpu_tlbc(), src[0]));
if (discard) {
*last_inst(c) = qpu_set_cond_add(*last_inst(c),
QPU_COND_ZS);
set_last_cond_add(c, QPU_COND_ZS);
}
break;