From 8b36d107fdd6f6b91556fcdc3498df16803d4181 Mon Sep 17 00:00:00 2001 From: Eric Anholt Date: Tue, 18 Aug 2015 21:26:05 -0700 Subject: [PATCH] vc4: Pack the unorm-packing bits into a src MUL instruction when possible. Now that we do non-SSA QIR instructions, we can take a NIR SSA src that's only used by the unorm packing and just stuff the pack bits into it. total instructions in shared programs: 98136 -> 97974 (-0.17%) instructions in affected programs: 4149 -> 3987 (-3.90%) --- src/gallium/drivers/vc4/vc4_opt_algebraic.c | 14 ++-- src/gallium/drivers/vc4/vc4_program.c | 77 ++++++++++++++++++--- src/gallium/drivers/vc4/vc4_qir.c | 23 ++++++ src/gallium/drivers/vc4/vc4_qir.h | 1 + src/gallium/drivers/vc4/vc4_qpu_emit.c | 5 ++ 5 files changed, 104 insertions(+), 16 deletions(-) diff --git a/src/gallium/drivers/vc4/vc4_opt_algebraic.c b/src/gallium/drivers/vc4/vc4_opt_algebraic.c index e8c93dedfd2..77028bc026f 100644 --- a/src/gallium/drivers/vc4/vc4_opt_algebraic.c +++ b/src/gallium/drivers/vc4/vc4_opt_algebraic.c @@ -229,18 +229,20 @@ qir_opt_algebraic(struct vc4_compile *c) break; case QOP_FMUL: - if (replace_x_0_with_0(c, inst, 0) || - replace_x_0_with_0(c, inst, 1) || - fmul_replace_one(c, inst, 0) || - fmul_replace_one(c, inst, 1)) { + if (!inst->dst.pack && + (replace_x_0_with_0(c, inst, 0) || + replace_x_0_with_0(c, inst, 1) || + fmul_replace_one(c, inst, 0) || + fmul_replace_one(c, inst, 1))) { progress = true; break; } break; case QOP_MUL24: - if (replace_x_0_with_0(c, inst, 0) || - replace_x_0_with_0(c, inst, 1)) { + if (!inst->dst.pack && + (replace_x_0_with_0(c, inst, 0) || + replace_x_0_with_0(c, inst, 1))) { progress = true; break; } diff --git a/src/gallium/drivers/vc4/vc4_program.c b/src/gallium/drivers/vc4/vc4_program.c index ff41779e6c1..6bf4c9eab9b 100644 --- a/src/gallium/drivers/vc4/vc4_program.c +++ b/src/gallium/drivers/vc4/vc4_program.c @@ -818,6 +818,72 @@ declare_uniform_range(struct vc4_compile *c, uint32_t start, uint32_t size) c->ubo_ranges[array_id].used = false; } +static bool +ntq_src_is_only_ssa_def_user(nir_src *src) +{ + if (!src->is_ssa) + return false; + + if (!list_empty(&src->ssa->if_uses)) + return false; + + return (src->ssa->uses.next == &src->use_link && + src->ssa->uses.next->next == &src->ssa->uses); +} + +/** + * In general, emits a nir_pack_unorm_4x8 as a series of MOVs with the pack + * bit set. + * + * However, as an optimization, it tries to find the instructions generating + * the sources to be packed and just emit the pack flag there, if possible. + */ +static void +ntq_emit_pack_unorm_4x8(struct vc4_compile *c, nir_alu_instr *instr) +{ + struct qreg result = qir_get_temp(c); + struct nir_alu_instr *vec4 = NULL; + + /* If packing from a vec4 op (as expected), identify it so that we can + * peek back at what generated its sources. + */ + if (instr->src[0].src.is_ssa && + instr->src[0].src.ssa->parent_instr->type == nir_instr_type_alu && + nir_instr_as_alu(instr->src[0].src.ssa->parent_instr)->op == + nir_op_vec4) { + vec4 = nir_instr_as_alu(instr->src[0].src.ssa->parent_instr); + } + + for (int i = 0; i < 4; i++) { + int swiz = instr->src[0].swizzle[i]; + struct qreg src; + if (vec4) { + src = ntq_get_src(c, vec4->src[swiz].src, + vec4->src[swiz].swizzle[0]); + } else { + src = ntq_get_src(c, instr->src[0].src, swiz); + } + + if (vec4 && + ntq_src_is_only_ssa_def_user(&vec4->src[swiz].src) && + src.file == QFILE_TEMP && + c->defs[src.index] && + qir_is_mul(c->defs[src.index]) && + !c->defs[src.index]->dst.pack) { + struct qinst *rewrite = c->defs[src.index]; + c->defs[src.index] = NULL; + rewrite->dst = result; + rewrite->dst.pack = QPU_PACK_MUL_8A + i; + continue; + } + + qir_PACK_8_F(c, result, src, i); + } + + struct qreg *dest = ntq_get_dest(c, &instr->dest.dest); + *dest = result; +} + static void ntq_emit_alu(struct vc4_compile *c, nir_alu_instr *instr) { @@ -839,16 +905,7 @@ ntq_emit_alu(struct vc4_compile *c, nir_alu_instr *instr) } if (instr->op == nir_op_pack_unorm_4x8) { - struct qreg result = qir_get_temp(c); - - for (int i = 0; i < 4; i++) { - qir_PACK_8_F(c, result, - ntq_get_src(c, instr->src[0].src, - instr->src[0].swizzle[i]), - i); - } - struct qreg *dest = ntq_get_dest(c, &instr->dest.dest); - *dest = result; + ntq_emit_pack_unorm_4x8(c, instr); return; } diff --git a/src/gallium/drivers/vc4/vc4_qir.c b/src/gallium/drivers/vc4/vc4_qir.c index f27b2d2d949..92669a83010 100644 --- a/src/gallium/drivers/vc4/vc4_qir.c +++ b/src/gallium/drivers/vc4/vc4_qir.c @@ -286,6 +286,29 @@ qir_dump_inst(struct vc4_compile *c, struct qinst *inst) inst->sf ? ".sf" : ""); qir_print_reg(c, inst->dst, true); + if (inst->dst.pack) { + if (qir_is_mul(inst)) { + switch (inst->dst.pack) { + case QPU_PACK_MUL_8888: + fprintf(stderr, ".8888"); + break; + case QPU_PACK_MUL_8A: + fprintf(stderr, ".8a"); + break; + case QPU_PACK_MUL_8B: + fprintf(stderr, ".8b"); + break; + case QPU_PACK_MUL_8C: + fprintf(stderr, ".8c"); + break; + case QPU_PACK_MUL_8D: + fprintf(stderr, ".8d"); + break; + } + } else { + unreachable("packs only set up for MULs so far.\n"); + } + } for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) { fprintf(stderr, ", "); qir_print_reg(c, inst->src[i], false); diff --git a/src/gallium/drivers/vc4/vc4_qir.h b/src/gallium/drivers/vc4/vc4_qir.h index c9ca3da203c..65d493dd558 100644 --- a/src/gallium/drivers/vc4/vc4_qir.h +++ b/src/gallium/drivers/vc4/vc4_qir.h @@ -58,6 +58,7 @@ enum qfile { struct qreg { enum qfile file; uint32_t index; + int pack; }; enum qop { diff --git a/src/gallium/drivers/vc4/vc4_qpu_emit.c b/src/gallium/drivers/vc4/vc4_qpu_emit.c index ef35f33a0eb..bf614a2c1fd 100644 --- a/src/gallium/drivers/vc4/vc4_qpu_emit.c +++ b/src/gallium/drivers/vc4/vc4_qpu_emit.c @@ -507,6 +507,11 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c) queue(c, qpu_m_alu2(translate[qinst->op].op, dst, src[0], src[1])); + if (qinst->dst.pack) { + *last_inst(c) |= QPU_PM; + *last_inst(c) |= QPU_SET_FIELD(qinst->dst.pack, + QPU_PACK); + } } else { queue(c, qpu_a_alu2(translate[qinst->op].op, dst,