From f7d44a46cd424e797a38ef732360e546f093f0ae Mon Sep 17 00:00:00 2001 From: Alyssa Rosenzweig Date: Sat, 15 Jan 2022 12:26:42 -0500 Subject: [PATCH] pan/bi: Optimize replication MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bifrost's 16-bit support comes in the form of vectorized instructions, so when we manipulate scalars, we usually replicate to both bottom and top halves of 32-bit registers. Add an analysis pass that detects replication. Then, use that replication pass to optimize out useless swizzle instructions (by changing them to plain moves, which can be copypropped). This optimization is a slight shader-db win on its own, and allows us to transition to lower_bool_to_bitsize without regressing shader-db. total instructions in shared programs: 90323 -> 90257 (-0.07%) instructions in affected programs: 2513 -> 2447 (-2.63%) helped: 20 HURT: 0 helped stats (abs) min: 1.0 max: 16.0 x̄: 3.30 x̃: 2 helped stats (rel) min: 1.25% max: 11.11% x̄: 4.80% x̃: 4.29% 95% mean confidence interval for instructions value: -5.05 -1.55 95% mean confidence interval for instructions %-change: -6.06% -3.54% Instructions are helped. total tuples in shared programs: 73769 -> 73740 (-0.04%) tuples in affected programs: 1611 -> 1582 (-1.80%) helped: 17 HURT: 0 helped stats (abs) min: 1.0 max: 9.0 x̄: 1.71 x̃: 1 helped stats (rel) min: 0.58% max: 16.67% x̄: 4.80% x̃: 3.33% 95% mean confidence interval for tuples value: -2.70 -0.71 95% mean confidence interval for tuples %-change: -7.06% -2.54% Tuples are helped. total clauses in shared programs: 15997 -> 15993 (-0.03%) clauses in affected programs: 27 -> 23 (-14.81%) helped: 4 HURT: 0 helped stats (abs) min: 1.0 max: 1.0 x̄: 1.00 x̃: 1 helped stats (rel) min: 7.69% max: 25.00% x̄: 18.17% x̃: 20.00% 95% mean confidence interval for clauses value: -1.00 -1.00 95% mean confidence interval for clauses %-change: -29.91% -6.44% Clauses are helped. total cycles in shared programs: 7623.13 -> 7622.13 (-0.01%) cycles in affected programs: 64.83 -> 63.83 (-1.54%) helped: 13 HURT: 0 helped stats (abs) min: 0.0416660000000002 max: 0.375 x̄: 0.08 x̃: 0 helped stats (rel) min: 1.02% max: 5.56% x̄: 2.82% x̃: 2.50% 95% mean confidence interval for cycles value: -0.13 -0.02 95% mean confidence interval for cycles %-change: -3.79% -1.85% Cycles are helped. total arith in shared programs: 2763.75 -> 2762.46 (-0.05%) arith in affected programs: 67.17 -> 65.88 (-1.92%) helped: 18 HURT: 0 helped stats (abs) min: 0.0416660000000002 max: 0.375 x̄: 0.07 x̃: 0 helped stats (rel) min: 1.02% max: 22.22% x̄: 5.68% x̃: 3.16% 95% mean confidence interval for arith value: -0.11 -0.03 95% mean confidence interval for arith %-change: -8.56% -2.80% Arith are helped. total quadwords in shared programs: 68173 -> 68155 (-0.03%) quadwords in affected programs: 1258 -> 1240 (-1.43%) helped: 14 HURT: 0 helped stats (abs) min: 1.0 max: 3.0 x̄: 1.29 x̃: 1 helped stats (rel) min: 0.42% max: 8.70% x̄: 3.88% x̃: 3.67% 95% mean confidence interval for quadwords value: -1.64 -0.93 95% mean confidence interval for quadwords %-change: -5.27% -2.49% Quadwords are helped. Signed-off-by: Alyssa Rosenzweig Part-of: --- src/panfrost/bifrost/bi_lower_swizzle.c | 116 ++++++++++++++++++++++++ src/panfrost/bifrost/compiler.h | 22 +++++ 2 files changed, 138 insertions(+) diff --git a/src/panfrost/bifrost/bi_lower_swizzle.c b/src/panfrost/bifrost/bi_lower_swizzle.c index 32517ad93de..2d79fcf83ca 100644 --- a/src/panfrost/bifrost/bi_lower_swizzle.c +++ b/src/panfrost/bifrost/bi_lower_swizzle.c @@ -137,6 +137,106 @@ bi_lower_swizzle_16(bi_context *ctx, bi_instr *ins, unsigned src) ins->src[src].swizzle = BI_SWIZZLE_H01; } +static bool +bi_swizzle_replicates_8(enum bi_swizzle swz) +{ + switch (swz) { + case BI_SWIZZLE_B0000: + case BI_SWIZZLE_B1111: + case BI_SWIZZLE_B2222: + case BI_SWIZZLE_B3333: + return true; + default: + return false; + } +} + +static bool +bi_swizzle_replicates_16(enum bi_swizzle swz) +{ + switch (swz) { + case BI_SWIZZLE_H00: + case BI_SWIZZLE_H11: + return true; + default: + /* If a swizzle replicates every 8-bits, it also replicates + * every 16-bits, so allow 8-bit replicating swizzles. + */ + return bi_swizzle_replicates_8(swz); + } +} + +static bool +bi_instr_replicates(bi_instr *I, BITSET_WORD *replicates_16) +{ + switch (I->op) { + + /* Instructions that construct vectors have replicated output if their + * sources are identical. Check this case first. + */ + case BI_OPCODE_MKVEC_V2I16: + case BI_OPCODE_V2F16_TO_V2S16: + case BI_OPCODE_V2F16_TO_V2U16: + case BI_OPCODE_V2F32_TO_V2F16: + case BI_OPCODE_V2S16_TO_V2F16: + case BI_OPCODE_V2S8_TO_V2F16: + case BI_OPCODE_V2S8_TO_V2S16: + case BI_OPCODE_V2U16_TO_V2F16: + case BI_OPCODE_V2U8_TO_V2F16: + case BI_OPCODE_V2U8_TO_V2U16: + return bi_is_value_equiv(I->src[0], I->src[1]); + + /* 16-bit transcendentals are defined to output zero in their + * upper half, so they do not replicate + */ + case BI_OPCODE_FRCP_F16: + case BI_OPCODE_FRSQ_F16: + return false; + + /* Not sure, be conservative, we don't use these.. */ + case BI_OPCODE_VN_ASST1_F16: + case BI_OPCODE_FPCLASS_F16: + case BI_OPCODE_FPOW_SC_DET_F16: + return false; + + default: + break; + } + + /* Replication analysis only makes sense for ALU instructions */ + if (bi_opcode_props[I->op].message != BIFROST_MESSAGE_NONE) + return false; + + /* We only analyze 16-bit instructions for 16-bit replication. We could + * maybe do better. + */ + if (bi_opcode_props[I->op].size != BI_SIZE_16) + return false; + + bi_foreach_src(I, s) { + if (bi_is_null(I->src[s])) + continue; + + /* Replicated swizzles */ + if (bi_swizzle_replicates_16(I->src[s].swizzle)) + continue; + + /* Replicated values */ + if (bi_is_ssa(I->src[s]) && + BITSET_TEST(replicates_16, bi_word_node(I->src[s]))) + continue; + + /* Replicated constants */ + if (I->src[s].type == BI_INDEX_CONSTANT && + (I->src[s].value & 0xFFFF) == (I->src[s].value >> 16)) + continue; + + return false; + } + + return true; +} + void bi_lower_swizzle(bi_context *ctx) { @@ -146,4 +246,20 @@ bi_lower_swizzle(bi_context *ctx) bi_lower_swizzle_16(ctx, ins, s); } } + + /* Now that we've lowered swizzles, clean up the mess */ + BITSET_WORD *replicates_16 = calloc(sizeof(bi_index), ((ctx->ssa_alloc + 1) << 2)); + + bi_foreach_instr_global(ctx, ins) { + if (bi_is_ssa(ins->dest[0]) && bi_instr_replicates(ins, replicates_16)) + BITSET_SET(replicates_16, bi_word_node(ins->dest[0])); + + if (ins->op == BI_OPCODE_SWZ_V2I16 && bi_is_ssa(ins->src[0]) && + BITSET_TEST(replicates_16, bi_word_node(ins->src[0]))) { + ins->op = BI_OPCODE_MOV_I32; + ins->src[0].swizzle = BI_SWIZZLE_H01; + } + } + + free(replicates_16); } diff --git a/src/panfrost/bifrost/compiler.h b/src/panfrost/bifrost/compiler.h index 5e767f3d2a7..72d21da8ee4 100644 --- a/src/panfrost/bifrost/compiler.h +++ b/src/panfrost/bifrost/compiler.h @@ -355,6 +355,28 @@ bi_is_word_equiv(bi_index left, bi_index right) return bi_is_equiv(left, right) && left.offset == right.offset; } +/* An even stronger equivalence that checks if indices correspond to the + * right value when evaluated + */ +static inline bool +bi_is_value_equiv(bi_index left, bi_index right) +{ + if (left.type == BI_INDEX_CONSTANT && right.type == BI_INDEX_CONSTANT) { + return (bi_apply_swizzle(left.value, left.swizzle) == + bi_apply_swizzle(right.value, right.swizzle)) && + (left.abs == right.abs) && + (left.neg == right.neg); + } else { + return (left.value == right.value) && + (left.abs == right.abs) && + (left.neg == right.neg) && + (left.swizzle == right.swizzle) && + (left.offset == right.offset) && + (left.reg == right.reg) && + (left.type == right.type); + } +} + #define BI_MAX_DESTS 2 #define BI_MAX_SRCS 5