pan/bi: Optimize replication

Bifrost's 16-bit support comes in the form of vectorized instructions,
so when we manipulate scalars, we usually replicate to both bottom and
top halves of 32-bit registers. Add an analysis pass that detects
replication. Then, use that replication pass to optimize out useless
swizzle instructions (by changing them to plain moves, which can be
copypropped).

This optimization is a slight shader-db win on its own, and allows us to
transition to lower_bool_to_bitsize without regressing shader-db.

total instructions in shared programs: 90323 -> 90257 (-0.07%)
instructions in affected programs: 2513 -> 2447 (-2.63%)
helped: 20
HURT: 0
helped stats (abs) min: 1.0 max: 16.0 x̄: 3.30 x̃: 2
helped stats (rel) min: 1.25% max: 11.11% x̄: 4.80% x̃: 4.29%
95% mean confidence interval for instructions value: -5.05 -1.55
95% mean confidence interval for instructions %-change: -6.06% -3.54%
Instructions are helped.

total tuples in shared programs: 73769 -> 73740 (-0.04%)
tuples in affected programs: 1611 -> 1582 (-1.80%)
helped: 17
HURT: 0
helped stats (abs) min: 1.0 max: 9.0 x̄: 1.71 x̃: 1
helped stats (rel) min: 0.58% max: 16.67% x̄: 4.80% x̃: 3.33%
95% mean confidence interval for tuples value: -2.70 -0.71
95% mean confidence interval for tuples %-change: -7.06% -2.54%
Tuples are helped.

total clauses in shared programs: 15997 -> 15993 (-0.03%)
clauses in affected programs: 27 -> 23 (-14.81%)
helped: 4
HURT: 0
helped stats (abs) min: 1.0 max: 1.0 x̄: 1.00 x̃: 1
helped stats (rel) min: 7.69% max: 25.00% x̄: 18.17% x̃: 20.00%
95% mean confidence interval for clauses value: -1.00 -1.00
95% mean confidence interval for clauses %-change: -29.91% -6.44%
Clauses are helped.

total cycles in shared programs: 7623.13 -> 7622.13 (-0.01%)
cycles in affected programs: 64.83 -> 63.83 (-1.54%)
helped: 13
HURT: 0
helped stats (abs) min: 0.0416660000000002 max: 0.375 x̄: 0.08 x̃: 0
helped stats (rel) min: 1.02% max: 5.56% x̄: 2.82% x̃: 2.50%
95% mean confidence interval for cycles value: -0.13 -0.02
95% mean confidence interval for cycles %-change: -3.79% -1.85%
Cycles are helped.

total arith in shared programs: 2763.75 -> 2762.46 (-0.05%)
arith in affected programs: 67.17 -> 65.88 (-1.92%)
helped: 18
HURT: 0
helped stats (abs) min: 0.0416660000000002 max: 0.375 x̄: 0.07 x̃: 0
helped stats (rel) min: 1.02% max: 22.22% x̄: 5.68% x̃: 3.16%
95% mean confidence interval for arith value: -0.11 -0.03
95% mean confidence interval for arith %-change: -8.56% -2.80%
Arith are helped.

total quadwords in shared programs: 68173 -> 68155 (-0.03%)
quadwords in affected programs: 1258 -> 1240 (-1.43%)
helped: 14
HURT: 0
helped stats (abs) min: 1.0 max: 3.0 x̄: 1.29 x̃: 1
helped stats (rel) min: 0.42% max: 8.70% x̄: 3.88% x̃: 3.67%
95% mean confidence interval for quadwords value: -1.64 -0.93
95% mean confidence interval for quadwords %-change: -5.27% -2.49%
Quadwords are helped.

Signed-off-by: Alyssa Rosenzweig <alyssa@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/14576>
This commit is contained in:
Alyssa Rosenzweig 2022-01-15 12:26:42 -05:00 committed by Marge Bot
parent 35ff537814
commit f7d44a46cd
2 changed files with 138 additions and 0 deletions

View File

@ -137,6 +137,106 @@ bi_lower_swizzle_16(bi_context *ctx, bi_instr *ins, unsigned src)
ins->src[src].swizzle = BI_SWIZZLE_H01;
}
static bool
bi_swizzle_replicates_8(enum bi_swizzle swz)
{
switch (swz) {
case BI_SWIZZLE_B0000:
case BI_SWIZZLE_B1111:
case BI_SWIZZLE_B2222:
case BI_SWIZZLE_B3333:
return true;
default:
return false;
}
}
static bool
bi_swizzle_replicates_16(enum bi_swizzle swz)
{
switch (swz) {
case BI_SWIZZLE_H00:
case BI_SWIZZLE_H11:
return true;
default:
/* If a swizzle replicates every 8-bits, it also replicates
* every 16-bits, so allow 8-bit replicating swizzles.
*/
return bi_swizzle_replicates_8(swz);
}
}
static bool
bi_instr_replicates(bi_instr *I, BITSET_WORD *replicates_16)
{
switch (I->op) {
/* Instructions that construct vectors have replicated output if their
* sources are identical. Check this case first.
*/
case BI_OPCODE_MKVEC_V2I16:
case BI_OPCODE_V2F16_TO_V2S16:
case BI_OPCODE_V2F16_TO_V2U16:
case BI_OPCODE_V2F32_TO_V2F16:
case BI_OPCODE_V2S16_TO_V2F16:
case BI_OPCODE_V2S8_TO_V2F16:
case BI_OPCODE_V2S8_TO_V2S16:
case BI_OPCODE_V2U16_TO_V2F16:
case BI_OPCODE_V2U8_TO_V2F16:
case BI_OPCODE_V2U8_TO_V2U16:
return bi_is_value_equiv(I->src[0], I->src[1]);
/* 16-bit transcendentals are defined to output zero in their
* upper half, so they do not replicate
*/
case BI_OPCODE_FRCP_F16:
case BI_OPCODE_FRSQ_F16:
return false;
/* Not sure, be conservative, we don't use these.. */
case BI_OPCODE_VN_ASST1_F16:
case BI_OPCODE_FPCLASS_F16:
case BI_OPCODE_FPOW_SC_DET_F16:
return false;
default:
break;
}
/* Replication analysis only makes sense for ALU instructions */
if (bi_opcode_props[I->op].message != BIFROST_MESSAGE_NONE)
return false;
/* We only analyze 16-bit instructions for 16-bit replication. We could
* maybe do better.
*/
if (bi_opcode_props[I->op].size != BI_SIZE_16)
return false;
bi_foreach_src(I, s) {
if (bi_is_null(I->src[s]))
continue;
/* Replicated swizzles */
if (bi_swizzle_replicates_16(I->src[s].swizzle))
continue;
/* Replicated values */
if (bi_is_ssa(I->src[s]) &&
BITSET_TEST(replicates_16, bi_word_node(I->src[s])))
continue;
/* Replicated constants */
if (I->src[s].type == BI_INDEX_CONSTANT &&
(I->src[s].value & 0xFFFF) == (I->src[s].value >> 16))
continue;
return false;
}
return true;
}
void
bi_lower_swizzle(bi_context *ctx)
{
@ -146,4 +246,20 @@ bi_lower_swizzle(bi_context *ctx)
bi_lower_swizzle_16(ctx, ins, s);
}
}
/* Now that we've lowered swizzles, clean up the mess */
BITSET_WORD *replicates_16 = calloc(sizeof(bi_index), ((ctx->ssa_alloc + 1) << 2));
bi_foreach_instr_global(ctx, ins) {
if (bi_is_ssa(ins->dest[0]) && bi_instr_replicates(ins, replicates_16))
BITSET_SET(replicates_16, bi_word_node(ins->dest[0]));
if (ins->op == BI_OPCODE_SWZ_V2I16 && bi_is_ssa(ins->src[0]) &&
BITSET_TEST(replicates_16, bi_word_node(ins->src[0]))) {
ins->op = BI_OPCODE_MOV_I32;
ins->src[0].swizzle = BI_SWIZZLE_H01;
}
}
free(replicates_16);
}

View File

@ -355,6 +355,28 @@ bi_is_word_equiv(bi_index left, bi_index right)
return bi_is_equiv(left, right) && left.offset == right.offset;
}
/* An even stronger equivalence that checks if indices correspond to the
* right value when evaluated
*/
static inline bool
bi_is_value_equiv(bi_index left, bi_index right)
{
if (left.type == BI_INDEX_CONSTANT && right.type == BI_INDEX_CONSTANT) {
return (bi_apply_swizzle(left.value, left.swizzle) ==
bi_apply_swizzle(right.value, right.swizzle)) &&
(left.abs == right.abs) &&
(left.neg == right.neg);
} else {
return (left.value == right.value) &&
(left.abs == right.abs) &&
(left.neg == right.neg) &&
(left.swizzle == right.swizzle) &&
(left.offset == right.offset) &&
(left.reg == right.reg) &&
(left.type == right.type);
}
}
#define BI_MAX_DESTS 2
#define BI_MAX_SRCS 5