mesa/src/panfrost/bifrost/bi_opt_mod_props.c

423 lines
14 KiB
C

/*
* Copyright (C) 2021 Collabora, Ltd.
* Copyright (C) 2021 Alyssa Rosenzweig <alyssa@rosenzweig.io>
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#include "compiler.h"
#include "bi_builder.h"
/*
* Due to a Bifrost encoding restriction, some instructions cannot have an abs
* modifier on both sources. Check if adding a fabs modifier to a given source
* of a binary instruction would cause this restriction to be hit.
*/
static bool
bi_would_impact_abs(unsigned arch, bi_instr *I, bi_index repl, unsigned s)
{
return (arch <= 8) && I->src[1 - s].abs &&
bi_is_word_equiv(I->src[1 - s], repl);
}
static bool
bi_takes_fabs(unsigned arch, bi_instr *I, bi_index repl, unsigned s)
{
switch (I->op) {
case BI_OPCODE_FCMP_V2F16:
case BI_OPCODE_FMAX_V2F16:
case BI_OPCODE_FMIN_V2F16:
return !bi_would_impact_abs(arch, I, repl, s);
case BI_OPCODE_FADD_V2F16:
/*
* For FADD.v2f16, the FMA pipe has the abs encoding hazard,
* while the FADD pipe cannot encode a clamp. Either case in
* isolation can be worked around in the scheduler, but both
* together is impossible to encode. Avoid the hazard.
*/
return !(I->clamp && bi_would_impact_abs(arch, I, repl, s));
case BI_OPCODE_V2F32_TO_V2F16:
/* TODO: Needs both match or lower */
return false;
case BI_OPCODE_FLOG_TABLE_F32:
/* TODO: Need to check mode */
return false;
default:
return bi_opcode_props[I->op].abs & BITFIELD_BIT(s);
}
}
static bool
bi_takes_fneg(unsigned arch, bi_instr *I, unsigned s)
{
switch (I->op) {
case BI_OPCODE_CUBE_SSEL:
case BI_OPCODE_CUBE_TSEL:
case BI_OPCODE_CUBEFACE:
/* TODO: Bifrost encoding restriction: need to match or lower */
return arch >= 9;
case BI_OPCODE_FREXPE_F32:
case BI_OPCODE_FREXPE_V2F16:
case BI_OPCODE_FLOG_TABLE_F32:
/* TODO: Need to check mode */
return false;
default:
return bi_opcode_props[I->op].neg & BITFIELD_BIT(s);
}
}
static bool
bi_is_fabsneg(enum bi_opcode op, enum bi_size size)
{
return (size == BI_SIZE_32 && op == BI_OPCODE_FABSNEG_F32) ||
(size == BI_SIZE_16 && op == BI_OPCODE_FABSNEG_V2F16);
}
static enum bi_swizzle
bi_compose_swizzle_16(enum bi_swizzle a, enum bi_swizzle b)
{
assert(a <= BI_SWIZZLE_H11);
assert(b <= BI_SWIZZLE_H11);
bool al = (a & BI_SWIZZLE_H10);
bool ar = (a & BI_SWIZZLE_H01);
bool bl = (b & BI_SWIZZLE_H10);
bool br = (b & BI_SWIZZLE_H01);
return ((al ? br : bl) ? BI_SWIZZLE_H10 : 0) |
((ar ? br : bl) ? BI_SWIZZLE_H01 : 0);
}
/* Like bi_replace_index, but composes instead of overwrites */
static inline bi_index
bi_compose_float_index(bi_index old, bi_index repl)
{
/* abs(-x) = abs(+x) so ignore repl.neg if old.abs is set, otherwise
* -(-x) = x but -(+x) = +(-x) so need to exclusive-or the negates */
repl.neg = old.neg ^ (repl.neg && !old.abs);
/* +/- abs(+/- abs(x)) = +/- abs(x), etc so just or the two */
repl.abs |= old.abs;
/* Use the old swizzle to select from the replacement swizzle */
repl.swizzle = bi_compose_swizzle_16(old.swizzle, repl.swizzle);
return repl;
}
/* DISCARD.b32(FCMP.f(x, y)) --> DISCARD.f(x, y) */
static inline void
bi_fuse_discard_fcmp(bi_instr *I, bi_instr *mod, unsigned arch)
{
if (I->op != BI_OPCODE_DISCARD_B32) return;
if (mod->op != BI_OPCODE_FCMP_F32 && mod->op != BI_OPCODE_FCMP_V2F16) return;
if (mod->cmpf >= BI_CMPF_GTLT) return;
/* .abs and .neg modifiers allowed on Valhall DISCARD but not Bifrost */
bool absneg = mod->src[0].neg || mod->src[0].abs;
absneg |= mod->src[1].neg || mod->src[1].abs;
if (arch <= 8 && absneg) return;
enum bi_swizzle r = I->src[0].swizzle;
/* result_type doesn't matter */
I->op = BI_OPCODE_DISCARD_F32;
I->cmpf = mod->cmpf;
I->src[0] = mod->src[0];
I->src[1] = mod->src[1];
if (mod->op == BI_OPCODE_FCMP_V2F16) {
I->src[0].swizzle = bi_compose_swizzle_16(r, I->src[0].swizzle);
I->src[1].swizzle = bi_compose_swizzle_16(r, I->src[1].swizzle);
}
}
void
bi_opt_mod_prop_forward(bi_context *ctx)
{
bi_instr **lut = calloc(sizeof(bi_instr *), ctx->ssa_alloc);
bi_foreach_instr_global_safe(ctx, I) {
if (bi_is_ssa(I->dest[0]))
lut[I->dest[0].value] = I;
bi_foreach_src(I, s) {
if (!bi_is_ssa(I->src[s]))
continue;
bi_instr *mod = lut[I->src[s].value];
if (!mod)
continue;
unsigned size = bi_opcode_props[I->op].size;
bi_fuse_discard_fcmp(I, mod, ctx->arch);
if (bi_is_fabsneg(mod->op, size)) {
if (mod->src[0].abs && !bi_takes_fabs(ctx->arch, I, mod->src[0], s))
continue;
if (mod->src[0].neg && !bi_takes_fneg(ctx->arch, I, s))
continue;
I->src[s] = bi_compose_float_index(I->src[s], mod->src[0]);
}
}
}
free(lut);
}
/* RSCALE has restrictions on how the clamp may be used, only used for
* specialized transcendental sequences that set the clamp explicitly anyway */
static bool
bi_takes_clamp(bi_instr *I)
{
switch (I->op) {
case BI_OPCODE_FMA_RSCALE_F32:
case BI_OPCODE_FMA_RSCALE_V2F16:
case BI_OPCODE_FADD_RSCALE_F32:
return false;
case BI_OPCODE_FADD_V2F16:
/* Encoding restriction */
return !(I->src[0].abs && I->src[1].abs &&
bi_is_word_equiv(I->src[0], I->src[1]));
default:
return bi_opcode_props[I->op].clamp;
}
}
static bool
bi_is_fclamp(enum bi_opcode op, enum bi_size size)
{
return (size == BI_SIZE_32 && op == BI_OPCODE_FCLAMP_F32) ||
(size == BI_SIZE_16 && op == BI_OPCODE_FCLAMP_V2F16);
}
static bool
bi_optimizer_clamp(bi_instr *I, bi_instr *use)
{
if (!bi_is_fclamp(use->op, bi_opcode_props[I->op].size)) return false;
if (!bi_takes_clamp(I)) return false;
/* Clamps are bitfields (clamp_m1_1/clamp_0_inf) so composition is OR */
I->clamp |= use->clamp;
I->dest[0] = use->dest[0];
return true;
}
static enum bi_opcode
bi_sized_mux_op(unsigned size)
{
switch (size) {
case 8: return BI_OPCODE_MUX_V4I8;
case 16: return BI_OPCODE_MUX_V2I16;
case 32: return BI_OPCODE_MUX_I32;
default: unreachable("invalid size");
}
}
static bool
bi_is_fixed_mux(bi_instr *I, unsigned size, bi_index v1)
{
return I->op == bi_sized_mux_op(size) &&
bi_is_value_equiv(I->src[0], bi_zero()) &&
bi_is_value_equiv(I->src[1], v1);
}
static bool
bi_takes_int_result_type(enum bi_opcode op)
{
switch (op) {
case BI_OPCODE_ICMP_I32:
case BI_OPCODE_ICMP_S32:
case BI_OPCODE_ICMP_U32:
case BI_OPCODE_ICMP_V2I16:
case BI_OPCODE_ICMP_V2S16:
case BI_OPCODE_ICMP_V2U16:
case BI_OPCODE_ICMP_V4I8:
case BI_OPCODE_ICMP_V4S8:
case BI_OPCODE_ICMP_V4U8:
case BI_OPCODE_FCMP_F32:
case BI_OPCODE_FCMP_V2F16:
return true;
default:
return false;
}
}
static bool
bi_takes_float_result_type(enum bi_opcode op)
{
return (op == BI_OPCODE_FCMP_F32) ||
(op == BI_OPCODE_FCMP_V2F16);
}
/* CMP+MUX -> CMP with result type */
static bool
bi_optimizer_result_type(bi_instr *I, bi_instr *mux)
{
if (bi_opcode_props[I->op].size != bi_opcode_props[mux->op].size)
return false;
if (bi_is_fixed_mux(mux, 32, bi_imm_f32(1.0)) ||
bi_is_fixed_mux(mux, 16, bi_imm_f16(1.0))) {
if (!bi_takes_float_result_type(I->op))
return false;
I->result_type = BI_RESULT_TYPE_F1;
} else if (bi_is_fixed_mux(mux, 32, bi_imm_u32(1)) ||
bi_is_fixed_mux(mux, 16, bi_imm_u16(1)) ||
bi_is_fixed_mux(mux, 8, bi_imm_u8(1))) {
if (!bi_takes_int_result_type(I->op))
return false;
I->result_type = BI_RESULT_TYPE_I1;
} else {
return false;
}
I->dest[0] = mux->dest[0];
return true;
}
static bool
bi_is_var_tex(bi_instr *var, bi_instr *tex)
{
return (var->op == BI_OPCODE_LD_VAR_IMM) &&
(tex->op == BI_OPCODE_TEXS_2D_F16 || tex->op == BI_OPCODE_TEXS_2D_F32) &&
(var->register_format == BI_REGISTER_FORMAT_F32) &&
((var->sample == BI_SAMPLE_CENTER && var->update == BI_UPDATE_STORE) ||
(var->sample == BI_SAMPLE_NONE && var->update == BI_UPDATE_RETRIEVE)) &&
(tex->texture_index == tex->sampler_index) &&
(tex->texture_index < 4) &&
(var->index < 8);
}
static bool
bi_optimizer_var_tex(bi_context *ctx, bi_instr *var, bi_instr *tex)
{
if (!bi_is_var_tex(var, tex)) return false;
/* Construct the corresponding VAR_TEX intruction */
bi_builder b = bi_init_builder(ctx, bi_after_instr(var));
bi_instr *I = bi_var_tex_f32_to(&b, tex->dest[0], tex->lod_mode,
var->sample, var->update, tex->texture_index, var->index);
I->skip = tex->skip;
if (tex->op == BI_OPCODE_TEXS_2D_F16)
I->op = BI_OPCODE_VAR_TEX_F16;
/* Dead code elimination will clean up for us */
return true;
}
void
bi_opt_mod_prop_backward(bi_context *ctx)
{
unsigned count = ctx->ssa_alloc;
bi_instr **uses = calloc(count, sizeof(*uses));
BITSET_WORD *multiple = calloc(BITSET_WORDS(count), sizeof(*multiple));
bi_foreach_instr_global_rev(ctx, I) {
bi_foreach_src(I, s) {
if (bi_is_ssa(I->src[s])) {
unsigned v = I->src[s].value;
if (uses[v] && uses[v] != I)
BITSET_SET(multiple, v);
else
uses[v] = I;
}
}
if (!bi_is_ssa(I->dest[0]))
continue;
bi_instr *use = uses[I->dest[0].value];
if (!use || BITSET_TEST(multiple, I->dest[0].value))
continue;
/* Destination has a single use, try to propagate */
bool propagated =
bi_optimizer_clamp(I, use) ||
bi_optimizer_result_type(I, use);
if (!propagated && I->op == BI_OPCODE_LD_VAR_IMM && use->op == BI_OPCODE_SPLIT_I32) {
/* Need to see through the split in a
* ld_var_imm/split/var_tex sequence
*/
assert(bi_is_ssa(use->dest[0]));
bi_instr *tex = uses[use->dest[0].value];
if (!tex || BITSET_TEST(multiple, use->dest[0].value))
continue;
use = tex;
propagated = bi_optimizer_var_tex(ctx, I, use);
}
if (propagated) {
bi_remove_instruction(use);
continue;
}
}
free(uses);
free(multiple);
}
/** Lower pseudo instructions that exist to simplify the optimizer */
void
bi_lower_opt_instruction(bi_instr *I)
{
switch (I->op) {
case BI_OPCODE_FABSNEG_F32:
case BI_OPCODE_FABSNEG_V2F16:
case BI_OPCODE_FCLAMP_F32:
case BI_OPCODE_FCLAMP_V2F16:
I->op = (bi_opcode_props[I->op].size == BI_SIZE_32) ?
BI_OPCODE_FADD_F32 : BI_OPCODE_FADD_V2F16;
I->round = BI_ROUND_NONE;
I->src[1] = bi_negzero();
break;
case BI_OPCODE_DISCARD_B32:
I->op = BI_OPCODE_DISCARD_F32;
I->src[1] = bi_imm_u32(0);
I->cmpf = BI_CMPF_NE;
break;
default:
break;
}
}