broadcom/compiler: add a constant alu optimization pass

Currently this is useful to clean up after DCEing leading ldunifa
instructions, but it can be expanded to handle more cases which
may allow to simplify the compiler code in places where we have
been trying to optimize manually for similar cases.

Reviewed-by: Alejandro Piñeiro <apinheiro@igalia.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9128>
This commit is contained in:
Iago Toral Quiroga 2021-02-17 11:17:25 +01:00 committed by Marge Bot
parent 89de085055
commit e1cf2406da
5 changed files with 149 additions and 4 deletions

View File

@ -23,6 +23,7 @@ libbroadcom_compiler_files = files(
'vir.c',
'vir_dump.c',
'vir_live_variables.c',
'vir_opt_constant_alu.c',
'vir_opt_copy_propagate.c',
'vir_opt_dead_code.c',
'vir_opt_redundant_flags.c',

View File

@ -988,6 +988,7 @@ bool vir_opt_peephole_sf(struct v3d_compile *c);
bool vir_opt_redundant_flags(struct v3d_compile *c);
bool vir_opt_small_immediates(struct v3d_compile *c);
bool vir_opt_vpm(struct v3d_compile *c);
bool vir_opt_constant_alu(struct v3d_compile *c);
void v3d_nir_lower_blend(nir_shader *s, struct v3d_compile *c);
void v3d_nir_lower_io(nir_shader *s, struct v3d_compile *c);
void v3d_nir_lower_line_smooth(nir_shader *shader);

View File

@ -1497,6 +1497,7 @@ vir_optimize(struct v3d_compile *c)
OPTPASS(vir_opt_redundant_flags);
OPTPASS(vir_opt_dead_code);
OPTPASS(vir_opt_small_immediates);
OPTPASS(vir_opt_constant_alu);
if (!progress)
break;

View File

@ -0,0 +1,146 @@
/*
* Copyright © 2021 Raspberry Pi
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
/**
* @file v3d_opt_constant_alu.c
*
* Identified sequences of ALU instructions that operate on constant operands
* and reduces them to a uniform load.
*
* Currently, this is useul to optimize the result of removing leading ldunifa
* instructions in the DCE pass, which can leave a series of constant additions
* that increment the unifa address by 4 for each leading ldunif removed. It
* helps turn this:
*
* nop t1; ldunif (0x00000004 / 0.000000)
* nop t2; ldunif (0x00000004 / 0.000000)
* add t3, t1, t2
*
* into:
*
* nop t1; ldunif (0x00000004 / 0.000000)
* nop t2; ldunif (0x00000004 / 0.000000)
* nop t4; ldunif (0x00000008 / 0.000000)
* mov t3, t4
*
* For best results we want to run copy propagation in between this and
* the combine constants pass: every time we manage to convert an alu to
* a uniform load, we move the uniform to the original alu destination. By
* running copy propagation immediately after we can reuse the uniform as
* source in more follow-up alu instructions, making them constant and allowing
* this pass to continue making progress. However, if we run the small
* immediates optimization before that, that pass can convert some of the movs
* to use small immediates instead of the uniforms and prevent us from making
* the best of this pass, as small immediates don't get copy propagated.
*/
#include "v3d_compiler.h"
static bool
opt_constant_add(struct v3d_compile *c, struct qinst *inst, uint32_t *values)
{
/* FIXME: handle more add operations */
struct qreg unif = { };
switch (inst->qpu.alu.add.op) {
case V3D_QPU_A_ADD:
c->cursor = vir_after_inst(inst);
unif = vir_uniform_ui(c, values[0] + values[1]);
break;
default:
return false;
}
/* Remove the original ALU instruction and replace it with a uniform
* load.
*/
struct qreg dst = inst->dst;
struct qinst *mov = vir_MOV_dest(c, dst, unif);
vir_remove_instruction(c, inst);
if (dst.file == QFILE_TEMP)
c->defs[dst.index] = mov;
return true;
}
static bool
try_opt_constant_alu(struct v3d_compile *c, struct qinst *inst)
{
if(inst->qpu.type != V3D_QPU_INSTR_TYPE_ALU)
return false;
/* If the instruction does anything other than writing the result
* directly to the destination, skip.
*/
if (inst->qpu.alu.add.output_pack != V3D_QPU_PACK_NONE ||
inst->qpu.alu.mul.output_pack != V3D_QPU_PACK_NONE) {
return false;
}
if (inst->qpu.flags.ac != V3D_QPU_COND_NONE ||
inst->qpu.flags.mc != V3D_QPU_COND_NONE) {
return false;
}
assert(vir_get_nsrc(inst) <= 2);
uint32_t values[2];
for (int i = 0; i < vir_get_nsrc(inst); i++) {
if (inst->src[i].file == QFILE_SMALL_IMM &&
v3d_qpu_small_imm_unpack(c->devinfo,
inst->qpu.raddr_b,
&values[i])) {
continue;
}
if (inst->src[i].file == QFILE_TEMP) {
struct qinst *def = c->defs[inst->src[i].index];
if (!def)
return false;
if ((def->qpu.sig.ldunif || def->qpu.sig.ldunifrf) &&
c->uniform_contents[def->uniform] == QUNIFORM_CONSTANT) {
values[i] = c->uniform_data[def->uniform];
continue;
}
}
return false;
}
/* FIXME: handle mul operations */
if (vir_is_add(inst))
return opt_constant_add(c, inst, values);
return false;
}
bool
vir_opt_constant_alu(struct v3d_compile *c)
{
bool progress = false;
vir_for_each_block(block, c) {
vir_for_each_inst_safe(inst, block) {
progress = try_opt_constant_alu(c, inst) || progress;
}
}
return progress;
}

View File

@ -160,10 +160,6 @@ increment_unifa_address(struct v3d_compile *c, struct qinst *unifa)
return true;
}
/* FIXME: we can optimize this further by implementing a constant
* ALU pass in the backend, for the case where we are skipping
* multiple leading ldunifa.
*/
if (unifa->qpu.type == V3D_QPU_INSTR_TYPE_ALU &&
unifa->qpu.alu.add.op == V3D_QPU_A_ADD) {
c->cursor = vir_after_inst(unifa);