/* * Copyright © 2021 Raspberry Pi Ltd * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice (including the next * paragraph) shall be included in all copies or substantial portions of the * Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS * IN THE SOFTWARE. */ /** * @file v3d_opt_constant_alu.c * * Identified sequences of ALU instructions that operate on constant operands * and reduces them to a uniform load. * * This is useful, for example, to optimize the result of removing leading * ldunifa instructions in the DCE pass, which can leave a series of constant * additions that increment the unifa address by 4 for each leading ldunif * removed. It helps turn this: * * nop t1; ldunif (0x00000004 / 0.000000) * nop t2; ldunif (0x00000004 / 0.000000) * add t3, t1, t2 * * into: * * nop t1; ldunif (0x00000004 / 0.000000) * nop t2; ldunif (0x00000004 / 0.000000) * nop t4; ldunif (0x00000008 / 0.000000) * mov t3, t4 * * For best results we want to run copy propagation in between this and * the combine constants pass: every time we manage to convert an alu to * a uniform load, we move the uniform to the original alu destination. By * running copy propagation immediately after we can reuse the uniform as * source in more follow-up alu instructions, making them constant and allowing * this pass to continue making progress. However, if we run the small * immediates optimization before that, that pass can convert some of the movs * to use small immediates instead of the uniforms and prevent us from making * the best of this pass, as small immediates don't get copy propagated. */ #include "v3d_compiler.h" #include "util/half_float.h" #include "util/u_math.h" static bool opt_constant_add(struct v3d_compile *c, struct qinst *inst, union fi *values) { /* FIXME: handle more add operations */ struct qreg unif = { }; switch (inst->qpu.alu.add.op) { case V3D_QPU_A_ADD: c->cursor = vir_after_inst(inst); unif = vir_uniform_ui(c, values[0].ui + values[1].ui); break; case V3D_QPU_A_VFPACK: { assert(inst->qpu.alu.add.output_pack == V3D_QPU_PACK_NONE); const uint32_t packed = (((uint32_t)_mesa_float_to_half(values[1].f)) << 16) | _mesa_float_to_half(values[0].f); c->cursor = vir_after_inst(inst); unif = vir_uniform_ui(c, packed); break; } default: return false; } /* Remove the original ALU instruction and replace it with a uniform * load. If the original instruction loaded an implicit uniform we * need to replicate that in the new instruction. */ struct qreg dst = inst->dst; struct qinst *mov = vir_MOV_dest(c, dst, unif); mov->uniform = inst->uniform; vir_remove_instruction(c, inst); if (dst.file == QFILE_TEMP) c->defs[dst.index] = mov; return true; } static bool try_opt_constant_alu(struct v3d_compile *c, struct qinst *inst) { if(inst->qpu.type != V3D_QPU_INSTR_TYPE_ALU) return false; /* If the instruction does anything other than writing the result * directly to the destination, skip. */ if (inst->qpu.alu.add.output_pack != V3D_QPU_PACK_NONE || inst->qpu.alu.mul.output_pack != V3D_QPU_PACK_NONE) { return false; } if (inst->qpu.flags.ac != V3D_QPU_COND_NONE || inst->qpu.flags.mc != V3D_QPU_COND_NONE) { return false; } assert(vir_get_nsrc(inst) <= 2); union fi values[2]; for (int i = 0; i < vir_get_nsrc(inst); i++) { if (inst->src[i].file == QFILE_SMALL_IMM && v3d_qpu_small_imm_unpack(c->devinfo, inst->qpu.raddr_b, &values[i].ui)) { continue; } if (inst->src[i].file == QFILE_TEMP) { struct qinst *def = c->defs[inst->src[i].index]; if (!def) return false; if ((def->qpu.sig.ldunif || def->qpu.sig.ldunifrf) && c->uniform_contents[def->uniform] == QUNIFORM_CONSTANT) { values[i].ui = c->uniform_data[def->uniform]; continue; } } return false; } /* FIXME: handle mul operations */ if (vir_is_add(inst)) return opt_constant_add(c, inst, values); return false; } bool vir_opt_constant_alu(struct v3d_compile *c) { bool progress = false; vir_for_each_block(block, c) { c->cur_block = block; vir_for_each_inst_safe(inst, block) { progress = try_opt_constant_alu(c, inst) || progress; } } return progress; }