ir3: Add scalar ALU-specific passes

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/22075>
This commit is contained in:
Connor Abbott 2023-03-02 14:47:30 +01:00 committed by Marge Bot
parent 4c4234501f
commit ce6c4f0320
5 changed files with 310 additions and 0 deletions

View File

@ -2056,6 +2056,9 @@ bool ir3_remove_unreachable(struct ir3 *ir);
/* calculate reconvergence information: */
void ir3_calc_reconvergence(struct ir3_shader_variant *so);
/* lower invalid shared phis after calculating reconvergence information: */
bool ir3_lower_shared_phis(struct ir3 *ir);
/* dead code elimination: */
struct ir3_shader_variant;
bool ir3_dce(struct ir3 *ir, struct ir3_shader_variant *so);
@ -2063,6 +2066,9 @@ bool ir3_dce(struct ir3 *ir, struct ir3_shader_variant *so);
/* fp16 conversion folding */
bool ir3_cf(struct ir3 *ir);
/* shared mov folding */
bool ir3_shared_fold(struct ir3 *ir);
/* copy-propagate: */
bool ir3_cp(struct ir3 *ir, struct ir3_shader_variant *so);
@ -2121,6 +2127,21 @@ ir3_has_latency_to_hide(struct ir3 *ir)
return false;
}
/**
* Move 'instr' to after the last phi node at the beginning of the block:
*/
static inline void
ir3_instr_move_after_phis(struct ir3_instruction *instr,
struct ir3_block *block)
{
struct ir3_instruction *last_phi = ir3_block_get_last_phi(block);
if (last_phi)
ir3_instr_move_after(instr, last_phi);
else
ir3_instr_move_before_block(instr, block);
}
/* ************************************************************************* */
/* instruction helpers */

View File

@ -5182,6 +5182,8 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler,
ir3_calc_reconvergence(so);
IR3_PASS(ir, ir3_lower_shared_phis);
do {
progress = false;
@ -5192,6 +5194,7 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler,
progress |= IR3_PASS(ir, ir3_cse);
progress |= IR3_PASS(ir, ir3_dce, so);
progress |= IR3_PASS(ir, ir3_opt_predicates, so);
progress |= IR3_PASS(ir, ir3_shared_fold);
} while (progress);
/* at this point, for binning pass, throw away unneeded outputs:

View File

@ -0,0 +1,134 @@
/*
* Copyright (C) 2023 Valve Corporation.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#include "ir3.h"
#include "util/ralloc.h"
/* RA cannot handle phis of shared registers where there are extra physical
* sources, or the sources have extra physical destinations, because these edges
* are critical edges that we cannot resolve copies along. Here's a contrived
* example:
*
* loop {
* if non-uniform {
* if uniform {
* x_1 = ...;
* continue;
* }
* x_2 = ...;
* } else {
* break;
* }
* // continue block
* x_3 = phi(x_1, x_2)
* }
*
* Assuming x_1 and x_2 are uniform, x_3 will also be uniform, because all
* threads that stay in the loop take the same branch to the continue block,
* however execution may fall through from the assignment to x_2 to the
* break statement because the outer if is non-uniform, and then it will fall
* through again to the continue block. In cases like this we have to demote the
* phi to normal registers and insert movs around it (which will probably be
* coalesced).
*/
static void
lower_phi(void *ctx, struct ir3_instruction *phi)
{
struct ir3_block *block = phi->block;
for (unsigned i = 0; i < block->predecessors_count; i++) {
struct ir3_block *pred = block->predecessors[i];
if (phi->srcs[i]->def) {
struct ir3_instruction *pred_mov = ir3_instr_create(pred, OPC_MOV, 1, 1);
pred_mov->uses = _mesa_pointer_set_create(ctx);
__ssa_dst(pred_mov)->flags |= (phi->srcs[i]->flags & IR3_REG_HALF);
unsigned src_flags = IR3_REG_SSA | IR3_REG_SHARED |
(phi->srcs[i]->flags & IR3_REG_HALF);
ir3_src_create(pred_mov, INVALID_REG, src_flags)->def =
phi->srcs[i]->def;
pred_mov->cat1.src_type = pred_mov->cat1.dst_type =
(src_flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
_mesa_set_remove_key(phi->srcs[i]->def->instr->uses, phi);
_mesa_set_add(phi->srcs[i]->def->instr->uses, pred_mov);
phi->srcs[i]->def = pred_mov->dsts[0];
}
phi->srcs[i]->flags &= ~IR3_REG_SHARED;
}
phi->dsts[0]->flags &= ~IR3_REG_SHARED;
struct ir3_instruction *shared_mov =
ir3_MOV(block, phi,
(phi->dsts[0]->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32);
shared_mov->uses = _mesa_pointer_set_create(ctx);
shared_mov->dsts[0]->flags |= IR3_REG_SHARED;
ir3_instr_move_after_phis(shared_mov, block);
foreach_ssa_use (use, phi) {
for (unsigned i = 0; i < use->srcs_count; i++) {
if (use->srcs[i]->def == phi->dsts[0])
use->srcs[i]->def = shared_mov->dsts[0];
}
}
}
bool
ir3_lower_shared_phis(struct ir3 *ir)
{
void *mem_ctx = ralloc_context(NULL);
bool progress = false;
ir3_find_ssa_uses(ir, mem_ctx, false);
foreach_block (block, &ir->block_list) {
bool pred_physical_edge = false;
for (unsigned i = 0; i < block->predecessors_count; i++) {
unsigned successors_count =
block->predecessors[i]->successors[1] ? 2 : 1;
if (block->predecessors[i]->physical_successors_count > successors_count) {
pred_physical_edge = true;
break;
}
}
if (!pred_physical_edge &&
block->physical_predecessors_count == block->predecessors_count)
continue;
foreach_instr_safe (phi, &block->instr_list) {
if (phi->opc != OPC_META_PHI)
break;
if (!(phi->dsts[0]->flags & IR3_REG_SHARED))
continue;
lower_phi(mem_ctx, phi);
progress = true;
}
}
ralloc_free(mem_ctx);
return progress;
}

View File

@ -0,0 +1,150 @@
/*
* Copyright (C) 2023 Valve Corporation.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
/* Try to fold a shared -> non-shared mov into the instruction producing the
* shared src. We do this aggresively, even if there are other uses of the
* source, on the assumption that the "default" state should be non-shared and
* we should be able to fold the other sources eventually.
*/
#include "util/ralloc.h"
#include "ir3.h"
static bool
try_shared_folding(struct ir3_instruction *mov, void *mem_ctx)
{
if (mov->opc != OPC_MOV)
return false;
if ((mov->dsts[0]->flags & IR3_REG_SHARED) ||
!(mov->srcs[0]->flags & IR3_REG_SHARED))
return false;
struct ir3_instruction *src = ssa(mov->srcs[0]);
if (!src)
return false;
if (mov->cat1.dst_type != mov->cat1.src_type) {
/* Check if the conversion can be folded into the source by ir3_cf */
bool can_fold;
type_t output_type = ir3_output_conv_type(src, &can_fold);
if (!can_fold || output_type != TYPE_U32)
return false;
foreach_ssa_use (use, src) {
if (use->opc != OPC_MOV ||
use->cat1.src_type != mov->cat1.src_type ||
use->cat1.dst_type != mov->cat1.dst_type)
return false;
}
}
if (src->opc == OPC_META_PHI) {
struct ir3_block *block = src->block;
for (unsigned i = 0; i < block->predecessors_count; i++) {
struct ir3_block *pred = block->predecessors[i];
if (src->srcs[i]->def) {
struct ir3_instruction *pred_mov = ir3_instr_create(pred, OPC_MOV, 1, 1);
__ssa_dst(pred_mov)->flags |= (src->srcs[i]->flags & IR3_REG_HALF);
unsigned src_flags = IR3_REG_SSA | IR3_REG_SHARED |
(src->srcs[i]->flags & IR3_REG_HALF);
ir3_src_create(pred_mov, INVALID_REG, src_flags)->def =
src->srcs[i]->def;
pred_mov->cat1.src_type = pred_mov->cat1.dst_type =
(src_flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
_mesa_set_remove_key(src->srcs[i]->def->instr->uses, src);
_mesa_set_add(src->srcs[i]->def->instr->uses, pred_mov);
src->srcs[i]->def = pred_mov->dsts[0];
}
src->srcs[i]->flags &= ~IR3_REG_SHARED;
}
} else if (opc_cat(src->opc) == 2 && src->srcs_count >= 2) {
/* cat2 vector ALU instructions cannot have both shared sources */
if ((src->srcs[0]->flags & (IR3_REG_SHARED | IR3_REG_CONST)) &&
(src->srcs[1]->flags & (IR3_REG_SHARED | IR3_REG_CONST)))
return false;
} else if (opc_cat(src->opc) == 3) {
/* cat3 vector ALU instructions cannot have src1 shared */
if (src->srcs[1]->flags & IR3_REG_SHARED)
return false;
} else if (src->opc == OPC_LDC) {
src->flags &= ~IR3_INSTR_U;
} else {
return false;
}
/* Remove IR3_REG_SHARED from the original destination, which should make the
* mov trivial so that it can be cleaned up later by copy prop.
*/
src->dsts[0]->flags &= ~IR3_REG_SHARED;
mov->srcs[0]->flags &= ~IR3_REG_SHARED;
/* Insert a copy to shared for uses other than this move instruction. */
struct ir3_instruction *shared_mov = NULL;
foreach_ssa_use (use, src) {
if (use == mov)
continue;
if (!shared_mov) {
shared_mov = ir3_MOV(src->block, src, mov->cat1.src_type);
shared_mov->dsts[0]->flags |= IR3_REG_SHARED;
if (src->opc == OPC_META_PHI)
ir3_instr_move_after_phis(shared_mov, src->block);
else
ir3_instr_move_after(shared_mov, src);
shared_mov->uses = _mesa_pointer_set_create(mem_ctx);
}
for (unsigned i = 0; i < use->srcs_count; i++) {
if (use->srcs[i]->def == src->dsts[0])
use->srcs[i]->def = shared_mov->dsts[0];
}
_mesa_set_add(shared_mov->uses, use);
}
return true;
}
bool
ir3_shared_fold(struct ir3 *ir)
{
void *mem_ctx = ralloc_context(NULL);
bool progress = false;
ir3_find_ssa_uses(ir, mem_ctx, false);
/* Folding a phi can push the mov up to its sources, so iterate blocks in
* reverse to try and convert an entire phi-web in one go.
*/
foreach_block_rev (block, &ir->block_list) {
foreach_instr (instr, &block->instr_list) {
progress |= try_shared_folding(instr, mem_ctx);
}
}
ralloc_free(mem_ctx);
return progress;
}

View File

@ -97,6 +97,7 @@ libfreedreno_ir3_files = files(
'ir3_legalize_relative.c',
'ir3_liveness.c',
'ir3_lower_parallelcopy.c',
'ir3_lower_shared_phi.c',
'ir3_lower_spill.c',
'ir3_lower_subgroups.c',
'ir3_merge_regs.c',
@ -127,6 +128,7 @@ libfreedreno_ir3_files = files(
'ir3_sched.c',
'ir3_shader.c',
'ir3_shader.h',
'ir3_shared_folding.c',
'ir3_shared_ra.c',
'ir3_spill.c',
'ir3_validate.c',