diff --git a/src/freedreno/ir3/ir3.h b/src/freedreno/ir3/ir3.h index f5a303e3773c9..52df6e2d287ab 100644 --- a/src/freedreno/ir3/ir3.h +++ b/src/freedreno/ir3/ir3.h @@ -2056,6 +2056,9 @@ bool ir3_remove_unreachable(struct ir3 *ir); /* calculate reconvergence information: */ void ir3_calc_reconvergence(struct ir3_shader_variant *so); +/* lower invalid shared phis after calculating reconvergence information: */ +bool ir3_lower_shared_phis(struct ir3 *ir); + /* dead code elimination: */ struct ir3_shader_variant; bool ir3_dce(struct ir3 *ir, struct ir3_shader_variant *so); @@ -2063,6 +2066,9 @@ bool ir3_dce(struct ir3 *ir, struct ir3_shader_variant *so); /* fp16 conversion folding */ bool ir3_cf(struct ir3 *ir); +/* shared mov folding */ +bool ir3_shared_fold(struct ir3 *ir); + /* copy-propagate: */ bool ir3_cp(struct ir3 *ir, struct ir3_shader_variant *so); @@ -2121,6 +2127,21 @@ ir3_has_latency_to_hide(struct ir3 *ir) return false; } +/** + * Move 'instr' to after the last phi node at the beginning of the block: + */ +static inline void +ir3_instr_move_after_phis(struct ir3_instruction *instr, + struct ir3_block *block) +{ + struct ir3_instruction *last_phi = ir3_block_get_last_phi(block); + if (last_phi) + ir3_instr_move_after(instr, last_phi); + else + ir3_instr_move_before_block(instr, block); +} + + /* ************************************************************************* */ /* instruction helpers */ diff --git a/src/freedreno/ir3/ir3_compiler_nir.c b/src/freedreno/ir3/ir3_compiler_nir.c index e99ed4eb0f1cb..6a47e0dd21c8c 100644 --- a/src/freedreno/ir3/ir3_compiler_nir.c +++ b/src/freedreno/ir3/ir3_compiler_nir.c @@ -5182,6 +5182,8 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler, ir3_calc_reconvergence(so); + IR3_PASS(ir, ir3_lower_shared_phis); + do { progress = false; @@ -5192,6 +5194,7 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler, progress |= IR3_PASS(ir, ir3_cse); progress |= IR3_PASS(ir, ir3_dce, so); progress |= IR3_PASS(ir, ir3_opt_predicates, so); + progress |= IR3_PASS(ir, ir3_shared_fold); } while (progress); /* at this point, for binning pass, throw away unneeded outputs: diff --git a/src/freedreno/ir3/ir3_lower_shared_phi.c b/src/freedreno/ir3/ir3_lower_shared_phi.c new file mode 100644 index 0000000000000..12d3bc2960ce5 --- /dev/null +++ b/src/freedreno/ir3/ir3_lower_shared_phi.c @@ -0,0 +1,134 @@ +/* + * Copyright (C) 2023 Valve Corporation. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "ir3.h" +#include "util/ralloc.h" + +/* RA cannot handle phis of shared registers where there are extra physical + * sources, or the sources have extra physical destinations, because these edges + * are critical edges that we cannot resolve copies along. Here's a contrived + * example: + * + * loop { + * if non-uniform { + * if uniform { + * x_1 = ...; + * continue; + * } + * x_2 = ...; + * } else { + * break; + * } + * // continue block + * x_3 = phi(x_1, x_2) + * } + * + * Assuming x_1 and x_2 are uniform, x_3 will also be uniform, because all + * threads that stay in the loop take the same branch to the continue block, + * however execution may fall through from the assignment to x_2 to the + * break statement because the outer if is non-uniform, and then it will fall + * through again to the continue block. In cases like this we have to demote the + * phi to normal registers and insert movs around it (which will probably be + * coalesced). + */ + +static void +lower_phi(void *ctx, struct ir3_instruction *phi) +{ + struct ir3_block *block = phi->block; + for (unsigned i = 0; i < block->predecessors_count; i++) { + struct ir3_block *pred = block->predecessors[i]; + if (phi->srcs[i]->def) { + struct ir3_instruction *pred_mov = ir3_instr_create(pred, OPC_MOV, 1, 1); + pred_mov->uses = _mesa_pointer_set_create(ctx); + __ssa_dst(pred_mov)->flags |= (phi->srcs[i]->flags & IR3_REG_HALF); + unsigned src_flags = IR3_REG_SSA | IR3_REG_SHARED | + (phi->srcs[i]->flags & IR3_REG_HALF); + ir3_src_create(pred_mov, INVALID_REG, src_flags)->def = + phi->srcs[i]->def; + pred_mov->cat1.src_type = pred_mov->cat1.dst_type = + (src_flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32; + + _mesa_set_remove_key(phi->srcs[i]->def->instr->uses, phi); + _mesa_set_add(phi->srcs[i]->def->instr->uses, pred_mov); + phi->srcs[i]->def = pred_mov->dsts[0]; + } + phi->srcs[i]->flags &= ~IR3_REG_SHARED; + } + + phi->dsts[0]->flags &= ~IR3_REG_SHARED; + + struct ir3_instruction *shared_mov = + ir3_MOV(block, phi, + (phi->dsts[0]->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32); + shared_mov->uses = _mesa_pointer_set_create(ctx); + shared_mov->dsts[0]->flags |= IR3_REG_SHARED; + ir3_instr_move_after_phis(shared_mov, block); + + foreach_ssa_use (use, phi) { + for (unsigned i = 0; i < use->srcs_count; i++) { + if (use->srcs[i]->def == phi->dsts[0]) + use->srcs[i]->def = shared_mov->dsts[0]; + } + } +} + +bool +ir3_lower_shared_phis(struct ir3 *ir) +{ + void *mem_ctx = ralloc_context(NULL); + bool progress = false; + + ir3_find_ssa_uses(ir, mem_ctx, false); + + foreach_block (block, &ir->block_list) { + bool pred_physical_edge = false; + for (unsigned i = 0; i < block->predecessors_count; i++) { + unsigned successors_count = + block->predecessors[i]->successors[1] ? 2 : 1; + if (block->predecessors[i]->physical_successors_count > successors_count) { + pred_physical_edge = true; + break; + } + } + + if (!pred_physical_edge && + block->physical_predecessors_count == block->predecessors_count) + continue; + + foreach_instr_safe (phi, &block->instr_list) { + if (phi->opc != OPC_META_PHI) + break; + + if (!(phi->dsts[0]->flags & IR3_REG_SHARED)) + continue; + + lower_phi(mem_ctx, phi); + progress = true; + } + } + + ralloc_free(mem_ctx); + return progress; +} + diff --git a/src/freedreno/ir3/ir3_shared_folding.c b/src/freedreno/ir3/ir3_shared_folding.c new file mode 100644 index 0000000000000..29b3f28ea3344 --- /dev/null +++ b/src/freedreno/ir3/ir3_shared_folding.c @@ -0,0 +1,150 @@ +/* + * Copyright (C) 2023 Valve Corporation. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/* Try to fold a shared -> non-shared mov into the instruction producing the + * shared src. We do this aggresively, even if there are other uses of the + * source, on the assumption that the "default" state should be non-shared and + * we should be able to fold the other sources eventually. + */ + +#include "util/ralloc.h" + +#include "ir3.h" + +static bool +try_shared_folding(struct ir3_instruction *mov, void *mem_ctx) +{ + if (mov->opc != OPC_MOV) + return false; + + if ((mov->dsts[0]->flags & IR3_REG_SHARED) || + !(mov->srcs[0]->flags & IR3_REG_SHARED)) + return false; + + struct ir3_instruction *src = ssa(mov->srcs[0]); + if (!src) + return false; + + if (mov->cat1.dst_type != mov->cat1.src_type) { + /* Check if the conversion can be folded into the source by ir3_cf */ + bool can_fold; + type_t output_type = ir3_output_conv_type(src, &can_fold); + if (!can_fold || output_type != TYPE_U32) + return false; + foreach_ssa_use (use, src) { + if (use->opc != OPC_MOV || + use->cat1.src_type != mov->cat1.src_type || + use->cat1.dst_type != mov->cat1.dst_type) + return false; + } + } + + if (src->opc == OPC_META_PHI) { + struct ir3_block *block = src->block; + for (unsigned i = 0; i < block->predecessors_count; i++) { + struct ir3_block *pred = block->predecessors[i]; + if (src->srcs[i]->def) { + struct ir3_instruction *pred_mov = ir3_instr_create(pred, OPC_MOV, 1, 1); + __ssa_dst(pred_mov)->flags |= (src->srcs[i]->flags & IR3_REG_HALF); + unsigned src_flags = IR3_REG_SSA | IR3_REG_SHARED | + (src->srcs[i]->flags & IR3_REG_HALF); + ir3_src_create(pred_mov, INVALID_REG, src_flags)->def = + src->srcs[i]->def; + pred_mov->cat1.src_type = pred_mov->cat1.dst_type = + (src_flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32; + + _mesa_set_remove_key(src->srcs[i]->def->instr->uses, src); + _mesa_set_add(src->srcs[i]->def->instr->uses, pred_mov); + src->srcs[i]->def = pred_mov->dsts[0]; + } + src->srcs[i]->flags &= ~IR3_REG_SHARED; + } + } else if (opc_cat(src->opc) == 2 && src->srcs_count >= 2) { + /* cat2 vector ALU instructions cannot have both shared sources */ + if ((src->srcs[0]->flags & (IR3_REG_SHARED | IR3_REG_CONST)) && + (src->srcs[1]->flags & (IR3_REG_SHARED | IR3_REG_CONST))) + return false; + } else if (opc_cat(src->opc) == 3) { + /* cat3 vector ALU instructions cannot have src1 shared */ + if (src->srcs[1]->flags & IR3_REG_SHARED) + return false; + } else if (src->opc == OPC_LDC) { + src->flags &= ~IR3_INSTR_U; + } else { + return false; + } + + /* Remove IR3_REG_SHARED from the original destination, which should make the + * mov trivial so that it can be cleaned up later by copy prop. + */ + src->dsts[0]->flags &= ~IR3_REG_SHARED; + mov->srcs[0]->flags &= ~IR3_REG_SHARED; + + /* Insert a copy to shared for uses other than this move instruction. */ + struct ir3_instruction *shared_mov = NULL; + foreach_ssa_use (use, src) { + if (use == mov) + continue; + + if (!shared_mov) { + shared_mov = ir3_MOV(src->block, src, mov->cat1.src_type); + shared_mov->dsts[0]->flags |= IR3_REG_SHARED; + if (src->opc == OPC_META_PHI) + ir3_instr_move_after_phis(shared_mov, src->block); + else + ir3_instr_move_after(shared_mov, src); + shared_mov->uses = _mesa_pointer_set_create(mem_ctx); + } + + for (unsigned i = 0; i < use->srcs_count; i++) { + if (use->srcs[i]->def == src->dsts[0]) + use->srcs[i]->def = shared_mov->dsts[0]; + } + _mesa_set_add(shared_mov->uses, use); + } + + return true; +} + +bool +ir3_shared_fold(struct ir3 *ir) +{ + void *mem_ctx = ralloc_context(NULL); + bool progress = false; + + ir3_find_ssa_uses(ir, mem_ctx, false); + + /* Folding a phi can push the mov up to its sources, so iterate blocks in + * reverse to try and convert an entire phi-web in one go. + */ + foreach_block_rev (block, &ir->block_list) { + foreach_instr (instr, &block->instr_list) { + progress |= try_shared_folding(instr, mem_ctx); + } + } + + ralloc_free(mem_ctx); + + return progress; +} + diff --git a/src/freedreno/ir3/meson.build b/src/freedreno/ir3/meson.build index adad6ee9a32f6..7351c236174c5 100644 --- a/src/freedreno/ir3/meson.build +++ b/src/freedreno/ir3/meson.build @@ -97,6 +97,7 @@ libfreedreno_ir3_files = files( 'ir3_legalize_relative.c', 'ir3_liveness.c', 'ir3_lower_parallelcopy.c', + 'ir3_lower_shared_phi.c', 'ir3_lower_spill.c', 'ir3_lower_subgroups.c', 'ir3_merge_regs.c', @@ -127,6 +128,7 @@ libfreedreno_ir3_files = files( 'ir3_sched.c', 'ir3_shader.c', 'ir3_shader.h', + 'ir3_shared_folding.c', 'ir3_shared_ra.c', 'ir3_spill.c', 'ir3_validate.c',