ir3: Add scalar ALU-specific passes

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/22075>
2023-03-02 14:47:30 +01:00 · 2023-03-02 14:47:30 +01:00 · ce6c4f0320
parent 4c4234501f
commit ce6c4f0320
5 changed files with 310 additions and 0 deletions
--- a/src/freedreno/ir3/ir3.h
+++ b/src/freedreno/ir3/ir3.h
@ -2056,6 +2056,9 @@ bool ir3_remove_unreachable(struct ir3 *ir);
 /* calculate reconvergence information: */
 void ir3_calc_reconvergence(struct ir3_shader_variant *so);

+/* lower invalid shared phis after calculating reconvergence information: */
+bool ir3_lower_shared_phis(struct ir3 *ir);
+
 /* dead code elimination: */
 struct ir3_shader_variant;
 bool ir3_dce(struct ir3 *ir, struct ir3_shader_variant *so);
@ -2063,6 +2066,9 @@ bool ir3_dce(struct ir3 *ir, struct ir3_shader_variant *so);
 /* fp16 conversion folding */
 bool ir3_cf(struct ir3 *ir);

+/* shared mov folding */
+bool ir3_shared_fold(struct ir3 *ir);
+
 /* copy-propagate: */
 bool ir3_cp(struct ir3 *ir, struct ir3_shader_variant *so);

@ -2121,6 +2127,21 @@ ir3_has_latency_to_hide(struct ir3 *ir)
   return false;
 }

+/**
+ * Move 'instr' to after the last phi node at the beginning of the block:
+ */
+static inline void
+ir3_instr_move_after_phis(struct ir3_instruction *instr,
+                          struct ir3_block *block)
+{
+   struct ir3_instruction *last_phi = ir3_block_get_last_phi(block);
+   if (last_phi)
+      ir3_instr_move_after(instr, last_phi);
+   else
+      ir3_instr_move_before_block(instr, block);
+}
+
+
 /* ************************************************************************* */
 /* instruction helpers */

--- a/src/freedreno/ir3/ir3_compiler_nir.c
+++ b/src/freedreno/ir3/ir3_compiler_nir.c
@ -5182,6 +5182,8 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler,

   ir3_calc_reconvergence(so);

+   IR3_PASS(ir, ir3_lower_shared_phis);
+
   do {
      progress = false;

@ -5192,6 +5194,7 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler,
      progress |= IR3_PASS(ir, ir3_cse);
      progress |= IR3_PASS(ir, ir3_dce, so);
      progress |= IR3_PASS(ir, ir3_opt_predicates, so);
+      progress |= IR3_PASS(ir, ir3_shared_fold);
   } while (progress);

   /* at this point, for binning pass, throw away unneeded outputs:
--- a/src/freedreno/ir3/ir3_lower_shared_phi.c
+++ b/src/freedreno/ir3/ir3_lower_shared_phi.c
@ -0,0 +1,134 @@
+/*
+ * Copyright (C) 2023 Valve Corporation.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "ir3.h"
+#include "util/ralloc.h"
+
+/* RA cannot handle phis of shared registers where there are extra physical
+ * sources, or the sources have extra physical destinations, because these edges
+ * are critical edges that we cannot resolve copies along.  Here's a contrived
+ * example:
+ *
+ * loop {
+ *    if non-uniform {
+ *       if uniform {
+ *          x_1 = ...;
+ *          continue;
+ *       }
+ *       x_2 = ...;
+ *    } else {
+ *       break;
+ *    }
+ *    // continue block
+ *    x_3 = phi(x_1, x_2)
+ * }
+ *
+ * Assuming x_1 and x_2 are uniform, x_3 will also be uniform, because all
+ * threads that stay in the loop take the same branch to the continue block,
+ * however execution may fall through from the assignment to x_2 to the
+ * break statement because the outer if is non-uniform, and then it will fall
+ * through again to the continue block. In cases like this we have to demote the
+ * phi to normal registers and insert movs around it (which will probably be
+ * coalesced).
+ */
+
+static void
+lower_phi(void *ctx, struct ir3_instruction *phi)
+{
+   struct ir3_block *block = phi->block;
+   for (unsigned i = 0; i < block->predecessors_count; i++) {
+      struct ir3_block *pred = block->predecessors[i];
+      if (phi->srcs[i]->def) {
+         struct ir3_instruction *pred_mov = ir3_instr_create(pred, OPC_MOV, 1, 1);
+         pred_mov->uses = _mesa_pointer_set_create(ctx);
+         __ssa_dst(pred_mov)->flags |= (phi->srcs[i]->flags & IR3_REG_HALF);
+         unsigned src_flags = IR3_REG_SSA | IR3_REG_SHARED |
+            (phi->srcs[i]->flags & IR3_REG_HALF);
+         ir3_src_create(pred_mov, INVALID_REG, src_flags)->def =
+            phi->srcs[i]->def;
+         pred_mov->cat1.src_type = pred_mov->cat1.dst_type =
+            (src_flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
+
+         _mesa_set_remove_key(phi->srcs[i]->def->instr->uses, phi);
+         _mesa_set_add(phi->srcs[i]->def->instr->uses, pred_mov);
+         phi->srcs[i]->def = pred_mov->dsts[0];
+      }
+      phi->srcs[i]->flags &= ~IR3_REG_SHARED;
+   }
+
+   phi->dsts[0]->flags &= ~IR3_REG_SHARED;
+
+   struct ir3_instruction *shared_mov =
+      ir3_MOV(block, phi,
+              (phi->dsts[0]->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32);
+   shared_mov->uses = _mesa_pointer_set_create(ctx);
+   shared_mov->dsts[0]->flags |= IR3_REG_SHARED;
+   ir3_instr_move_after_phis(shared_mov, block);
+
+   foreach_ssa_use (use, phi) {
+      for (unsigned i = 0; i < use->srcs_count; i++) {
+         if (use->srcs[i]->def == phi->dsts[0])
+            use->srcs[i]->def = shared_mov->dsts[0];
+      }
+   }
+}
+
+bool
+ir3_lower_shared_phis(struct ir3 *ir)
+{
+   void *mem_ctx = ralloc_context(NULL);
+   bool progress = false;
+
+   ir3_find_ssa_uses(ir, mem_ctx, false);
+
+   foreach_block (block, &ir->block_list) {
+      bool pred_physical_edge = false;
+      for (unsigned i = 0; i < block->predecessors_count; i++) {
+         unsigned successors_count =
+            block->predecessors[i]->successors[1] ? 2 : 1;
+         if (block->predecessors[i]->physical_successors_count > successors_count) {
+            pred_physical_edge = true;
+            break;
+         }
+      }
+
+      if (!pred_physical_edge &&
+          block->physical_predecessors_count == block->predecessors_count)
+         continue;
+
+      foreach_instr_safe (phi, &block->instr_list) {
+         if (phi->opc != OPC_META_PHI)
+            break;
+
+         if (!(phi->dsts[0]->flags & IR3_REG_SHARED))
+            continue;
+
+         lower_phi(mem_ctx, phi);
+         progress = true;
+      }
+   }
+
+   ralloc_free(mem_ctx);
+   return progress;
+}
+
--- a/src/freedreno/ir3/ir3_shared_folding.c
+++ b/src/freedreno/ir3/ir3_shared_folding.c
@ -0,0 +1,150 @@
+/*
+ * Copyright (C) 2023 Valve Corporation.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/* Try to fold a shared -> non-shared mov into the instruction producing the
+ * shared src. We do this aggresively, even if there are other uses of the
+ * source, on the assumption that the "default" state should be non-shared and
+ * we should be able to fold the other sources eventually.
+ */
+
+#include "util/ralloc.h"
+
+#include "ir3.h"
+
+static bool
+try_shared_folding(struct ir3_instruction *mov, void *mem_ctx)
+{
+   if (mov->opc != OPC_MOV)
+      return false;
+
+   if ((mov->dsts[0]->flags & IR3_REG_SHARED) ||
+       !(mov->srcs[0]->flags & IR3_REG_SHARED))
+      return false;
+
+   struct ir3_instruction *src = ssa(mov->srcs[0]);
+   if (!src)
+      return false;
+
+   if (mov->cat1.dst_type != mov->cat1.src_type) {
+      /* Check if the conversion can be folded into the source by ir3_cf */
+      bool can_fold;
+      type_t output_type = ir3_output_conv_type(src, &can_fold);
+      if (!can_fold || output_type != TYPE_U32)
+         return false;
+      foreach_ssa_use (use, src) {
+         if (use->opc != OPC_MOV ||
+             use->cat1.src_type != mov->cat1.src_type ||
+             use->cat1.dst_type != mov->cat1.dst_type)
+            return false;
+      }
+   }
+
+   if (src->opc == OPC_META_PHI) {
+      struct ir3_block *block = src->block;
+      for (unsigned i = 0; i < block->predecessors_count; i++) {
+         struct ir3_block *pred = block->predecessors[i];
+         if (src->srcs[i]->def) {
+            struct ir3_instruction *pred_mov = ir3_instr_create(pred, OPC_MOV, 1, 1);
+            __ssa_dst(pred_mov)->flags |= (src->srcs[i]->flags & IR3_REG_HALF);
+            unsigned src_flags = IR3_REG_SSA | IR3_REG_SHARED |
+               (src->srcs[i]->flags & IR3_REG_HALF);
+            ir3_src_create(pred_mov, INVALID_REG, src_flags)->def =
+               src->srcs[i]->def;
+            pred_mov->cat1.src_type = pred_mov->cat1.dst_type =
+               (src_flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
+
+            _mesa_set_remove_key(src->srcs[i]->def->instr->uses, src);
+            _mesa_set_add(src->srcs[i]->def->instr->uses, pred_mov);
+            src->srcs[i]->def = pred_mov->dsts[0];
+         }
+         src->srcs[i]->flags &= ~IR3_REG_SHARED;
+      }
+   } else if (opc_cat(src->opc) == 2 && src->srcs_count >= 2) {
+      /* cat2 vector ALU instructions cannot have both shared sources */
+      if ((src->srcs[0]->flags & (IR3_REG_SHARED | IR3_REG_CONST)) &&
+          (src->srcs[1]->flags & (IR3_REG_SHARED | IR3_REG_CONST)))
+         return false;
+   } else if (opc_cat(src->opc) == 3) {
+      /* cat3 vector ALU instructions cannot have src1 shared */
+      if (src->srcs[1]->flags & IR3_REG_SHARED)
+         return false;
+   } else if (src->opc == OPC_LDC) {
+      src->flags &= ~IR3_INSTR_U;
+   } else {
+      return false;
+   }
+
+   /* Remove IR3_REG_SHARED from the original destination, which should make the
+    * mov trivial so that it can be cleaned up later by copy prop.
+    */
+   src->dsts[0]->flags &= ~IR3_REG_SHARED;
+   mov->srcs[0]->flags &= ~IR3_REG_SHARED;
+
+   /* Insert a copy to shared for uses other than this move instruction. */
+   struct ir3_instruction *shared_mov = NULL;
+   foreach_ssa_use (use, src) {
+      if (use == mov)
+         continue;
+
+      if (!shared_mov) {
+         shared_mov = ir3_MOV(src->block, src, mov->cat1.src_type);
+         shared_mov->dsts[0]->flags |= IR3_REG_SHARED;
+         if (src->opc == OPC_META_PHI)
+            ir3_instr_move_after_phis(shared_mov, src->block);
+         else
+            ir3_instr_move_after(shared_mov, src);
+         shared_mov->uses = _mesa_pointer_set_create(mem_ctx);
+      }
+
+      for (unsigned i = 0; i < use->srcs_count; i++) {
+         if (use->srcs[i]->def == src->dsts[0])
+            use->srcs[i]->def = shared_mov->dsts[0];
+      }
+      _mesa_set_add(shared_mov->uses, use);
+   }
+
+   return true;
+}
+
+bool
+ir3_shared_fold(struct ir3 *ir)
+{
+   void *mem_ctx = ralloc_context(NULL);
+   bool progress = false;
+
+   ir3_find_ssa_uses(ir, mem_ctx, false);
+
+   /* Folding a phi can push the mov up to its sources, so iterate blocks in
+    * reverse to try and convert an entire phi-web in one go.
+    */
+   foreach_block_rev (block, &ir->block_list) {
+      foreach_instr (instr, &block->instr_list) {
+         progress |= try_shared_folding(instr, mem_ctx);
+      }
+   }
+
+   ralloc_free(mem_ctx);
+
+   return progress;
+}
+
--- a/src/freedreno/ir3/meson.build
+++ b/src/freedreno/ir3/meson.build
@ -97,6 +97,7 @@ libfreedreno_ir3_files = files(
  'ir3_legalize_relative.c',
  'ir3_liveness.c',
  'ir3_lower_parallelcopy.c',
+  'ir3_lower_shared_phi.c',
  'ir3_lower_spill.c',
  'ir3_lower_subgroups.c',
  'ir3_merge_regs.c',
@ -127,6 +128,7 @@ libfreedreno_ir3_files = files(
  'ir3_sched.c',
  'ir3_shader.c',
  'ir3_shader.h',
+  'ir3_shared_folding.c',
  'ir3_shared_ra.c',
  'ir3_spill.c',
  'ir3_validate.c',