intel/fs: Implement Wa_22012725308 for cross-pipe accumulator data hazard.

The hardware fails to provide the expected data coherency guarantees for accumulator registers when accessed from multiple FPU pipelines. Fix this by tracking implicit accumulator accesses just like we do for regular GRF registers, but instead of adding synchronization annotations for any dependency we only do it for dependencies with a pipeline mismatch, since the hardware should be able to guarantee proper synchronization for matching pipelines. Note that this workaround handles RaW and WaW dependencies in addition to the WaR dependencies described in the hardware bug report even though cross-pipeline RaW accumulator dependencies should be extremely rare, since chances are the hardware will also hang if we ever hit such a condition. This only affects XeHP+, since all FPU instructions are executed as a single in-order pipeline on earlier Xe platforms. Reviewed-by: Jason Ekstrand <jason@jlekstrand.net> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/11433>
2021-05-25 15:02:53 -07:00 · 2021-05-25 15:02:53 -07:00 · d46bb14d14
parent 385da1fe36
commit d46bb14d14
1 changed files with 50 additions and 1 deletions
--- a/src/intel/compiler/brw_fs_scoreboard.cpp
+++ b/src/intel/compiler/brw_fs_scoreboard.cpp
@ -209,6 +209,21 @@ namespace {
      }
   };

+   /**
+    * Return true if the specified ordered address is trivially satisfied for
+    * all pipelines except potentially for the specified pipeline \p p.
+    */
+   bool
+   is_single_pipe(const ordered_address &jp, tgl_pipe p)
+   {
+      for (unsigned q = 0; q < IDX(TGL_PIPE_ALL); q++) {
+         if ((p == TGL_PIPE_NONE || IDX(p) != q) && jp.jp[q] > INT_MIN)
+            return false;
+      }
+
+      return true;
+   }
+
   /**
    * Return the number of instructions in the program.
    */
@ -962,6 +977,9 @@ namespace {
            sb.set(byte_offset(inst->src[i], REG_SIZE * j), rd_dep);
      }

+      if (inst->reads_accumulator_implicitly())
+         sb.set(brw_acc_reg(8), dependency(TGL_REGDIST_SRC, jp, exec_all));
+
      if (is_send(inst) && inst->base_mrf != -1) {
         const dependency rd_dep = dependency(TGL_SBID_SRC, ip, exec_all);

@ -976,6 +994,9 @@ namespace {
            dependency(TGL_REGDIST_DST, jp, exec_all) :
         dependency();

+      if (inst->writes_accumulator_implicitly(devinfo))
+         sb.set(brw_acc_reg(8), wr_dep);
+
      if (is_valid(wr_dep) && inst->dst.file != BAD_FILE &&
          !inst->dst.is_null()) {
         for (unsigned j = 0; j < regs_written(inst); j++)
@ -1059,6 +1080,7 @@ namespace {
   gather_inst_dependencies(const fs_visitor *shader,
                            const ordered_address *jps)
   {
+      const struct intel_device_info *devinfo = shader->devinfo;
      equivalence_relation eq(num_instructions(shader));
      scoreboard *sbs = propagate_block_scoreboards(shader, jps, eq);
      const unsigned *ids = eq.flatten();
@ -1067,6 +1089,7 @@ namespace {

      foreach_block_and_inst(block, fs_inst, inst, shader->cfg) {
         const bool exec_all = inst->force_writemask_all;
+         const tgl_pipe p = inferred_exec_pipe(devinfo, inst);
         scoreboard &sb = sbs[block->num];

         for (unsigned i = 0; i < inst->sources; i++) {
@ -1075,6 +1098,18 @@ namespace {
                  sb.get(byte_offset(inst->src[i], REG_SIZE * j))));
         }

+         if (inst->reads_accumulator_implicitly()) {
+            /* Wa_22012725308:
+             *
+             * "When the accumulator registers are used as source and/or
+             *  destination, hardware does not ensure prevention of write
+             *  after read hazard across execution pipes."
+             */
+            const dependency dep = sb.get(brw_acc_reg(8));
+            if (dep.ordered && !is_single_pipe(dep.jp, p))
+               add_dependency(ids, deps[ip], dep);
+         }
+
         if (is_send(inst) && inst->base_mrf != -1) {
            for (unsigned j = 0; j < inst->mlen; j++)
               add_dependency(ids, deps[ip], dependency_for_read(
@ -1086,13 +1121,27 @@ namespace {
                           dependency(TGL_SBID_SET, ip, exec_all));

         if (!inst->no_dd_check) {
-            if (inst->dst.file != BAD_FILE && !inst->dst.is_null()) {
+            if (inst->dst.file != BAD_FILE && !inst->dst.is_null() &&
+                !inst->dst.is_accumulator()) {
               for (unsigned j = 0; j < regs_written(inst); j++) {
                  add_dependency(ids, deps[ip], dependency_for_write(inst,
                     sb.get(byte_offset(inst->dst, REG_SIZE * j))));
               }
            }

+            if (inst->writes_accumulator_implicitly(devinfo) ||
+                inst->dst.is_accumulator()) {
+               /* Wa_22012725308:
+                *
+                * "When the accumulator registers are used as source and/or
+                *  destination, hardware does not ensure prevention of write
+                *  after read hazard across execution pipes."
+                */
+               const dependency dep = sb.get(brw_acc_reg(8));
+               if (dep.ordered && !is_single_pipe(dep.jp, p))
+                  add_dependency(ids, deps[ip], dep);
+            }
+
            if (is_send(inst) && inst->base_mrf != -1) {
               for (unsigned j = 0; j < inst->implied_mrf_writes(); j++)
                  add_dependency(ids, deps[ip], dependency_for_write(inst,