aco/gfx10: Wait for pending SMEM stores before loads

Currently if you have an SMEM store followed by an SMEM load that loads the same location as was written, it won't work because the store isn't finished before the load is executed. This is NOT mitigated by an s_nop instruction on GFX10. Since we currently don't have proper alias analysis, this commit adds a workaround which will insert an s_waitcnt lgkmcnt(0) before each SSBO load if they follow a store. We should further refine this in the future when we can make sure to only add the wait when we load the same thing as has been stored. Signed-off-by: Timur Kristóf <timur.kristof@gmail.com> Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
2019-10-14 15:18:31 +02:00 · 2019-10-14 15:18:31 +02:00 · 1749953ea3
parent 7fa5cd3ee3
commit 1749953ea3
2 changed files with 33 additions and 1 deletions
--- a/src/amd/compiler/README
+++ b/src/amd/compiler/README
@ -131,3 +131,12 @@ finish and then write to vcc (for example, `s_mov_b64 vcc, vcc`) to correct vccz

 Currently, we don't do this.

+## RDNA / GFX10 hazards
+
+### SMEM store followed by a load with the same address
+
+We found that an `s_buffer_load` will produce incorrect results if it is preceded
+by an `s_buffer_store` with the same address. Inserting an `s_nop` between them
+does not mitigate the issue, so an `s_waitcnt lgkmcnt(0)` must be inserted.
+This is not mentioned by LLVM among the other GFX10 bugs, but LLVM doesn't use
+SMEM stores, so it's not surprising that they didn't notice it.
--- a/src/amd/compiler/aco_insert_waitcnt.cpp
+++ b/src/amd/compiler/aco_insert_waitcnt.cpp
@ -221,6 +221,7 @@ struct wait_ctx {
   uint8_t vs_cnt = 0;
   bool pending_flat_lgkm = false;
   bool pending_flat_vm = false;
+   bool pending_s_buffer_store = false; /* GFX10 workaround */

   wait_imm barrier_imm[barrier_count];

@ -244,6 +245,7 @@ struct wait_ctx {
      vs_cnt = std::max(vs_cnt, other->vs_cnt);
      pending_flat_lgkm |= other->pending_flat_lgkm;
      pending_flat_vm |= other->pending_flat_vm;
+      pending_s_buffer_store |= other->pending_s_buffer_store;

      for (std::pair<PhysReg,wait_entry> entry : other->gpr_map)
      {
@ -329,6 +331,19 @@ wait_imm kill(Instruction* instr, wait_ctx& ctx)
       */
      if (ctx.lgkm_cnt && instr->opcode == aco_opcode::s_dcache_wb)
         imm.lgkm = 0;
+
+      /* GFX10: A store followed by a load at the same address causes a problem because
+       * the load doesn't load the correct values unless we wait for the store first.
+       * This is NOT mitigated by an s_nop.
+       *
+       * TODO: Refine this when we have proper alias analysis.
+       */
+      SMEM_instruction *smem = static_cast<SMEM_instruction *>(instr);
+      if (ctx.pending_s_buffer_store &&
+          !smem->definitions.empty() &&
+          !smem->can_reorder && smem->barrier == barrier_buffer) {
+         imm.lgkm = 0;
+      }
   }

   if (instr->format == Format::PSEUDO_BARRIER) {
@ -407,8 +422,10 @@ wait_imm kill(Instruction* instr, wait_ctx& ctx)

   if (imm.vm == 0)
      ctx.pending_flat_vm = false;
-   if (imm.lgkm == 0)
+   if (imm.lgkm == 0) {
      ctx.pending_flat_lgkm = false;
+      ctx.pending_s_buffer_store = false;
+   }

   return imm;
 }
@ -576,10 +593,16 @@ void gen(Instruction* instr, wait_ctx& ctx)
      break;
   }
   case Format::SMEM: {
+      SMEM_instruction *smem = static_cast<SMEM_instruction*>(instr);
      update_counters(ctx, event_smem, static_cast<SMEM_instruction*>(instr)->barrier);

      if (!instr->definitions.empty())
         insert_wait_entry(ctx, instr->definitions[0], event_smem);
+      else if (ctx.chip_class >= GFX10 &&
+               !smem->can_reorder &&
+               smem->barrier == barrier_buffer)
+         ctx.pending_s_buffer_store = true;
+
      break;
   }
   case Format::DS: {