aco: use non-sequential addressing

fossil-db (GFX10.3): Totals from 70493 (50.57% of 139391) affected shaders: SGPRs: 4232624 -> 4231808 (-0.02%); split: -0.09%, +0.07% VGPRs: 2831772 -> 2764740 (-2.37%); split: -2.53%, +0.17% CodeSize: 225584412 -> 225048740 (-0.24%); split: -0.44%, +0.21% MaxWaves: 875319 -> 878837 (+0.40%); split: +0.44%, -0.04% Instrs: 43157803 -> 42496421 (-1.53%); split: -1.54%, +0.01% Cycles: 1656380132 -> 1641532056 (-0.90%); split: -0.94%, +0.04% Signed-off-by: Rhys Perry <pendingchaos02@gmail.com> Reviewed-by: Daniel Schürmann <daniel@schuermann.dev> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/8523>
2021-01-14 19:58:13 +00:00 · 2021-01-14 19:58:13 +00:00 · c353895c92
parent faf3e9a27f
commit c353895c92
4 changed files with 68 additions and 30 deletions
--- a/src/amd/compiler/aco_assembler.cpp
+++ b/src/amd/compiler/aco_assembler.cpp
@ -428,6 +428,15 @@ void emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction*
      break;
   }
   case Format::MIMG: {
+      unsigned use_nsa = false;
+      unsigned addr_dwords = instr->operands.size() - 3;
+      for (unsigned i = 1; i < addr_dwords; i++) {
+         if (instr->operands[3 + i].physReg() != instr->operands[3].physReg().advance(i * 4))
+            use_nsa = true;
+      }
+      assert(!use_nsa || ctx.chip_class >= GFX10);
+      unsigned nsa_dwords = use_nsa ? DIV_ROUND_UP(addr_dwords - 1, 4) : 0;
+
      MIMG_instruction* mimg = static_cast<MIMG_instruction*>(instr);
      uint32_t encoding = (0b111100 << 26);
      encoding |= mimg->slc ? 1 << 25 : 0;
@ -443,6 +452,7 @@ void emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction*
         encoding |= mimg->da ? 1 << 14 : 0;
      } else {
         encoding |= mimg->r128 ? 1 << 15 : 0; /* GFX10: A16 moved to 2nd word, R128 replaces it in 1st word */
+         encoding |= nsa_dwords << 1;
         encoding |= mimg->dim << 3; /* GFX10: dimensionality instead of declare array */
         encoding |= mimg->dlc ? 1 << 7 : 0;
      }
@ -465,6 +475,13 @@ void emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction*
      }

      out.push_back(encoding);
+
+      if (nsa_dwords) {
+         out.resize(out.size() + nsa_dwords);
+         std::vector<uint32_t>::iterator nsa = std::prev(out.end(), nsa_dwords);
+         for (unsigned i = 0; i < addr_dwords - 1; i++)
+            nsa[i / 4] |= (0xFF & instr->operands[4 + i].physReg().reg()) << (i % 4 * 8);
+      }
      break;
   }
   case Format::FLAT:
--- a/src/amd/compiler/aco_instruction_selection.cpp
+++ b/src/amd/compiler/aco_instruction_selection.cpp
@ -5841,38 +5841,52 @@ static MIMG_instruction *emit_mimg(Builder& bld, aco_opcode op,
                                   Definition dst,
                                   Temp rsrc,
                                   Operand samp,
-                                   const std::vector<Temp>& coords,
+                                   std::vector<Temp> coords,
                                   unsigned num_wqm_coords=0,
                                   Operand vdata=Operand(v1))
 {
-   Temp coord = coords[0];
-   if (coords.size() > 1) {
-      coord = bld.tmp(RegType::vgpr, coords.size());
+   if (bld.program->chip_class < GFX10) {
+      Temp coord = coords[0];
+      if (coords.size() > 1) {
+         coord = bld.tmp(RegType::vgpr, coords.size());

-      aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, coords.size(), 1)};
-      for (unsigned i = 0; i < coords.size(); i++)
-         vec->operands[i] = Operand(coords[i]);
-      vec->definitions[0] = Definition(coord);
-      bld.insert(std::move(vec));
-   } else if (coord.type() == RegType::sgpr) {
-      coord = bld.copy(bld.def(v1), coord);
-   }
+         aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, coords.size(), 1)};
+         for (unsigned i = 0; i < coords.size(); i++)
+            vec->operands[i] = Operand(coords[i]);
+         vec->definitions[0] = Definition(coord);
+         bld.insert(std::move(vec));
+      } else if (coord.type() == RegType::sgpr) {
+         coord = bld.copy(bld.def(v1), coord);
+      }

-   if (num_wqm_coords) {
-      /* We don't need the bias, sample index, compare value or offset to be
-       * computed in WQM but if the p_create_vector copies the coordinates, then it
-       * needs to be in WQM. */
-      coord = emit_wqm(bld, coord, bld.tmp(coord.regClass()), true);
+      if (num_wqm_coords) {
+         /* We don't need the bias, sample index, compare value or offset to be
+          * computed in WQM but if the p_create_vector copies the coordinates, then it
+          * needs to be in WQM. */
+         coord = emit_wqm(bld, coord, bld.tmp(coord.regClass()), true);
+      }
+
+      coords[0] = coord;
+      coords.resize(1);
+   } else {
+      for (unsigned i = 0; i < num_wqm_coords; i++)
+         coords[i] = emit_wqm(bld, coords[i], bld.tmp(coords[i].regClass()), true);
+
+      for (Temp& coord : coords) {
+         if (coord.type() == RegType::sgpr)
+            coord = bld.copy(bld.def(v1), coord);
+      }
   }

   aco_ptr<MIMG_instruction> mimg{create_instruction<MIMG_instruction>(
-      op, Format::MIMG, 4, dst.isTemp())};
+      op, Format::MIMG, 3 + coords.size(), dst.isTemp())};
   if (dst.isTemp())
      mimg->definitions[0] = dst;
   mimg->operands[0] = Operand(rsrc);
   mimg->operands[1] = samp;
   mimg->operands[2] = vdata;
-   mimg->operands[3] = Operand(coord);
+   for (unsigned i = 0; i < coords.size(); i++)
+      mimg->operands[3 + i] = Operand(coords[i]);

   MIMG_instruction *res = mimg.get();
   bld.insert(std::move(mimg));
--- a/src/amd/compiler/aco_validate.cpp
+++ b/src/amd/compiler/aco_validate.cpp
@ -436,7 +436,7 @@ bool validate_ir(Program* program)
            break;
         }
         case Format::MIMG: {
-            check(instr->operands.size() == 4, "MIMG instructions must have 4 operands", instr.get());
+            check(instr->operands.size() >= 4, "MIMG instructions must have at least 4 operands", instr.get());
            check(instr->operands[0].hasRegClass() && (instr->operands[0].regClass() == s4 || instr->operands[0].regClass() == s8),
                  "MIMG operands[0] (resource constant) must be in 4 or 8 SGPRs", instr.get());
            if (instr->operands[1].hasRegClass())
@ -447,8 +447,15 @@ bool validate_ir(Program* program)
               check(instr->definitions.empty() || (instr->definitions[0].regClass() == instr->operands[2].regClass() || is_cmpswap),
                     "MIMG operands[2] (VDATA) must be the same as definitions[0] for atomics and TFE/LWE loads", instr.get());
            }
-            check(instr->operands[3].hasRegClass() && instr->operands[3].regClass().type() == RegType::vgpr,
-                  "MIMG operands[3] (VADDR) must be VGPR", instr.get());
+            check(instr->operands.size() == 4 || program->chip_class >= GFX10, "NSA is only supported on GFX10+", instr.get());
+            for (unsigned i = 3; i < instr->operands.size(); i++) {
+               if (instr->operands.size() == 4) {
+                  check(instr->operands[i].hasRegClass() && instr->operands[i].regClass().type() == RegType::vgpr,
+                        "MIMG operands[3] (VADDR) must be VGPR", instr.get());
+               } else {
+                  check(instr->operands[i].regClass() == v1, "MIMG VADDR must be v1 if NSA is used", instr.get());
+               }
+            }
            check(instr->definitions.empty() || (instr->definitions[0].isTemp() && instr->definitions[0].regClass().type() == RegType::vgpr),
                  "MIMG definitions[0] (VDATA) must be VGPR", instr.get());
            break;
--- a/src/amd/compiler/tests/test_isel.cpp
+++ b/src/amd/compiler/tests/test_isel.cpp
@ -149,18 +149,18 @@ BEGIN_TEST(isel.sparse.clause)
         };
         void main() {
            //>> v5: (noCSE)%zero0 = p_create_vector 0, 0, 0, 0, 0
-            //>> v5: %_ = image_sample_lz_o %_, %_, %zero0, %_ dmask:xyzw 2d tfe storage: semantics: scope:invocation
+            //>> v5: %_ = image_sample_lz_o %_, %_, %zero0, %_, %_, %_ dmask:xyzw 2d tfe storage: semantics: scope:invocation
            //>> v5: (noCSE)%zero1 = p_create_vector 0, 0, 0, 0, 0
-            //>> v5: %_ = image_sample_lz_o %_, %_, %zero1, %_ dmask:xyzw 2d tfe storage: semantics: scope:invocation
+            //>> v5: %_ = image_sample_lz_o %_, %_, %zero1, %_, %_, %_ dmask:xyzw 2d tfe storage: semantics: scope:invocation
            //>> v5: (noCSE)%zero2 = p_create_vector 0, 0, 0, 0, 0
-            //>> v5: %_ = image_sample_lz_o %_, %_, %zero2, %_ dmask:xyzw 2d tfe storage: semantics: scope:invocation
+            //>> v5: %_ = image_sample_lz_o %_, %_, %zero2, %_, %_, %_ dmask:xyzw 2d tfe storage: semantics: scope:invocation
            //>> v5: (noCSE)%zero3 = p_create_vector 0, 0, 0, 0, 0
-            //>> v5: %_ = image_sample_lz_o %_, %_, %zero3, %_ dmask:xyzw 2d tfe storage: semantics: scope:invocation
+            //>> v5: %_ = image_sample_lz_o %_, %_, %zero3, %_, %_, %_ dmask:xyzw 2d tfe storage: semantics: scope:invocation
            //>> s_clause 0x3
-            //! image_sample_lz_o v#_, v[#_:#_], @s256(img), @s128(samp) dmask:0xf dim:SQ_RSRC_IMG_2D tfe
-            //! image_sample_lz_o v#_, v[#_:#_], @s256(img), @s128(samp) dmask:0xf dim:SQ_RSRC_IMG_2D tfe
-            //! image_sample_lz_o v#_, v[#_:#_], @s256(img), @s128(samp) dmask:0xf dim:SQ_RSRC_IMG_2D tfe
-            //! image_sample_lz_o v#_, v[#_:#_], @s256(img), @s128(samp) dmask:0xf dim:SQ_RSRC_IMG_2D tfe
+            //! image_sample_lz_o v#_, [v#_, v#_, v#_, v#_], @s256(img), @s128(samp) dmask:0xf dim:SQ_RSRC_IMG_2D tfe
+            //! image_sample_lz_o v#_, [v#_, v#_, v#_, v#_], @s256(img), @s128(samp) dmask:0xf dim:SQ_RSRC_IMG_2D tfe
+            //! image_sample_lz_o v#_, [v#_, v#_, v#_, v#_], @s256(img), @s128(samp) dmask:0xf dim:SQ_RSRC_IMG_2D tfe
+            //! image_sample_lz_o v#_, [v#_, v#_, v#_, v#_], @s256(img), @s128(samp) dmask:0xf dim:SQ_RSRC_IMG_2D tfe
            code[0] = sparseTextureOffsetARB(tex, vec2(0.5), ivec2(1, 0), res[0]);
            code[1] = sparseTextureOffsetARB(tex, vec2(0.5), ivec2(2, 0), res[1]);
            code[2] = sparseTextureOffsetARB(tex, vec2(0.5), ivec2(3, 0), res[2]);