r600/sfn: Fix the kcache failure handling

Instead of starting a new block when the kcache handling failed, try to continue scheduling instructions until kcache allocation fails for all ready instruction. With that we avoid a CF split withing an LDS fetch/read group. Signed-off-by: Gert Wollny <gert.wollny@collabora.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17678>
2022-07-21 17:52:48 +02:00 · 2022-07-21 17:52:48 +02:00 · 8a7d34e3bd
parent 8db31e0fe6
commit 8a7d34e3bd
3 changed files with 138 additions and 91 deletions
--- a/src/gallium/drivers/r600/sfn/sfn_instr.cpp
+++ b/src/gallium/drivers/r600/sfn/sfn_instr.cpp
@ -302,17 +302,43 @@ void Block::push_back(PInst instr)

 bool Block::try_reserve_kcache(const AluGroup& group)
 {
+   auto kcache = m_kcache;
+
   auto kcache_constants = group.get_kconsts();
   for (auto& kc : kcache_constants)  {
      auto u = kc->as_uniform();
      assert(u);
-      if (!try_reserve_kcache(*u))
+      if (!try_reserve_kcache(*u, kcache)) {
+         m_kcache_alloc_failed = true;
         return false;
+      }
   }
+
+   m_kcache = kcache;
+   m_kcache_alloc_failed = false;
   return true;
 }

-bool Block::try_reserve_kcache(const UniformValue& u)
+bool Block::try_reserve_kcache(const AluInstr& instr)
+{
+   auto kcache = m_kcache;
+
+   for (auto& src : instr.sources()) {
+      auto u = src->as_uniform();
+      if (u) {
+         if (!try_reserve_kcache(*u, kcache)) {
+            m_kcache_alloc_failed = true;
+            return false;
+         }
+      }
+   }
+   m_kcache = kcache;
+   m_kcache_alloc_failed = false;
+   return true;
+}
+
+bool Block::try_reserve_kcache(const UniformValue& u,
+                               std::array<KCacheLine, 4>& kcache) const
 {
   const int kcache_banks = 4; // TODO: handle pre-evergreen

@ -323,49 +349,50 @@ bool Block::try_reserve_kcache(const UniformValue& u)
   bool found = false;

   for (int i = 0; i < kcache_banks && !found; ++i) {
-      if (m_kcache[i].mode) {
-         if (m_kcache[i].bank < bank)
+      if (kcache[i].mode) {
+         if (kcache[i].bank < bank)
            continue;

-         if ((m_kcache[i].bank == bank &&
-              m_kcache[i].addr > line  + 1) ||
-             m_kcache[i].bank > bank) {
-            if (m_kcache[kcache_banks - 1].mode)
+         if ((kcache[i].bank == bank &&
+              kcache[i].addr > line  + 1) ||
+             kcache[i].bank > bank) {
+            if (kcache[kcache_banks - 1].mode)
               return false;

-            memmove(&m_kcache[i+1],&m_kcache[i], (kcache_banks-i-1)*sizeof(KCacheLine));
-            m_kcache[i].mode = KCacheLine::lock_1;
-            m_kcache[i].bank = bank;
-            m_kcache[i].addr = line;
+            memmove(&kcache[i+1],&kcache[i], (kcache_banks-i-1)*sizeof(KCacheLine));
+            kcache[i].mode = KCacheLine::lock_1;
+            kcache[i].bank = bank;
+            kcache[i].addr = line;
            return true;
         }

-         int d = line - m_kcache[i].addr;
+         int d = line - kcache[i].addr;

         if (d == -1) {
-            m_kcache[i].addr--;
-            if (m_kcache[i].mode == KCacheLine::lock_2) {
+            kcache[i].addr--;
+            if (kcache[i].mode == KCacheLine::lock_2) {
               /* we are prepending the line to the current set,
-          * discarding the existing second line,
-          * so we'll have to insert line+2 after it */
+                * discarding the existing second line,
+                * so we'll have to insert line+2 after it */
               line += 2;
               continue;
-            } else if (m_kcache[i].mode == KCacheLine::lock_1) {
-               m_kcache[i].mode = KCacheLine::lock_2;
+            } else if (kcache[i].mode == KCacheLine::lock_1) {
+               kcache[i].mode = KCacheLine::lock_2;
               return true;
            } else {
               /* V_SQ_CF_KCACHE_LOCK_LOOP_INDEX is not supported */
               return false;
            }
         } else if (d == 1) {
-            m_kcache[i].mode = KCacheLine::lock_2;
+            kcache[i].mode = KCacheLine::lock_2;
            return true;
-         } else if (d == 0)
+         } else if (d == 0) {
            return true;
+         }
      } else { /* free kcache set - use it */
-         m_kcache[i].mode = KCacheLine::lock_1;
-         m_kcache[i].bank = bank;
-         m_kcache[i].addr = line;
+         kcache[i].mode = KCacheLine::lock_1;
+         kcache[i].bank = bank;
+         kcache[i].addr = line;
         return true;
      }
   }
--- a/src/gallium/drivers/r600/sfn/sfn_instr.h
+++ b/src/gallium/drivers/r600/sfn/sfn_instr.h
@ -196,7 +196,8 @@ public:
   void set_type(Type t);
   uint32_t remaining_slots() const { return m_remaining_slots;}

-   bool try_reserve_kcache(const AluGroup& group);
+   bool try_reserve_kcache(const AluGroup& instr);
+   bool try_reserve_kcache(const AluInstr& group);

   auto last_lds_instr() {return m_last_lds_instr;}
   void set_last_lds_instr(Instr *instr) {m_last_lds_instr = instr;}
@ -207,8 +208,11 @@ public:

   size_t size() const { return m_instructions.size();}

+   bool kcache_reservation_failed() const { return m_kcache_alloc_failed;}
+
 private:
-   bool try_reserve_kcache(const UniformValue& u);
+   bool try_reserve_kcache(const UniformValue& u,
+                           std::array<KCacheLine, 4>& kcache) const;

   bool do_ready() const override {return true;};
   void do_print(std::ostream& os) const override;
@ -221,11 +225,13 @@ private:
   uint32_t m_remaining_slots{0xffff};

   std::array<KCacheLine, 4> m_kcache;
+   bool m_kcache_alloc_failed{false};

   Instr *m_last_lds_instr{nullptr};

   int m_lds_group_requirement{0};
   AluInstr *m_lds_group_start{nullptr};
+
 };

 class InstrWithVectorResult : public Instr {
--- a/src/gallium/drivers/r600/sfn/sfn_scheduler.cpp
+++ b/src/gallium/drivers/r600/sfn/sfn_scheduler.cpp
@ -489,83 +489,84 @@ bool BlockSheduler::schedule_alu(Shader::ShaderBlocks& out_blocks)
   bool has_lds_ready = !alu_vec_ready.empty() &&
                        (*alu_vec_ready.begin())->has_lds_access();

+   /* If we have ready ALU instructions we have to start a new ALU block */
+   if (has_alu_ready ||  !alu_groups_ready.empty()) {
+      if (m_current_block->type() != Block::alu) {
+         start_new_block(out_blocks, Block::alu);
+         m_alu_groups_schduled = 0;
+      }
+   }
+
   /* Schedule groups first. unless we have a pending LDS instuction
    * We don't want the LDS instructions to be too far apart because the
    * fetch + read from queue has to be in the same ALU CF block */
   if (!alu_groups_ready.empty() && !has_lds_ready) {
      group = *alu_groups_ready.begin();
-      alu_groups_ready.erase(alu_groups_ready.begin());
-      sfn_log << SfnLog::schedule << "Schedule ALU group\n";
-      success = true;
-   } else {
-      if (has_alu_ready) {
-         group = new AluGroup();
-         sfn_log << SfnLog::schedule << "START new ALU group\n";
-      }
-   }
-
-   if (group) {
-      int free_slots = group->free_slots();
-
-      if (free_slots && has_alu_ready) {
-         if (!alu_vec_ready.empty())
-            success |= schedule_alu_to_group_vec(group);
-
-         /* Apparently one can't schedule a t-slot if there is already
-          * and LDS instruction scheduled.
-          * TODO: check whether this is only relevant for actual LDS instructions
-          * or also for instructions that read from the LDS return value queue */
-
-         if (free_slots & 0x10 && !has_lds_ready) {
-            sfn_log << SfnLog::schedule << "Try schedule TRANS channel\n";
-            if (!alu_trans_ready.empty())
-               success |= schedule_alu_to_group_trans(group, alu_trans_ready);
-            if (!alu_vec_ready.empty())
-               success |= schedule_alu_to_group_trans(group, alu_vec_ready);
-         }
-      }
-
-      sfn_log << SfnLog::schedule << "Finalize ALU group\n";
-      group->set_scheduled();
-      group->fix_last_flag();
-      group->set_nesting_depth(m_current_block->nesting_depth());
-
-
-      if (m_current_block->type() != Block::alu) {
-         start_new_block(out_blocks, Block::alu);
-         m_alu_groups_schduled = 0;
-      }
-
-      /* Pessimistic hack: If we have started an LDS group,
-       * make sure 8 instructions groups still fit into the CF
-       * TODO: take care of Address slot emission
-       * TODO: maybe do this CF split only in the assembler
-       */
-      /*if (group->slots() > m_current_block->remaining_slots() ||
-          (group->has_lds_group_start() &&
-           m_current_block->remaining_slots() < 7 * 8)) {
-         //assert(!m_current_block->lds_group_active());
-         start_new_block(out_blocks, Block::alu);
-      }*/
-
      if (!m_current_block->try_reserve_kcache(*group)) {
-         assert(!m_current_block->lds_group_active());
         start_new_block(out_blocks, Block::alu);
         m_current_block->set_instr_flag(Instr::force_cf);
      }

-      assert(m_current_block->try_reserve_kcache(*group));
-
-      if (group->has_lds_group_start())
-         m_current_block->lds_group_start(*group->begin());
-
-      m_current_block->push_back(group);
-      if (group->has_lds_group_end())
-         m_current_block->lds_group_end();
+      if (!m_current_block->try_reserve_kcache(*group))
+         unreachable("Scheduling a group in a new block should always succeed");
+      alu_groups_ready.erase(alu_groups_ready.begin());
+      sfn_log << SfnLog::schedule << "Schedule ALU group\n";
+      success = true;
+   } else if (has_alu_ready) {
+      group = new AluGroup();
+      sfn_log << SfnLog::schedule << "START new ALU group\n";
+   } else {
+      return false;
   }

-   if (success)
-      ++m_alu_groups_schduled;
+   assert(group);
+
+   int free_slots = group->free_slots();
+
+   while (free_slots && has_alu_ready) {
+      if (!alu_vec_ready.empty())
+         success |= schedule_alu_to_group_vec(group);
+
+      /* Apparently one can't schedule a t-slot if there is already
+       * and LDS instruction scheduled.
+       * TODO: check whether this is only relevant for actual LDS instructions
+       * or also for instructions that read from the LDS return value queue */
+
+      if (free_slots & 0x10 && !has_lds_ready) {
+         sfn_log << SfnLog::schedule << "Try schedule TRANS channel\n";
+         if (!alu_trans_ready.empty())
+            success |= schedule_alu_to_group_trans(group, alu_trans_ready);
+         if (!alu_vec_ready.empty())
+            success |= schedule_alu_to_group_trans(group, alu_vec_ready);
+      }
+
+      if (success) {
+         ++m_alu_groups_schduled;
+         break;
+      } else if (m_current_block->kcache_reservation_failed()) {
+         // LDS read groups should not lead to impossible
+         // kcache constellations
+         assert(!m_current_block->lds_group_active());
+
+         // kcache reservation failed, so we have to start a new CF
+         start_new_block(out_blocks, Block::alu);
+         m_current_block->set_instr_flag(Instr::force_cf);
+      } else {
+         return false;
+      }
+   }
+
+   sfn_log << SfnLog::schedule << "Finalize ALU group\n";
+   group->set_scheduled();
+   group->fix_last_flag();
+   group->set_nesting_depth(m_current_block->nesting_depth());
+   m_current_block->push_back(group);
+
+   if (group->has_lds_group_start())
+      m_current_block->lds_group_start(*group->begin());
+
+   if (group->has_lds_group_end())
+      m_current_block->lds_group_end();

   return success;
 }
@ -652,6 +653,13 @@ bool BlockSheduler::schedule_alu_to_group_vec(AluGroup *group)
   auto e = alu_vec_ready.end();
   while (i != e) {
      sfn_log << SfnLog::schedule << "Try schedule to vec " << **i;
+
+      if (!m_current_block->try_reserve_kcache(**i)) {
+           sfn_log << SfnLog::schedule << " failed (kcache)\n";
+         ++i;
+         continue;
+      }
+
      if (group->add_vec_instructions(*i)) {
         auto old_i = i;
         ++i;
@ -679,6 +687,12 @@ bool BlockSheduler::schedule_alu_to_group_trans(AluGroup *group, std::list<AluIn
   auto e = readylist.end();
   while (i != e) {
      sfn_log << SfnLog::schedule << "Try schedule to trans " << **i;
+      if (!m_current_block->try_reserve_kcache(**i)) {
+           sfn_log << SfnLog::schedule << " failed (kcache)\n";
+         ++i;
+         continue;
+      }
+
      if (group->add_trans_instructions(*i)) {
         auto old_i = i;
         ++i;