aco: Format.

Manually adjusted some comments for more intuitive line breaks. Reviewed-by: Tony Wasserka <tony.wasserka@gmx.de> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/11258>
2021-06-09 10:14:54 +02:00 · 2021-06-09 10:14:54 +02:00 · 1e2639026f
parent 97ec360dc4
commit 1e2639026f
32 changed files with 7231 additions and 6574 deletions
--- a/src/amd/compiler/aco_assembler.cpp
+++ b/src/amd/compiler/aco_assembler.cpp
@ -41,14 +41,15 @@ struct constaddr_info {
 };

 struct asm_context {
-   Program *program;
+   Program* program;
   enum chip_class chip_class;
   std::vector<std::pair<int, SOPP_instruction*>> branches;
   std::map<unsigned, constaddr_info> constaddrs;
   const int16_t* opcode;
   // TODO: keep track of branch instructions referring blocks
   // and, when emitting the block, correct the offset in instr
-   asm_context(Program* program_) : program(program_), chip_class(program->chip_class) {
+   asm_context(Program* program_) : program(program_), chip_class(program->chip_class)
+   {
      if (chip_class <= GFX7)
         opcode = &instr_info.opcode_gfx7[0];
      else if (chip_class <= GFX9)
@ -60,7 +61,8 @@ struct asm_context {
   int subvector_begin_pos = -1;
 };

-static uint32_t get_sdwa_sel(unsigned sel, PhysReg reg)
+static uint32_t
+get_sdwa_sel(unsigned sel, PhysReg reg)
 {
   if (sel & sdwa_isra) {
      unsigned size = sdwa_rasize & sel;
@ -72,7 +74,9 @@ static uint32_t get_sdwa_sel(unsigned sel, PhysReg reg)
   return sel & sdwa_asuint;
 }

-unsigned get_mimg_nsa_dwords(const Instruction *instr) {
+unsigned
+get_mimg_nsa_dwords(const Instruction* instr)
+{
   unsigned addr_dwords = instr->operands.size() - 3;
   for (unsigned i = 1; i < addr_dwords; i++) {
      if (instr->operands[3 + i].physReg() != instr->operands[3].physReg().advance(i * 4))
@ -81,7 +85,8 @@ unsigned get_mimg_nsa_dwords(const Instruction *instr) {
   return 0;
 }

-void emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction* instr)
+void
+emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction* instr)
 {
   /* lower remaining pseudo-instructions */
   if (instr->opcode == aco_opcode::p_constaddr_getpc) {
@ -99,11 +104,11 @@ void emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction*

   uint32_t opcode = ctx.opcode[(int)instr->opcode];
   if (opcode == (uint32_t)-1) {
-      char *outmem;
+      char* outmem;
      size_t outsize;
      struct u_memstream mem;
      u_memstream_open(&mem, &outmem, &outsize);
-      FILE *const memf = u_memstream_get(&mem);
+      FILE* const memf = u_memstream_get(&mem);

      fprintf(memf, "Unsupported opcode: ");
      aco_print_instr(instr, memf);
@ -144,11 +149,11 @@ void emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction*

      uint32_t encoding = (0b1011 << 28);
      encoding |= opcode << 23;
-      encoding |=
-         !instr->definitions.empty() && !(instr->definitions[0].physReg() == scc) ?
-         instr->definitions[0].physReg() << 16 :
-         !instr->operands.empty() && instr->operands[0].physReg() <= 127 ?
-         instr->operands[0].physReg() << 16 : 0;
+      encoding |= !instr->definitions.empty() && !(instr->definitions[0].physReg() == scc)
+                     ? instr->definitions[0].physReg() << 16
+                  : !instr->operands.empty() && instr->operands[0].physReg() <= 127
+                     ? instr->operands[0].physReg() << 16
+                     : 0;
      encoding |= sopk.imm;
      out.push_back(encoding);
      break;
@ -177,7 +182,7 @@ void emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction*
      SOPP_instruction& sopp = instr->sopp();
      uint32_t encoding = (0b101111111 << 23);
      encoding |= opcode << 16;
-      encoding |= (uint16_t) sopp.imm;
+      encoding |= (uint16_t)sopp.imm;
      if (sopp.block != -1) {
         sopp.pass_flags = 0;
         ctx.branches.emplace_back(out.size(), &sopp);
@ -208,7 +213,8 @@ void emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction*
         }
         out.push_back(encoding);
         /* SMRD instructions can take a literal on GFX7 */
-         if (instr->operands.size() >= 2 && instr->operands[1].isConstant() && instr->operands[1].constantValue() >= 1024)
+         if (instr->operands.size() >= 2 && instr->operands[1].isConstant() &&
+             instr->operands[1].constantValue() >= 1024)
            out.push_back(instr->operands[1].constantValue() >> 2);
         return;
      }
@ -235,7 +241,8 @@ void emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction*
      }

      if (is_load || instr->operands.size() >= 3) { /* SDATA */
-         encoding |= (is_load ? instr->definitions[0].physReg() : instr->operands[2].physReg()) << 6;
+         encoding |= (is_load ? instr->definitions[0].physReg() : instr->operands[2].physReg())
+                     << 6;
      }
      if (instr->operands.size() >= 1) { /* SBASE */
         encoding |= instr->operands[0].physReg() >> 1;
@ -246,14 +253,16 @@ void emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction*

      int32_t offset = 0;
      uint32_t soffset = ctx.chip_class >= GFX10
-                         ? sgpr_null /* On GFX10 this is disabled by specifying SGPR_NULL */
-                         : 0;        /* On GFX9, it is disabled by the SOE bit (and it's not present on GFX8 and below) */
+                            ? sgpr_null /* On GFX10 this is disabled by specifying SGPR_NULL */
+                            : 0; /* On GFX9, it is disabled by the SOE bit (and it's not present on
+                                    GFX8 and below) */
      if (instr->operands.size() >= 2) {
-         const Operand &op_off1 = instr->operands[1];
+         const Operand& op_off1 = instr->operands[1];
         if (ctx.chip_class <= GFX9) {
            offset = op_off1.isConstant() ? op_off1.constantValue() : op_off1.physReg();
         } else {
-            /* GFX10 only supports constants in OFFSET, so put the operand in SOFFSET if it's an SGPR */
+            /* GFX10 only supports constants in OFFSET, so put the operand in SOFFSET if it's an
+             * SGPR */
            if (op_off1.isConstant()) {
               offset = op_off1.constantValue();
            } else {
@ -263,8 +272,9 @@ void emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction*
         }

         if (soe) {
-            const Operand &op_off2 = instr->operands.back();
-            assert(ctx.chip_class >= GFX9); /* GFX8 and below don't support specifying a constant and an SGPR at the same time */
+            const Operand& op_off2 = instr->operands.back();
+            assert(ctx.chip_class >= GFX9); /* GFX8 and below don't support specifying a constant
+                                               and an SGPR at the same time */
            assert(!op_off2.isConstant());
            soffset = op_off2.physReg();
         }
@ -368,9 +378,13 @@ void emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction*
      encoding = 0;
      unsigned reg = !instr->definitions.empty() ? instr->definitions[0].physReg() : 0;
      encoding |= (0xFF & reg) << 24;
-      reg = instr->operands.size() >= 3 && !(instr->operands[2].physReg() == m0)  ? instr->operands[2].physReg() : 0;
+      reg = instr->operands.size() >= 3 && !(instr->operands[2].physReg() == m0)
+               ? instr->operands[2].physReg()
+               : 0;
      encoding |= (0xFF & reg) << 16;
-      reg = instr->operands.size() >= 2 && !(instr->operands[1].physReg() == m0) ? instr->operands[1].physReg() : 0;
+      reg = instr->operands.size() >= 2 && !(instr->operands[1].physReg() == m0)
+               ? instr->operands[1].physReg()
+               : 0;
      encoding |= (0xFF & reg) << 8;
      encoding |= (0xFF & instr->operands[0].physReg());
      out.push_back(encoding);
@ -402,7 +416,8 @@ void emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction*
      encoding |= instr->operands[2].physReg() << 24;
      encoding |= (mubuf.tfe ? 1 : 0) << 23;
      encoding |= (instr->operands[0].physReg() >> 2) << 16;
-      unsigned reg = instr->operands.size() > 3 ? instr->operands[3].physReg() : instr->definitions[0].physReg();
+      unsigned reg = instr->operands.size() > 3 ? instr->operands[3].physReg()
+                                                : instr->definitions[0].physReg();
      encoding |= (0xFF & reg) << 8;
      encoding |= (0xFF & instr->operands[1].physReg());
      out.push_back(encoding);
@ -435,7 +450,8 @@ void emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction*
      encoding |= (mtbuf.tfe ? 1 : 0) << 23;
      encoding |= (mtbuf.slc ? 1 : 0) << 22;
      encoding |= (instr->operands[0].physReg() >> 2) << 16;
-      unsigned reg = instr->operands.size() > 3 ? instr->operands[3].physReg() : instr->definitions[0].physReg();
+      unsigned reg = instr->operands.size() > 3 ? instr->operands[3].physReg()
+                                                : instr->definitions[0].physReg();
      encoding |= (0xFF & reg) << 8;
      encoding |= (0xFF & instr->operands[1].physReg());

@ -465,7 +481,8 @@ void emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction*
         encoding |= mimg.a16 ? 1 << 15 : 0;
         encoding |= mimg.da ? 1 << 14 : 0;
      } else {
-         encoding |= mimg.r128 ? 1 << 15 : 0; /* GFX10: A16 moved to 2nd word, R128 replaces it in 1st word */
+         encoding |= mimg.r128 ? 1 << 15
+                               : 0; /* GFX10: A16 moved to 2nd word, R128 replaces it in 1st word */
         encoding |= nsa_dwords << 1;
         encoding |= mimg.dim << 3; /* GFX10: dimensionality instead of declare array */
         encoding |= mimg.dlc ? 1 << 7 : 0;
@ -485,7 +502,8 @@ void emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction*
      assert(!mimg.d16 || ctx.chip_class >= GFX9);
      encoding |= mimg.d16 ? 1 << 31 : 0;
      if (ctx.chip_class >= GFX10) {
-         encoding |= mimg.a16 ? 1 << 30 : 0; /* GFX10: A16 still exists, but is in a different place */
+         /* GFX10: A16 still exists, but is in a different place */
+         encoding |= mimg.a16 ? 1 << 30 : 0;
      }

      out.push_back(encoding);
@ -539,7 +557,8 @@ void emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction*
         assert(ctx.chip_class >= GFX10 || instr->operands[1].physReg() != 0x7F);
         assert(instr->format != Format::FLAT);
         encoding |= instr->operands[1].physReg() << 16;
-      } else if (instr->format != Format::FLAT || ctx.chip_class >= GFX10) { /* SADDR is actually used with FLAT on GFX10 */
+      } else if (instr->format != Format::FLAT ||
+                 ctx.chip_class >= GFX10) { /* SADDR is actually used with FLAT on GFX10 */
         if (ctx.chip_class <= GFX9)
            encoding |= 0x7F << 16;
         else
@ -611,7 +630,7 @@ void emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction*
         }
         encoding |= vop3.opsel << 11;
         for (unsigned i = 0; i < 3; i++)
-            encoding |= vop3.abs[i] << (8+i);
+            encoding |= vop3.abs[i] << (8 + i);
         if (instr->definitions.size() == 2)
            encoding |= instr->definitions[1].physReg() << 8;
         encoding |= (0xFF & instr->definitions[0].physReg());
@ -625,7 +644,7 @@ void emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction*
         }
         encoding |= vop3.omod << 27;
         for (unsigned i = 0; i < 3; i++)
-            encoding |= vop3.neg[i] << (29+i);
+            encoding |= vop3.neg[i] << (29 + i);
         out.push_back(encoding);

      } else if (instr->isVOP3P()) {
@ -645,7 +664,7 @@ void emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction*
         encoding |= vop3.opsel_lo << 11;
         encoding |= ((vop3.opsel_hi & 0x4) ? 1 : 0) << 14;
         for (unsigned i = 0; i < 3; i++)
-            encoding |= vop3.neg_hi[i] << (8+i);
+            encoding |= vop3.neg_hi[i] << (8 + i);
         encoding |= (0xFF & instr->definitions[0].physReg());
         out.push_back(encoding);
         encoding = 0;
@ -653,17 +672,17 @@ void emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction*
            encoding |= instr->operands[i].physReg() << (i * 9);
         encoding |= (vop3.opsel_hi & 0x3) << 27;
         for (unsigned i = 0; i < 3; i++)
-            encoding |= vop3.neg_lo[i] << (29+i);
+            encoding |= vop3.neg_lo[i] << (29 + i);
         out.push_back(encoding);

-      } else if (instr->isDPP()){
+      } else if (instr->isDPP()) {
         assert(ctx.chip_class >= GFX8);
         DPP_instruction& dpp = instr->dpp();

         /* first emit the instruction without the DPP operand */
         Operand dpp_op = instr->operands[0];
         instr->operands[0] = Operand(PhysReg{250}, v1);
-         instr->format = (Format) ((uint16_t) instr->format & ~(uint16_t)Format::DPP);
+         instr->format = (Format)((uint16_t)instr->format & ~(uint16_t)Format::DPP);
         emit_instruction(ctx, out, instr);
         uint32_t encoding = (0xF & dpp.row_mask) << 28;
         encoding |= (0xF & dpp.bank_mask) << 24;
@ -684,7 +703,7 @@ void emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction*
         /* first emit the instruction without the SDWA operand */
         Operand sdwa_op = instr->operands[0];
         instr->operands[0] = Operand(PhysReg{249}, v1);
-         instr->format = (Format) ((uint16_t) instr->format & ~(uint16_t)Format::SDWA);
+         instr->format = (Format)((uint16_t)instr->format & ~(uint16_t)Format::SDWA);
         emit_instruction(ctx, out, instr);

         uint32_t encoding = 0;
@ -737,7 +756,8 @@ void emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction*
   }
 }

-void emit_block(asm_context& ctx, std::vector<uint32_t>& out, Block& block)
+void
+emit_block(asm_context& ctx, std::vector<uint32_t>& out, Block& block)
 {
   for (aco_ptr<Instruction>& instr : block.instructions) {
 #if 0
@ -754,15 +774,15 @@ void emit_block(asm_context& ctx, std::vector<uint32_t>& out, Block& block)
   }
 }

-void fix_exports(asm_context& ctx, std::vector<uint32_t>& out, Program* program)
+void
+fix_exports(asm_context& ctx, std::vector<uint32_t>& out, Program* program)
 {
   bool exported = false;
   for (Block& block : program->blocks) {
      if (!(block.kind & block_kind_export_end))
         continue;
      std::vector<aco_ptr<Instruction>>::reverse_iterator it = block.instructions.rbegin();
-      while ( it != block.instructions.rend())
-      {
+      while (it != block.instructions.rend()) {
         if ((*it)->isEXP()) {
            Export_instruction& exp = (*it)->exp();
            if (program->stage.hw == HWStage::VS || program->stage.hw == HWStage::NGG) {
@ -785,15 +805,18 @@ void fix_exports(asm_context& ctx, std::vector<uint32_t>& out, Program* program)

   if (!exported) {
      /* Abort in order to avoid a GPU hang. */
-      bool is_vertex_or_ngg = (program->stage.hw == HWStage::VS || program->stage.hw == HWStage::NGG);
-      aco_err(program, "Missing export in %s shader:", is_vertex_or_ngg ? "vertex or NGG" : "fragment");
+      bool is_vertex_or_ngg =
+         (program->stage.hw == HWStage::VS || program->stage.hw == HWStage::NGG);
+      aco_err(program,
+              "Missing export in %s shader:", is_vertex_or_ngg ? "vertex or NGG" : "fragment");
      aco_print_program(program, stderr);
      abort();
   }
 }

-static void insert_code(asm_context& ctx, std::vector<uint32_t>& out, unsigned insert_before,
-                        unsigned insert_count, const uint32_t *insert_data)
+static void
+insert_code(asm_context& ctx, std::vector<uint32_t>& out, unsigned insert_before,
+            unsigned insert_count, const uint32_t* insert_data)
 {
   out.insert(out.begin() + insert_before, insert_data, insert_data + insert_count);

@ -804,9 +827,9 @@ static void insert_code(asm_context& ctx, std::vector<uint32_t>& out, unsigned i
   }

   /* Find first branch after the inserted code */
-   auto branch_it = std::find_if(ctx.branches.begin(), ctx.branches.end(), [insert_before](const auto &branch) -> bool {
-      return (unsigned)branch.first >= insert_before;
-   });
+   auto branch_it = std::find_if(ctx.branches.begin(), ctx.branches.end(),
+                                 [insert_before](const auto& branch) -> bool
+                                 { return (unsigned)branch.first >= insert_before; });

   /* Update the locations of branches */
   for (; branch_it != ctx.branches.end(); ++branch_it)
@ -822,15 +845,21 @@ static void insert_code(asm_context& ctx, std::vector<uint32_t>& out, unsigned i
   }
 }

-static void fix_branches_gfx10(asm_context& ctx, std::vector<uint32_t>& out)
+static void
+fix_branches_gfx10(asm_context& ctx, std::vector<uint32_t>& out)
 {
-   /* Branches with an offset of 0x3f are buggy on GFX10, we workaround by inserting NOPs if needed. */
+   /* Branches with an offset of 0x3f are buggy on GFX10,
+    * we workaround by inserting NOPs if needed.
+    */
   bool gfx10_3f_bug = false;

   do {
-      auto buggy_branch_it = std::find_if(ctx.branches.begin(), ctx.branches.end(), [&ctx](const auto &branch) -> bool {
-         return ((int)ctx.program->blocks[branch.second->block].offset - branch.first - 1) == 0x3f;
-      });
+      auto buggy_branch_it = std::find_if(
+         ctx.branches.begin(), ctx.branches.end(),
+         [&ctx](const auto& branch) -> bool {
+            return ((int)ctx.program->blocks[branch.second->block].offset - branch.first - 1) ==
+                   0x3f;
+         });

      gfx10_3f_bug = buggy_branch_it != ctx.branches.end();

@ -842,7 +871,9 @@ static void fix_branches_gfx10(asm_context& ctx, std::vector<uint32_t>& out)
   } while (gfx10_3f_bug);
 }

-void emit_long_jump(asm_context& ctx, SOPP_instruction *branch, bool backwards, std::vector<uint32_t>& out)
+void
+emit_long_jump(asm_context& ctx, SOPP_instruction* branch, bool backwards,
+               std::vector<uint32_t>& out)
 {
   Builder bld(ctx.program);

@ -857,26 +888,13 @@ void emit_long_jump(asm_context& ctx, SOPP_instruction *branch, bool backwards,
      /* for conditional branches, skip the long jump if the condition is false */
      aco_opcode inv;
      switch (branch->opcode) {
-      case aco_opcode::s_cbranch_scc0:
-         inv = aco_opcode::s_cbranch_scc1;
-         break;
-      case aco_opcode::s_cbranch_scc1:
-         inv = aco_opcode::s_cbranch_scc0;
-         break;
-      case aco_opcode::s_cbranch_vccz:
-         inv = aco_opcode::s_cbranch_vccnz;
-         break;
-      case aco_opcode::s_cbranch_vccnz:
-         inv = aco_opcode::s_cbranch_vccz;
-         break;
-      case aco_opcode::s_cbranch_execz:
-         inv = aco_opcode::s_cbranch_execnz;
-         break;
-      case aco_opcode::s_cbranch_execnz:
-         inv = aco_opcode::s_cbranch_execz;
-         break;
-      default:
-         unreachable("Unhandled long jump.");
+      case aco_opcode::s_cbranch_scc0: inv = aco_opcode::s_cbranch_scc1; break;
+      case aco_opcode::s_cbranch_scc1: inv = aco_opcode::s_cbranch_scc0; break;
+      case aco_opcode::s_cbranch_vccz: inv = aco_opcode::s_cbranch_vccnz; break;
+      case aco_opcode::s_cbranch_vccnz: inv = aco_opcode::s_cbranch_vccz; break;
+      case aco_opcode::s_cbranch_execz: inv = aco_opcode::s_cbranch_execnz; break;
+      case aco_opcode::s_cbranch_execnz: inv = aco_opcode::s_cbranch_execz; break;
+      default: unreachable("Unhandled long jump.");
      }
      instr.reset(bld.sopp(inv, -1, 7));
      emit_instruction(ctx, out, instr.get());
@ -891,7 +909,9 @@ void emit_long_jump(asm_context& ctx, SOPP_instruction *branch, bool backwards,
   emit_instruction(ctx, out, instr.get());
   branch->pass_flags = out.size();

-   instr.reset(bld.sop2(aco_opcode::s_addc_u32, def_tmp_hi, op_tmp_hi, Operand(backwards ? UINT32_MAX : 0u)).instr);
+   instr.reset(
+      bld.sop2(aco_opcode::s_addc_u32, def_tmp_hi, op_tmp_hi, Operand(backwards ? UINT32_MAX : 0u))
+         .instr);
   emit_instruction(ctx, out, instr.get());

   /* restore SCC and clear the LSB of the new PC */
@ -901,11 +921,13 @@ void emit_long_jump(asm_context& ctx, SOPP_instruction *branch, bool backwards,
   emit_instruction(ctx, out, instr.get());

   /* create the s_setpc_b64 to jump */
-   instr.reset(bld.sop1(aco_opcode::s_setpc_b64, Operand(branch->definitions[0].physReg(), s2)).instr);
+   instr.reset(
+      bld.sop1(aco_opcode::s_setpc_b64, Operand(branch->definitions[0].physReg(), s2)).instr);
   emit_instruction(ctx, out, instr.get());
 }

-void fix_branches(asm_context& ctx, std::vector<uint32_t>& out)
+void
+fix_branches(asm_context& ctx, std::vector<uint32_t>& out)
 {
   bool repeat = false;
   do {
@ -914,11 +936,12 @@ void fix_branches(asm_context& ctx, std::vector<uint32_t>& out)
      if (ctx.chip_class == GFX10)
         fix_branches_gfx10(ctx, out);

-      for (std::pair<int, SOPP_instruction*> &branch : ctx.branches) {
+      for (std::pair<int, SOPP_instruction*>& branch : ctx.branches) {
         int offset = (int)ctx.program->blocks[branch.second->block].offset - branch.first - 1;
         if ((offset < INT16_MIN || offset > INT16_MAX) && !branch.second->pass_flags) {
            std::vector<uint32_t> long_jump;
-            bool backwards = ctx.program->blocks[branch.second->block].offset < (unsigned)branch.first;
+            bool backwards =
+               ctx.program->blocks[branch.second->block].offset < (unsigned)branch.first;
            emit_long_jump(ctx, branch.second, backwards, long_jump);

            out[branch.first] = long_jump[0];
@ -934,13 +957,14 @@ void fix_branches(asm_context& ctx, std::vector<uint32_t>& out)
            out[branch.first + branch.second->pass_flags - 1] = offset * 4;
         } else {
            out[branch.first] &= 0xffff0000u;
-            out[branch.first] |= (uint16_t) offset;
+            out[branch.first] |= (uint16_t)offset;
         }
      }
   } while (repeat);
 }

-void fix_constaddrs(asm_context& ctx, std::vector<uint32_t>& out)
+void
+fix_constaddrs(asm_context& ctx, std::vector<uint32_t>& out)
 {
   for (auto& constaddr : ctx.constaddrs) {
      constaddr_info& info = constaddr.second;
@ -948,13 +972,12 @@ void fix_constaddrs(asm_context& ctx, std::vector<uint32_t>& out)
   }
 }

-unsigned emit_program(Program* program,
-                      std::vector<uint32_t>& code)
+unsigned
+emit_program(Program* program, std::vector<uint32_t>& code)
 {
   asm_context ctx(program);

-   if (program->stage.hw == HWStage::VS ||
-       program->stage.hw == HWStage::FS ||
+   if (program->stage.hw == HWStage::VS || program->stage.hw == HWStage::FS ||
       program->stage.hw == HWStage::NGG)
      fix_exports(ctx, code, program);

@ -986,4 +1009,4 @@ unsigned emit_program(Program* program,
   return exec_size;
 }

-}
+} // namespace aco
--- a/src/amd/compiler/aco_dead_code_analysis.cpp
+++ b/src/amd/compiler/aco_dead_code_analysis.cpp
@ -40,7 +40,8 @@ struct dce_ctx {
   std::vector<uint16_t> uses;
   std::vector<std::vector<bool>> live;

-   dce_ctx(Program* program) : current_block(program->blocks.size() - 1), uses(program->peekAllocationId())
+   dce_ctx(Program* program)
+       : current_block(program->blocks.size() - 1), uses(program->peekAllocationId())
   {
      live.reserve(program->blocks.size());
      for (Block& block : program->blocks)
@ -48,7 +49,8 @@ struct dce_ctx {
   }
 };

-void process_block(dce_ctx& ctx, Block& block)
+void
+process_block(dce_ctx& ctx, Block& block)
 {
   std::vector<bool>& live = ctx.live[block.index];
   assert(live.size() == block.instructions.size());
@ -72,23 +74,26 @@ void process_block(dce_ctx& ctx, Block& block)

   if (process_predecessors) {
      for (unsigned pred_idx : block.linear_preds)
-         ctx.current_block = std::max(ctx.current_block, (int) pred_idx);
+         ctx.current_block = std::max(ctx.current_block, (int)pred_idx);
   }
 }

 } /* end namespace */

-bool is_dead(const std::vector<uint16_t>& uses, Instruction *instr)
+bool
+is_dead(const std::vector<uint16_t>& uses, Instruction* instr)
 {
   if (instr->definitions.empty() || instr->isBranch())
      return false;
   if (std::any_of(instr->definitions.begin(), instr->definitions.end(),
-          [&uses] (const Definition& def) { return !def.isTemp() || uses[def.tempId()];}))
+                   [&uses](const Definition& def) { return !def.isTemp() || uses[def.tempId()]; }))
      return false;
   return !(get_sync_info(instr).semantics & (semantic_volatile | semantic_acqrel));
 }

-std::vector<uint16_t> dead_code_analysis(Program *program) {
+std::vector<uint16_t>
+dead_code_analysis(Program* program)
+{

   dce_ctx ctx(program);

@ -105,5 +110,4 @@ std::vector<uint16_t> dead_code_analysis(Program *program) {
   return ctx.uses;
 }

-}
-
+} // namespace aco
--- a/src/amd/compiler/aco_dominance.cpp
+++ b/src/amd/compiler/aco_dominance.cpp
@ -38,7 +38,8 @@

 namespace aco {

-void dominator_tree(Program* program)
+void
+dominator_tree(Program* program)
 {
   program->blocks[0].logical_idom = 0;
   program->blocks[0].linear_idom = 0;
@ -48,7 +49,7 @@ void dominator_tree(Program* program)
      int new_logical_idom = -1;
      int new_linear_idom = -1;
      for (unsigned pred_idx : block.logical_preds) {
-         if ((int) program->blocks[pred_idx].logical_idom == -1)
+         if ((int)program->blocks[pred_idx].logical_idom == -1)
            continue;

         if (new_logical_idom == -1) {
@ -56,16 +57,16 @@ void dominator_tree(Program* program)
            continue;
         }

-         while ((int) pred_idx != new_logical_idom) {
-            if ((int) pred_idx > new_logical_idom)
+         while ((int)pred_idx != new_logical_idom) {
+            if ((int)pred_idx > new_logical_idom)
               pred_idx = program->blocks[pred_idx].logical_idom;
-            if ((int) pred_idx < new_logical_idom)
+            if ((int)pred_idx < new_logical_idom)
               new_logical_idom = program->blocks[new_logical_idom].logical_idom;
         }
      }

      for (unsigned pred_idx : block.linear_preds) {
-         if ((int) program->blocks[pred_idx].linear_idom == -1)
+         if ((int)program->blocks[pred_idx].linear_idom == -1)
            continue;

         if (new_linear_idom == -1) {
@ -73,10 +74,10 @@ void dominator_tree(Program* program)
            continue;
         }

-         while ((int) pred_idx != new_linear_idom) {
-            if ((int) pred_idx > new_linear_idom)
+         while ((int)pred_idx != new_linear_idom) {
+            if ((int)pred_idx > new_linear_idom)
               pred_idx = program->blocks[pred_idx].linear_idom;
-            if ((int) pred_idx < new_linear_idom)
+            if ((int)pred_idx < new_linear_idom)
               new_linear_idom = program->blocks[new_linear_idom].linear_idom;
         }
      }
@ -86,5 +87,5 @@ void dominator_tree(Program* program)
   }
 }

-}
+} // namespace aco
 #endif
--- a/src/amd/compiler/aco_form_hard_clauses.cpp
+++ b/src/amd/compiler/aco_form_hard_clauses.cpp
@ -31,15 +31,15 @@ namespace aco {
 namespace {

 /* there can also be LDS and VALU clauses, but I don't see how those are interesting */
-enum clause_type
-{
+enum clause_type {
   clause_vmem,
   clause_flat,
   clause_smem,
   clause_other,
 };

-void emit_clause(Builder& bld, unsigned num_instrs, aco_ptr<Instruction> *instrs)
+void
+emit_clause(Builder& bld, unsigned num_instrs, aco_ptr<Instruction>* instrs)
 {
   unsigned start = 0;

@ -61,7 +61,8 @@ void emit_clause(Builder& bld, unsigned num_instrs, aco_ptr<Instruction> *instrs

 } /* end namespace */

-void form_hard_clauses(Program *program)
+void
+form_hard_clauses(Program* program)
 {
   for (Block& block : program->blocks) {
      unsigned num_instrs = 0;
@ -77,7 +78,8 @@ void form_hard_clauses(Program *program)

         clause_type type = clause_other;
         if (instr->isVMEM() && !instr->operands.empty()) {
-            if (program->chip_class == GFX10 && instr->isMIMG() && get_mimg_nsa_dwords(instr.get()) > 0)
+            if (program->chip_class == GFX10 && instr->isMIMG() &&
+                get_mimg_nsa_dwords(instr.get()) > 0)
               type = clause_other;
            else
               type = clause_vmem;
@ -109,4 +111,4 @@ void form_hard_clauses(Program *program)
      block.instructions = std::move(new_instructions);
   }
 }
-}
+} // namespace aco
--- a/src/amd/compiler/aco_insert_NOPs.cpp
+++ b/src/amd/compiler/aco_insert_NOPs.cpp
@ -34,12 +34,15 @@ namespace aco {
 namespace {

 struct NOP_ctx_gfx6 {
-   void join(const NOP_ctx_gfx6 &other) {
-      set_vskip_mode_then_vector = MAX2(set_vskip_mode_then_vector, other.set_vskip_mode_then_vector);
+   void join(const NOP_ctx_gfx6& other)
+   {
+      set_vskip_mode_then_vector =
+         MAX2(set_vskip_mode_then_vector, other.set_vskip_mode_then_vector);
      valu_wr_vcc_then_vccz = MAX2(valu_wr_vcc_then_vccz, other.valu_wr_vcc_then_vccz);
      valu_wr_exec_then_execz = MAX2(valu_wr_exec_then_execz, other.valu_wr_exec_then_execz);
      valu_wr_vcc_then_div_fmas = MAX2(valu_wr_vcc_then_div_fmas, other.valu_wr_vcc_then_div_fmas);
-      salu_wr_m0_then_gds_msg_ttrace = MAX2(salu_wr_m0_then_gds_msg_ttrace, other.salu_wr_m0_then_gds_msg_ttrace);
+      salu_wr_m0_then_gds_msg_ttrace =
+         MAX2(salu_wr_m0_then_gds_msg_ttrace, other.salu_wr_m0_then_gds_msg_ttrace);
      valu_wr_exec_then_dpp = MAX2(valu_wr_exec_then_dpp, other.valu_wr_exec_then_dpp);
      salu_wr_m0_then_lds = MAX2(salu_wr_m0_then_lds, other.salu_wr_m0_then_lds);
      salu_wr_m0_then_moverel = MAX2(salu_wr_m0_then_moverel, other.salu_wr_m0_then_moverel);
@ -53,23 +56,21 @@ struct NOP_ctx_gfx6 {
      }
   }

-   bool operator==(const NOP_ctx_gfx6 &other)
+   bool operator==(const NOP_ctx_gfx6& other)
   {
-      return
-         set_vskip_mode_then_vector == other.set_vskip_mode_then_vector &&
-         valu_wr_vcc_then_vccz == other.valu_wr_vcc_then_vccz &&
-         valu_wr_exec_then_execz == other.valu_wr_exec_then_execz &&
-         valu_wr_vcc_then_div_fmas == other.valu_wr_vcc_then_div_fmas &&
-         vmem_store_then_wr_data == other.vmem_store_then_wr_data &&
-         salu_wr_m0_then_gds_msg_ttrace == other.salu_wr_m0_then_gds_msg_ttrace &&
-         valu_wr_exec_then_dpp == other.valu_wr_exec_then_dpp &&
-         salu_wr_m0_then_lds == other.salu_wr_m0_then_lds &&
-         salu_wr_m0_then_moverel == other.salu_wr_m0_then_moverel &&
-         setreg_then_getsetreg == other.setreg_then_getsetreg &&
-         smem_clause == other.smem_clause &&
-         smem_write == other.smem_write &&
-         BITSET_EQUAL(smem_clause_read_write, other.smem_clause_read_write) &&
-         BITSET_EQUAL(smem_clause_write, other.smem_clause_write);
+      return set_vskip_mode_then_vector == other.set_vskip_mode_then_vector &&
+             valu_wr_vcc_then_vccz == other.valu_wr_vcc_then_vccz &&
+             valu_wr_exec_then_execz == other.valu_wr_exec_then_execz &&
+             valu_wr_vcc_then_div_fmas == other.valu_wr_vcc_then_div_fmas &&
+             vmem_store_then_wr_data == other.vmem_store_then_wr_data &&
+             salu_wr_m0_then_gds_msg_ttrace == other.salu_wr_m0_then_gds_msg_ttrace &&
+             valu_wr_exec_then_dpp == other.valu_wr_exec_then_dpp &&
+             salu_wr_m0_then_lds == other.salu_wr_m0_then_lds &&
+             salu_wr_m0_then_moverel == other.salu_wr_m0_then_moverel &&
+             setreg_then_getsetreg == other.setreg_then_getsetreg &&
+             smem_clause == other.smem_clause && smem_write == other.smem_write &&
+             BITSET_EQUAL(smem_clause_read_write, other.smem_clause_read_write) &&
+             BITSET_EQUAL(smem_clause_write, other.smem_clause_write);
   }

   void add_wait_states(unsigned amount)
@ -154,7 +155,8 @@ struct NOP_ctx_gfx10 {
   std::bitset<128> sgprs_read_by_VMEM;
   std::bitset<128> sgprs_read_by_SMEM;

-   void join(const NOP_ctx_gfx10 &other) {
+   void join(const NOP_ctx_gfx10& other)
+   {
      has_VOPC |= other.has_VOPC;
      has_nonVALU_exec_read |= other.has_nonVALU_exec_read;
      has_VMEM |= other.has_VMEM;
@ -167,23 +169,19 @@ struct NOP_ctx_gfx10 {
      sgprs_read_by_SMEM |= other.sgprs_read_by_SMEM;
   }

-   bool operator==(const NOP_ctx_gfx10 &other)
+   bool operator==(const NOP_ctx_gfx10& other)
   {
-      return
-         has_VOPC == other.has_VOPC &&
-         has_nonVALU_exec_read == other.has_nonVALU_exec_read &&
-         has_VMEM == other.has_VMEM &&
-         has_branch_after_VMEM == other.has_branch_after_VMEM &&
-         has_DS == other.has_DS &&
-         has_branch_after_DS == other.has_branch_after_DS &&
-         has_NSA_MIMG == other.has_NSA_MIMG &&
-         has_writelane == other.has_writelane &&
-         sgprs_read_by_VMEM == other.sgprs_read_by_VMEM &&
-         sgprs_read_by_SMEM == other.sgprs_read_by_SMEM;
+      return has_VOPC == other.has_VOPC && has_nonVALU_exec_read == other.has_nonVALU_exec_read &&
+             has_VMEM == other.has_VMEM && has_branch_after_VMEM == other.has_branch_after_VMEM &&
+             has_DS == other.has_DS && has_branch_after_DS == other.has_branch_after_DS &&
+             has_NSA_MIMG == other.has_NSA_MIMG && has_writelane == other.has_writelane &&
+             sgprs_read_by_VMEM == other.sgprs_read_by_VMEM &&
+             sgprs_read_by_SMEM == other.sgprs_read_by_SMEM;
   }
 };

-int get_wait_states(aco_ptr<Instruction>& instr)
+int
+get_wait_states(aco_ptr<Instruction>& instr)
 {
   if (instr->opcode == aco_opcode::s_nop)
      return instr->sopp().imm + 1;
@ -193,16 +191,16 @@ int get_wait_states(aco_ptr<Instruction>& instr)
      return 1;
 }

-bool regs_intersect(PhysReg a_reg, unsigned a_size, PhysReg b_reg, unsigned b_size)
+bool
+regs_intersect(PhysReg a_reg, unsigned a_size, PhysReg b_reg, unsigned b_size)
 {
-   return a_reg > b_reg ?
-          (a_reg - b_reg < b_size) :
-          (b_reg - a_reg < a_size);
+   return a_reg > b_reg ? (a_reg - b_reg < b_size) : (b_reg - a_reg < a_size);
 }

 template <bool Valu, bool Vintrp, bool Salu>
-int handle_raw_hazard_internal(Program *program, Block *block,
-                               int nops_needed, PhysReg reg, uint32_t mask)
+int
+handle_raw_hazard_internal(Program* program, Block* block, int nops_needed, PhysReg reg,
+                           uint32_t mask)
 {
   unsigned mask_size = util_last_bit(mask);
   for (int pred_idx = block->instructions.size() - 1; pred_idx >= 0; pred_idx--) {
@ -217,10 +215,8 @@ int handle_raw_hazard_internal(Program *program, Block *block,
         }
      }

-      bool is_hazard = writemask != 0 &&
-                       ((pred->isVALU() && Valu) ||
-                        (pred->isVINTRP() && Vintrp) ||
-                        (pred->isSALU() && Salu));
+      bool is_hazard = writemask != 0 && ((pred->isVALU() && Valu) ||
+                                          (pred->isVINTRP() && Vintrp) || (pred->isSALU() && Salu));
      if (is_hazard)
         return nops_needed;

@ -238,17 +234,19 @@ int handle_raw_hazard_internal(Program *program, Block *block,
    * huge value. */
   for (unsigned lin_pred : block->linear_preds) {
      res = std::max(res, handle_raw_hazard_internal<Valu, Vintrp, Salu>(
-         program, &program->blocks[lin_pred], nops_needed, reg, mask));
+                             program, &program->blocks[lin_pred], nops_needed, reg, mask));
   }
   return res;
 }

 template <bool Valu, bool Vintrp, bool Salu>
-void handle_raw_hazard(Program *program, Block *cur_block, int *NOPs, int min_states, Operand op)
+void
+handle_raw_hazard(Program* program, Block* cur_block, int* NOPs, int min_states, Operand op)
 {
   if (*NOPs >= min_states)
      return;
-   int res = handle_raw_hazard_internal<Valu, Vintrp, Salu>(program, cur_block, min_states, op.physReg(), u_bit_consecutive(0, op.size()));
+   int res = handle_raw_hazard_internal<Valu, Vintrp, Salu>(
+      program, cur_block, min_states, op.physReg(), u_bit_consecutive(0, op.size()));
   *NOPs = MAX2(*NOPs, res);
 }

@ -256,7 +254,9 @@ static auto handle_valu_then_read_hazard = handle_raw_hazard<true, true, false>;
 static auto handle_vintrp_then_read_hazard = handle_raw_hazard<false, true, false>;
 static auto handle_valu_salu_then_read_hazard = handle_raw_hazard<true, true, true>;

-void set_bitset_range(BITSET_WORD *words, unsigned start, unsigned size) {
+void
+set_bitset_range(BITSET_WORD* words, unsigned start, unsigned size)
+{
   unsigned end = start + size - 1;
   unsigned start_mod = start % BITSET_WORDBITS;
   if (start_mod + size <= BITSET_WORDBITS) {
@ -268,7 +268,9 @@ void set_bitset_range(BITSET_WORD *words, unsigned start, unsigned size) {
   }
 }

-bool test_bitset_range(BITSET_WORD *words, unsigned start, unsigned size) {
+bool
+test_bitset_range(BITSET_WORD* words, unsigned start, unsigned size)
+{
   unsigned end = start + size - 1;
   unsigned start_mod = start % BITSET_WORDBITS;
   if (start_mod + size <= BITSET_WORDBITS) {
@ -291,18 +293,21 @@ bool test_bitset_range(BITSET_WORD *words, unsigned start, unsigned size) {
 *
 * SMEM clauses are only present on GFX8+, and only matter when XNACK is set.
 */
-void handle_smem_clause_hazards(Program *program, NOP_ctx_gfx6 &ctx,
-                                aco_ptr<Instruction>& instr, int *NOPs)
+void
+handle_smem_clause_hazards(Program* program, NOP_ctx_gfx6& ctx, aco_ptr<Instruction>& instr,
+                           int* NOPs)
 {
   /* break off from previous SMEM clause if needed */
   if (!*NOPs & (ctx.smem_clause || ctx.smem_write)) {
      /* Don't allow clauses with store instructions since the clause's
       * instructions may use the same address. */
-      if (ctx.smem_write || instr->definitions.empty() || instr_info.is_atomic[(unsigned)instr->opcode]) {
+      if (ctx.smem_write || instr->definitions.empty() ||
+          instr_info.is_atomic[(unsigned)instr->opcode]) {
         *NOPs = 1;
      } else if (program->dev.xnack_enabled) {
         for (Operand op : instr->operands) {
-            if (!op.isConstant() && test_bitset_range(ctx.smem_clause_write, op.physReg(), op.size())) {
+            if (!op.isConstant() &&
+                test_bitset_range(ctx.smem_clause_write, op.physReg(), op.size())) {
               *NOPs = 1;
               break;
            }
@ -316,8 +321,10 @@ void handle_smem_clause_hazards(Program *program, NOP_ctx_gfx6 &ctx,
 }

 /* TODO: we don't handle accessing VCC using the actual SGPR instead of using the alias */
-void handle_instruction_gfx6(Program *program, Block *cur_block, NOP_ctx_gfx6 &ctx,
-                             aco_ptr<Instruction>& instr, std::vector<aco_ptr<Instruction>>& new_instructions)
+void
+handle_instruction_gfx6(Program* program, Block* cur_block, NOP_ctx_gfx6& ctx,
+                        aco_ptr<Instruction>& instr,
+                        std::vector<aco_ptr<Instruction>>& new_instructions)
 {
   /* check hazards */
   int NOPs = 0;
@ -343,14 +350,17 @@ void handle_instruction_gfx6(Program *program, Block *cur_block, NOP_ctx_gfx6 &c

      handle_smem_clause_hazards(program, ctx, instr, &NOPs);
   } else if (instr->isSALU()) {
-      if (instr->opcode == aco_opcode::s_setreg_b32 || instr->opcode == aco_opcode::s_setreg_imm32_b32 ||
+      if (instr->opcode == aco_opcode::s_setreg_b32 ||
+          instr->opcode == aco_opcode::s_setreg_imm32_b32 ||
          instr->opcode == aco_opcode::s_getreg_b32) {
         NOPs = MAX2(NOPs, ctx.setreg_then_getsetreg);
      }

      if (program->chip_class == GFX9) {
-         if (instr->opcode == aco_opcode::s_movrels_b32 || instr->opcode == aco_opcode::s_movrels_b64 ||
-             instr->opcode == aco_opcode::s_movreld_b32 || instr->opcode == aco_opcode::s_movreld_b64) {
+         if (instr->opcode == aco_opcode::s_movrels_b32 ||
+             instr->opcode == aco_opcode::s_movrels_b64 ||
+             instr->opcode == aco_opcode::s_movreld_b32 ||
+             instr->opcode == aco_opcode::s_movreld_b64) {
            NOPs = MAX2(NOPs, ctx.salu_wr_m0_then_moverel);
         }
      }
@ -398,7 +408,8 @@ void handle_instruction_gfx6(Program *program, Block *cur_block, NOP_ctx_gfx6 &c
         handle_vintrp_then_read_hazard(program, cur_block, &NOPs, 1, instr->operands[0]);
      }

-      if (instr->opcode == aco_opcode::v_div_fmas_f32 || instr->opcode == aco_opcode::v_div_fmas_f64)
+      if (instr->opcode == aco_opcode::v_div_fmas_f32 ||
+          instr->opcode == aco_opcode::v_div_fmas_f64)
         NOPs = MAX2(NOPs, ctx.valu_wr_vcc_then_div_fmas);
   } else if (instr->isVMEM() || instr->isFlatLike()) {
      /* If the VALU writes the SGPR that is used by a VMEM, the user must add five wait states. */
@ -412,13 +423,11 @@ void handle_instruction_gfx6(Program *program, Block *cur_block, NOP_ctx_gfx6 &c
      NOPs = MAX2(NOPs, ctx.set_vskip_mode_then_vector);

   if (program->chip_class == GFX9) {
-      bool lds_scratch_global = (instr->isScratch() || instr->isGlobal()) &&
-                                instr->flatlike().lds;
-      if (instr->isVINTRP() ||
+      bool lds_scratch_global = (instr->isScratch() || instr->isGlobal()) && instr->flatlike().lds;
+      if (instr->isVINTRP() || lds_scratch_global ||
          instr->opcode == aco_opcode::ds_read_addtid_b32 ||
          instr->opcode == aco_opcode::ds_write_addtid_b32 ||
-          instr->opcode == aco_opcode::buffer_store_lds_dword ||
-          lds_scratch_global) {
+          instr->opcode == aco_opcode::buffer_store_lds_dword) {
         NOPs = MAX2(NOPs, ctx.salu_wr_m0_then_lds);
      }
   }
@ -428,7 +437,8 @@ void handle_instruction_gfx6(Program *program, Block *cur_block, NOP_ctx_gfx6 &c
   // TODO: try to schedule the NOP-causing instruction up to reduce the number of stall cycles
   if (NOPs) {
      /* create NOP */
-      aco_ptr<SOPP_instruction> nop{create_instruction<SOPP_instruction>(aco_opcode::s_nop, Format::SOPP, 0, 0)};
+      aco_ptr<SOPP_instruction> nop{
+         create_instruction<SOPP_instruction>(aco_opcode::s_nop, Format::SOPP, 0, 0)};
      nop->imm = NOPs - 1;
      nop->block = -1;
      new_instructions.emplace_back(std::move(nop));
@ -485,7 +495,8 @@ void handle_instruction_gfx6(Program *program, Block *cur_block, NOP_ctx_gfx6 &c
            ctx.salu_wr_m0_then_lds = 1;
            ctx.salu_wr_m0_then_moverel = 1;
         }
-      } else if (instr->opcode == aco_opcode::s_setreg_b32 || instr->opcode == aco_opcode::s_setreg_imm32_b32) {
+      } else if (instr->opcode == aco_opcode::s_setreg_b32 ||
+                 instr->opcode == aco_opcode::s_setreg_imm32_b32) {
         SOPK_instruction& sopk = instr->sopk();
         unsigned offset = (sopk.imm >> 6) & 0x1f;
         unsigned size = ((sopk.imm >> 11) & 0x1f) + 1;
@ -497,19 +508,16 @@ void handle_instruction_gfx6(Program *program, Block *cur_block, NOP_ctx_gfx6 &c
      }
   } else if (instr->isVMEM() || instr->isFlatLike()) {
      /* >64-bit MUBUF/MTBUF store with a constant in SOFFSET */
-      bool consider_buf = (instr->isMUBUF() || instr->isMTBUF()) &&
-                          instr->operands.size() == 4 &&
-                          instr->operands[3].size() > 2 &&
-                          instr->operands[2].physReg() >= 128;
-      /* MIMG store with a 128-bit T# with more than two bits set in dmask (making it a >64-bit store) */
+      bool consider_buf = (instr->isMUBUF() || instr->isMTBUF()) && instr->operands.size() == 4 &&
+                          instr->operands[3].size() > 2 && instr->operands[2].physReg() >= 128;
+      /* MIMG store with a 128-bit T# with more than two bits set in dmask (making it a >64-bit
+       * store) */
      bool consider_mimg = instr->isMIMG() &&
                           instr->operands[1].regClass().type() == RegType::vgpr &&
-                           instr->operands[1].size() > 2 &&
-                           instr->operands[0].size() == 4;
+                           instr->operands[1].size() > 2 && instr->operands[0].size() == 4;
      /* FLAT/GLOBAL/SCRATCH store with >64-bit data */
-      bool consider_flat = instr->isFlatLike() &&
-                           instr->operands.size() == 3 &&
-                           instr->operands[2].size() > 2;
+      bool consider_flat =
+         instr->isFlatLike() && instr->operands.size() == 3 && instr->operands[2].size() > 2;
      if (consider_buf || consider_mimg || consider_flat) {
         PhysReg wrdata = instr->operands[consider_flat ? 2 : 3].physReg();
         unsigned size = instr->operands[consider_flat ? 2 : 3].size();
@ -520,22 +528,26 @@ void handle_instruction_gfx6(Program *program, Block *cur_block, NOP_ctx_gfx6 &c
 }

 template <std::size_t N>
-bool check_written_regs(const aco_ptr<Instruction> &instr, const std::bitset<N> &check_regs)
+bool
+check_written_regs(const aco_ptr<Instruction>& instr, const std::bitset<N>& check_regs)
 {
-   return std::any_of(instr->definitions.begin(), instr->definitions.end(), [&check_regs](const Definition &def) -> bool {
-      bool writes_any = false;
-      for (unsigned i = 0; i < def.size(); i++) {
-         unsigned def_reg = def.physReg() + i;
-         writes_any |= def_reg < check_regs.size() && check_regs[def_reg];
-      }
-      return writes_any;
-   });
+   return std::any_of(instr->definitions.begin(), instr->definitions.end(),
+                      [&check_regs](const Definition& def) -> bool
+                      {
+                         bool writes_any = false;
+                         for (unsigned i = 0; i < def.size(); i++) {
+                            unsigned def_reg = def.physReg() + i;
+                            writes_any |= def_reg < check_regs.size() && check_regs[def_reg];
+                         }
+                         return writes_any;
+                      });
 }

 template <std::size_t N>
-void mark_read_regs(const aco_ptr<Instruction> &instr, std::bitset<N> &reg_reads)
+void
+mark_read_regs(const aco_ptr<Instruction>& instr, std::bitset<N>& reg_reads)
 {
-   for (const Operand &op : instr->operands) {
+   for (const Operand& op : instr->operands) {
      for (unsigned i = 0; i < op.size(); i++) {
         unsigned reg = op.physReg() + i;
         if (reg < reg_reads.size())
@ -544,7 +556,8 @@ void mark_read_regs(const aco_ptr<Instruction> &instr, std::bitset<N> &reg_reads
   }
 }

-bool VALU_writes_sgpr(aco_ptr<Instruction>& instr)
+bool
+VALU_writes_sgpr(aco_ptr<Instruction>& instr)
 {
   if (instr->isVOPC())
      return true;
@ -557,24 +570,26 @@ bool VALU_writes_sgpr(aco_ptr<Instruction>& instr)
   return false;
 }

-bool instr_writes_exec(const aco_ptr<Instruction>& instr)
+bool
+instr_writes_exec(const aco_ptr<Instruction>& instr)
 {
-   return std::any_of(instr->definitions.begin(), instr->definitions.end(), [](const Definition &def) -> bool {
-      return def.physReg() == exec_lo || def.physReg() == exec_hi;
-   });
+   return std::any_of(instr->definitions.begin(), instr->definitions.end(),
+                      [](const Definition& def) -> bool
+                      { return def.physReg() == exec_lo || def.physReg() == exec_hi; });
 }

-bool instr_writes_sgpr(const aco_ptr<Instruction>& instr)
+bool
+instr_writes_sgpr(const aco_ptr<Instruction>& instr)
 {
-   return std::any_of(instr->definitions.begin(), instr->definitions.end(), [](const Definition &def) -> bool {
-      return def.getTemp().type() == RegType::sgpr;
-   });
+   return std::any_of(instr->definitions.begin(), instr->definitions.end(),
+                      [](const Definition& def) -> bool
+                      { return def.getTemp().type() == RegType::sgpr; });
 }

-inline bool instr_is_branch(const aco_ptr<Instruction>& instr)
+inline bool
+instr_is_branch(const aco_ptr<Instruction>& instr)
 {
-   return instr->opcode == aco_opcode::s_branch ||
-          instr->opcode == aco_opcode::s_cbranch_scc0 ||
+   return instr->opcode == aco_opcode::s_branch || instr->opcode == aco_opcode::s_cbranch_scc0 ||
          instr->opcode == aco_opcode::s_cbranch_scc1 ||
          instr->opcode == aco_opcode::s_cbranch_vccz ||
          instr->opcode == aco_opcode::s_cbranch_vccnz ||
@ -586,19 +601,20 @@ inline bool instr_is_branch(const aco_ptr<Instruction>& instr)
          instr->opcode == aco_opcode::s_cbranch_cdbgsys_and_user ||
          instr->opcode == aco_opcode::s_subvector_loop_begin ||
          instr->opcode == aco_opcode::s_subvector_loop_end ||
-          instr->opcode == aco_opcode::s_setpc_b64 ||
-          instr->opcode == aco_opcode::s_swappc_b64 ||
-          instr->opcode == aco_opcode::s_getpc_b64 ||
-          instr->opcode == aco_opcode::s_call_b64;
+          instr->opcode == aco_opcode::s_setpc_b64 || instr->opcode == aco_opcode::s_swappc_b64 ||
+          instr->opcode == aco_opcode::s_getpc_b64 || instr->opcode == aco_opcode::s_call_b64;
 }

-void handle_instruction_gfx10(Program *program, Block *cur_block, NOP_ctx_gfx10 &ctx,
-                              aco_ptr<Instruction>& instr, std::vector<aco_ptr<Instruction>>& new_instructions)
+void
+handle_instruction_gfx10(Program* program, Block* cur_block, NOP_ctx_gfx10& ctx,
+                         aco_ptr<Instruction>& instr,
+                         std::vector<aco_ptr<Instruction>>& new_instructions)
 {
-   //TODO: s_dcache_inv needs to be in it's own group on GFX10
+   // TODO: s_dcache_inv needs to be in it's own group on GFX10

   /* VMEMtoScalarWriteHazard
-    * Handle EXEC/M0/SGPR write following a VMEM instruction without a VALU or "waitcnt vmcnt(0)" in-between.
+    * Handle EXEC/M0/SGPR write following a VMEM instruction without a VALU or "waitcnt vmcnt(0)"
+    * in-between.
    */
   if (instr->isVMEM() || instr->isFlatLike() || instr->isDS()) {
      /* Remember all SGPRs that are read by the VMEM instruction */
@ -624,7 +640,8 @@ void handle_instruction_gfx10(Program *program, Block *cur_block, NOP_ctx_gfx10
         ctx.sgprs_read_by_VMEM.reset();

         /* Insert s_waitcnt_depctr instruction with magic imm to mitigate the problem */
-         aco_ptr<SOPP_instruction> depctr{create_instruction<SOPP_instruction>(aco_opcode::s_waitcnt_depctr, Format::SOPP, 0, 0)};
+         aco_ptr<SOPP_instruction> depctr{
+            create_instruction<SOPP_instruction>(aco_opcode::s_waitcnt_depctr, Format::SOPP, 0, 0)};
         depctr->imm = 0xffe3;
         depctr->block = -1;
         new_instructions.emplace_back(std::move(depctr));
@ -639,13 +656,13 @@ void handle_instruction_gfx10(Program *program, Block *cur_block, NOP_ctx_gfx10
    */
   if (instr->isVOPC()) {
      ctx.has_VOPC = true;
-   } else if (ctx.has_VOPC &&
-              (instr->opcode == aco_opcode::v_permlane16_b32 ||
-               instr->opcode == aco_opcode::v_permlanex16_b32)) {
+   } else if (ctx.has_VOPC && (instr->opcode == aco_opcode::v_permlane16_b32 ||
+                               instr->opcode == aco_opcode::v_permlanex16_b32)) {
      ctx.has_VOPC = false;

      /* v_nop would be discarded by SQ, so use v_mov with the first operand of the permlane */
-      aco_ptr<VOP1_instruction> v_mov{create_instruction<VOP1_instruction>(aco_opcode::v_mov_b32, Format::VOP1, 1, 1)};
+      aco_ptr<VOP1_instruction> v_mov{
+         create_instruction<VOP1_instruction>(aco_opcode::v_mov_b32, Format::VOP1, 1, 1)};
      v_mov->definitions[0] = Definition(instr->operands[0].physReg(), v1);
      v_mov->operands[0] = Operand(instr->operands[0].physReg(), v1);
      new_instructions.emplace_back(std::move(v_mov));
@ -663,7 +680,8 @@ void handle_instruction_gfx10(Program *program, Block *cur_block, NOP_ctx_gfx10
         ctx.has_nonVALU_exec_read = false;

         /* Insert s_waitcnt_depctr instruction with magic imm to mitigate the problem */
-         aco_ptr<SOPP_instruction> depctr{create_instruction<SOPP_instruction>(aco_opcode::s_waitcnt_depctr, Format::SOPP, 0, 0)};
+         aco_ptr<SOPP_instruction> depctr{
+            create_instruction<SOPP_instruction>(aco_opcode::s_waitcnt_depctr, Format::SOPP, 0, 0)};
         depctr->imm = 0xfffe;
         depctr->block = -1;
         new_instructions.emplace_back(std::move(depctr));
@ -689,7 +707,8 @@ void handle_instruction_gfx10(Program *program, Block *cur_block, NOP_ctx_gfx10
         ctx.sgprs_read_by_SMEM.reset();

         /* Insert s_mov to mitigate the problem */
-         aco_ptr<SOP1_instruction> s_mov{create_instruction<SOP1_instruction>(aco_opcode::s_mov_b32, Format::SOP1, 1, 1)};
+         aco_ptr<SOP1_instruction> s_mov{
+            create_instruction<SOP1_instruction>(aco_opcode::s_mov_b32, Format::SOP1, 1, 1)};
         s_mov->definitions[0] = Definition(sgpr_null, s1);
         s_mov->operands[0] = Operand(0u);
         new_instructions.emplace_back(std::move(s_mov));
@ -738,14 +757,16 @@ void handle_instruction_gfx10(Program *program, Block *cur_block, NOP_ctx_gfx10
      ctx.has_VMEM = ctx.has_branch_after_VMEM = ctx.has_DS = ctx.has_branch_after_DS = false;

      /* Insert s_waitcnt_vscnt to mitigate the problem */
-      aco_ptr<SOPK_instruction> wait{create_instruction<SOPK_instruction>(aco_opcode::s_waitcnt_vscnt, Format::SOPK, 0, 1)};
+      aco_ptr<SOPK_instruction> wait{
+         create_instruction<SOPK_instruction>(aco_opcode::s_waitcnt_vscnt, Format::SOPK, 0, 1)};
      wait->definitions[0] = Definition(sgpr_null, s1);
      wait->imm = 0;
      new_instructions.emplace_back(std::move(wait));
   }

   /* NSAToVMEMBug
-    * Handles NSA MIMG (4 or more dwords) immediately followed by MUBUF/MTBUF (with offset[2:1] != 0).
+    * Handles NSA MIMG (4 or more dwords) immediately followed by MUBUF/MTBUF (with offset[2:1] !=
+    * 0).
    */
   if (instr->isMIMG() && get_mimg_nsa_dwords(instr.get()) > 1) {
      ctx.has_NSA_MIMG = true;
@ -772,11 +793,12 @@ void handle_instruction_gfx10(Program *program, Block *cur_block, NOP_ctx_gfx10
 }

 template <typename Ctx>
-using HandleInstr = void (*)(Program *, Block *block, Ctx&, aco_ptr<Instruction>&,
+using HandleInstr = void (*)(Program*, Block* block, Ctx&, aco_ptr<Instruction>&,
                             std::vector<aco_ptr<Instruction>>&);

 template <typename Ctx, HandleInstr<Ctx> Handle>
-void handle_block(Program *program, Ctx& ctx, Block& block)
+void
+handle_block(Program* program, Ctx& ctx, Block& block)
 {
   if (block.instructions.empty())
      return;
@ -793,14 +815,15 @@ void handle_block(Program *program, Ctx& ctx, Block& block)
 }

 template <typename Ctx, HandleInstr<Ctx> Handle>
-void mitigate_hazards(Program *program)
+void
+mitigate_hazards(Program* program)
 {
   std::vector<Ctx> all_ctx(program->blocks.size());
   std::stack<unsigned> loop_header_indices;

   for (unsigned i = 0; i < program->blocks.size(); i++) {
      Block& block = program->blocks[i];
-      Ctx &ctx = all_ctx[i];
+      Ctx& ctx = all_ctx[i];

      if (block.kind & block_kind_loop_header) {
         loop_header_indices.push(i);
@ -832,7 +855,8 @@ void mitigate_hazards(Program *program)

 } /* end namespace */

-void insert_NOPs(Program* program)
+void
+insert_NOPs(Program* program)
 {
   if (program->chip_class >= GFX10_3)
      ; /* no hazards/bugs to mitigate */
@ -842,4 +866,4 @@ void insert_NOPs(Program* program)
      mitigate_hazards<NOP_ctx_gfx6, handle_instruction_gfx6>(program);
 }

-}
+} // namespace aco
--- a/src/amd/compiler/aco_insert_exec_mask.cpp
+++ b/src/amd/compiler/aco_insert_exec_mask.cpp
@ -24,6 +24,7 @@

 #include "aco_builder.h"
 #include "aco_ir.h"
+
 #include "util/u_math.h"

 #include <set>
@ -55,10 +56,9 @@ struct wqm_ctx {
   std::vector<uint16_t> defined_in;
   std::vector<bool> needs_wqm;
   std::vector<bool> branch_wqm; /* true if the branch condition in this block should be in wqm */
-   wqm_ctx(Program* program_) : program(program_),
-                               defined_in(program->peekAllocationId(), 0xFFFF),
-                               needs_wqm(program->peekAllocationId()),
-                               branch_wqm(program->blocks.size())
+   wqm_ctx(Program* program_)
+       : program(program_), defined_in(program->peekAllocationId(), 0xFFFF),
+         needs_wqm(program->peekAllocationId()), branch_wqm(program->blocks.size())
   {
      for (unsigned i = 0; i < program->blocks.size(); i++)
         worklist.insert(i);
@ -72,13 +72,15 @@ struct loop_info {
   bool has_divergent_break;
   bool has_divergent_continue;
   bool has_discard; /* has a discard or demote */
-   loop_info(Block* b, uint16_t num, uint8_t needs_, bool breaks, bool cont, bool discard) :
-             loop_header(b), num_exec_masks(num), needs(needs_), has_divergent_break(breaks),
-             has_divergent_continue(cont), has_discard(discard) {}
+   loop_info(Block* b, uint16_t num, uint8_t needs_, bool breaks, bool cont, bool discard)
+       : loop_header(b), num_exec_masks(num), needs(needs_), has_divergent_break(breaks),
+         has_divergent_continue(cont), has_discard(discard)
+   {}
 };

 struct block_info {
-   std::vector<std::pair<Operand, uint8_t>> exec; /* Vector of exec masks. Either a temporary or const -1. */
+   std::vector<std::pair<Operand, uint8_t>>
+      exec; /* Vector of exec masks. Either a temporary or const -1. */
   std::vector<WQMState> instr_needs;
   uint8_t block_needs;
   uint8_t ever_again_needs;
@ -87,14 +89,16 @@ struct block_info {
 };

 struct exec_ctx {
-   Program *program;
+   Program* program;
   std::vector<block_info> info;
   std::vector<loop_info> loop;
   bool handle_wqm = false;
-   exec_ctx(Program *program_) : program(program_), info(program->blocks.size()) {}
+   exec_ctx(Program* program_) : program(program_), info(program->blocks.size()) {}
 };

-bool needs_exact(aco_ptr<Instruction>& instr) {
+bool
+needs_exact(aco_ptr<Instruction>& instr)
+{
   if (instr->isMUBUF()) {
      return instr->mubuf().disable_wqm;
   } else if (instr->isMTBUF()) {
@ -108,7 +112,8 @@ bool needs_exact(aco_ptr<Instruction>& instr) {
   }
 }

-void set_needs_wqm(wqm_ctx &ctx, Temp tmp)
+void
+set_needs_wqm(wqm_ctx& ctx, Temp tmp)
 {
   if (!ctx.needs_wqm[tmp.id()]) {
      ctx.needs_wqm[tmp.id()] = true;
@ -117,7 +122,8 @@ void set_needs_wqm(wqm_ctx &ctx, Temp tmp)
   }
 }

-void mark_block_wqm(wqm_ctx &ctx, unsigned block_idx)
+void
+mark_block_wqm(wqm_ctx& ctx, unsigned block_idx)
 {
   if (ctx.branch_wqm[block_idx])
      return;
@ -136,7 +142,8 @@ void mark_block_wqm(wqm_ctx &ctx, unsigned block_idx)
      mark_block_wqm(ctx, pred_idx);
 }

-void get_block_needs(wqm_ctx &ctx, exec_ctx &exec_ctx, Block* block)
+void
+get_block_needs(wqm_ctx& ctx, exec_ctx& exec_ctx, Block* block)
 {
   block_info& info = exec_ctx.info[block->index];

@ -146,8 +153,8 @@ void get_block_needs(wqm_ctx &ctx, exec_ctx &exec_ctx, Block* block)
      aco_ptr<Instruction>& instr = block->instructions[i];

      WQMState needs = needs_exact(instr) ? Exact : Unspecified;
-      bool propagate_wqm = instr->opcode == aco_opcode::p_wqm ||
-                           instr->opcode == aco_opcode::p_as_uniform;
+      bool propagate_wqm =
+         instr->opcode == aco_opcode::p_wqm || instr->opcode == aco_opcode::p_as_uniform;
      bool preserve_wqm = instr->opcode == aco_opcode::p_discard_if;
      bool pred_by_exec = needs_exec_mask(instr.get());
      for (const Definition& definition : instr->definitions) {
@ -214,7 +221,8 @@ void get_block_needs(wqm_ctx &ctx, exec_ctx &exec_ctx, Block* block)
 * breaks, which might benefit from being in exact) by adding Exact_Branch to a
 * divergent branch surrounding the nested loop, if such a branch exists.
 */
-void handle_wqm_loops(wqm_ctx& ctx, exec_ctx& exec_ctx, unsigned preheader)
+void
+handle_wqm_loops(wqm_ctx& ctx, exec_ctx& exec_ctx, unsigned preheader)
 {
   for (unsigned idx = preheader + 1; idx < exec_ctx.program->blocks.size(); idx++) {
      Block& block = exec_ctx.program->blocks[idx];
@ -231,7 +239,8 @@ void handle_wqm_loops(wqm_ctx& ctx, exec_ctx& exec_ctx, unsigned preheader)
 * ensure that the exact exec mask is not empty by adding Exact_Branch to
 * the outer divergent branch.
 */
-void handle_exact_loops(wqm_ctx& ctx, exec_ctx& exec_ctx, unsigned preheader)
+void
+handle_exact_loops(wqm_ctx& ctx, exec_ctx& exec_ctx, unsigned preheader)
 {
   assert(exec_ctx.program->blocks[preheader + 1].kind & block_kind_loop_header);

@ -265,7 +274,8 @@ void handle_exact_loops(wqm_ctx& ctx, exec_ctx& exec_ctx, unsigned preheader)
   }
 }

-void calculate_wqm_needs(exec_ctx& exec_ctx)
+void
+calculate_wqm_needs(exec_ctx& exec_ctx)
 {
   wqm_ctx ctx(exec_ctx.program);

@ -307,14 +317,12 @@ void calculate_wqm_needs(exec_ctx& exec_ctx)
         exec_ctx.info[i].block_needs |= Exact;

      /* if discard is used somewhere in nested CF, we need to preserve the WQM mask */
-      if ((block.kind & block_kind_discard ||
-           block.kind & block_kind_uses_discard_if) &&
+      if ((block.kind & block_kind_discard || block.kind & block_kind_uses_discard_if) &&
          ever_again_needs & WQM)
         exec_ctx.info[i].block_needs |= Preserve_WQM;

      ever_again_needs |= exec_ctx.info[i].block_needs & ~Exact_Branch;
-      if (block.kind & block_kind_discard ||
-          block.kind & block_kind_uses_discard_if ||
+      if (block.kind & block_kind_discard || block.kind & block_kind_uses_discard_if ||
          block.kind & block_kind_uses_demote)
         ever_again_needs |= Exact;

@ -327,7 +335,8 @@ void calculate_wqm_needs(exec_ctx& exec_ctx)
   exec_ctx.handle_wqm = true;
 }

-Operand get_exec_op(Operand t)
+Operand
+get_exec_op(Operand t)
 {
   if (t.isUndefined())
      return Operand(exec, t.regClass());
@ -335,7 +344,8 @@ Operand get_exec_op(Operand t)
      return t;
 }

-void transition_to_WQM(exec_ctx& ctx, Builder bld, unsigned idx)
+void
+transition_to_WQM(exec_ctx& ctx, Builder bld, unsigned idx)
 {
   if (ctx.info[idx].exec.back().second & mask_type_wqm)
      return;
@ -346,7 +356,8 @@ void transition_to_WQM(exec_ctx& ctx, Builder bld, unsigned idx)
         ctx.info[idx].exec.back().first = exec_mask;
      }

-      exec_mask = bld.sop1(Builder::s_wqm, Definition(exec, bld.lm), bld.def(s1, scc), get_exec_op(exec_mask));
+      exec_mask = bld.sop1(Builder::s_wqm, Definition(exec, bld.lm), bld.def(s1, scc),
+                           get_exec_op(exec_mask));
      ctx.info[idx].exec.emplace_back(exec_mask, mask_type_global | mask_type_wqm);
      return;
   }
@ -355,11 +366,12 @@ void transition_to_WQM(exec_ctx& ctx, Builder bld, unsigned idx)
   assert(ctx.info[idx].exec.back().second & mask_type_wqm);
   assert(ctx.info[idx].exec.back().first.size() == bld.lm.size());
   assert(ctx.info[idx].exec.back().first.isTemp());
-   ctx.info[idx].exec.back().first = bld.pseudo(aco_opcode::p_parallelcopy, Definition(exec, bld.lm),
-                                                ctx.info[idx].exec.back().first);
+   ctx.info[idx].exec.back().first = bld.pseudo(
+      aco_opcode::p_parallelcopy, Definition(exec, bld.lm), ctx.info[idx].exec.back().first);
 }

-void transition_to_Exact(exec_ctx& ctx, Builder bld, unsigned idx)
+void
+transition_to_Exact(exec_ctx& ctx, Builder bld, unsigned idx)
 {
   if (ctx.info[idx].exec.back().second & mask_type_exact)
      return;
@ -372,8 +384,8 @@ void transition_to_Exact(exec_ctx& ctx, Builder bld, unsigned idx)
      assert(ctx.info[idx].exec.back().second & mask_type_exact);
      assert(ctx.info[idx].exec.back().first.size() == bld.lm.size());
      assert(ctx.info[idx].exec.back().first.isTemp());
-      ctx.info[idx].exec.back().first = bld.pseudo(aco_opcode::p_parallelcopy, Definition(exec, bld.lm),
-                                                   ctx.info[idx].exec.back().first);
+      ctx.info[idx].exec.back().first = bld.pseudo(
+         aco_opcode::p_parallelcopy, Definition(exec, bld.lm), ctx.info[idx].exec.back().first);
      return;
   }
   /* otherwise, we create an exact mask and push to the stack */
@ -382,14 +394,15 @@ void transition_to_Exact(exec_ctx& ctx, Builder bld, unsigned idx)
      wqm = bld.sop1(Builder::s_and_saveexec, bld.def(bld.lm), bld.def(s1, scc),
                     Definition(exec, bld.lm), ctx.info[idx].exec[0].first, Operand(exec, bld.lm));
   } else {
-      bld.sop2(Builder::s_and, Definition(exec, bld.lm), bld.def(s1, scc), ctx.info[idx].exec[0].first, wqm);
+      bld.sop2(Builder::s_and, Definition(exec, bld.lm), bld.def(s1, scc),
+               ctx.info[idx].exec[0].first, wqm);
   }
   ctx.info[idx].exec.back().first = Operand(wqm);
   ctx.info[idx].exec.emplace_back(Operand(bld.lm), mask_type_exact);
 }

-unsigned add_coupling_code(exec_ctx& ctx, Block* block,
-                           std::vector<aco_ptr<Instruction>>& instructions)
+unsigned
+add_coupling_code(exec_ctx& ctx, Block* block, std::vector<aco_ptr<Instruction>>& instructions)
 {
   unsigned idx = block->index;
   Builder bld(ctx.program, &instructions);
@ -417,7 +430,8 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block,
      } else {
         uint8_t mask = mask_type_global;
         if (ctx.program->needs_wqm) {
-            bld.sop1(Builder::s_wqm, Definition(exec, bld.lm), bld.def(s1, scc), Operand(exec, bld.lm));
+            bld.sop1(Builder::s_wqm, Definition(exec, bld.lm), bld.def(s1, scc),
+                     Operand(exec, bld.lm));
            mask |= mask_type_wqm;
         } else {
            mask |= mask_type_exact;
@ -440,7 +454,8 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block,
      if (info.has_discard) {
         aco_ptr<Pseudo_instruction> phi;
         for (int i = 0; i < info.num_exec_masks - 1; i++) {
-            phi.reset(create_instruction<Pseudo_instruction>(aco_opcode::p_linear_phi, Format::PSEUDO, preds.size(), 1));
+            phi.reset(create_instruction<Pseudo_instruction>(aco_opcode::p_linear_phi,
+                                                             Format::PSEUDO, preds.size(), 1));
            phi->definitions[0] = bld.def(bld.lm);
            phi->operands[0] = get_exec_op(ctx.info[preds[0]].exec[i].first);
            ctx.info[idx].exec[i].first = bld.insert(std::move(phi));
@ -450,14 +465,16 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block,
      /* create ssa name for restore mask */
      if (info.has_divergent_break) {
         /* this phi might be trivial but ensures a parallelcopy on the loop header */
-         aco_ptr<Pseudo_instruction> phi{create_instruction<Pseudo_instruction>(aco_opcode::p_linear_phi, Format::PSEUDO, preds.size(), 1)};
+         aco_ptr<Pseudo_instruction> phi{create_instruction<Pseudo_instruction>(
+            aco_opcode::p_linear_phi, Format::PSEUDO, preds.size(), 1)};
         phi->definitions[0] = bld.def(bld.lm);
         phi->operands[0] = get_exec_op(ctx.info[preds[0]].exec[info.num_exec_masks - 1].first);
         ctx.info[idx].exec.back().first = bld.insert(std::move(phi));
      }

      /* create ssa name for loop active mask */
-      aco_ptr<Pseudo_instruction> phi{create_instruction<Pseudo_instruction>(aco_opcode::p_linear_phi, Format::PSEUDO, preds.size(), 1)};
+      aco_ptr<Pseudo_instruction> phi{create_instruction<Pseudo_instruction>(
+         aco_opcode::p_linear_phi, Format::PSEUDO, preds.size(), 1)};
      if (info.has_divergent_continue)
         phi->definitions[0] = bld.def(bld.lm);
      else
@ -466,7 +483,8 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block,
      Temp loop_active = bld.insert(std::move(phi));

      if (info.has_divergent_break) {
-         uint8_t mask_type = (ctx.info[idx].exec.back().second & (mask_type_wqm | mask_type_exact)) | mask_type_loop;
+         uint8_t mask_type =
+            (ctx.info[idx].exec.back().second & (mask_type_wqm | mask_type_exact)) | mask_type_loop;
         ctx.info[idx].exec.emplace_back(loop_active, mask_type);
      } else {
         ctx.info[idx].exec.back().first = Operand(loop_active);
@ -482,8 +500,10 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block,
         }
         uint8_t mask_type = ctx.info[idx].exec.back().second & (mask_type_wqm | mask_type_exact);
         assert(ctx.info[idx].exec.back().first.size() == bld.lm.size());
-         ctx.info[idx].exec.emplace_back(bld.pseudo(aco_opcode::p_parallelcopy, Definition(exec, bld.lm),
-                                                    ctx.info[idx].exec.back().first), mask_type);
+         ctx.info[idx].exec.emplace_back(
+            bld.pseudo(aco_opcode::p_parallelcopy, Definition(exec, bld.lm),
+                       ctx.info[idx].exec.back().first),
+            mask_type);
      }

      return i;
@ -514,14 +534,16 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block,
         aco_ptr<Instruction>& phi = header->instructions[instr_idx++];
         assert(phi->opcode == aco_opcode::p_linear_phi);
         for (unsigned i = 1; i < phi->operands.size(); i++)
-            phi->operands[i] = get_exec_op(ctx.info[header_preds[i]].exec[info.num_exec_masks - 1].first);
+            phi->operands[i] =
+               get_exec_op(ctx.info[header_preds[i]].exec[info.num_exec_masks - 1].first);
      }

      if (info.has_divergent_break) {
         aco_ptr<Instruction>& phi = header->instructions[instr_idx];
         assert(phi->opcode == aco_opcode::p_linear_phi);
         for (unsigned i = 1; i < phi->operands.size(); i++)
-            phi->operands[i] = get_exec_op(ctx.info[header_preds[i]].exec[info.num_exec_masks].first);
+            phi->operands[i] =
+               get_exec_op(ctx.info[header_preds[i]].exec[info.num_exec_masks].first);
      }

      assert(!(block->kind & block_kind_top_level) || info.num_exec_masks <= 2);
@ -541,7 +563,8 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block,
            ctx.info[idx].exec.emplace_back(same, type);
         } else {
            /* create phi for loop footer */
-            aco_ptr<Pseudo_instruction> phi{create_instruction<Pseudo_instruction>(aco_opcode::p_linear_phi, Format::PSEUDO, preds.size(), 1)};
+            aco_ptr<Pseudo_instruction> phi{create_instruction<Pseudo_instruction>(
+               aco_opcode::p_linear_phi, Format::PSEUDO, preds.size(), 1)};
            phi->definitions[0] = bld.def(bld.lm);
            if (exec_idx == info.num_exec_masks - 1u) {
               phi->definitions[0] = Definition(exec, bld.lm);
@ -578,8 +601,8 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block,
      assert(ctx.info[idx].exec.back().first.size() == bld.lm.size());
      if (get_exec_op(ctx.info[idx].exec.back().first).isTemp()) {
         /* move current exec mask into exec register */
-         ctx.info[idx].exec.back().first = bld.pseudo(aco_opcode::p_parallelcopy, Definition(exec, bld.lm),
-                                                      ctx.info[idx].exec.back().first);
+         ctx.info[idx].exec.back().first = bld.pseudo(
+            aco_opcode::p_parallelcopy, Definition(exec, bld.lm), ctx.info[idx].exec.back().first);
      }

      ctx.loop.pop_back();
@ -591,8 +614,8 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block,
   } else {
      assert(preds.size() == 2);
      /* if one of the predecessors ends in exact mask, we pop it from stack */
-      unsigned num_exec_masks = std::min(ctx.info[preds[0]].exec.size(),
-                                         ctx.info[preds[1]].exec.size());
+      unsigned num_exec_masks =
+         std::min(ctx.info[preds[0]].exec.size(), ctx.info[preds[1]].exec.size());

      if (block->kind & block_kind_merge)
         num_exec_masks--;
@ -605,14 +628,16 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block,
         if (ctx.info[preds[0]].exec[i].first == ctx.info[preds[1]].exec[i].first) {
            Operand t = ctx.info[preds[0]].exec[i].first;
            /* discard/demote can change the state of the current exec mask */
-            assert(!t.isTemp() || ctx.info[preds[0]].exec[i].second == ctx.info[preds[1]].exec[i].second);
+            assert(!t.isTemp() ||
+                   ctx.info[preds[0]].exec[i].second == ctx.info[preds[1]].exec[i].second);
            uint8_t mask = ctx.info[preds[0]].exec[i].second & ctx.info[preds[1]].exec[i].second;
            ctx.info[idx].exec.emplace_back(t, mask);
            continue;
         }

         bool in_exec = i == num_exec_masks - 1 && !(block->kind & block_kind_merge);
-         Temp phi = bld.pseudo(aco_opcode::p_linear_phi, in_exec ? Definition(exec, bld.lm) : bld.def(bld.lm),
+         Temp phi = bld.pseudo(aco_opcode::p_linear_phi,
+                               in_exec ? Definition(exec, bld.lm) : bld.def(bld.lm),
                               get_exec_op(ctx.info[preds[0]].exec[i].first),
                               get_exec_op(ctx.info[preds[1]].exec[i].first));
         uint8_t mask_type = ctx.info[preds[0]].exec[i].second & ctx.info[preds[1]].exec[i].second;
@ -654,9 +679,9 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block,
   return i;
 }

-void process_instructions(exec_ctx& ctx, Block* block,
-                          std::vector<aco_ptr<Instruction>>& instructions,
-                          unsigned idx)
+void
+process_instructions(exec_ctx& ctx, Block* block, std::vector<aco_ptr<Instruction>>& instructions,
+                     unsigned idx)
 {
   WQMState state;
   if (ctx.info[block->index].exec.back().second & mask_type_wqm)
@ -667,17 +692,16 @@ void process_instructions(exec_ctx& ctx, Block* block,
   }

   /* if the block doesn't need both, WQM and Exact, we can skip processing the instructions */
-   bool process = (ctx.handle_wqm &&
-                   (ctx.info[block->index].block_needs & state) !=
-                   (ctx.info[block->index].block_needs & (WQM | Exact))) ||
+   bool process = (ctx.handle_wqm && (ctx.info[block->index].block_needs & state) !=
+                                        (ctx.info[block->index].block_needs & (WQM | Exact))) ||
                  block->kind & block_kind_uses_discard_if ||
-                  block->kind & block_kind_uses_demote ||
-                  block->kind & block_kind_needs_lowering;
+                  block->kind & block_kind_uses_demote || block->kind & block_kind_needs_lowering;
   if (!process) {
      std::vector<aco_ptr<Instruction>>::iterator it = std::next(block->instructions.begin(), idx);
      instructions.insert(instructions.end(),
                          std::move_iterator<std::vector<aco_ptr<Instruction>>::iterator>(it),
-                          std::move_iterator<std::vector<aco_ptr<Instruction>>::iterator>(block->instructions.end()));
+                          std::move_iterator<std::vector<aco_ptr<Instruction>>::iterator>(
+                             block->instructions.end()));
      return;
   }

@ -700,11 +724,13 @@ void process_instructions(exec_ctx& ctx, Block* block,
         /* discard from current exec */
         const Operand cond = instr->operands[0];
         Temp exit_cond = bld.sop2(Builder::s_andn2, Definition(exec, bld.lm), bld.def(s1, scc),
-                                   Operand(exec, bld.lm), cond).def(1).getTemp();
+                                   Operand(exec, bld.lm), cond)
+                             .def(1)
+                             .getTemp();

         /* discard from inner to outer exec mask on stack */
         for (int i = num - 2; i >= 0; i--) {
-            Instruction *andn2 = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc),
+            Instruction* andn2 = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc),
                                          ctx.info[block->index].exec[i].first, cond);
            ctx.info[block->index].exec[i].first = Operand(andn2->definitions[0].getTemp());
            exit_cond = andn2->definitions[1].getTemp();
@ -726,14 +752,16 @@ void process_instructions(exec_ctx& ctx, Block* block,
         Definition dst = instr->definitions[0];
         assert(dst.size() == bld.lm.size());
         if (state == Exact) {
-            instr.reset(create_instruction<SOP1_instruction>(bld.w64or32(Builder::s_mov), Format::SOP1, 1, 1));
+            instr.reset(create_instruction<SOP1_instruction>(bld.w64or32(Builder::s_mov),
+                                                             Format::SOP1, 1, 1));
            instr->operands[0] = Operand(0u);
            instr->definitions[0] = dst;
         } else {
            std::pair<Operand, uint8_t>& exact_mask = ctx.info[block->index].exec[0];
            assert(exact_mask.second & mask_type_exact);

-            instr.reset(create_instruction<SOP2_instruction>(bld.w64or32(Builder::s_andn2), Format::SOP2, 2, 2));
+            instr.reset(create_instruction<SOP2_instruction>(bld.w64or32(Builder::s_andn2),
+                                                             Format::SOP2, 2, 2));
            instr->operands[0] = Operand(exec, bld.lm); /* current exec */
            instr->operands[1] = Operand(exact_mask.first);
            instr->definitions[0] = dst;
@ -741,7 +769,8 @@ void process_instructions(exec_ctx& ctx, Block* block,
         }
      } else if (instr->opcode == aco_opcode::p_demote_to_helper) {
         /* turn demote into discard_if with only exact masks */
-         assert((ctx.info[block->index].exec[0].second & (mask_type_exact | mask_type_global)) == (mask_type_exact | mask_type_global));
+         assert((ctx.info[block->index].exec[0].second & (mask_type_exact | mask_type_global)) ==
+                (mask_type_exact | mask_type_global));

         int num;
         Temp cond, exit_cond;
@ -749,8 +778,9 @@ void process_instructions(exec_ctx& ctx, Block* block,
            assert(instr->operands[0].constantValue() == -1u);
            /* transition to exact and set exec to zero */
            exit_cond = bld.tmp(s1);
-            cond = bld.sop1(Builder::s_and_saveexec, bld.def(bld.lm), bld.scc(Definition(exit_cond)),
-                            Definition(exec, bld.lm), Operand(0u), Operand(exec, bld.lm));
+            cond =
+               bld.sop1(Builder::s_and_saveexec, bld.def(bld.lm), bld.scc(Definition(exit_cond)),
+                        Definition(exec, bld.lm), Operand(0u), Operand(exec, bld.lm));

            num = ctx.info[block->index].exec.size() - 2;
            if (!(ctx.info[block->index].exec.back().second & mask_type_exact)) {
@ -767,7 +797,7 @@ void process_instructions(exec_ctx& ctx, Block* block,

         for (int i = num; i >= 0; i--) {
            if (ctx.info[block->index].exec[i].second & mask_type_exact) {
-               Instruction *andn2 = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc),
+               Instruction* andn2 = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc),
                                             ctx.info[block->index].exec[i].first, cond);
               if (i == (int)ctx.info[block->index].exec.size() - 1) {
                  andn2->operands[0] = Operand(exec, bld.lm);
@ -783,14 +813,14 @@ void process_instructions(exec_ctx& ctx, Block* block,
         instr->opcode = aco_opcode::p_exit_early_if;
         instr->operands[0] = bld.scc(exit_cond);
         state = Exact;
-
      }

      bld.insert(std::move(instr));
   }
 }

-void add_branch_code(exec_ctx& ctx, Block* block)
+void
+add_branch_code(exec_ctx& ctx, Block* block)
 {
   unsigned idx = block->index;
   Builder bld(ctx.program, block);
@ -806,8 +836,7 @@ void add_branch_code(exec_ctx& ctx, Block* block)
      }
      assert(ctx.info[idx].exec.size() <= 2);

-      if (ctx.info[idx].ever_again_needs == 0 ||
-          ctx.info[idx].ever_again_needs == Exact) {
+      if (ctx.info[idx].ever_again_needs == 0 || ctx.info[idx].ever_again_needs == Exact) {
         /* transition to Exact */
         aco_ptr<Instruction> branch = std::move(block->instructions.back());
         block->instructions.pop_back();
@ -838,8 +867,7 @@ void add_branch_code(exec_ctx& ctx, Block* block)
         Block& loop_block = ctx.program->blocks[i];
         needs |= ctx.info[i].block_needs;

-         if (loop_block.kind & block_kind_uses_discard_if ||
-             loop_block.kind & block_kind_discard ||
+         if (loop_block.kind & block_kind_uses_discard_if || loop_block.kind & block_kind_discard ||
             loop_block.kind & block_kind_uses_demote)
            has_discard = true;
         if (loop_block.loop_nest_depth != loop_nest_depth)
@ -871,12 +899,8 @@ void add_branch_code(exec_ctx& ctx, Block* block)
      if (block->kind & block_kind_top_level)
         num_exec_masks = std::min(num_exec_masks, 2u);

-      ctx.loop.emplace_back(&ctx.program->blocks[block->linear_succs[0]],
-                            num_exec_masks,
-                            needs,
-                            has_divergent_break,
-                            has_divergent_continue,
-                            has_discard);
+      ctx.loop.emplace_back(&ctx.program->blocks[block->linear_succs[0]], num_exec_masks, needs,
+                            has_divergent_break, has_divergent_continue, has_discard);
   }

   /* For normal breaks, this is the exec mask. For discard+break, it's the
@ -903,7 +927,7 @@ void add_branch_code(exec_ctx& ctx, Block* block)
                           Definition(exec, bld.lm), Operand(0u), Operand(exec, bld.lm));

      for (int i = num - 1; i >= 0; i--) {
-         Instruction *andn2 = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc),
+         Instruction* andn2 = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc),
                                       get_exec_op(ctx.info[block->index].exec[i].first), cond);
         if (i == (int)ctx.info[idx].exec.size() - 1)
            andn2->definitions[0] = Definition(exec, bld.lm);
@ -919,8 +943,10 @@ void add_branch_code(exec_ctx& ctx, Block* block)
   }

   if (block->kind & block_kind_continue_or_break) {
-      assert(ctx.program->blocks[ctx.program->blocks[block->linear_succs[1]].linear_succs[0]].kind & block_kind_loop_header);
-      assert(ctx.program->blocks[ctx.program->blocks[block->linear_succs[0]].linear_succs[0]].kind & block_kind_loop_exit);
+      assert(ctx.program->blocks[ctx.program->blocks[block->linear_succs[1]].linear_succs[0]].kind &
+             block_kind_loop_header);
+      assert(ctx.program->blocks[ctx.program->blocks[block->linear_succs[0]].linear_succs[0]].kind &
+             block_kind_loop_exit);
      assert(block->instructions.back()->opcode == aco_opcode::p_branch);
      block->instructions.pop_back();

@ -931,8 +957,10 @@ void add_branch_code(exec_ctx& ctx, Block* block)
      }

      if (need_parallelcopy)
-         ctx.info[idx].exec.back().first = bld.pseudo(aco_opcode::p_parallelcopy, Definition(exec, bld.lm), ctx.info[idx].exec.back().first);
-      bld.branch(aco_opcode::p_cbranch_nz, bld.hint_vcc(bld.def(s2)), Operand(exec, bld.lm), block->linear_succs[1], block->linear_succs[0]);
+         ctx.info[idx].exec.back().first = bld.pseudo(
+            aco_opcode::p_parallelcopy, Definition(exec, bld.lm), ctx.info[idx].exec.back().first);
+      bld.branch(aco_opcode::p_cbranch_nz, bld.hint_vcc(bld.def(s2)), Operand(exec, bld.lm),
+                 block->linear_succs[1], block->linear_succs[0]);
      return;
   }

@ -949,8 +977,7 @@ void add_branch_code(exec_ctx& ctx, Block* block)

   if (block->kind & block_kind_branch) {

-      if (ctx.handle_wqm &&
-          ctx.info[idx].exec.size() >= 2 &&
+      if (ctx.handle_wqm && ctx.info[idx].exec.size() >= 2 &&
          ctx.info[idx].exec.back().second == mask_type_exact &&
          !(ctx.info[idx].block_needs & Exact_Branch) &&
          ctx.info[idx].exec[ctx.info[idx].exec.size() - 2].second & mask_type_wqm) {
@ -972,7 +999,7 @@ void add_branch_code(exec_ctx& ctx, Block* block)
         bld.pseudo(aco_opcode::p_parallelcopy, Definition(exec, bld.lm), cond);
      } else {
         Temp old_exec = bld.sop1(Builder::s_and_saveexec, bld.def(bld.lm), bld.def(s1, scc),
-                                 Definition(exec, bld.lm), cond, Operand(exec, bld.lm));
+                                  Definition(exec, bld.lm), cond, Operand(exec, bld.lm));

         ctx.info[idx].exec.back().first = Operand(old_exec);
      }
@ -980,7 +1007,8 @@ void add_branch_code(exec_ctx& ctx, Block* block)
      /* add next current exec to the stack */
      ctx.info[idx].exec.emplace_back(Operand(bld.lm), mask_type);

-      bld.branch(aco_opcode::p_cbranch_z, bld.hint_vcc(bld.def(s2)), Operand(exec, bld.lm), block->linear_succs[1], block->linear_succs[0]);
+      bld.branch(aco_opcode::p_cbranch_z, bld.hint_vcc(bld.def(s2)), Operand(exec, bld.lm),
+                 block->linear_succs[1], block->linear_succs[0]);
      return;
   }

@ -990,9 +1018,11 @@ void add_branch_code(exec_ctx& ctx, Block* block)
      block->instructions.pop_back();
      assert(ctx.info[idx].exec.size() >= 2);
      Operand orig_exec = ctx.info[idx].exec[ctx.info[idx].exec.size() - 2].first;
-      bld.sop2(Builder::s_andn2, Definition(exec, bld.lm), bld.def(s1, scc), orig_exec, Operand(exec, bld.lm));
+      bld.sop2(Builder::s_andn2, Definition(exec, bld.lm), bld.def(s1, scc), orig_exec,
+               Operand(exec, bld.lm));

-      bld.branch(aco_opcode::p_cbranch_z, bld.hint_vcc(bld.def(s2)), Operand(exec, bld.lm), block->linear_succs[1], block->linear_succs[0]);
+      bld.branch(aco_opcode::p_cbranch_z, bld.hint_vcc(bld.def(s2)), Operand(exec, bld.lm),
+                 block->linear_succs[1], block->linear_succs[0]);
      return;
   }

@ -1020,7 +1050,8 @@ void add_branch_code(exec_ctx& ctx, Block* block)
         bld.copy(Definition(exec, bld.lm), Operand(0u, bld.lm == s2));
      }

-      bld.branch(aco_opcode::p_cbranch_nz, bld.hint_vcc(bld.def(s2)), bld.scc(cond), block->linear_succs[1], block->linear_succs[0]);
+      bld.branch(aco_opcode::p_cbranch_nz, bld.hint_vcc(bld.def(s2)), bld.scc(cond),
+                 block->linear_succs[1], block->linear_succs[0]);
      return;
   }

@ -1048,12 +1079,14 @@ void add_branch_code(exec_ctx& ctx, Block* block)
         bld.copy(Definition(exec, bld.lm), Operand(0u, bld.lm == s2));
      }

-      bld.branch(aco_opcode::p_cbranch_nz, bld.hint_vcc(bld.def(s2)), bld.scc(cond), block->linear_succs[1], block->linear_succs[0]);
+      bld.branch(aco_opcode::p_cbranch_nz, bld.hint_vcc(bld.def(s2)), bld.scc(cond),
+                 block->linear_succs[1], block->linear_succs[0]);
      return;
   }
 }

-void process_block(exec_ctx& ctx, Block* block)
+void
+process_block(exec_ctx& ctx, Block* block)
 {
   std::vector<aco_ptr<Instruction>> instructions;
   instructions.reserve(block->instructions.size());
@ -1072,8 +1105,8 @@ void process_block(exec_ctx& ctx, Block* block)

 } /* end namespace */

-
-void insert_exec_mask(Program *program)
+void
+insert_exec_mask(Program* program)
 {
   exec_ctx ctx(program);

@ -1082,8 +1115,6 @@ void insert_exec_mask(Program *program)

   for (Block& block : program->blocks)
      process_block(ctx, &block);
-
-}
-
 }

+} // namespace aco
--- a/src/amd/compiler/aco_insert_waitcnt.cpp
+++ b/src/amd/compiler/aco_insert_waitcnt.cpp
@ -23,6 +23,7 @@
 */

 #include "aco_ir.h"
+
 #include "common/sid.h"

 #include <map>
@ -49,7 +50,8 @@ namespace {
 * - or erase gprs with counters higher than to be waited for.
 */

-// TODO: do a more clever insertion of wait_cnt (lgkm_cnt) when there is a load followed by a use of a previous load
+// TODO: do a more clever insertion of wait_cnt (lgkm_cnt)
+// when there is a load followed by a use of a previous load

 /* Instructions of the same event will finish in-order except for smem
 * and maybe flat. Instructions of different events may not finish in-order. */
@ -77,54 +79,50 @@ enum counter_type : uint8_t {
   num_counters = 4,
 };

-static const uint16_t exp_events = event_exp_pos | event_exp_param | event_exp_mrt_null | event_gds_gpr_lock | event_vmem_gpr_lock;
+static const uint16_t exp_events =
+   event_exp_pos | event_exp_param | event_exp_mrt_null | event_gds_gpr_lock | event_vmem_gpr_lock;
 static const uint16_t lgkm_events = event_smem | event_lds | event_gds | event_flat | event_sendmsg;
 static const uint16_t vm_events = event_vmem | event_flat;
 static const uint16_t vs_events = event_vmem_store;

-uint8_t get_counters_for_event(wait_event ev)
+uint8_t
+get_counters_for_event(wait_event ev)
 {
   switch (ev) {
   case event_smem:
   case event_lds:
   case event_gds:
-   case event_sendmsg:
-      return counter_lgkm;
-   case event_vmem:
-      return counter_vm;
-   case event_vmem_store:
-      return counter_vs;
-   case event_flat:
-      return counter_vm | counter_lgkm;
+   case event_sendmsg: return counter_lgkm;
+   case event_vmem: return counter_vm;
+   case event_vmem_store: return counter_vs;
+   case event_flat: return counter_vm | counter_lgkm;
   case event_exp_pos:
   case event_exp_param:
   case event_exp_mrt_null:
   case event_gds_gpr_lock:
-   case event_vmem_gpr_lock:
-      return counter_exp;
-   default:
-      return 0;
+   case event_vmem_gpr_lock: return counter_exp;
+   default: return 0;
   }
 }

 struct wait_entry {
   wait_imm imm;
-   uint16_t events; /* use wait_event notion */
+   uint16_t events;  /* use wait_event notion */
   uint8_t counters; /* use counter_type notion */
-   bool wait_on_read:1;
-   bool logical:1;
-   bool has_vmem_nosampler:1;
-   bool has_vmem_sampler:1;
+   bool wait_on_read : 1;
+   bool logical : 1;
+   bool has_vmem_nosampler : 1;
+   bool has_vmem_sampler : 1;

   wait_entry(wait_event event_, wait_imm imm_, bool logical_, bool wait_on_read_)
-           : imm(imm_), events(event_), counters(get_counters_for_event(event_)),
-             wait_on_read(wait_on_read_), logical(logical_),
-             has_vmem_nosampler(false), has_vmem_sampler(false) {}
+       : imm(imm_), events(event_), counters(get_counters_for_event(event_)),
+         wait_on_read(wait_on_read_), logical(logical_), has_vmem_nosampler(false),
+         has_vmem_sampler(false)
+   {}

   bool join(const wait_entry& other)
   {
-      bool changed = (other.events & ~events) ||
-                     (other.counters & ~counters) ||
+      bool changed = (other.events & ~events) || (other.counters & ~counters) ||
                     (other.wait_on_read && !wait_on_read) ||
                     (other.has_vmem_nosampler && !has_vmem_nosampler) ||
                     (other.has_vmem_sampler && !has_vmem_sampler);
@ -156,7 +154,8 @@ struct wait_entry {

      if (counter == counter_exp) {
         imm.exp = wait_imm::unset_counter;
-         events &= ~(event_exp_pos | event_exp_param | event_exp_mrt_null | event_gds_gpr_lock | event_vmem_gpr_lock);
+         events &= ~(event_exp_pos | event_exp_param | event_exp_mrt_null | event_gds_gpr_lock |
+                     event_vmem_gpr_lock);
      }

      if (counter == counter_vs) {
@ -170,7 +169,7 @@ struct wait_entry {
 };

 struct wait_ctx {
-   Program *program;
+   Program* program;
   enum chip_class chip_class;
   uint16_t max_vm_cnt;
   uint16_t max_exp_cnt;
@ -189,24 +188,21 @@ struct wait_ctx {
   wait_imm barrier_imm[storage_count];
   uint16_t barrier_events[storage_count] = {}; /* use wait_event notion */

-   std::map<PhysReg,wait_entry> gpr_map;
+   std::map<PhysReg, wait_entry> gpr_map;

   wait_ctx() {}
-   wait_ctx(Program *program_)
-           : program(program_),
-             chip_class(program_->chip_class),
-             max_vm_cnt(program_->chip_class >= GFX9 ? 62 : 14),
-             max_exp_cnt(6),
-             max_lgkm_cnt(program_->chip_class >= GFX10 ? 62 : 14),
-             max_vs_cnt(program_->chip_class >= GFX10 ? 62 : 0),
-             unordered_events(event_smem | (program_->chip_class < GFX10 ? event_flat : 0)) {}
+   wait_ctx(Program* program_)
+       : program(program_), chip_class(program_->chip_class),
+         max_vm_cnt(program_->chip_class >= GFX9 ? 62 : 14), max_exp_cnt(6),
+         max_lgkm_cnt(program_->chip_class >= GFX10 ? 62 : 14),
+         max_vs_cnt(program_->chip_class >= GFX10 ? 62 : 0),
+         unordered_events(event_smem | (program_->chip_class < GFX10 ? event_flat : 0))
+   {}

   bool join(const wait_ctx* other, bool logical)
   {
-      bool changed = other->exp_cnt > exp_cnt ||
-                     other->vm_cnt > vm_cnt ||
-                     other->lgkm_cnt > lgkm_cnt ||
-                     other->vs_cnt > vs_cnt ||
+      bool changed = other->exp_cnt > exp_cnt || other->vm_cnt > vm_cnt ||
+                     other->lgkm_cnt > lgkm_cnt || other->vs_cnt > vs_cnt ||
                     (other->pending_flat_lgkm && !pending_flat_lgkm) ||
                     (other->pending_flat_vm && !pending_flat_vm);

@ -218,12 +214,11 @@ struct wait_ctx {
      pending_flat_vm |= other->pending_flat_vm;
      pending_s_buffer_store |= other->pending_s_buffer_store;

-      for (const auto& entry : other->gpr_map)
-      {
+      for (const auto& entry : other->gpr_map) {
         if (entry.second.logical != logical)
            continue;

-         using iterator = std::map<PhysReg,wait_entry>::iterator;
+         using iterator = std::map<PhysReg, wait_entry>::iterator;
         const std::pair<iterator, bool> insert_pair = gpr_map.insert(entry);
         if (insert_pair.second) {
            changed = true;
@ -241,12 +236,14 @@ struct wait_ctx {
      return changed;
   }

-   void wait_and_remove_from_entry(PhysReg reg, wait_entry& entry, counter_type counter) {
+   void wait_and_remove_from_entry(PhysReg reg, wait_entry& entry, counter_type counter)
+   {
      entry.remove_counter(counter);
   }
 };

-wait_imm check_instr(Instruction* instr, wait_ctx& ctx)
+wait_imm
+check_instr(Instruction* instr, wait_ctx& ctx)
 {
   wait_imm wait;

@ -257,7 +254,7 @@ wait_imm check_instr(Instruction* instr, wait_ctx& ctx)
      /* check consecutively read gprs */
      for (unsigned j = 0; j < op.size(); j++) {
         PhysReg reg{op.physReg() + j};
-         std::map<PhysReg,wait_entry>::iterator it = ctx.gpr_map.find(reg);
+         std::map<PhysReg, wait_entry>::iterator it = ctx.gpr_map.find(reg);
         if (it == ctx.gpr_map.end() || !it->second.wait_on_read)
            continue;

@ -267,22 +264,24 @@ wait_imm check_instr(Instruction* instr, wait_ctx& ctx)

   for (const Definition& def : instr->definitions) {
      /* check consecutively written gprs */
-      for (unsigned j = 0; j < def.getTemp().size(); j++)
-      {
+      for (unsigned j = 0; j < def.getTemp().size(); j++) {
         PhysReg reg{def.physReg() + j};

-         std::map<PhysReg,wait_entry>::iterator it = ctx.gpr_map.find(reg);
+         std::map<PhysReg, wait_entry>::iterator it = ctx.gpr_map.find(reg);
         if (it == ctx.gpr_map.end())
            continue;

         /* Vector Memory reads and writes return in the order they were issued */
-         bool has_sampler = instr->isMIMG() && !instr->operands[1].isUndefined() && instr->operands[1].regClass() == s4;
+         bool has_sampler = instr->isMIMG() && !instr->operands[1].isUndefined() &&
+                            instr->operands[1].regClass() == s4;
         if (instr->isVMEM() && ((it->second.events & vm_events) == event_vmem) &&
-             it->second.has_vmem_nosampler == !has_sampler && it->second.has_vmem_sampler == has_sampler)
+             it->second.has_vmem_nosampler == !has_sampler &&
+             it->second.has_vmem_sampler == has_sampler)
            continue;

         /* LDS reads and writes return in the order they were issued. same for GDS */
-         if (instr->isDS() && (it->second.events & lgkm_events) == (instr->ds().gds ? event_gds : event_lds))
+         if (instr->isDS() &&
+             (it->second.events & lgkm_events) == (instr->ds().gds ? event_gds : event_lds))
            continue;

         wait.combine(it->second.imm);
@ -292,7 +291,8 @@ wait_imm check_instr(Instruction* instr, wait_ctx& ctx)
   return wait;
 }

-wait_imm parse_wait_instr(wait_ctx& ctx, Instruction *instr)
+wait_imm
+parse_wait_instr(wait_ctx& ctx, Instruction* instr)
 {
   if (instr->opcode == aco_opcode::s_waitcnt_vscnt &&
       instr->definitions[0].physReg() == sgpr_null) {
@ -305,10 +305,12 @@ wait_imm parse_wait_instr(wait_ctx& ctx, Instruction *instr)
   return wait_imm();
 }

-wait_imm perform_barrier(wait_ctx& ctx, memory_sync_info sync, unsigned semantics)
+wait_imm
+perform_barrier(wait_ctx& ctx, memory_sync_info sync, unsigned semantics)
 {
   wait_imm imm;
-   sync_scope subgroup_scope = ctx.program->workgroup_size <= ctx.program->wave_size ? scope_workgroup : scope_subgroup;
+   sync_scope subgroup_scope =
+      ctx.program->workgroup_size <= ctx.program->wave_size ? scope_workgroup : scope_subgroup;
   if ((sync.semantics & semantics) && sync.scope > subgroup_scope) {
      unsigned storage = sync.storage;
      while (storage) {
@ -321,7 +323,8 @@ wait_imm perform_barrier(wait_ctx& ctx, memory_sync_info sync, unsigned semantic
         if (bar_scope_lds <= subgroup_scope)
            events &= ~event_lds;

-         /* in non-WGP, the L1 (L0 on GFX10+) cache keeps all memory operations in-order for the same workgroup */
+         /* in non-WGP, the L1 (L0 on GFX10+) cache keeps all memory operations
+          * in-order for the same workgroup */
         if (!ctx.program->wgp_mode && sync.scope <= scope_workgroup)
            events &= ~(event_vmem | event_vmem_store | event_smem);

@ -333,7 +336,8 @@ wait_imm perform_barrier(wait_ctx& ctx, memory_sync_info sync, unsigned semantic
   return imm;
 }

-void force_waitcnt(wait_ctx& ctx, wait_imm& imm)
+void
+force_waitcnt(wait_ctx& ctx, wait_imm& imm)
 {
   if (ctx.vm_cnt)
      imm.vm = 0;
@ -348,7 +352,8 @@ void force_waitcnt(wait_ctx& ctx, wait_imm& imm)
   }
 }

-wait_imm kill(Instruction* instr, wait_ctx& ctx, memory_sync_info sync_info)
+wait_imm
+kill(Instruction* instr, wait_ctx& ctx, memory_sync_info sync_info)
 {
   wait_imm imm;

@ -364,7 +369,6 @@ wait_imm kill(Instruction* instr, wait_ctx& ctx, memory_sync_info sync_info)

   imm.combine(parse_wait_instr(ctx, instr));

-
   /* It's required to wait for scalar stores before "writing back" data.
    * It shouldn't cost anything anyways since we're about to do s_endpgm.
    */
@ -380,20 +384,19 @@ wait_imm kill(Instruction* instr, wait_ctx& ctx, memory_sync_info sync_info)
       *
       * TODO: Refine this when we have proper alias analysis.
       */
-      if (ctx.pending_s_buffer_store &&
-          !instr->smem().definitions.empty() &&
+      if (ctx.pending_s_buffer_store && !instr->smem().definitions.empty() &&
          !instr->smem().sync.can_reorder()) {
         imm.lgkm = 0;
      }
   }

   if (ctx.program->early_rast && instr->opcode == aco_opcode::exp) {
-      if (instr->exp().dest >= V_008DFC_SQ_EXP_POS &&
-          instr->exp().dest < V_008DFC_SQ_EXP_PRIM) {
+      if (instr->exp().dest >= V_008DFC_SQ_EXP_POS && instr->exp().dest < V_008DFC_SQ_EXP_PRIM) {

-         /* With early_rast, the HW will start clipping and rasterization after the 1st DONE pos export.
-          * Wait for all stores (and atomics) to complete, so PS can read them.
-          * TODO: This only really applies to DONE pos exports. Consider setting the DONE bit earlier.
+         /* With early_rast, the HW will start clipping and rasterization after the 1st DONE pos
+          * export. Wait for all stores (and atomics) to complete, so PS can read them.
+          * TODO: This only really applies to DONE pos exports.
+          *       Consider setting the DONE bit earlier.
          */
         if (ctx.vs_cnt > 0)
            imm.vs = 0;
@ -444,9 +447,8 @@ wait_imm kill(Instruction* instr, wait_ctx& ctx, memory_sync_info sync_info)
      }

      /* remove all gprs with higher counter from map */
-      std::map<PhysReg,wait_entry>::iterator it = ctx.gpr_map.begin();
-      while (it != ctx.gpr_map.end())
-      {
+      std::map<PhysReg, wait_entry>::iterator it = ctx.gpr_map.begin();
+      while (it != ctx.gpr_map.end()) {
         if (imm.exp != wait_imm::unset_counter && imm.exp <= it->second.imm.exp)
            ctx.wait_and_remove_from_entry(it->first, it->second, counter_exp);
         if (imm.vm != wait_imm::unset_counter && imm.vm <= it->second.imm.vm)
@ -472,13 +474,15 @@ wait_imm kill(Instruction* instr, wait_ctx& ctx, memory_sync_info sync_info)
   return imm;
 }

-void update_barrier_counter(uint8_t *ctr, unsigned max)
+void
+update_barrier_counter(uint8_t* ctr, unsigned max)
 {
   if (*ctr != wait_imm::unset_counter && *ctr < max)
      (*ctr)++;
 }

-void update_barrier_imm(wait_ctx& ctx, uint8_t counters, wait_event event, memory_sync_info sync)
+void
+update_barrier_imm(wait_ctx& ctx, uint8_t counters, wait_event event, memory_sync_info sync)
 {
   for (unsigned i = 0; i < storage_count; i++) {
      wait_imm& bar = ctx.barrier_imm[i];
@ -506,7 +510,8 @@ void update_barrier_imm(wait_ctx& ctx, uint8_t counters, wait_event event, memor
   }
 }

-void update_counters(wait_ctx& ctx, wait_event event, memory_sync_info sync=memory_sync_info())
+void
+update_counters(wait_ctx& ctx, wait_event event, memory_sync_info sync = memory_sync_info())
 {
   uint8_t counters = get_counters_for_event(event);

@ -529,7 +534,7 @@ void update_counters(wait_ctx& ctx, wait_event event, memory_sync_info sync=memo
   if (ctx.pending_flat_vm)
      counters &= ~counter_vm;

-   for (std::pair<const PhysReg,wait_entry>& e : ctx.gpr_map) {
+   for (std::pair<const PhysReg, wait_entry>& e : ctx.gpr_map) {
      wait_entry& entry = e.second;

      if (entry.events & ctx.unordered_events)
@ -537,18 +542,23 @@ void update_counters(wait_ctx& ctx, wait_event event, memory_sync_info sync=memo

      assert(entry.events);

-      if ((counters & counter_exp) && (entry.events & exp_events) == event && entry.imm.exp < ctx.max_exp_cnt)
+      if ((counters & counter_exp) && (entry.events & exp_events) == event &&
+          entry.imm.exp < ctx.max_exp_cnt)
         entry.imm.exp++;
-      if ((counters & counter_lgkm) && (entry.events & lgkm_events) == event && entry.imm.lgkm < ctx.max_lgkm_cnt)
+      if ((counters & counter_lgkm) && (entry.events & lgkm_events) == event &&
+          entry.imm.lgkm < ctx.max_lgkm_cnt)
         entry.imm.lgkm++;
-      if ((counters & counter_vm) && (entry.events & vm_events) == event && entry.imm.vm < ctx.max_vm_cnt)
+      if ((counters & counter_vm) && (entry.events & vm_events) == event &&
+          entry.imm.vm < ctx.max_vm_cnt)
         entry.imm.vm++;
-      if ((counters & counter_vs) && (entry.events & vs_events) == event && entry.imm.vs < ctx.max_vs_cnt)
+      if ((counters & counter_vs) && (entry.events & vs_events) == event &&
+          entry.imm.vs < ctx.max_vs_cnt)
         entry.imm.vs++;
   }
 }

-void update_counters_for_flat_load(wait_ctx& ctx, memory_sync_info sync=memory_sync_info())
+void
+update_counters_for_flat_load(wait_ctx& ctx, memory_sync_info sync = memory_sync_info())
 {
   assert(ctx.chip_class < GFX10);

@ -559,8 +569,7 @@ void update_counters_for_flat_load(wait_ctx& ctx, memory_sync_info sync=memory_s

   update_barrier_imm(ctx, counter_vm | counter_lgkm, event_flat, sync);

-   for (std::pair<PhysReg,wait_entry> e : ctx.gpr_map)
-   {
+   for (std::pair<PhysReg, wait_entry> e : ctx.gpr_map) {
      if (e.second.counters & counter_vm)
         e.second.imm.vm = 0;
      if (e.second.counters & counter_lgkm)
@ -570,8 +579,9 @@ void update_counters_for_flat_load(wait_ctx& ctx, memory_sync_info sync=memory_s
   ctx.pending_flat_vm = true;
 }

-void insert_wait_entry(wait_ctx& ctx, PhysReg reg, RegClass rc, wait_event event, bool wait_on_read,
-                       bool has_sampler=false)
+void
+insert_wait_entry(wait_ctx& ctx, PhysReg reg, RegClass rc, wait_event event, bool wait_on_read,
+                  bool has_sampler = false)
 {
   uint16_t counters = get_counters_for_event(event);
   wait_imm imm;
@ -589,24 +599,27 @@ void insert_wait_entry(wait_ctx& ctx, PhysReg reg, RegClass rc, wait_event event
   new_entry.has_vmem_sampler = (event & event_vmem) && has_sampler;

   for (unsigned i = 0; i < rc.size(); i++) {
-      auto it = ctx.gpr_map.emplace(PhysReg{reg.reg()+i}, new_entry);
+      auto it = ctx.gpr_map.emplace(PhysReg{reg.reg() + i}, new_entry);
      if (!it.second)
         it.first->second.join(new_entry);
   }
 }

-void insert_wait_entry(wait_ctx& ctx, Operand op, wait_event event, bool has_sampler=false)
+void
+insert_wait_entry(wait_ctx& ctx, Operand op, wait_event event, bool has_sampler = false)
 {
   if (!op.isConstant() && !op.isUndefined())
      insert_wait_entry(ctx, op.physReg(), op.regClass(), event, false, has_sampler);
 }

-void insert_wait_entry(wait_ctx& ctx, Definition def, wait_event event, bool has_sampler=false)
+void
+insert_wait_entry(wait_ctx& ctx, Definition def, wait_event event, bool has_sampler = false)
 {
   insert_wait_entry(ctx, def.physReg(), def.regClass(), event, true, has_sampler);
 }

-void gen(Instruction* instr, wait_ctx& ctx)
+void
+gen(Instruction* instr, wait_ctx& ctx)
 {
   switch (instr->format) {
   case Format::EXP: {
@ -622,13 +635,11 @@ void gen(Instruction* instr, wait_ctx& ctx)
      update_counters(ctx, ev);

      /* insert new entries for exported vgprs */
-      for (unsigned i = 0; i < 4; i++)
-      {
+      for (unsigned i = 0; i < 4; i++) {
         if (exp_instr.enabled_mask & (1 << i)) {
            unsigned idx = exp_instr.compressed ? i >> 1 : i;
            assert(idx < exp_instr.operands.size());
            insert_wait_entry(ctx, exp_instr.operands[idx], ev);
-
         }
      }
      insert_wait_entry(ctx, exec, s2, ev, false);
@ -651,8 +662,7 @@ void gen(Instruction* instr, wait_ctx& ctx)

      if (!instr->definitions.empty())
         insert_wait_entry(ctx, instr->definitions[0], event_smem);
-      else if (ctx.chip_class >= GFX10 &&
-               !smem.sync.can_reorder())
+      else if (ctx.chip_class >= GFX10 && !smem.sync.can_reorder())
         ctx.pending_s_buffer_store = true;

      break;
@ -677,23 +687,21 @@ void gen(Instruction* instr, wait_ctx& ctx)
   case Format::MTBUF:
   case Format::MIMG:
   case Format::GLOBAL: {
-      wait_event ev = !instr->definitions.empty() || ctx.chip_class < GFX10 ? event_vmem : event_vmem_store;
+      wait_event ev =
+         !instr->definitions.empty() || ctx.chip_class < GFX10 ? event_vmem : event_vmem_store;
      update_counters(ctx, ev, get_sync_info(instr));

-      bool has_sampler = instr->isMIMG() && !instr->operands[1].isUndefined() && instr->operands[1].regClass() == s4;
+      bool has_sampler = instr->isMIMG() && !instr->operands[1].isUndefined() &&
+                         instr->operands[1].regClass() == s4;

      if (!instr->definitions.empty())
         insert_wait_entry(ctx, instr->definitions[0], ev, has_sampler);

-      if (ctx.chip_class == GFX6 &&
-          instr->format != Format::MIMG &&
-          instr->operands.size() == 4) {
+      if (ctx.chip_class == GFX6 && instr->format != Format::MIMG && instr->operands.size() == 4) {
         ctx.exp_cnt++;
         update_counters(ctx, event_vmem_gpr_lock);
         insert_wait_entry(ctx, instr->operands[3], event_vmem_gpr_lock);
-      } else if (ctx.chip_class == GFX6 &&
-                 instr->isMIMG() &&
-                 !instr->operands[2].isUndefined()) {
+      } else if (ctx.chip_class == GFX6 && instr->isMIMG() && !instr->operands[2].isUndefined()) {
         ctx.exp_cnt++;
         update_counters(ctx, event_vmem_gpr_lock);
         insert_wait_entry(ctx, instr->operands[2], event_vmem_gpr_lock);
@ -702,35 +710,37 @@ void gen(Instruction* instr, wait_ctx& ctx)
      break;
   }
   case Format::SOPP: {
-      if (instr->opcode == aco_opcode::s_sendmsg ||
-          instr->opcode == aco_opcode::s_sendmsghalt)
+      if (instr->opcode == aco_opcode::s_sendmsg || instr->opcode == aco_opcode::s_sendmsghalt)
         update_counters(ctx, event_sendmsg);
      break;
   }
-   default:
-      break;
+   default: break;
   }
 }

-void emit_waitcnt(wait_ctx& ctx, std::vector<aco_ptr<Instruction>>& instructions, wait_imm imm)
+void
+emit_waitcnt(wait_ctx& ctx, std::vector<aco_ptr<Instruction>>& instructions, wait_imm imm)
 {
   if (imm.vs != wait_imm::unset_counter) {
      assert(ctx.chip_class >= GFX10);
-      SOPK_instruction* waitcnt_vs = create_instruction<SOPK_instruction>(aco_opcode::s_waitcnt_vscnt, Format::SOPK, 0, 1);
+      SOPK_instruction* waitcnt_vs =
+         create_instruction<SOPK_instruction>(aco_opcode::s_waitcnt_vscnt, Format::SOPK, 0, 1);
      waitcnt_vs->definitions[0] = Definition(sgpr_null, s1);
      waitcnt_vs->imm = imm.vs;
      instructions.emplace_back(waitcnt_vs);
      imm.vs = wait_imm::unset_counter;
   }
   if (!imm.empty()) {
-      SOPP_instruction* waitcnt = create_instruction<SOPP_instruction>(aco_opcode::s_waitcnt, Format::SOPP, 0, 0);
+      SOPP_instruction* waitcnt =
+         create_instruction<SOPP_instruction>(aco_opcode::s_waitcnt, Format::SOPP, 0, 0);
      waitcnt->imm = imm.pack(ctx.chip_class);
      waitcnt->block = -1;
      instructions.emplace_back(waitcnt);
   }
 }

-void handle_block(Program *program, Block& block, wait_ctx& ctx)
+void
+handle_block(Program* program, Block& block, wait_ctx& ctx)
 {
   std::vector<aco_ptr<Instruction>> new_instructions;

@ -763,7 +773,8 @@ void handle_block(Program *program, Block& block, wait_ctx& ctx)

 } /* end namespace */

-void insert_wait_states(Program* program)
+void
+insert_wait_states(Program* program)
 {
   /* per BB ctx */
   std::vector<bool> done(program->blocks.size());
@ -818,5 +829,4 @@ void insert_wait_states(Program* program)
   }
 }

-}
-
+} // namespace aco
--- a/src/amd/compiler/aco_instruction_selection.cpp
+++ b/src/amd/compiler/aco_instruction_selection.cpp
--- a/src/amd/compiler/aco_instruction_selection.h
+++ b/src/amd/compiler/aco_instruction_selection.h
@ -39,21 +39,22 @@ struct shader_io_state {
   uint8_t mask[VARYING_SLOT_MAX];
   Temp temps[VARYING_SLOT_MAX * 4u];

-   shader_io_state() {
+   shader_io_state()
+   {
      memset(mask, 0, sizeof(mask));
      std::fill_n(temps, VARYING_SLOT_MAX * 4u, Temp(0, RegClass::v1));
   }
 };

 struct isel_context {
-   const struct radv_nir_compiler_options *options;
-   struct radv_shader_args *args;
-   Program *program;
-   nir_shader *shader;
+   const struct radv_nir_compiler_options* options;
+   struct radv_shader_args* args;
+   Program* program;
+   nir_shader* shader;
   uint32_t constant_data_offset;
-   Block *block;
+   Block* block;
   uint32_t first_temp_id;
-   std::unordered_map<unsigned, std::array<Temp,NIR_MAX_VEC_COMPONENTS>> allocated_vec;
+   std::unordered_map<unsigned, std::array<Temp, NIR_MAX_VEC_COMPONENTS>> allocated_vec;
   Stage stage;
   struct {
      bool has_branch;
@ -66,7 +67,8 @@ struct isel_context {
      struct {
         bool is_divergent = false;
      } parent_if;
-      bool exec_potentially_empty_discard = false; /* set to false when loop_nest_depth==0 && parent_if.is_divergent==false */
+      bool exec_potentially_empty_discard =
+         false; /* set to false when loop_nest_depth==0 && parent_if.is_divergent==false */
      uint16_t exec_potentially_empty_break_depth = UINT16_MAX;
      /* Set to false when loop_nest_depth==exec_potentially_empty_break_depth
       * and parent_if.is_divergent==false. Called _break but it's also used for
@ -76,7 +78,7 @@ struct isel_context {
   } cf_info;

   /* NIR range analysis. */
-   struct hash_table *range_ht;
+   struct hash_table* range_ht;
   nir_unsigned_upper_bound_config ub_config;

   Temp arg_temps[AC_MAX_ARGS];
@ -102,22 +104,19 @@ struct isel_context {
   shader_io_state outputs;
 };

-inline Temp get_arg(isel_context *ctx, struct ac_arg arg)
+inline Temp
+get_arg(isel_context* ctx, struct ac_arg arg)
 {
   assert(arg.used);
   return ctx->arg_temps[arg.arg_index];
 }

-void init_context(isel_context *ctx, nir_shader *shader);
-void cleanup_context(isel_context *ctx);
+void init_context(isel_context* ctx, nir_shader* shader);
+void cleanup_context(isel_context* ctx);

-isel_context
-setup_isel_context(Program* program,
-                   unsigned shader_count,
-                   struct nir_shader *const *shaders,
-                   ac_shader_config* config,
-                   struct radv_shader_args *args,
-                   bool is_gs_copy_shader);
+isel_context setup_isel_context(Program* program, unsigned shader_count,
+                                struct nir_shader* const* shaders, ac_shader_config* config,
+                                struct radv_shader_args* args, bool is_gs_copy_shader);

 } // namespace aco

--- a/src/amd/compiler/aco_instruction_selection_setup.cpp
+++ b/src/amd/compiler/aco_instruction_selection_setup.cpp
--- a/src/amd/compiler/aco_interface.cpp
+++ b/src/amd/compiler/aco_interface.cpp
@ -23,6 +23,7 @@
 */

 #include "aco_interface.h"
+
 #include "aco_ir.h"

 #include "vulkan/radv_shader.h"
@ -37,23 +38,33 @@
 static const std::array<aco_compiler_statistic_info, aco::num_statistics> statistic_infos = []()
 {
   std::array<aco_compiler_statistic_info, aco::num_statistics> ret{};
-   ret[aco::statistic_hash] = aco_compiler_statistic_info{"Hash", "CRC32 hash of code and constant data"};
-   ret[aco::statistic_instructions] = aco_compiler_statistic_info{"Instructions", "Instruction count"};
-   ret[aco::statistic_copies] = aco_compiler_statistic_info{"Copies", "Copy instructions created for pseudo-instructions"};
+   ret[aco::statistic_hash] =
+      aco_compiler_statistic_info{"Hash", "CRC32 hash of code and constant data"};
+   ret[aco::statistic_instructions] =
+      aco_compiler_statistic_info{"Instructions", "Instruction count"};
+   ret[aco::statistic_copies] =
+      aco_compiler_statistic_info{"Copies", "Copy instructions created for pseudo-instructions"};
   ret[aco::statistic_branches] = aco_compiler_statistic_info{"Branches", "Branch instructions"};
-   ret[aco::statistic_latency] = aco_compiler_statistic_info{"Latency", "Issue cycles plus stall cycles"};
-   ret[aco::statistic_inv_throughput] = aco_compiler_statistic_info{"Inverse Throughput", "Estimated busy cycles to execute one wave"};
-   ret[aco::statistic_vmem_clauses] = aco_compiler_statistic_info{"VMEM Clause", "Number of VMEM clauses (includes 1-sized clauses)"};
-   ret[aco::statistic_smem_clauses] = aco_compiler_statistic_info{"SMEM Clause", "Number of SMEM clauses (includes 1-sized clauses)"};
-   ret[aco::statistic_sgpr_presched] = aco_compiler_statistic_info{"Pre-Sched SGPRs", "SGPR usage before scheduling"};
-   ret[aco::statistic_vgpr_presched] = aco_compiler_statistic_info{"Pre-Sched VGPRs", "VGPR usage before scheduling"};
+   ret[aco::statistic_latency] =
+      aco_compiler_statistic_info{"Latency", "Issue cycles plus stall cycles"};
+   ret[aco::statistic_inv_throughput] = aco_compiler_statistic_info{
+      "Inverse Throughput", "Estimated busy cycles to execute one wave"};
+   ret[aco::statistic_vmem_clauses] = aco_compiler_statistic_info{
+      "VMEM Clause", "Number of VMEM clauses (includes 1-sized clauses)"};
+   ret[aco::statistic_smem_clauses] = aco_compiler_statistic_info{
+      "SMEM Clause", "Number of SMEM clauses (includes 1-sized clauses)"};
+   ret[aco::statistic_sgpr_presched] =
+      aco_compiler_statistic_info{"Pre-Sched SGPRs", "SGPR usage before scheduling"};
+   ret[aco::statistic_vgpr_presched] =
+      aco_compiler_statistic_info{"Pre-Sched VGPRs", "VGPR usage before scheduling"};
   return ret;
 }();

 const unsigned aco_num_statistics = aco::num_statistics;
-const aco_compiler_statistic_info *aco_statistic_infos = statistic_infos.data();
+const aco_compiler_statistic_info* aco_statistic_infos = statistic_infos.data();

-static void validate(aco::Program *program)
+static void
+validate(aco::Program* program)
 {
   if (!(aco::debug_flags & aco::DEBUG_VALIDATE_IR))
      return;
@ -62,10 +73,9 @@ static void validate(aco::Program *program)
   assert(is_valid);
 }

-void aco_compile_shader(unsigned shader_count,
-                        struct nir_shader *const *shaders,
-                        struct radv_shader_binary **binary,
-                        struct radv_shader_args *args)
+void
+aco_compile_shader(unsigned shader_count, struct nir_shader* const* shaders,
+                   struct radv_shader_binary** binary, struct radv_shader_args* args)
 {
   aco::init();

@ -116,11 +126,11 @@ void aco_compile_shader(unsigned shader_count,

   std::string llvm_ir;
   if (args->options->record_ir) {
-      char *data = NULL;
+      char* data = NULL;
      size_t size = 0;
      u_memstream mem;
      if (u_memstream_open(&mem, &data, &size)) {
-         FILE *const memf = u_memstream_get(&mem);
+         FILE* const memf = u_memstream_get(&mem);
         aco_print_program(program.get(), memf);
         fputc(0, memf);
         u_memstream_close(&mem);
@ -137,8 +147,7 @@ void aco_compile_shader(unsigned shader_count,
      aco_print_program(program.get(), stderr, live_vars, aco::print_live_vars | aco::print_kill);

   if (!args->is_trap_handler_shader) {
-      if (!args->options->disable_optimizations &&
-          !(aco::debug_flags & aco::DEBUG_NO_SCHED))
+      if (!args->options->disable_optimizations && !(aco::debug_flags & aco::DEBUG_NO_SCHED))
         aco::schedule_program(program.get(), live_vars);
      validate(program.get());

@ -189,11 +198,11 @@ void aco_compile_shader(unsigned shader_count,

   std::string disasm;
   if (get_disasm) {
-      char *data = NULL;
+      char* data = NULL;
      size_t disasm_size = 0;
      struct u_memstream mem;
      if (u_memstream_open(&mem, &data, &disasm_size)) {
-         FILE *const memf = u_memstream_get(&mem);
+         FILE* const memf = u_memstream_get(&mem);
         aco::print_asm(program.get(), code, exec_size / 4u, memf);
         fputc(0, memf);
         u_memstream_close(&mem);
@ -214,10 +223,10 @@ void aco_compile_shader(unsigned shader_count,
    * directly for the disk cache. Uninitialized data can appear because of
    * padding in the struct or because legacy_binary->data can be at an offset
    * from the start less than sizeof(radv_shader_binary_legacy). */
-   radv_shader_binary_legacy* legacy_binary = (radv_shader_binary_legacy*) calloc(size, 1);
+   radv_shader_binary_legacy* legacy_binary = (radv_shader_binary_legacy*)calloc(size, 1);

   legacy_binary->base.type = RADV_BINARY_TYPE_LEGACY;
-   legacy_binary->base.stage = shaders[shader_count-1]->info.stage;
+   legacy_binary->base.stage = shaders[shader_count - 1]->info.stage;
   legacy_binary->base.is_gs_copy_shader = args->is_gs_copy_shader;
   legacy_binary->base.total_size = size;

@ -225,7 +234,8 @@ void aco_compile_shader(unsigned shader_count,
      memcpy(legacy_binary->data, program->statistics, aco::num_statistics * sizeof(uint32_t));
   legacy_binary->stats_size = stats_size;

-   memcpy(legacy_binary->data + legacy_binary->stats_size, code.data(), code.size() * sizeof(uint32_t));
+   memcpy(legacy_binary->data + legacy_binary->stats_size, code.data(),
+          code.size() * sizeof(uint32_t));
   legacy_binary->exec_size = exec_size;
   legacy_binary->code_size = code.size() * sizeof(uint32_t);

@ -233,12 +243,15 @@ void aco_compile_shader(unsigned shader_count,
   legacy_binary->disasm_size = 0;
   legacy_binary->ir_size = llvm_ir.size();

-   llvm_ir.copy((char*) legacy_binary->data + legacy_binary->stats_size + legacy_binary->code_size, llvm_ir.size());
+   llvm_ir.copy((char*)legacy_binary->data + legacy_binary->stats_size + legacy_binary->code_size,
+                llvm_ir.size());

   if (get_disasm) {
-      disasm.copy((char*) legacy_binary->data + legacy_binary->stats_size + legacy_binary->code_size + llvm_ir.size(), disasm.size());
+      disasm.copy((char*)legacy_binary->data + legacy_binary->stats_size +
+                     legacy_binary->code_size + llvm_ir.size(),
+                  disasm.size());
      legacy_binary->disasm_size = disasm.size();
   }

-   *binary = (radv_shader_binary*) legacy_binary;
+   *binary = (radv_shader_binary*)legacy_binary;
 }
--- a/src/amd/compiler/aco_interface.h
+++ b/src/amd/compiler/aco_interface.h
@ -39,12 +39,10 @@ struct aco_compiler_statistic_info {
 };

 extern const unsigned aco_num_statistics;
-extern const struct aco_compiler_statistic_info *aco_statistic_infos;
+extern const struct aco_compiler_statistic_info* aco_statistic_infos;

-void aco_compile_shader(unsigned shader_count,
-                        struct nir_shader *const *shaders,
-                        struct radv_shader_binary** binary,
-                        struct radv_shader_args *args);
+void aco_compile_shader(unsigned shader_count, struct nir_shader* const* shaders,
+                        struct radv_shader_binary** binary, struct radv_shader_args* args);

 #ifdef __cplusplus
 }
--- a/src/amd/compiler/aco_ir.cpp
+++ b/src/amd/compiler/aco_ir.cpp
@ -32,39 +32,40 @@ namespace aco {

 uint64_t debug_flags = 0;

-static const struct debug_control aco_debug_options[] = {
-   {"validateir", DEBUG_VALIDATE_IR},
-   {"validatera", DEBUG_VALIDATE_RA},
-   {"perfwarn", DEBUG_PERFWARN},
-   {"force-waitcnt", DEBUG_FORCE_WAITCNT},
-   {"novn", DEBUG_NO_VN},
-   {"noopt", DEBUG_NO_OPT},
-   {"nosched", DEBUG_NO_SCHED},
-   {"perfinfo", DEBUG_PERF_INFO},
-   {"liveinfo", DEBUG_LIVE_INFO},
-   {NULL, 0}
-};
+static const struct debug_control aco_debug_options[] = {{"validateir", DEBUG_VALIDATE_IR},
+                                                         {"validatera", DEBUG_VALIDATE_RA},
+                                                         {"perfwarn", DEBUG_PERFWARN},
+                                                         {"force-waitcnt", DEBUG_FORCE_WAITCNT},
+                                                         {"novn", DEBUG_NO_VN},
+                                                         {"noopt", DEBUG_NO_OPT},
+                                                         {"nosched", DEBUG_NO_SCHED},
+                                                         {"perfinfo", DEBUG_PERF_INFO},
+                                                         {"liveinfo", DEBUG_LIVE_INFO},
+                                                         {NULL, 0}};

 static once_flag init_once_flag = ONCE_FLAG_INIT;

-static void init_once()
+static void
+init_once()
 {
   debug_flags = parse_debug_string(getenv("ACO_DEBUG"), aco_debug_options);

-   #ifndef NDEBUG
+#ifndef NDEBUG
   /* enable some flags by default on debug builds */
   debug_flags |= aco::DEBUG_VALIDATE_IR;
-   #endif
+#endif
 }

-void init()
+void
+init()
 {
   call_once(&init_once_flag, init_once);
 }

-void init_program(Program *program, Stage stage, struct radv_shader_info *info,
-                  enum chip_class chip_class, enum radeon_family family,
-                  bool wgp_mode, ac_shader_config *config)
+void
+init_program(Program* program, Stage stage, struct radv_shader_info* info,
+             enum chip_class chip_class, enum radeon_family family, bool wgp_mode,
+             ac_shader_config* config)
 {
   program->stage = stage;
   program->config = config;
@ -72,24 +73,12 @@ void init_program(Program *program, Stage stage, struct radv_shader_info *info,
   program->chip_class = chip_class;
   if (family == CHIP_UNKNOWN) {
      switch (chip_class) {
-      case GFX6:
-         program->family = CHIP_TAHITI;
-         break;
-      case GFX7:
-         program->family = CHIP_BONAIRE;
-         break;
-      case GFX8:
-         program->family = CHIP_POLARIS10;
-         break;
-      case GFX9:
-         program->family = CHIP_VEGA10;
-         break;
-      case GFX10:
-         program->family = CHIP_NAVI10;
-         break;
-      default:
-         program->family = CHIP_UNKNOWN;
-         break;
+      case GFX6: program->family = CHIP_TAHITI; break;
+      case GFX7: program->family = CHIP_BONAIRE; break;
+      case GFX8: program->family = CHIP_POLARIS10; break;
+      case GFX9: program->family = CHIP_VEGA10; break;
+      case GFX10: program->family = CHIP_NAVI10; break;
+      default: program->family = CHIP_UNKNOWN; break;
      }
   } else {
      program->family = family;
@ -98,7 +87,8 @@ void init_program(Program *program, Stage stage, struct radv_shader_info *info,
   program->lane_mask = program->wave_size == 32 ? s1 : s2;

   program->dev.lds_encoding_granule = chip_class >= GFX7 ? 512 : 256;
-   program->dev.lds_alloc_granule = chip_class >= GFX10_3 ? 1024 : program->dev.lds_encoding_granule;
+   program->dev.lds_alloc_granule =
+      chip_class >= GFX10_3 ? 1024 : program->dev.lds_encoding_granule;
   program->dev.lds_limit = chip_class >= GFX7 ? 65536 : 32768;
   /* apparently gfx702 also has 16-bank LDS but I can't find a family for that */
   program->dev.has_16bank_lds = family == CHIP_KABINI || family == CHIP_STONEY;
@ -111,7 +101,8 @@ void init_program(Program *program, Stage stage, struct radv_shader_info *info,
      program->dev.physical_sgprs = 5120; /* doesn't matter as long as it's at least 128 * 40 */
      program->dev.physical_vgprs = program->wave_size == 32 ? 1024 : 512;
      program->dev.sgpr_alloc_granule = 128;
-      program->dev.sgpr_limit = 108; /* includes VCC, which can be treated as s[106-107] on GFX10+ */
+      program->dev.sgpr_limit =
+         108; /* includes VCC, which can be treated as s[106-107] on GFX10+ */
      if (chip_class >= GFX10_3)
         program->dev.vgpr_alloc_granule = program->wave_size == 32 ? 16 : 8;
      else
@ -145,18 +136,14 @@ void init_program(Program *program, Stage stage, struct radv_shader_info *info,
   /* GFX9 APUS */
   case CHIP_RAVEN:
   case CHIP_RAVEN2:
-   case CHIP_RENOIR:
-      program->dev.xnack_enabled = true;
-      break;
-   default:
-      break;
+   case CHIP_RENOIR: program->dev.xnack_enabled = true; break;
+   default: break;
   }

   program->dev.sram_ecc_enabled = program->family == CHIP_ARCTURUS;
   /* apparently gfx702 also has fast v_fma_f32 but I can't find a family for that */
   program->dev.has_fast_fma32 = program->chip_class >= GFX9;
-   if (program->family == CHIP_TAHITI ||
-       program->family == CHIP_CARRIZO ||
+   if (program->family == CHIP_TAHITI || program->family == CHIP_CARRIZO ||
       program->family == CHIP_HAWAII)
      program->dev.has_fast_fma32 = true;

@ -176,29 +163,24 @@ void init_program(Program *program, Stage stage, struct radv_shader_info *info,
   program->next_fp_mode.round32 = fp_round_ne;
 }

-memory_sync_info get_sync_info(const Instruction* instr)
+memory_sync_info
+get_sync_info(const Instruction* instr)
 {
   switch (instr->format) {
-   case Format::SMEM:
-      return instr->smem().sync;
-   case Format::MUBUF:
-      return instr->mubuf().sync;
-   case Format::MIMG:
-      return instr->mimg().sync;
-   case Format::MTBUF:
-      return instr->mtbuf().sync;
+   case Format::SMEM: return instr->smem().sync;
+   case Format::MUBUF: return instr->mubuf().sync;
+   case Format::MIMG: return instr->mimg().sync;
+   case Format::MTBUF: return instr->mtbuf().sync;
   case Format::FLAT:
   case Format::GLOBAL:
-   case Format::SCRATCH:
-      return instr->flatlike().sync;
-   case Format::DS:
-      return instr->ds().sync;
-   default:
-      return memory_sync_info();
+   case Format::SCRATCH: return instr->flatlike().sync;
+   case Format::DS: return instr->ds().sync;
+   default: return memory_sync_info();
   }
 }

-bool can_use_SDWA(chip_class chip, const aco_ptr<Instruction>& instr, bool pre_ra)
+bool
+can_use_SDWA(chip_class chip, const aco_ptr<Instruction>& instr, bool pre_ra)
 {
   if (!instr->isVALU())
      return false;
@ -218,7 +200,7 @@ bool can_use_SDWA(chip_class chip, const aco_ptr<Instruction>& instr, bool pre_r
      if (vop3.omod && chip < GFX9)
         return false;

-      //TODO: return true if we know we will use vcc
+      // TODO: return true if we know we will use vcc
      if (!pre_ra && instr->definitions.size() >= 2)
         return false;

@ -244,38 +226,36 @@ bool can_use_SDWA(chip_class chip, const aco_ptr<Instruction>& instr, bool pre_r
         return false;
   }

-   bool is_mac = instr->opcode == aco_opcode::v_mac_f32 ||
-                 instr->opcode == aco_opcode::v_mac_f16 ||
-                 instr->opcode == aco_opcode::v_fmac_f32 ||
-                 instr->opcode == aco_opcode::v_fmac_f16;
+   bool is_mac = instr->opcode == aco_opcode::v_mac_f32 || instr->opcode == aco_opcode::v_mac_f16 ||
+                 instr->opcode == aco_opcode::v_fmac_f32 || instr->opcode == aco_opcode::v_fmac_f16;

   if (chip != GFX8 && is_mac)
      return false;

-   //TODO: return true if we know we will use vcc
+   // TODO: return true if we know we will use vcc
   if (!pre_ra && instr->isVOPC())
      return false;
   if (!pre_ra && instr->operands.size() >= 3 && !is_mac)
      return false;

-   return instr->opcode != aco_opcode::v_madmk_f32 &&
-          instr->opcode != aco_opcode::v_madak_f32 &&
-          instr->opcode != aco_opcode::v_madmk_f16 &&
-          instr->opcode != aco_opcode::v_madak_f16 &&
+   return instr->opcode != aco_opcode::v_madmk_f32 && instr->opcode != aco_opcode::v_madak_f32 &&
+          instr->opcode != aco_opcode::v_madmk_f16 && instr->opcode != aco_opcode::v_madak_f16 &&
          instr->opcode != aco_opcode::v_readfirstlane_b32 &&
-          instr->opcode != aco_opcode::v_clrexcp &&
-          instr->opcode != aco_opcode::v_swap_b32;
+          instr->opcode != aco_opcode::v_clrexcp && instr->opcode != aco_opcode::v_swap_b32;
 }

 /* updates "instr" and returns the old instruction (or NULL if no update was needed) */
-aco_ptr<Instruction> convert_to_SDWA(chip_class chip, aco_ptr<Instruction>& instr)
+aco_ptr<Instruction>
+convert_to_SDWA(chip_class chip, aco_ptr<Instruction>& instr)
 {
   if (instr->isSDWA())
      return NULL;

   aco_ptr<Instruction> tmp = std::move(instr);
-   Format format = (Format)(((uint16_t)tmp->format & ~(uint16_t)Format::VOP3) | (uint16_t)Format::SDWA);
-   instr.reset(create_instruction<SDWA_instruction>(tmp->opcode, format, tmp->operands.size(), tmp->definitions.size()));
+   Format format =
+      (Format)(((uint16_t)tmp->format & ~(uint16_t)Format::VOP3) | (uint16_t)Format::SDWA);
+   instr.reset(create_instruction<SDWA_instruction>(tmp->opcode, format, tmp->operands.size(),
+                                                    tmp->definitions.size()));
   std::copy(tmp->operands.cbegin(), tmp->operands.cend(), instr->operands.begin());
   std::copy(tmp->definitions.cbegin(), tmp->definitions.cend(), instr->definitions.begin());

@ -295,15 +275,9 @@ aco_ptr<Instruction> convert_to_SDWA(chip_class chip, aco_ptr<Instruction>& inst
         break;

      switch (instr->operands[i].bytes()) {
-      case 1:
-         sdwa.sel[i] = sdwa_ubyte;
-         break;
-      case 2:
-         sdwa.sel[i] = sdwa_uword;
-         break;
-      case 4:
-         sdwa.sel[i] = sdwa_udword;
-         break;
+      case 1: sdwa.sel[i] = sdwa_ubyte; break;
+      case 2: sdwa.sel[i] = sdwa_uword; break;
+      case 4: sdwa.sel[i] = sdwa_udword; break;
      }
   }
   switch (instr->definitions[0].bytes()) {
@ -315,9 +289,7 @@ aco_ptr<Instruction> convert_to_SDWA(chip_class chip, aco_ptr<Instruction>& inst
      sdwa.dst_sel = sdwa_uword;
      sdwa.dst_preserve = true;
      break;
-   case 4:
-      sdwa.dst_sel = sdwa_udword;
-      break;
+   case 4: sdwa.dst_sel = sdwa_udword; break;
   }

   if (instr->definitions[0].getTemp().type() == RegType::sgpr && chip == GFX8)
@ -330,7 +302,8 @@ aco_ptr<Instruction> convert_to_SDWA(chip_class chip, aco_ptr<Instruction>& inst
   return tmp;
 }

-bool can_use_opsel(chip_class chip, aco_opcode op, int idx, bool high)
+bool
+can_use_opsel(chip_class chip, aco_opcode op, int idx, bool high)
 {
   /* opsel is only GFX9+ */
   if ((high || idx == -1) && chip < GFX9)
@ -362,21 +335,18 @@ bool can_use_opsel(chip_class chip, aco_opcode op, int idx, bool high)
   case aco_opcode::v_lshlrev_b16_e64:
   case aco_opcode::v_lshrrev_b16_e64:
   case aco_opcode::v_ashrrev_i16_e64:
-   case aco_opcode::v_mul_lo_u16_e64:
-      return true;
+   case aco_opcode::v_mul_lo_u16_e64: return true;
   case aco_opcode::v_pack_b32_f16:
   case aco_opcode::v_cvt_pknorm_i16_f16:
-   case aco_opcode::v_cvt_pknorm_u16_f16:
-      return idx != -1;
+   case aco_opcode::v_cvt_pknorm_u16_f16: return idx != -1;
   case aco_opcode::v_mad_u32_u16:
-   case aco_opcode::v_mad_i32_i16:
-      return idx >= 0 && idx < 2;
-   default:
-      return false;
+   case aco_opcode::v_mad_i32_i16: return idx >= 0 && idx < 2;
+   default: return false;
   }
 }

-uint32_t get_reduction_identity(ReduceOp op, unsigned idx)
+uint32_t
+get_reduction_identity(ReduceOp op, unsigned idx)
 {
   switch (op) {
   case iadd8:
@ -397,65 +367,44 @@ uint32_t get_reduction_identity(ReduceOp op, unsigned idx)
   case umax8:
   case umax16:
   case umax32:
-   case umax64:
-      return 0;
+   case umax64: return 0;
   case imul8:
   case imul16:
   case imul32:
-   case imul64:
-      return idx ? 0 : 1;
-   case fmul16:
-      return 0x3c00u; /* 1.0 */
-   case fmul32:
-      return 0x3f800000u; /* 1.0 */
-   case fmul64:
-      return idx ? 0x3ff00000u : 0u; /* 1.0 */
-   case imin8:
-      return INT8_MAX;
-   case imin16:
-      return INT16_MAX;
-   case imin32:
-      return INT32_MAX;
-   case imin64:
-      return idx ? 0x7fffffffu : 0xffffffffu;
-   case imax8:
-      return INT8_MIN;
-   case imax16:
-      return INT16_MIN;
-   case imax32:
-      return INT32_MIN;
-   case imax64:
-      return idx ? 0x80000000u : 0;
+   case imul64: return idx ? 0 : 1;
+   case fmul16: return 0x3c00u;                /* 1.0 */
+   case fmul32: return 0x3f800000u;            /* 1.0 */
+   case fmul64: return idx ? 0x3ff00000u : 0u; /* 1.0 */
+   case imin8: return INT8_MAX;
+   case imin16: return INT16_MAX;
+   case imin32: return INT32_MAX;
+   case imin64: return idx ? 0x7fffffffu : 0xffffffffu;
+   case imax8: return INT8_MIN;
+   case imax16: return INT16_MIN;
+   case imax32: return INT32_MIN;
+   case imax64: return idx ? 0x80000000u : 0;
   case umin8:
   case umin16:
   case iand8:
-   case iand16:
-      return 0xffffffffu;
+   case iand16: return 0xffffffffu;
   case umin32:
   case umin64:
   case iand32:
-   case iand64:
-      return 0xffffffffu;
-   case fmin16:
-      return 0x7c00u; /* infinity */
-   case fmin32:
-      return 0x7f800000u; /* infinity */
-   case fmin64:
-      return idx ? 0x7ff00000u : 0u; /* infinity */
-   case fmax16:
-      return 0xfc00u; /* negative infinity */
-   case fmax32:
-      return 0xff800000u; /* negative infinity */
-   case fmax64:
-      return idx ? 0xfff00000u : 0u; /* negative infinity */
-   default:
-      unreachable("Invalid reduction operation");
-      break;
+   case iand64: return 0xffffffffu;
+   case fmin16: return 0x7c00u;                /* infinity */
+   case fmin32: return 0x7f800000u;            /* infinity */
+   case fmin64: return idx ? 0x7ff00000u : 0u; /* infinity */
+   case fmax16: return 0xfc00u;                /* negative infinity */
+   case fmax32: return 0xff800000u;            /* negative infinity */
+   case fmax64: return idx ? 0xfff00000u : 0u; /* negative infinity */
+   default: unreachable("Invalid reduction operation"); break;
   }
   return 0;
 }

-bool needs_exec_mask(const Instruction* instr) {
+bool
+needs_exec_mask(const Instruction* instr)
+{
   if (instr->isSALU() || instr->isBranch())
      return instr->reads_exec();
   if (instr->isSMEM())
@ -479,10 +428,8 @@ bool needs_exec_mask(const Instruction* instr) {
      case aco_opcode::p_reload:
      case aco_opcode::p_logical_start:
      case aco_opcode::p_logical_end:
-      case aco_opcode::p_startpgm:
-         return false;
-      default:
-         break;
+      case aco_opcode::p_startpgm: return false;
+      default: break;
      }
   }

@ -495,10 +442,11 @@ bool needs_exec_mask(const Instruction* instr) {
   return true;
 }

-wait_imm::wait_imm() :
-   vm(unset_counter), exp(unset_counter), lgkm(unset_counter), vs(unset_counter) {}
-wait_imm::wait_imm(uint16_t vm_, uint16_t exp_, uint16_t lgkm_, uint16_t vs_) :
-      vm(vm_), exp(exp_), lgkm(lgkm_), vs(vs_) {}
+wait_imm::wait_imm() : vm(unset_counter), exp(unset_counter), lgkm(unset_counter), vs(unset_counter)
+{}
+wait_imm::wait_imm(uint16_t vm_, uint16_t exp_, uint16_t lgkm_, uint16_t vs_)
+    : vm(vm_), exp(exp_), lgkm(lgkm_), vs(vs_)
+{}

 wait_imm::wait_imm(enum chip_class chip, uint16_t packed) : vs(unset_counter)
 {
@ -513,7 +461,8 @@ wait_imm::wait_imm(enum chip_class chip, uint16_t packed) : vs(unset_counter)
      lgkm |= (packed >> 8) & 0x30;
 }

-uint16_t wait_imm::pack(enum chip_class chip) const
+uint16_t
+wait_imm::pack(enum chip_class chip) const
 {
   uint16_t imm = 0;
   assert(exp == unset_counter || exp <= 0x7);
@ -536,13 +485,16 @@ uint16_t wait_imm::pack(enum chip_class chip) const
      break;
   }
   if (chip < GFX9 && vm == wait_imm::unset_counter)
-      imm |= 0xc000; /* should have no effect on pre-GFX9 and now we won't have to worry about the architecture when interpreting the immediate */
+      imm |= 0xc000; /* should have no effect on pre-GFX9 and now we won't have to worry about the
+                        architecture when interpreting the immediate */
   if (chip < GFX10 && lgkm == wait_imm::unset_counter)
-      imm |= 0x3000; /* should have no effect on pre-GFX10 and now we won't have to worry about the architecture when interpreting the immediate */
+      imm |= 0x3000; /* should have no effect on pre-GFX10 and now we won't have to worry about the
+                        architecture when interpreting the immediate */
   return imm;
 }

-bool wait_imm::combine(const wait_imm& other)
+bool
+wait_imm::combine(const wait_imm& other)
 {
   bool changed = other.vm < vm || other.exp < exp || other.lgkm < lgkm || other.vs < vs;
   vm = std::min(vm, other.vm);
@ -552,17 +504,21 @@ bool wait_imm::combine(const wait_imm& other)
   return changed;
 }

-bool wait_imm::empty() const
+bool
+wait_imm::empty() const
 {
-   return vm == unset_counter && exp == unset_counter &&
-          lgkm == unset_counter && vs == unset_counter;
+   return vm == unset_counter && exp == unset_counter && lgkm == unset_counter &&
+          vs == unset_counter;
 }

-bool should_form_clause(const Instruction *a, const Instruction *b)
+bool
+should_form_clause(const Instruction* a, const Instruction* b)
 {
   /* Vertex attribute loads from the same binding likely load from similar addresses */
-   unsigned a_vtx_binding = a->isMUBUF() ? a->mubuf().vtx_binding : (a->isMTBUF() ? a->mtbuf().vtx_binding : 0);
-   unsigned b_vtx_binding = b->isMUBUF() ? b->mubuf().vtx_binding : (b->isMTBUF() ? b->mtbuf().vtx_binding : 0);
+   unsigned a_vtx_binding =
+      a->isMUBUF() ? a->mubuf().vtx_binding : (a->isMTBUF() ? a->mtbuf().vtx_binding : 0);
+   unsigned b_vtx_binding =
+      b->isMUBUF() ? b->mubuf().vtx_binding : (b->isMTBUF() ? b->mtbuf().vtx_binding : 0);
   if (a_vtx_binding && a_vtx_binding == b_vtx_binding)
      return true;

@ -584,4 +540,4 @@ bool should_form_clause(const Instruction *a, const Instruction *b)
   return false;
 }

-}
+} // namespace aco
--- a/src/amd/compiler/aco_ir.h
+++ b/src/amd/compiler/aco_ir.h
--- a/src/amd/compiler/aco_live_var_analysis.cpp
+++ b/src/amd/compiler/aco_live_var_analysis.cpp
@ -24,13 +24,15 @@
 */

 #include "aco_ir.h"
+
 #include "util/u_math.h"

 #include <set>
 #include <vector>

 namespace aco {
-RegisterDemand get_live_changes(aco_ptr<Instruction>& instr)
+RegisterDemand
+get_live_changes(aco_ptr<Instruction>& instr)
 {
   RegisterDemand changes;
   for (const Definition& def : instr->definitions) {
@ -48,7 +50,8 @@ RegisterDemand get_live_changes(aco_ptr<Instruction>& instr)
   return changes;
 }

-RegisterDemand get_temp_registers(aco_ptr<Instruction>& instr)
+RegisterDemand
+get_temp_registers(aco_ptr<Instruction>& instr)
 {
   RegisterDemand temp_registers;

@ -67,7 +70,9 @@ RegisterDemand get_temp_registers(aco_ptr<Instruction>& instr)
   return temp_registers;
 }

-RegisterDemand get_demand_before(RegisterDemand demand, aco_ptr<Instruction>& instr, aco_ptr<Instruction>& instr_before)
+RegisterDemand
+get_demand_before(RegisterDemand demand, aco_ptr<Instruction>& instr,
+                  aco_ptr<Instruction>& instr_before)
 {
   demand -= get_live_changes(instr);
   demand -= get_temp_registers(instr);
@ -77,8 +82,9 @@ RegisterDemand get_demand_before(RegisterDemand demand, aco_ptr<Instruction>& in
 }

 namespace {
-void process_live_temps_per_block(Program *program, live& lives, Block* block,
-                                  std::set<unsigned>& worklist, std::vector<uint16_t>& phi_sgpr_ops)
+void
+process_live_temps_per_block(Program* program, live& lives, Block* block,
+                             std::set<unsigned>& worklist, std::vector<uint16_t>& phi_sgpr_ops)
 {
   std::vector<RegisterDemand>& register_demand = lives.register_demand[block->index];
   RegisterDemand new_demand;
@ -94,8 +100,8 @@ void process_live_temps_per_block(Program *program, live& lives, Block* block,

   /* traverse the instructions backwards */
   int idx;
-   for (idx = block->instructions.size() -1; idx >= 0; idx--) {
-      Instruction *insn = block->instructions[idx].get();
+   for (idx = block->instructions.size() - 1; idx >= 0; idx--) {
+      Instruction* insn = block->instructions[idx].get();
      if (is_phi(insn))
         break;

@ -131,8 +137,7 @@ void process_live_temps_per_block(Program *program, live& lives, Block* block,
         for (Operand& op : insn->operands)
            op.setKill(false);

-         for (unsigned i = 0; i < insn->operands.size(); ++i)
-         {
+         for (unsigned i = 0; i < insn->operands.size(); ++i) {
            Operand& operand = insn->operands[i];
            if (!operand.isTemp())
               continue;
@ -143,7 +148,8 @@ void process_live_temps_per_block(Program *program, live& lives, Block* block,
            if (inserted) {
               operand.setFirstKill(true);
               for (unsigned j = i + 1; j < insn->operands.size(); ++j) {
-                  if (insn->operands[j].isTemp() && insn->operands[j].tempId() == operand.tempId()) {
+                  if (insn->operands[j].isTemp() &&
+                      insn->operands[j].tempId() == operand.tempId()) {
                     insn->operands[j].setFirstKill(false);
                     insn->operands[j].setKill(true);
                  }
@ -167,7 +173,7 @@ void process_live_temps_per_block(Program *program, live& lives, Block* block,
   int phi_idx = idx;
   while (phi_idx >= 0) {
      register_demand[phi_idx] = new_demand;
-      Instruction *insn = block->instructions[phi_idx].get();
+      Instruction* insn = block->instructions[phi_idx].get();

      assert(is_phi(insn) && insn->definitions.size() == 1);
      if (!insn->definitions[0].isTemp()) {
@ -196,7 +202,8 @@ void process_live_temps_per_block(Program *program, live& lives, Block* block,

 #ifndef NDEBUG
      if (preds.empty())
-         aco_err(program, "Temporary never defined or are defined after use: %%%d in BB%d", t, block->index);
+         aco_err(program, "Temporary never defined or are defined after use: %%%d in BB%d", t,
+                 block->index);
 #endif

      for (unsigned pred_idx : preds) {
@ -209,14 +216,13 @@ void process_live_temps_per_block(Program *program, live& lives, Block* block,
   /* handle phi operands */
   phi_idx = idx;
   while (phi_idx >= 0) {
-      Instruction *insn = block->instructions[phi_idx].get();
+      Instruction* insn = block->instructions[phi_idx].get();
      assert(is_phi(insn));
      /* directly insert into the predecessors live-out set */
-      std::vector<unsigned>& preds = insn->opcode == aco_opcode::p_phi
-                                   ? block->logical_preds
-                                   : block->linear_preds;
+      std::vector<unsigned>& preds =
+         insn->opcode == aco_opcode::p_phi ? block->logical_preds : block->linear_preds;
      for (unsigned i = 0; i < preds.size(); ++i) {
-         Operand &operand = insn->operands[i];
+         Operand& operand = insn->operands[i];
         if (!operand.isTemp())
            continue;
         if (operand.isFixed() && operand.physReg() == vcc)
@ -238,18 +244,19 @@ void process_live_temps_per_block(Program *program, live& lives, Block* block,
   assert(block->index != 0 || (new_demand == RegisterDemand() && live.empty()));
 }

-unsigned calc_waves_per_workgroup(Program *program)
+unsigned
+calc_waves_per_workgroup(Program* program)
 {
   /* When workgroup size is not known, just go with wave_size */
-   unsigned workgroup_size = program->workgroup_size == UINT_MAX
-                             ? program->wave_size
-                             : program->workgroup_size;
+   unsigned workgroup_size =
+      program->workgroup_size == UINT_MAX ? program->wave_size : program->workgroup_size;

   return align(workgroup_size, program->wave_size) / program->wave_size;
 }
 } /* end namespace */

-uint16_t get_extra_sgprs(Program *program)
+uint16_t
+get_extra_sgprs(Program* program)
 {
   if (program->chip_class >= GFX10) {
      assert(!program->needs_flat_scr);
@ -275,26 +282,30 @@ uint16_t get_extra_sgprs(Program *program)
   }
 }

-uint16_t get_sgpr_alloc(Program *program, uint16_t addressable_sgprs)
+uint16_t
+get_sgpr_alloc(Program* program, uint16_t addressable_sgprs)
 {
   uint16_t sgprs = addressable_sgprs + get_extra_sgprs(program);
   uint16_t granule = program->dev.sgpr_alloc_granule;
   return ALIGN_NPOT(std::max(sgprs, granule), granule);
 }

-uint16_t get_vgpr_alloc(Program *program, uint16_t addressable_vgprs)
+uint16_t
+get_vgpr_alloc(Program* program, uint16_t addressable_vgprs)
 {
   assert(addressable_vgprs <= program->dev.vgpr_limit);
   uint16_t granule = program->dev.vgpr_alloc_granule;
   return align(std::max(addressable_vgprs, granule), granule);
 }

-unsigned round_down(unsigned a, unsigned b)
+unsigned
+round_down(unsigned a, unsigned b)
 {
   return a - (a % b);
 }

-uint16_t get_addr_sgpr_from_waves(Program *program, uint16_t waves)
+uint16_t
+get_addr_sgpr_from_waves(Program* program, uint16_t waves)
 {
   /* it's not possible to allocate more than 128 SGPRs */
   uint16_t sgprs = std::min(program->dev.physical_sgprs / waves, 128);
@ -303,21 +314,24 @@ uint16_t get_addr_sgpr_from_waves(Program *program, uint16_t waves)
   return std::min(sgprs, program->dev.sgpr_limit);
 }

-uint16_t get_addr_vgpr_from_waves(Program *program, uint16_t waves)
+uint16_t
+get_addr_vgpr_from_waves(Program* program, uint16_t waves)
 {
   uint16_t vgprs = program->dev.physical_vgprs / waves & ~(program->dev.vgpr_alloc_granule - 1);
   vgprs -= program->config->num_shared_vgprs / 2;
   return std::min(vgprs, program->dev.vgpr_limit);
 }

-void calc_min_waves(Program* program)
+void
+calc_min_waves(Program* program)
 {
   unsigned waves_per_workgroup = calc_waves_per_workgroup(program);
   unsigned simd_per_cu_wgp = program->dev.simd_per_cu * (program->wgp_mode ? 2 : 1);
   program->min_waves = DIV_ROUND_UP(waves_per_workgroup, simd_per_cu_wgp);
 }

-void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand)
+void
+update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand)
 {
   unsigned max_waves_per_simd = program->dev.max_wave64_per_simd * (64 / program->wave_size);
   unsigned simd_per_cu_wgp = program->dev.simd_per_cu * (program->wgp_mode ? 2 : 1);
@ -333,8 +347,10 @@ void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand)
      program->max_reg_demand = new_demand;
   } else {
      program->num_waves = program->dev.physical_sgprs / get_sgpr_alloc(program, new_demand.sgpr);
-      uint16_t vgpr_demand = get_vgpr_alloc(program, new_demand.vgpr) + program->config->num_shared_vgprs / 2;
-      program->num_waves = std::min<uint16_t>(program->num_waves, program->dev.physical_vgprs / vgpr_demand);
+      uint16_t vgpr_demand =
+         get_vgpr_alloc(program, new_demand.vgpr) + program->config->num_shared_vgprs / 2;
+      program->num_waves =
+         std::min<uint16_t>(program->num_waves, program->dev.physical_vgprs / vgpr_demand);
      program->max_waves = max_waves_per_simd;

      /* adjust max_waves for workgroup and LDS limits */
@ -346,12 +362,15 @@ void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand)
         workgroups_per_cu_wgp = std::min(workgroups_per_cu_wgp, lds_limit / lds);
      }
      if (waves_per_workgroup > 1 && program->chip_class < GFX10)
-         workgroups_per_cu_wgp = std::min(workgroups_per_cu_wgp, 16u); /* TODO: is this a SI-only limit? what about Navi? */
+         workgroups_per_cu_wgp = std::min(
+            workgroups_per_cu_wgp, 16u); /* TODO: is this a SI-only limit? what about Navi? */

      /* in cases like waves_per_workgroup=3 or lds=65536 and
       * waves_per_workgroup=1, we want the maximum possible number of waves per
       * SIMD and not the minimum. so DIV_ROUND_UP is used */
-      program->max_waves = std::min<uint16_t>(program->max_waves, DIV_ROUND_UP(workgroups_per_cu_wgp * waves_per_workgroup, simd_per_cu_wgp));
+      program->max_waves = std::min<uint16_t>(
+         program->max_waves,
+         DIV_ROUND_UP(workgroups_per_cu_wgp * waves_per_workgroup, simd_per_cu_wgp));

      /* incorporate max_waves and calculate max_reg_demand */
      program->num_waves = std::min<uint16_t>(program->num_waves, program->max_waves);
@ -360,7 +379,8 @@ void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand)
   }
 }

-live live_var_analysis(Program* program)
+live
+live_var_analysis(Program* program)
 {
   live result;
   result.live_out.resize(program->blocks.size());
@ -371,14 +391,16 @@ live live_var_analysis(Program* program)

   program->needs_vcc = false;

-   /* this implementation assumes that the block idx corresponds to the block's position in program->blocks vector */
+   /* this implementation assumes that the block idx corresponds to the block's position in
+    * program->blocks vector */
   for (Block& block : program->blocks)
      worklist.insert(block.index);
   while (!worklist.empty()) {
      std::set<unsigned>::reverse_iterator b_it = worklist.rbegin();
      unsigned block_idx = *b_it;
      worklist.erase(block_idx);
-      process_live_temps_per_block(program, result, &program->blocks[block_idx], worklist, phi_sgpr_ops);
+      process_live_temps_per_block(program, result, &program->blocks[block_idx], worklist,
+                                   phi_sgpr_ops);
      new_demand.update(program->blocks[block_idx].register_demand);
   }

@ -389,5 +411,4 @@ live live_var_analysis(Program* program)
   return result;
 }

-}
-
+} // namespace aco
--- a/src/amd/compiler/aco_lower_phis.cpp
+++ b/src/amd/compiler/aco_lower_phis.cpp
@ -47,7 +47,8 @@ struct ssa_state {
   std::vector<bool> visited;
 };

-Operand get_ssa(Program *program, unsigned block_idx, ssa_state *state, bool before_write)
+Operand
+get_ssa(Program* program, unsigned block_idx, ssa_state* state, bool before_write)
 {
   if (!before_write) {
      auto it = state->writes.find(block_idx);
@ -79,7 +80,8 @@ Operand get_ssa(Program *program, unsigned block_idx, ssa_state *state, bool bef
      Temp res = Temp(program->allocateTmp(program->lane_mask));
      state->latest[block_idx] = Operand(res);

-      aco_ptr<Pseudo_instruction> phi{create_instruction<Pseudo_instruction>(aco_opcode::p_linear_phi, Format::PSEUDO, pred, 1)};
+      aco_ptr<Pseudo_instruction> phi{
+         create_instruction<Pseudo_instruction>(aco_opcode::p_linear_phi, Format::PSEUDO, pred, 1)};
      for (unsigned i = 0; i < pred; i++)
         phi->operands[i] = get_ssa(program, block.linear_preds[i], state, false);
      phi->definitions[0] = Definition(res);
@ -89,11 +91,11 @@ Operand get_ssa(Program *program, unsigned block_idx, ssa_state *state, bool bef
   }
 }

-void insert_before_logical_end(Block *block, aco_ptr<Instruction> instr)
+void
+insert_before_logical_end(Block* block, aco_ptr<Instruction> instr)
 {
-   auto IsLogicalEnd = [] (const aco_ptr<Instruction>& inst) -> bool {
-      return inst->opcode == aco_opcode::p_logical_end;
-   };
+   auto IsLogicalEnd = [](const aco_ptr<Instruction>& inst) -> bool
+   { return inst->opcode == aco_opcode::p_logical_end; };
   auto it = std::find_if(block->instructions.crbegin(), block->instructions.crend(), IsLogicalEnd);

   if (it == block->instructions.crend()) {
@ -104,13 +106,13 @@ void insert_before_logical_end(Block *block, aco_ptr<Instruction> instr)
   }
 }

-void build_merge_code(Program *program, Block *block, Definition dst, Operand prev, Operand cur)
+void
+build_merge_code(Program* program, Block* block, Definition dst, Operand prev, Operand cur)
 {
   Builder bld(program);

-   auto IsLogicalEnd = [] (const aco_ptr<Instruction>& instr) -> bool {
-      return instr->opcode == aco_opcode::p_logical_end;
-   };
+   auto IsLogicalEnd = [](const aco_ptr<Instruction>& instr) -> bool
+   { return instr->opcode == aco_opcode::p_logical_end; };
   auto it = std::find_if(block->instructions.rbegin(), block->instructions.rend(), IsLogicalEnd);
   assert(it != block->instructions.rend());
   bld.reset(&block->instructions, std::prev(it.base()));
@ -126,7 +128,8 @@ void build_merge_code(Program *program, Block *block, Definition dst, Operand pr
   if (!prev_is_constant) {
      if (!cur_is_constant) {
         Temp tmp1 = bld.tmp(bld.lm), tmp2 = bld.tmp(bld.lm);
-         bld.sop2(Builder::s_andn2, Definition(tmp1), bld.def(s1, scc), prev, Operand(exec, bld.lm));
+         bld.sop2(Builder::s_andn2, Definition(tmp1), bld.def(s1, scc), prev,
+                  Operand(exec, bld.lm));
         bld.sop2(Builder::s_and, Definition(tmp2), bld.def(s1, scc), cur, Operand(exec, bld.lm));
         bld.sop2(Builder::s_or, dst, bld.def(s1, scc), tmp1, tmp2);
      } else if (cur.constantValue()) {
@ -151,7 +154,8 @@ void build_merge_code(Program *program, Block *block, Definition dst, Operand pr
   }
 }

-void init_any_pred_defined(Program *program, ssa_state *state, Block *block, aco_ptr<Instruction>& phi)
+void
+init_any_pred_defined(Program* program, ssa_state* state, Block* block, aco_ptr<Instruction>& phi)
 {
   std::fill(state->any_pred_defined.begin(), state->any_pred_defined.end(), false);
   for (unsigned i = 0; i < block->logical_preds.size(); i++) {
@ -178,7 +182,9 @@ void init_any_pred_defined(Program *program, ssa_state *state, Block *block, aco
   }
 }

-void lower_divergent_bool_phi(Program *program, ssa_state *state, Block *block, aco_ptr<Instruction>& phi)
+void
+lower_divergent_bool_phi(Program* program, ssa_state* state, Block* block,
+                         aco_ptr<Instruction>& phi)
 {
   Builder bld(program);

@ -186,7 +192,8 @@ void lower_divergent_bool_phi(Program *program, ssa_state *state, Block *block,
      state->all_preds_uniform = !(block->kind & block_kind_merge) &&
                                 block->linear_preds.size() == block->logical_preds.size();
      for (unsigned pred : block->logical_preds)
-         state->all_preds_uniform = state->all_preds_uniform && (program->blocks[pred].kind & block_kind_uniform);
+         state->all_preds_uniform =
+            state->all_preds_uniform && (program->blocks[pred].kind & block_kind_uniform);
      state->checked_preds_for_uniform = true;
   }

@ -230,7 +237,7 @@ void lower_divergent_bool_phi(Program *program, ssa_state *state, Block *block,
   bool uniform_merge = block->kind & block_kind_loop_header;

   for (unsigned i = 0; i < phi->operands.size(); i++) {
-      Block *pred = &program->blocks[block->logical_preds[i]];
+      Block* pred = &program->blocks[block->logical_preds[i]];

      bool need_get_ssa = !uniform_merge;
      if (block->kind & block_kind_loop_header && !(pred->kind & block_kind_uniform))
@ -254,7 +261,8 @@ void lower_divergent_bool_phi(Program *program, ssa_state *state, Block *block,

   unsigned num_preds = block->linear_preds.size();
   if (phi->operands.size() != num_preds) {
-      Pseudo_instruction* new_phi{create_instruction<Pseudo_instruction>(aco_opcode::p_linear_phi, Format::PSEUDO, num_preds, 1)};
+      Pseudo_instruction* new_phi{create_instruction<Pseudo_instruction>(
+         aco_opcode::p_linear_phi, Format::PSEUDO, num_preds, 1)};
      new_phi->definitions[0] = phi->definitions[0];
      phi.reset(new_phi);
   } else {
@ -268,7 +276,8 @@ void lower_divergent_bool_phi(Program *program, ssa_state *state, Block *block,
   return;
 }

-void lower_subdword_phis(Program *program, Block *block, aco_ptr<Instruction>& phi)
+void
+lower_subdword_phis(Program* program, Block* block, aco_ptr<Instruction>& phi)
 {
   Builder bld(program);
   for (unsigned i = 0; i < phi->operands.size(); i++) {
@ -278,21 +287,24 @@ void lower_subdword_phis(Program *program, Block *block, aco_ptr<Instruction>& p
         continue;

      assert(phi->operands[i].isTemp());
-      Block *pred = &program->blocks[block->logical_preds[i]];
+      Block* pred = &program->blocks[block->logical_preds[i]];
      Temp phi_src = phi->operands[i].getTemp();

      assert(phi_src.regClass().type() == RegType::sgpr);
      Temp tmp = bld.tmp(RegClass(RegType::vgpr, phi_src.size()));
      insert_before_logical_end(pred, bld.copy(Definition(tmp), phi_src).get_ptr());
      Temp new_phi_src = bld.tmp(phi->definitions[0].regClass());
-      insert_before_logical_end(pred, bld.pseudo(aco_opcode::p_extract_vector, Definition(new_phi_src), tmp, Operand(0u)).get_ptr());
+      insert_before_logical_end(
+         pred, bld.pseudo(aco_opcode::p_extract_vector, Definition(new_phi_src), tmp, Operand(0u))
+                  .get_ptr());

      phi->operands[i].setTemp(new_phi_src);
   }
   return;
 }

-void lower_phis(Program* program)
+void
+lower_phis(Program* program)
 {
   ssa_state state;

@ -301,7 +313,8 @@ void lower_phis(Program* program)
      state.needs_init = true;
      for (aco_ptr<Instruction>& phi : block.instructions) {
         if (phi->opcode == aco_opcode::p_phi) {
-            assert(program->wave_size == 64 ? phi->definitions[0].regClass() != s1 : phi->definitions[0].regClass() != s2);
+            assert(program->wave_size == 64 ? phi->definitions[0].regClass() != s1
+                                            : phi->definitions[0].regClass() != s2);
            if (phi->definitions[0].regClass() == program->lane_mask)
               lower_divergent_bool_phi(program, &state, &block, phi);
            else if (phi->definitions[0].regClass().is_subdword())
@ -313,4 +326,4 @@ void lower_phis(Program* program)
   }
 }

-}
+} // namespace aco
--- a/src/amd/compiler/aco_lower_to_cssa.cpp
+++ b/src/amd/compiler/aco_lower_to_cssa.cpp
@ -53,32 +53,32 @@ struct copy {

 struct merge_node {
   Operand value = Operand(); /* original value: can be an SSA-def or constant value */
-   uint32_t index = -1u; /* index into the vector of merge sets */
+   uint32_t index = -1u;      /* index into the vector of merge sets */
   uint32_t defined_at = -1u; /* defining block */

   /* we also remember two dominating defs with the same value: */
-   Temp equal_anc_in = Temp(); /* within the same merge set */
+   Temp equal_anc_in = Temp();  /* within the same merge set */
   Temp equal_anc_out = Temp(); /* from a different set */
 };

 struct cssa_ctx {
   Program* program;
-   std::vector<IDSet>& live_out; /* live-out sets per block */
+   std::vector<IDSet>& live_out;                  /* live-out sets per block */
   std::vector<std::vector<copy>> parallelcopies; /* copies per block */
-   std::vector<merge_set> merge_sets; /* each vector is one (ordered) merge set */
+   std::vector<merge_set> merge_sets;             /* each vector is one (ordered) merge set */
   std::unordered_map<uint32_t, merge_node> merge_node_table; /* tempid -> merge node */
 };

 /* create (virtual) parallelcopies for each phi instruction and
 * already merge copy-definitions with phi-defs into merge sets */
-void collect_parallelcopies(cssa_ctx& ctx)
+void
+collect_parallelcopies(cssa_ctx& ctx)
 {
   ctx.parallelcopies.resize(ctx.program->blocks.size());
   Builder bld(ctx.program);
   for (Block& block : ctx.program->blocks) {
      for (aco_ptr<Instruction>& phi : block.instructions) {
-         if (phi->opcode != aco_opcode::p_phi &&
-             phi->opcode != aco_opcode::p_linear_phi)
+         if (phi->opcode != aco_opcode::p_phi && phi->opcode != aco_opcode::p_linear_phi)
            break;

         const Definition& def = phi->definitions[0];
@ -89,9 +89,8 @@ void collect_parallelcopies(cssa_ctx& ctx)
         if (!def.isTemp())
            continue;

-         std::vector<unsigned>& preds = phi->opcode == aco_opcode::p_phi ?
-                                        block.logical_preds :
-                                        block.linear_preds;
+         std::vector<unsigned>& preds =
+            phi->opcode == aco_opcode::p_phi ? block.logical_preds : block.linear_preds;
         uint32_t index = ctx.merge_sets.size();
         merge_set set;

@ -151,8 +150,8 @@ void collect_parallelcopies(cssa_ctx& ctx)
 }

 /* check whether the definition of a comes after b. */
-inline
-bool defined_after(cssa_ctx& ctx, Temp a, Temp b)
+inline bool
+defined_after(cssa_ctx& ctx, Temp a, Temp b)
 {
   merge_node& node_a = ctx.merge_node_table[a.id()];
   merge_node& node_b = ctx.merge_node_table[b.id()];
@ -163,25 +162,24 @@ bool defined_after(cssa_ctx& ctx, Temp a, Temp b)
 }

 /* check whether a dominates b where b is defined after a */
-inline
-bool dominates(cssa_ctx& ctx, Temp a, Temp b)
+inline bool
+dominates(cssa_ctx& ctx, Temp a, Temp b)
 {
   assert(defined_after(ctx, b, a));
   merge_node& node_a = ctx.merge_node_table[a.id()];
   merge_node& node_b = ctx.merge_node_table[b.id()];
   unsigned idom = node_b.defined_at;
   while (idom > node_a.defined_at)
-      idom = b.regClass().type() == RegType::vgpr ?
-             ctx.program->blocks[idom].logical_idom :
-             ctx.program->blocks[idom].linear_idom;
+      idom = b.regClass().type() == RegType::vgpr ? ctx.program->blocks[idom].logical_idom
+                                                  : ctx.program->blocks[idom].linear_idom;

   return idom == node_a.defined_at;
 }

 /* check intersection between var and parent:
 * We already know that parent dominates var. */
-inline
-bool intersects(cssa_ctx& ctx, Temp var, Temp parent)
+inline bool
+intersects(cssa_ctx& ctx, Temp var, Temp parent)
 {
   merge_node& node_var = ctx.merge_node_table[var.id()];
   merge_node& node_parent = ctx.merge_node_table[parent.id()];
@ -196,9 +194,9 @@ bool intersects(cssa_ctx& ctx, Temp var, Temp parent)
   /* parent is defined in a different block than var */
   if (node_parent.defined_at < node_var.defined_at) {
      /* if the parent is not live-in, they don't interfere */
-      std::vector<uint32_t>& preds = var.type() == RegType::vgpr ?
-                                     ctx.program->blocks[block_idx].logical_preds :
-                                     ctx.program->blocks[block_idx].linear_preds;
+      std::vector<uint32_t>& preds = var.type() == RegType::vgpr
+                                        ? ctx.program->blocks[block_idx].logical_preds
+                                        : ctx.program->blocks[block_idx].linear_preds;
      for (uint32_t pred : preds) {
         if (!ctx.live_out[pred].count(parent.id()))
            return false;
@ -246,8 +244,8 @@ bool intersects(cssa_ctx& ctx, Temp var, Temp parent)
 /* check interference between var and parent:
 * i.e. they have different values and intersect.
 * If parent and var share the same value, also updates the equal ancestor. */
-inline
-bool interference(cssa_ctx& ctx, Temp var, Temp parent)
+inline bool
+interference(cssa_ctx& ctx, Temp var, Temp parent)
 {
   assert(var != parent);
   merge_node& node_var = ctx.merge_node_table[var.id()];
@ -281,13 +279,14 @@ bool interference(cssa_ctx& ctx, Temp var, Temp parent)

 /* tries to merge set_b into set_a of given temporary and
 * drops that temporary as it is being coalesced */
-bool try_merge_merge_set(cssa_ctx& ctx, Temp dst, merge_set& set_b)
+bool
+try_merge_merge_set(cssa_ctx& ctx, Temp dst, merge_set& set_b)
 {
   auto def_node_it = ctx.merge_node_table.find(dst.id());
   uint32_t index = def_node_it->second.index;
   merge_set& set_a = ctx.merge_sets[index];
   std::vector<Temp> dom; /* stack of the traversal */
-   merge_set union_set; /* the new merged merge-set */
+   merge_set union_set;   /* the new merged merge-set */
   uint32_t i_a = 0;
   uint32_t i_b = 0;

@ -335,7 +334,8 @@ bool try_merge_merge_set(cssa_ctx& ctx, Temp dst, merge_set& set_b)
 }

 /* returns true if the copy can safely be omitted */
-bool try_coalesce_copy(cssa_ctx& ctx, copy copy, uint32_t block_idx)
+bool
+try_coalesce_copy(cssa_ctx& ctx, copy copy, uint32_t block_idx)
 {
   /* we can only coalesce temporaries */
   if (!copy.op.isTemp())
@ -348,11 +348,9 @@ bool try_coalesce_copy(cssa_ctx& ctx, copy copy, uint32_t block_idx)
      uint32_t pred = block_idx;
      do {
         block_idx = pred;
-         pred = copy.op.regClass().type() == RegType::vgpr ?
-                ctx.program->blocks[pred].logical_idom :
-                ctx.program->blocks[pred].linear_idom;
-      } while (block_idx != pred &&
-               ctx.live_out[pred].count(copy.op.tempId()));
+         pred = copy.op.regClass().type() == RegType::vgpr ? ctx.program->blocks[pred].logical_idom
+                                                           : ctx.program->blocks[pred].linear_idom;
+      } while (block_idx != pred && ctx.live_out[pred].count(copy.op.tempId()));
      op_node.defined_at = block_idx;
      op_node.value = copy.op;
   }
@ -385,7 +383,8 @@ struct ltg_node {

 /* emit the copies in an order that does not
 * create interferences within a merge-set */
-void emit_copies_block(Builder bld, std::map<uint32_t, ltg_node>& ltg, RegType type)
+void
+emit_copies_block(Builder bld, std::map<uint32_t, ltg_node>& ltg, RegType type)
 {
   auto&& it = ltg.begin();
   while (it != ltg.end()) {
@ -410,16 +409,16 @@ void emit_copies_block(Builder bld, std::map<uint32_t, ltg_node>& ltg, RegType t
   }

   /* count the number of remaining circular dependencies */
-   unsigned num = std::count_if(ltg.begin(), ltg.end(), [&] (auto& n){
-      return n.second.cp.def.regClass().type() == type;
-   });
+   unsigned num = std::count_if(ltg.begin(), ltg.end(),
+                                [&](auto& n) { return n.second.cp.def.regClass().type() == type; });

   /* if there are circular dependencies, we just emit them as single parallelcopy */
   if (num) {
      // TODO: this should be restricted to a feasible number of registers
      // and otherwise use a temporary to avoid having to reload more (spilled)
      // variables than we have registers.
-      aco_ptr<Pseudo_instruction> copy{create_instruction<Pseudo_instruction>(aco_opcode::p_parallelcopy, Format::PSEUDO, num, num)};
+      aco_ptr<Pseudo_instruction> copy{create_instruction<Pseudo_instruction>(
+         aco_opcode::p_parallelcopy, Format::PSEUDO, num, num)};
      it = ltg.begin();
      for (unsigned i = 0; i < num; i++) {
         while (it->second.cp.def.regClass().type() != type)
@ -435,7 +434,8 @@ void emit_copies_block(Builder bld, std::map<uint32_t, ltg_node>& ltg, RegType t

 /* either emits or coalesces all parallelcopies and
 * renames the phi-operands accordingly. */
-void emit_parallelcopies(cssa_ctx& ctx)
+void
+emit_parallelcopies(cssa_ctx& ctx)
 {
   std::unordered_map<uint32_t, Operand> renames;

@ -476,9 +476,8 @@ void emit_parallelcopies(cssa_ctx& ctx)
      Block& block = ctx.program->blocks[i];

      /* emit VGPR copies */
-      auto IsLogicalEnd = [] (const aco_ptr<Instruction>& inst) -> bool {
-         return inst->opcode == aco_opcode::p_logical_end;
-      };
+      auto IsLogicalEnd = [](const aco_ptr<Instruction>& inst) -> bool
+      { return inst->opcode == aco_opcode::p_logical_end; };
      auto it = std::find_if(block.instructions.rbegin(), block.instructions.rend(), IsLogicalEnd);
      bld.reset(&block.instructions, std::prev(it.base()));
      emit_copies_block(bld, ltg, RegType::vgpr);
@ -494,8 +493,7 @@ void emit_parallelcopies(cssa_ctx& ctx)
   /* finally, rename coalesced phi operands */
   for (Block& block : ctx.program->blocks) {
      for (aco_ptr<Instruction>& phi : block.instructions) {
-         if (phi->opcode != aco_opcode::p_phi &&
-             phi->opcode != aco_opcode::p_linear_phi)
+         if (phi->opcode != aco_opcode::p_phi && phi->opcode != aco_opcode::p_linear_phi)
            break;

         for (Operand& op : phi->operands) {
@ -514,8 +512,8 @@ void emit_parallelcopies(cssa_ctx& ctx)

 } /* end namespace */

-
-void lower_to_cssa(Program* program, live& live_vars)
+void
+lower_to_cssa(Program* program, live& live_vars)
 {
   reindex_ssa(program, live_vars.live_out);
   cssa_ctx ctx = {program, live_vars.live_out};
@ -525,5 +523,4 @@ void lower_to_cssa(Program* program, live& live_vars)
   /* update live variable information */
   live_vars = live_var_analysis(program);
 }
-}
-
+} // namespace aco
--- a/src/amd/compiler/aco_lower_to_hw_instr.cpp
+++ b/src/amd/compiler/aco_lower_to_hw_instr.cpp
--- a/src/amd/compiler/aco_opt_value_numbering.cpp
+++ b/src/amd/compiler/aco_opt_value_numbering.cpp
@ -36,8 +36,9 @@
 namespace aco {
 namespace {

-inline
-uint32_t murmur_32_scramble(uint32_t h, uint32_t k) {
+inline uint32_t
+murmur_32_scramble(uint32_t h, uint32_t k)
+{
   k *= 0xcc9e2d51;
   k = (k << 15) | (k >> 17);
   h ^= k * 0x1b873593;
@ -46,8 +47,9 @@ uint32_t murmur_32_scramble(uint32_t h, uint32_t k) {
   return h;
 }

-template<typename T>
-uint32_t hash_murmur_32(Instruction* instr)
+template <typename T>
+uint32_t
+hash_murmur_32(Instruction* instr)
 {
   uint32_t hash = uint32_t(instr->format) << 16 | uint32_t(instr->opcode);

@ -58,7 +60,7 @@ uint32_t hash_murmur_32(Instruction* instr)
   for (unsigned i = 2; i < (sizeof(T) >> 2); i++) {
      uint32_t u;
      /* Accesses it though a byte array, so doesn't violate the strict aliasing rule */
-      memcpy(&u, reinterpret_cast<uint8_t *>(instr) + i * 4, 4);
+      memcpy(&u, reinterpret_cast<uint8_t*>(instr) + i * 4, 4);
      hash = murmur_32_scramble(hash, u);
   }

@ -92,32 +94,19 @@ struct InstrHash {
         return hash_murmur_32<SDWA_instruction>(instr);

      switch (instr->format) {
-      case Format::SMEM:
-         return hash_murmur_32<SMEM_instruction>(instr);
-      case Format::VINTRP:
-         return hash_murmur_32<Interp_instruction>(instr);
-      case Format::DS:
-         return hash_murmur_32<DS_instruction>(instr);
-      case Format::SOPP:
-         return hash_murmur_32<SOPP_instruction>(instr);
-      case Format::SOPK:
-         return hash_murmur_32<SOPK_instruction>(instr);
-      case Format::EXP:
-         return hash_murmur_32<Export_instruction>(instr);
-      case Format::MUBUF:
-         return hash_murmur_32<MUBUF_instruction>(instr);
-      case Format::MIMG:
-         return hash_murmur_32<MIMG_instruction>(instr);
-      case Format::MTBUF:
-         return hash_murmur_32<MTBUF_instruction>(instr);
-      case Format::FLAT:
-         return hash_murmur_32<FLAT_instruction>(instr);
-      case Format::PSEUDO_BRANCH:
-         return hash_murmur_32<Pseudo_branch_instruction>(instr);
-      case Format::PSEUDO_REDUCTION:
-         return hash_murmur_32<Pseudo_reduction_instruction>(instr);
-      default:
-         return hash_murmur_32<Instruction>(instr);
+      case Format::SMEM: return hash_murmur_32<SMEM_instruction>(instr);
+      case Format::VINTRP: return hash_murmur_32<Interp_instruction>(instr);
+      case Format::DS: return hash_murmur_32<DS_instruction>(instr);
+      case Format::SOPP: return hash_murmur_32<SOPP_instruction>(instr);
+      case Format::SOPK: return hash_murmur_32<SOPK_instruction>(instr);
+      case Format::EXP: return hash_murmur_32<Export_instruction>(instr);
+      case Format::MUBUF: return hash_murmur_32<MUBUF_instruction>(instr);
+      case Format::MIMG: return hash_murmur_32<MIMG_instruction>(instr);
+      case Format::MTBUF: return hash_murmur_32<MTBUF_instruction>(instr);
+      case Format::FLAT: return hash_murmur_32<FLAT_instruction>(instr);
+      case Format::PSEUDO_BRANCH: return hash_murmur_32<Pseudo_branch_instruction>(instr);
+      case Format::PSEUDO_REDUCTION: return hash_murmur_32<Pseudo_reduction_instruction>(instr);
+      default: return hash_murmur_32<Instruction>(instr);
      }
   }
 };
@ -129,7 +118,8 @@ struct InstrPred {
         return false;
      if (a->opcode != b->opcode)
         return false;
-      if (a->operands.size() != b->operands.size() || a->definitions.size() != b->definitions.size())
+      if (a->operands.size() != b->operands.size() ||
+          a->definitions.size() != b->definitions.size())
         return false; /* possible with pseudo-instructions */
      for (unsigned i = 0; i < a->operands.size(); i++) {
         if (a->operands[i].isConstant()) {
@ -137,14 +127,12 @@ struct InstrPred {
               return false;
            if (a->operands[i].constantValue() != b->operands[i].constantValue())
               return false;
-         }
-         else if (a->operands[i].isTemp()) {
+         } else if (a->operands[i].isTemp()) {
            if (!b->operands[i].isTemp())
               return false;
            if (a->operands[i].tempId() != b->operands[i].tempId())
               return false;
-         }
-         else if (a->operands[i].isUndefined() ^ b->operands[i].isUndefined())
+         } else if (a->operands[i].isUndefined() ^ b->operands[i].isUndefined())
            return false;
         if (a->operands[i].isFixed()) {
            if (!b->operands[i].isFixed())
@ -179,154 +167,110 @@ struct InstrPred {
         VOP3_instruction& a3 = a->vop3();
         VOP3_instruction& b3 = b->vop3();
         for (unsigned i = 0; i < 3; i++) {
-            if (a3.abs[i] != b3.abs[i] ||
-                a3.neg[i] != b3.neg[i])
+            if (a3.abs[i] != b3.abs[i] || a3.neg[i] != b3.neg[i])
               return false;
         }
-         return a3.clamp == b3.clamp &&
-                a3.omod == b3.omod &&
-                a3.opsel == b3.opsel;
+         return a3.clamp == b3.clamp && a3.omod == b3.omod && a3.opsel == b3.opsel;
      }
      if (a->isDPP()) {
         DPP_instruction& aDPP = a->dpp();
         DPP_instruction& bDPP = b->dpp();
-         return aDPP.pass_flags == bDPP.pass_flags &&
-                aDPP.dpp_ctrl == bDPP.dpp_ctrl &&
-                aDPP.bank_mask == bDPP.bank_mask &&
-                aDPP.row_mask == bDPP.row_mask &&
-                aDPP.bound_ctrl == bDPP.bound_ctrl &&
-                aDPP.abs[0] == bDPP.abs[0] &&
-                aDPP.abs[1] == bDPP.abs[1] &&
-                aDPP.neg[0] == bDPP.neg[0] &&
+         return aDPP.pass_flags == bDPP.pass_flags && aDPP.dpp_ctrl == bDPP.dpp_ctrl &&
+                aDPP.bank_mask == bDPP.bank_mask && aDPP.row_mask == bDPP.row_mask &&
+                aDPP.bound_ctrl == bDPP.bound_ctrl && aDPP.abs[0] == bDPP.abs[0] &&
+                aDPP.abs[1] == bDPP.abs[1] && aDPP.neg[0] == bDPP.neg[0] &&
                aDPP.neg[1] == bDPP.neg[1];
      }
      if (a->isSDWA()) {
         SDWA_instruction& aSDWA = a->sdwa();
         SDWA_instruction& bSDWA = b->sdwa();
-         return aSDWA.sel[0] == bSDWA.sel[0] &&
-                aSDWA.sel[1] == bSDWA.sel[1] &&
-                aSDWA.dst_sel == bSDWA.dst_sel &&
-                aSDWA.abs[0] == bSDWA.abs[0] &&
-                aSDWA.abs[1] == bSDWA.abs[1] &&
-                aSDWA.neg[0] == bSDWA.neg[0] &&
-                aSDWA.neg[1] == bSDWA.neg[1] &&
-                aSDWA.dst_preserve == bSDWA.dst_preserve &&
-                aSDWA.clamp == bSDWA.clamp &&
-                aSDWA.omod == bSDWA.omod;
+         return aSDWA.sel[0] == bSDWA.sel[0] && aSDWA.sel[1] == bSDWA.sel[1] &&
+                aSDWA.dst_sel == bSDWA.dst_sel && aSDWA.abs[0] == bSDWA.abs[0] &&
+                aSDWA.abs[1] == bSDWA.abs[1] && aSDWA.neg[0] == bSDWA.neg[0] &&
+                aSDWA.neg[1] == bSDWA.neg[1] && aSDWA.dst_preserve == bSDWA.dst_preserve &&
+                aSDWA.clamp == bSDWA.clamp && aSDWA.omod == bSDWA.omod;
      }

      switch (a->format) {
-         case Format::SOPK: {
-            if (a->opcode == aco_opcode::s_getreg_b32)
+      case Format::SOPK: {
+         if (a->opcode == aco_opcode::s_getreg_b32)
+            return false;
+         SOPK_instruction& aK = a->sopk();
+         SOPK_instruction& bK = b->sopk();
+         return aK.imm == bK.imm;
+      }
+      case Format::SMEM: {
+         SMEM_instruction& aS = a->smem();
+         SMEM_instruction& bS = b->smem();
+         /* isel shouldn't be creating situations where this assertion fails */
+         assert(aS.prevent_overflow == bS.prevent_overflow);
+         return aS.sync == bS.sync && aS.glc == bS.glc && aS.dlc == bS.dlc && aS.nv == bS.nv &&
+                aS.disable_wqm == bS.disable_wqm && aS.prevent_overflow == bS.prevent_overflow;
+      }
+      case Format::VINTRP: {
+         Interp_instruction& aI = a->vintrp();
+         Interp_instruction& bI = b->vintrp();
+         if (aI.attribute != bI.attribute)
+            return false;
+         if (aI.component != bI.component)
+            return false;
+         return true;
+      }
+      case Format::VOP3P: {
+         VOP3P_instruction& a3P = a->vop3p();
+         VOP3P_instruction& b3P = b->vop3p();
+         for (unsigned i = 0; i < 3; i++) {
+            if (a3P.neg_lo[i] != b3P.neg_lo[i] || a3P.neg_hi[i] != b3P.neg_hi[i])
               return false;
-            SOPK_instruction& aK = a->sopk();
-            SOPK_instruction& bK = b->sopk();
-            return aK.imm == bK.imm;
         }
-         case Format::SMEM: {
-            SMEM_instruction& aS = a->smem();
-            SMEM_instruction& bS = b->smem();
-            /* isel shouldn't be creating situations where this assertion fails */
-            assert(aS.prevent_overflow == bS.prevent_overflow);
-            return aS.sync == bS.sync && aS.glc == bS.glc && aS.dlc == bS.dlc &&
-                   aS.nv == bS.nv && aS.disable_wqm == bS.disable_wqm &&
-                   aS.prevent_overflow == bS.prevent_overflow;
-         }
-         case Format::VINTRP: {
-            Interp_instruction& aI = a->vintrp();
-            Interp_instruction& bI = b->vintrp();
-            if (aI.attribute != bI.attribute)
-               return false;
-            if (aI.component != bI.component)
-               return false;
-            return true;
-         }
-         case Format::VOP3P: {
-            VOP3P_instruction& a3P = a->vop3p();
-            VOP3P_instruction& b3P = b->vop3p();
-            for (unsigned i = 0; i < 3; i++) {
-               if (a3P.neg_lo[i] != b3P.neg_lo[i] ||
-                   a3P.neg_hi[i] != b3P.neg_hi[i])
-                  return false;
-            }
-            return a3P.opsel_lo == b3P.opsel_lo &&
-                   a3P.opsel_hi == b3P.opsel_hi &&
-                   a3P.clamp == b3P.clamp;
-         }
-         case Format::PSEUDO_REDUCTION: {
-            Pseudo_reduction_instruction& aR = a->reduction();
-            Pseudo_reduction_instruction& bR = b->reduction();
-            return aR.pass_flags == bR.pass_flags &&
-                   aR.reduce_op == bR.reduce_op &&
-                   aR.cluster_size == bR.cluster_size;
-         }
-         case Format::DS: {
-            assert(a->opcode == aco_opcode::ds_bpermute_b32 ||
-                   a->opcode == aco_opcode::ds_permute_b32 ||
-                   a->opcode == aco_opcode::ds_swizzle_b32);
-            DS_instruction& aD = a->ds();
-            DS_instruction& bD = b->ds();
-            return aD.sync == bD.sync &&
-                   aD.pass_flags == bD.pass_flags &&
-                   aD.gds == bD.gds &&
-                   aD.offset0 == bD.offset0 &&
-                   aD.offset1 == bD.offset1;
-         }
-         case Format::MTBUF: {
-            MTBUF_instruction& aM = a->mtbuf();
-            MTBUF_instruction& bM = b->mtbuf();
-            return aM.sync == bM.sync &&
-                   aM.dfmt == bM.dfmt &&
-                   aM.nfmt == bM.nfmt &&
-                   aM.offset == bM.offset &&
-                   aM.offen == bM.offen &&
-                   aM.idxen == bM.idxen &&
-                   aM.glc == bM.glc &&
-                   aM.dlc == bM.dlc &&
-                   aM.slc == bM.slc &&
-                   aM.tfe == bM.tfe &&
-                   aM.disable_wqm == bM.disable_wqm;
-         }
-         case Format::MUBUF: {
-            MUBUF_instruction& aM = a->mubuf();
-            MUBUF_instruction& bM = b->mubuf();
-            return aM.sync == bM.sync &&
-                   aM.offset == bM.offset &&
-                   aM.offen == bM.offen &&
-                   aM.idxen == bM.idxen &&
-                   aM.glc == bM.glc &&
-                   aM.dlc == bM.dlc &&
-                   aM.slc == bM.slc &&
-                   aM.tfe == bM.tfe &&
-                   aM.lds == bM.lds &&
-                   aM.disable_wqm == bM.disable_wqm;
-         }
-         case Format::MIMG: {
-            MIMG_instruction& aM = a->mimg();
-            MIMG_instruction& bM = b->mimg();
-            return aM.sync == bM.sync &&
-                   aM.dmask == bM.dmask &&
-                   aM.unrm == bM.unrm &&
-                   aM.glc == bM.glc &&
-                   aM.slc == bM.slc &&
-                   aM.tfe == bM.tfe &&
-                   aM.da == bM.da &&
-                   aM.lwe == bM.lwe &&
-                   aM.r128 == bM.r128 &&
-                   aM.a16 == bM.a16 &&
-                   aM.d16 == bM.d16 &&
-                   aM.disable_wqm == bM.disable_wqm;
-         }
-         case Format::FLAT:
-         case Format::GLOBAL:
-         case Format::SCRATCH:
-         case Format::EXP:
-         case Format::SOPP:
-         case Format::PSEUDO_BRANCH:
-         case Format::PSEUDO_BARRIER:
-            assert(false);
-         default:
-            return true;
+         return a3P.opsel_lo == b3P.opsel_lo && a3P.opsel_hi == b3P.opsel_hi &&
+                a3P.clamp == b3P.clamp;
+      }
+      case Format::PSEUDO_REDUCTION: {
+         Pseudo_reduction_instruction& aR = a->reduction();
+         Pseudo_reduction_instruction& bR = b->reduction();
+         return aR.pass_flags == bR.pass_flags && aR.reduce_op == bR.reduce_op &&
+                aR.cluster_size == bR.cluster_size;
+      }
+      case Format::DS: {
+         assert(a->opcode == aco_opcode::ds_bpermute_b32 ||
+                a->opcode == aco_opcode::ds_permute_b32 || a->opcode == aco_opcode::ds_swizzle_b32);
+         DS_instruction& aD = a->ds();
+         DS_instruction& bD = b->ds();
+         return aD.sync == bD.sync && aD.pass_flags == bD.pass_flags && aD.gds == bD.gds &&
+                aD.offset0 == bD.offset0 && aD.offset1 == bD.offset1;
+      }
+      case Format::MTBUF: {
+         MTBUF_instruction& aM = a->mtbuf();
+         MTBUF_instruction& bM = b->mtbuf();
+         return aM.sync == bM.sync && aM.dfmt == bM.dfmt && aM.nfmt == bM.nfmt &&
+                aM.offset == bM.offset && aM.offen == bM.offen && aM.idxen == bM.idxen &&
+                aM.glc == bM.glc && aM.dlc == bM.dlc && aM.slc == bM.slc && aM.tfe == bM.tfe &&
+                aM.disable_wqm == bM.disable_wqm;
+      }
+      case Format::MUBUF: {
+         MUBUF_instruction& aM = a->mubuf();
+         MUBUF_instruction& bM = b->mubuf();
+         return aM.sync == bM.sync && aM.offset == bM.offset && aM.offen == bM.offen &&
+                aM.idxen == bM.idxen && aM.glc == bM.glc && aM.dlc == bM.dlc && aM.slc == bM.slc &&
+                aM.tfe == bM.tfe && aM.lds == bM.lds && aM.disable_wqm == bM.disable_wqm;
+      }
+      case Format::MIMG: {
+         MIMG_instruction& aM = a->mimg();
+         MIMG_instruction& bM = b->mimg();
+         return aM.sync == bM.sync && aM.dmask == bM.dmask && aM.unrm == bM.unrm &&
+                aM.glc == bM.glc && aM.slc == bM.slc && aM.tfe == bM.tfe && aM.da == bM.da &&
+                aM.lwe == bM.lwe && aM.r128 == bM.r128 && aM.a16 == bM.a16 && aM.d16 == bM.d16 &&
+                aM.disable_wqm == bM.disable_wqm;
+      }
+      case Format::FLAT:
+      case Format::GLOBAL:
+      case Format::SCRATCH:
+      case Format::EXP:
+      case Format::SOPP:
+      case Format::PSEUDO_BRANCH:
+      case Format::PSEUDO_BARRIER: assert(false);
+      default: return true;
      }
   }
 };
@ -345,7 +289,8 @@ struct vn_ctx {
    */
   uint32_t exec_id = 1;

-   vn_ctx(Program* program_) : program(program_) {
+   vn_ctx(Program* program_) : program(program_)
+   {
      static_assert(sizeof(Temp) == 4, "Temp must fit in 32bits");
      unsigned size = 0;
      for (Block& block : program->blocks)
@ -354,11 +299,11 @@ struct vn_ctx {
   }
 };

-
 /* dominates() returns true if the parent block dominates the child block and
 * if the parent block is part of the same loop or has a smaller loop nest depth.
 */
-bool dominates(vn_ctx& ctx, uint32_t parent, uint32_t child)
+bool
+dominates(vn_ctx& ctx, uint32_t parent, uint32_t child)
 {
   unsigned parent_loop_nest_depth = ctx.program->blocks[parent].loop_nest_depth;
   while (parent < child && parent_loop_nest_depth <= ctx.program->blocks[child].loop_nest_depth)
@ -375,42 +320,40 @@ bool dominates(vn_ctx& ctx, uint32_t parent, uint32_t child)
 *  Note that expr_set must not be used with instructions
 *  which cannot be eliminated.
 */
-bool can_eliminate(aco_ptr<Instruction>& instr)
+bool
+can_eliminate(aco_ptr<Instruction>& instr)
 {
   switch (instr->format) {
-      case Format::FLAT:
-      case Format::GLOBAL:
-      case Format::SCRATCH:
-      case Format::EXP:
-      case Format::SOPP:
-      case Format::PSEUDO_BRANCH:
-      case Format::PSEUDO_BARRIER:
+   case Format::FLAT:
+   case Format::GLOBAL:
+   case Format::SCRATCH:
+   case Format::EXP:
+   case Format::SOPP:
+   case Format::PSEUDO_BRANCH:
+   case Format::PSEUDO_BARRIER: return false;
+   case Format::DS:
+      return instr->opcode == aco_opcode::ds_bpermute_b32 ||
+             instr->opcode == aco_opcode::ds_permute_b32 ||
+             instr->opcode == aco_opcode::ds_swizzle_b32;
+   case Format::SMEM:
+   case Format::MUBUF:
+   case Format::MIMG:
+   case Format::MTBUF:
+      if (!get_sync_info(instr.get()).can_reorder())
         return false;
-      case Format::DS:
-         return instr->opcode == aco_opcode::ds_bpermute_b32 ||
-                instr->opcode == aco_opcode::ds_permute_b32 ||
-                instr->opcode == aco_opcode::ds_swizzle_b32;
-      case Format::SMEM:
-      case Format::MUBUF:
-      case Format::MIMG:
-      case Format::MTBUF:
-         if (!get_sync_info(instr.get()).can_reorder())
-            return false;
-         break;
-      default:
-         break;
+      break;
+   default: break;
   }

-   if (instr->definitions.empty() ||
-       instr->opcode == aco_opcode::p_phi ||
-       instr->opcode == aco_opcode::p_linear_phi ||
-       instr->definitions[0].isNoCSE())
+   if (instr->definitions.empty() || instr->opcode == aco_opcode::p_phi ||
+       instr->opcode == aco_opcode::p_linear_phi || instr->definitions[0].isNoCSE())
      return false;

   return true;
 }

-void process_block(vn_ctx& ctx, Block& block)
+void
+process_block(vn_ctx& ctx, Block& block)
 {
   std::vector<aco_ptr<Instruction>> new_instructions;
   new_instructions.reserve(block.instructions.size());
@ -435,8 +378,9 @@ void process_block(vn_ctx& ctx, Block& block)
      }

      /* simple copy-propagation through renaming */
-      bool copy_instr = instr->opcode == aco_opcode::p_parallelcopy ||
-                        (instr->opcode == aco_opcode::p_create_vector && instr->operands.size() == 1);
+      bool copy_instr =
+         instr->opcode == aco_opcode::p_parallelcopy ||
+         (instr->opcode == aco_opcode::p_create_vector && instr->operands.size() == 1);
      if (copy_instr && !instr->definitions[0].isFixed() && instr->operands[0].isTemp() &&
          instr->operands[0].regClass() == instr->definitions[0].regClass()) {
         ctx.renames[instr->definitions[0].tempId()] = instr->operands[0].getTemp();
@ -479,7 +423,8 @@ void process_block(vn_ctx& ctx, Block& block)
   block.instructions = std::move(new_instructions);
 }

-void rename_phi_operands(Block& block, std::map<uint32_t, Temp>& renames)
+void
+rename_phi_operands(Block& block, std::map<uint32_t, Temp>& renames)
 {
   for (aco_ptr<Instruction>& phi : block.instructions) {
      if (phi->opcode != aco_opcode::p_phi && phi->opcode != aco_opcode::p_linear_phi)
@ -496,8 +441,8 @@ void rename_phi_operands(Block& block, std::map<uint32_t, Temp>& renames)
 }
 } /* end namespace */

-
-void value_numbering(Program* program)
+void
+value_numbering(Program* program)
 {
   vn_ctx ctx(program);
   std::vector<unsigned> loop_headers;
@ -521,10 +466,8 @@ void value_numbering(Program* program)
         rename_phi_operands(block, ctx.renames);

      /* increment exec_id when entering nested control flow */
-      if (block.kind & block_kind_branch ||
-          block.kind & block_kind_loop_preheader ||
-          block.kind & block_kind_break ||
-          block.kind & block_kind_continue ||
+      if (block.kind & block_kind_branch || block.kind & block_kind_loop_preheader ||
+          block.kind & block_kind_break || block.kind & block_kind_continue ||
          block.kind & block_kind_discard)
         ctx.exec_id++;
      else if (block.kind & block_kind_continue_or_break)
@ -538,4 +481,4 @@ void value_numbering(Program* program)
   }
 }

-}
+} // namespace aco
--- a/src/amd/compiler/aco_optimizer.cpp
+++ b/src/amd/compiler/aco_optimizer.cpp
--- a/src/amd/compiler/aco_optimizer_postRA.cpp
+++ b/src/amd/compiler/aco_optimizer_postRA.cpp
@ -24,9 +24,9 @@

 #include "aco_ir.h"

-#include <bitset>
 #include <algorithm>
 #include <array>
+#include <bitset>
 #include <vector>

 namespace aco {
@ -41,15 +41,14 @@ enum {
   written_by_multiple_instrs = -4,
 };

-struct pr_opt_ctx
-{
-   Program *program;
-   Block *current_block;
+struct pr_opt_ctx {
+   Program* program;
+   Block* current_block;
   int current_instr_idx;
   std::vector<uint16_t> uses;
   std::array<int, max_reg_cnt * 4u> instr_idx_by_regs;

-   void reset_block(Block *block)
+   void reset_block(Block* block)
   {
      current_block = block;
      current_instr_idx = -1;
@ -57,9 +56,10 @@ struct pr_opt_ctx
   }
 };

-void save_reg_writes(pr_opt_ctx &ctx, aco_ptr<Instruction> &instr)
+void
+save_reg_writes(pr_opt_ctx& ctx, aco_ptr<Instruction>& instr)
 {
-   for (const Definition &def : instr->definitions) {
+   for (const Definition& def : instr->definitions) {
      assert(def.regClass().type() != RegType::sgpr || def.physReg().reg() <= 255);
      assert(def.regClass().type() != RegType::vgpr || def.physReg().reg() >= 256);

@ -75,20 +75,21 @@ void save_reg_writes(pr_opt_ctx &ctx, aco_ptr<Instruction> &instr)
   }
 }

-int last_writer_idx(pr_opt_ctx &ctx, PhysReg physReg, RegClass rc)
+int
+last_writer_idx(pr_opt_ctx& ctx, PhysReg physReg, RegClass rc)
 {
   /* Verify that all of the operand's registers are written by the same instruction. */
   int instr_idx = ctx.instr_idx_by_regs[physReg.reg()];
   unsigned dw_size = DIV_ROUND_UP(rc.bytes(), 4u);
   unsigned r = physReg.reg();
-   bool all_same = std::all_of(
-      &ctx.instr_idx_by_regs[r], &ctx.instr_idx_by_regs[r + dw_size],
-      [instr_idx](int i) { return i == instr_idx; });
+   bool all_same = std::all_of(&ctx.instr_idx_by_regs[r], &ctx.instr_idx_by_regs[r + dw_size],
+                               [instr_idx](int i) { return i == instr_idx; });

   return all_same ? instr_idx : written_by_multiple_instrs;
 }

-int last_writer_idx(pr_opt_ctx &ctx, const Operand &op)
+int
+last_writer_idx(pr_opt_ctx& ctx, const Operand& op)
 {
   if (op.isConstant() || op.isUndefined())
      return const_or_undef;
@ -104,7 +105,8 @@ int last_writer_idx(pr_opt_ctx &ctx, const Operand &op)
   return instr_idx;
 }

-void try_apply_branch_vcc(pr_opt_ctx &ctx, aco_ptr<Instruction> &instr)
+void
+try_apply_branch_vcc(pr_opt_ctx& ctx, aco_ptr<Instruction>& instr)
 {
   /* We are looking for the following pattern:
    *
@ -123,8 +125,7 @@ void try_apply_branch_vcc(pr_opt_ctx &ctx, aco_ptr<Instruction> &instr)
   if (ctx.program->chip_class < GFX8)
      return;

-   if (instr->format != Format::PSEUDO_BRANCH ||
-       instr->operands.size() == 0 ||
+   if (instr->format != Format::PSEUDO_BRANCH || instr->operands.size() == 0 ||
       instr->operands[0].physReg() != scc)
      return;

@ -141,13 +142,12 @@ void try_apply_branch_vcc(pr_opt_ctx &ctx, aco_ptr<Instruction> &instr)
       last_exec_wr_idx > last_vcc_wr_idx || last_exec_wr_idx < not_written_in_block)
      return;

-   aco_ptr<Instruction> &op0_instr = ctx.current_block->instructions[op0_instr_idx];
-   aco_ptr<Instruction> &last_vcc_wr = ctx.current_block->instructions[last_vcc_wr_idx];
+   aco_ptr<Instruction>& op0_instr = ctx.current_block->instructions[op0_instr_idx];
+   aco_ptr<Instruction>& last_vcc_wr = ctx.current_block->instructions[last_vcc_wr_idx];

   if ((op0_instr->opcode != aco_opcode::s_and_b64 /* wave64 */ &&
        op0_instr->opcode != aco_opcode::s_and_b32 /* wave32 */) ||
-       op0_instr->operands[0].physReg() != vcc ||
-       op0_instr->operands[1].physReg() != exec ||
+       op0_instr->operands[0].physReg() != vcc || op0_instr->operands[1].physReg() != exec ||
       !last_vcc_wr->isVOPC())
      return;

@ -159,7 +159,8 @@ void try_apply_branch_vcc(pr_opt_ctx &ctx, aco_ptr<Instruction> &instr)
   instr->operands[0] = op0_instr->operands[0];
 }

-void try_optimize_scc_nocompare(pr_opt_ctx &ctx, aco_ptr<Instruction> &instr)
+void
+try_optimize_scc_nocompare(pr_opt_ctx& ctx, aco_ptr<Instruction>& instr)
 {
   /* We are looking for the following pattern:
    *
@ -180,8 +181,7 @@ void try_optimize_scc_nocompare(pr_opt_ctx &ctx, aco_ptr<Instruction> &instr)
   if (instr->isSOPC() &&
       (instr->opcode == aco_opcode::s_cmp_eq_u32 || instr->opcode == aco_opcode::s_cmp_eq_i32 ||
        instr->opcode == aco_opcode::s_cmp_lg_u32 || instr->opcode == aco_opcode::s_cmp_lg_i32 ||
-        instr->opcode == aco_opcode::s_cmp_eq_u64 ||
-        instr->opcode == aco_opcode::s_cmp_lg_u64) &&
+        instr->opcode == aco_opcode::s_cmp_eq_u64 || instr->opcode == aco_opcode::s_cmp_lg_u64) &&
       (instr->operands[0].constantEquals(0) || instr->operands[1].constantEquals(0)) &&
       (instr->operands[0].isTemp() || instr->operands[1].isTemp())) {
      /* Make sure the constant is always in operand 1 */
@ -197,8 +197,9 @@ void try_optimize_scc_nocompare(pr_opt_ctx &ctx, aco_ptr<Instruction> &instr)
      if (wr_idx < 0 || wr_idx != sccwr_idx)
         return;

-      aco_ptr<Instruction> &wr_instr = ctx.current_block->instructions[wr_idx];
-      if (!wr_instr->isSALU() || wr_instr->definitions.size() < 2 || wr_instr->definitions[1].physReg() != scc)
+      aco_ptr<Instruction>& wr_instr = ctx.current_block->instructions[wr_idx];
+      if (!wr_instr->isSALU() || wr_instr->definitions.size() < 2 ||
+          wr_instr->definitions[1].physReg() != scc)
         return;

      /* Look for instructions which set SCC := (D != 0) */
@ -232,10 +233,8 @@ void try_optimize_scc_nocompare(pr_opt_ctx &ctx, aco_ptr<Instruction> &instr)
      case aco_opcode::s_ashr_i32:
      case aco_opcode::s_ashr_i64:
      case aco_opcode::s_abs_i32:
-      case aco_opcode::s_absdiff_i32:
-         break;
-      default:
-         return;
+      case aco_opcode::s_absdiff_i32: break;
+      default: return;
      }

      /* Use the SCC def from wr_instr */
@ -245,13 +244,12 @@ void try_optimize_scc_nocompare(pr_opt_ctx &ctx, aco_ptr<Instruction> &instr)

      /* Set the opcode and operand to 32-bit */
      instr->operands[1] = Operand(0u);
-      instr->opcode = (instr->opcode == aco_opcode::s_cmp_eq_u32 ||
-                       instr->opcode == aco_opcode::s_cmp_eq_i32 ||
-                       instr->opcode == aco_opcode::s_cmp_eq_u64)
-                      ? aco_opcode::s_cmp_eq_u32
-                      : aco_opcode::s_cmp_lg_u32;
-   } else if ((instr->format == Format::PSEUDO_BRANCH &&
-               instr->operands.size() == 1 &&
+      instr->opcode =
+         (instr->opcode == aco_opcode::s_cmp_eq_u32 || instr->opcode == aco_opcode::s_cmp_eq_i32 ||
+          instr->opcode == aco_opcode::s_cmp_eq_u64)
+            ? aco_opcode::s_cmp_eq_u32
+            : aco_opcode::s_cmp_lg_u32;
+   } else if ((instr->format == Format::PSEUDO_BRANCH && instr->operands.size() == 1 &&
               instr->operands[0].physReg() == scc) ||
              instr->opcode == aco_opcode::s_cselect_b32) {

@ -265,10 +263,11 @@ void try_optimize_scc_nocompare(pr_opt_ctx &ctx, aco_ptr<Instruction> &instr)
      if (wr_idx < 0)
         return;

-      aco_ptr<Instruction> &wr_instr = ctx.current_block->instructions[wr_idx];
+      aco_ptr<Instruction>& wr_instr = ctx.current_block->instructions[wr_idx];

      /* Check if we found the pattern above. */
-      if (wr_instr->opcode != aco_opcode::s_cmp_eq_u32 && wr_instr->opcode != aco_opcode::s_cmp_lg_u32)
+      if (wr_instr->opcode != aco_opcode::s_cmp_eq_u32 &&
+          wr_instr->opcode != aco_opcode::s_cmp_lg_u32)
         return;
      if (wr_instr->operands[0].physReg() != scc)
         return;
@ -282,11 +281,13 @@ void try_optimize_scc_nocompare(pr_opt_ctx &ctx, aco_ptr<Instruction> &instr)
      if (wr_instr->opcode == aco_opcode::s_cmp_eq_u32) {
         /* Flip the meaning of the instruction to correctly use the SCC. */
         if (instr->format == Format::PSEUDO_BRANCH)
-            instr->opcode = instr->opcode == aco_opcode::p_cbranch_z ? aco_opcode::p_cbranch_nz : aco_opcode::p_cbranch_z;
+            instr->opcode = instr->opcode == aco_opcode::p_cbranch_z ? aco_opcode::p_cbranch_nz
+                                                                     : aco_opcode::p_cbranch_z;
         else if (instr->opcode == aco_opcode::s_cselect_b32)
            std::swap(instr->operands[0], instr->operands[1]);
         else
-            unreachable("scc_nocompare optimization is only implemented for p_cbranch and s_cselect");
+            unreachable(
+               "scc_nocompare optimization is only implemented for p_cbranch and s_cselect");
      }

      /* Use the SCC def from the original instruction, not the comparison */
@ -295,7 +296,8 @@ void try_optimize_scc_nocompare(pr_opt_ctx &ctx, aco_ptr<Instruction> &instr)
   }
 }

-void process_instruction(pr_opt_ctx &ctx, aco_ptr<Instruction> &instr)
+void
+process_instruction(pr_opt_ctx& ctx, aco_ptr<Instruction>& instr)
 {
   ctx.current_instr_idx++;

@ -307,9 +309,10 @@ void process_instruction(pr_opt_ctx &ctx, aco_ptr<Instruction> &instr)
      save_reg_writes(ctx, instr);
 }

-} /* End of empty namespace */
+} // namespace

-void optimize_postRA(Program* program)
+void
+optimize_postRA(Program* program)
 {
   pr_opt_ctx ctx;
   ctx.program = program;
@ -319,10 +322,10 @@ void optimize_postRA(Program* program)
    * Goes through each instruction exactly once, and can transform
    * instructions or adjust the use counts of temps.
    */
-   for (auto &block : program->blocks) {
+   for (auto& block : program->blocks) {
      ctx.reset_block(&block);

-      for (aco_ptr<Instruction> &instr : block.instructions)
+      for (aco_ptr<Instruction>& instr : block.instructions)
         process_instruction(ctx, instr);
   }

@ -330,13 +333,12 @@ void optimize_postRA(Program* program)
    * Gets rid of instructions which are manually deleted or
    * no longer have any uses.
    */
-   for (auto &block : program->blocks) {
-      auto new_end = std::remove_if(
-         block.instructions.begin(), block.instructions.end(),
-         [&ctx](const aco_ptr<Instruction> &instr) { return !instr || is_dead(ctx.uses, instr.get()); });
+   for (auto& block : program->blocks) {
+      auto new_end = std::remove_if(block.instructions.begin(), block.instructions.end(),
+                                    [&ctx](const aco_ptr<Instruction>& instr)
+                                    { return !instr || is_dead(ctx.uses, instr.get()); });
      block.instructions.resize(new_end - block.instructions.begin());
   }
 }

-} /* End of aco namespace */
-
+} // namespace aco
--- a/src/amd/compiler/aco_print_asm.cpp
+++ b/src/amd/compiler/aco_print_asm.cpp
@ -39,17 +39,17 @@ namespace {

 /* LLVM disassembler only supports GFX8+, try to disassemble with CLRXdisasm
 * for GFX6-GFX7 if found on the system, this is better than nothing.
-*/
-bool print_asm_gfx6_gfx7(Program *program, std::vector<uint32_t>& binary,
-                         FILE *output)
+ */
+bool
+print_asm_gfx6_gfx7(Program* program, std::vector<uint32_t>& binary, FILE* output)
 {
 #ifdef _WIN32
   return true;
 #else
   char path[] = "/tmp/fileXXXXXX";
   char line[2048], command[128];
-   const char *gpu_type;
-   FILE *p;
+   const char* gpu_type;
+   FILE* p;
   int fd;

   /* Dump the binary into a temporary file. */
@ -57,8 +57,7 @@ bool print_asm_gfx6_gfx7(Program *program, std::vector<uint32_t>& binary,
   if (fd < 0)
      return true;

-   for (uint32_t w : binary)
-   {
+   for (uint32_t w : binary) {
      if (write(fd, &w, sizeof(w)) == -1)
         goto fail;
   }
@ -69,30 +68,16 @@ bool print_asm_gfx6_gfx7(Program *program, std::vector<uint32_t>& binary,
   switch (program->chip_class) {
   case GFX6:
      switch (program->family) {
-      case CHIP_TAHITI:
-         gpu_type = "tahiti";
-         break;
-      case CHIP_PITCAIRN:
-         gpu_type = "pitcairn";
-         break;
-      case CHIP_VERDE:
-         gpu_type = "capeverde";
-         break;
-      case CHIP_OLAND:
-         gpu_type = "oland";
-         break;
-      case CHIP_HAINAN:
-         gpu_type = "hainan";
-         break;
-      default:
-         unreachable("Invalid GFX6 family!");
+      case CHIP_TAHITI: gpu_type = "tahiti"; break;
+      case CHIP_PITCAIRN: gpu_type = "pitcairn"; break;
+      case CHIP_VERDE: gpu_type = "capeverde"; break;
+      case CHIP_OLAND: gpu_type = "oland"; break;
+      case CHIP_HAINAN: gpu_type = "hainan"; break;
+      default: unreachable("Invalid GFX6 family!");
      }
      break;
-   case GFX7:
-      gpu_type = "gfx700";
-      break;
-   default:
-      unreachable("Invalid chip class!");
+   case GFX7: gpu_type = "gfx700"; break;
+   default: unreachable("Invalid chip class!");
   }

   sprintf(command, "clrxdisasm --gpuType=%s -r %s", gpu_type, path);
@ -121,22 +106,21 @@ fail:
 #endif
 }

-std::pair<bool, size_t> disasm_instr(chip_class chip, LLVMDisasmContextRef disasm,
-                                     uint32_t *binary, unsigned exec_size, size_t pos,
-                                     char *outline, unsigned outline_size)
+std::pair<bool, size_t>
+disasm_instr(chip_class chip, LLVMDisasmContextRef disasm, uint32_t* binary, unsigned exec_size,
+             size_t pos, char* outline, unsigned outline_size)
 {
   /* mask out src2 on v_writelane_b32 */
   if (((chip == GFX8 || chip == GFX9) && (binary[pos] & 0xffff8000) == 0xd28a0000) ||
       (chip >= GFX10 && (binary[pos] & 0xffff8000) == 0xd7610000)) {
-      binary[pos+1] = binary[pos+1] & 0xF803FFFF;
+      binary[pos + 1] = binary[pos + 1] & 0xF803FFFF;
   }

-   size_t l = LLVMDisasmInstruction(disasm, (uint8_t *) &binary[pos],
-                                    (exec_size - pos) * sizeof(uint32_t), pos * 4,
-                                    outline, outline_size);
+   size_t l =
+      LLVMDisasmInstruction(disasm, (uint8_t*)&binary[pos], (exec_size - pos) * sizeof(uint32_t),
+                            pos * 4, outline, outline_size);

-   if (chip >= GFX10 && l == 8 &&
-       ((binary[pos] & 0xffff0000) == 0xd7610000) &&
+   if (chip >= GFX10 && l == 8 && ((binary[pos] & 0xffff0000) == 0xd7610000) &&
       ((binary[pos + 1] & 0x1ff) == 0xff)) {
      /* v_writelane with literal uses 3 dwords but llvm consumes only 2 */
      l += 4;
@ -145,14 +129,14 @@ std::pair<bool, size_t> disasm_instr(chip_class chip, LLVMDisasmContextRef disas
   bool invalid = false;
   size_t size;
   if (!l &&
-       ((chip >= GFX9 && (binary[pos] & 0xffff8000) == 0xd1348000) || /* v_add_u32_e64 + clamp */
+       ((chip >= GFX9 && (binary[pos] & 0xffff8000) == 0xd1348000) ||  /* v_add_u32_e64 + clamp */
        (chip >= GFX10 && (binary[pos] & 0xffff8000) == 0xd7038000) || /* v_add_u16_e64 + clamp */
-        (chip <= GFX9 && (binary[pos] & 0xffff8000) == 0xd1268000) || /* v_add_u16_e64 + clamp */
+        (chip <= GFX9 && (binary[pos] & 0xffff8000) == 0xd1268000) ||  /* v_add_u16_e64 + clamp */
        (chip >= GFX10 && (binary[pos] & 0xffff8000) == 0xd76d8000) || /* v_add3_u32 + clamp */
        (chip == GFX9 && (binary[pos] & 0xffff8000) == 0xd1ff8000)) /* v_add3_u32 + clamp */) {
      strcpy(outline, "\tinteger addition + clamp");
-      bool has_literal = chip >= GFX10 &&
-                         (((binary[pos+1] & 0x1ff) == 0xff) || (((binary[pos+1] >> 9) & 0x1ff) == 0xff));
+      bool has_literal = chip >= GFX10 && (((binary[pos + 1] & 0x1ff) == 0xff) ||
+                                           (((binary[pos + 1] >> 9) & 0x1ff) == 0xff));
      size = 2 + has_literal;
   } else if (chip >= GFX10 && l == 4 && ((binary[pos] & 0xfe0001ff) == 0x020000f9)) {
      strcpy(outline, "\tv_cndmask_b32 + sdwa");
@ -170,8 +154,8 @@ std::pair<bool, size_t> disasm_instr(chip_class chip, LLVMDisasmContextRef disas
 }
 } /* end namespace */

-bool print_asm(Program *program, std::vector<uint32_t>& binary,
-               unsigned exec_size, FILE *output)
+bool
+print_asm(Program* program, std::vector<uint32_t>& binary, unsigned exec_size, FILE* output)
 {
   if (program->chip_class <= GFX7) {
      /* Do not abort if clrxdisasm isn't found. */
@ -187,7 +171,7 @@ bool print_asm(Program *program, std::vector<uint32_t>& binary,
   }

   std::vector<llvm::SymbolInfoTy> symbols;
-   std::vector<std::array<char,16>> block_names;
+   std::vector<std::array<char, 16>> block_names;
   block_names.reserve(program->blocks.size());
   for (Block& block : program->blocks) {
      if (!referenced_blocks[block.index])
@ -195,18 +179,18 @@ bool print_asm(Program *program, std::vector<uint32_t>& binary,
      std::array<char, 16> name;
      sprintf(name.data(), "BB%u", block.index);
      block_names.push_back(name);
-      symbols.emplace_back(block.offset * 4, llvm::StringRef(block_names[block_names.size() - 1].data()), 0);
+      symbols.emplace_back(block.offset * 4,
+                           llvm::StringRef(block_names[block_names.size() - 1].data()), 0);
   }

-   const char *features = "";
+   const char* features = "";
   if (program->chip_class >= GFX10 && program->wave_size == 64) {
      features = "+wavefrontsize64";
   }

-   LLVMDisasmContextRef disasm = LLVMCreateDisasmCPUFeatures("amdgcn-mesa-mesa3d",
-                                                             ac_get_llvm_processor_name(program->family),
-                                                             features,
-                                                             &symbols, 0, NULL, NULL);
+   LLVMDisasmContextRef disasm =
+      LLVMCreateDisasmCPUFeatures("amdgcn-mesa-mesa3d", ac_get_llvm_processor_name(program->family),
+                                  features, &symbols, 0, NULL, NULL);

   size_t pos = 0;
   bool invalid = false;
@ -216,7 +200,8 @@ bool print_asm(Program *program, std::vector<uint32_t>& binary,
   unsigned prev_pos = 0;
   unsigned repeat_count = 0;
   while (pos < exec_size) {
-      bool new_block = next_block < program->blocks.size() && pos == program->blocks[next_block].offset;
+      bool new_block =
+         next_block < program->blocks.size() && pos == program->blocks[next_block].offset;
      if (pos + prev_size <= exec_size && prev_pos != pos && !new_block &&
          memcmp(&binary[prev_pos], &binary[pos], prev_size * 4) == 0) {
         repeat_count++;
@ -235,8 +220,8 @@ bool print_asm(Program *program, std::vector<uint32_t>& binary,
      }

      char outline[1024];
-      std::pair<bool, size_t> res = disasm_instr(
-         program->chip_class, disasm, binary.data(), exec_size, pos, outline, sizeof(outline));
+      std::pair<bool, size_t> res = disasm_instr(program->chip_class, disasm, binary.data(),
+                                                 exec_size, pos, outline, sizeof(outline));
      invalid |= res.first;

      fprintf(output, "%-60s ;", outline);
@ -271,4 +256,4 @@ bool print_asm(Program *program, std::vector<uint32_t>& binary,
   return invalid;
 }

-}
+} // namespace aco
--- a/src/amd/compiler/aco_print_ir.cpp
+++ b/src/amd/compiler/aco_print_ir.cpp
@ -86,36 +86,38 @@ const std::array<const char*, num_reduce_ops> reduce_ops = []()
   return ret;
 }();

-static void print_reg_class(const RegClass rc, FILE *output)
+static void
+print_reg_class(const RegClass rc, FILE* output)
 {
   switch (rc) {
-      case RegClass::s1: fprintf(output, " s1: "); return;
-      case RegClass::s2: fprintf(output, " s2: "); return;
-      case RegClass::s3: fprintf(output, " s3: "); return;
-      case RegClass::s4: fprintf(output, " s4: "); return;
-      case RegClass::s6: fprintf(output, " s6: "); return;
-      case RegClass::s8: fprintf(output, " s8: "); return;
-      case RegClass::s16: fprintf(output, "s16: "); return;
-      case RegClass::v1: fprintf(output, " v1: "); return;
-      case RegClass::v2: fprintf(output, " v2: "); return;
-      case RegClass::v3: fprintf(output, " v3: "); return;
-      case RegClass::v4: fprintf(output, " v4: "); return;
-      case RegClass::v5: fprintf(output, " v5: "); return;
-      case RegClass::v6: fprintf(output, " v6: "); return;
-      case RegClass::v7: fprintf(output, " v7: "); return;
-      case RegClass::v8: fprintf(output, " v8: "); return;
-      case RegClass::v1b: fprintf(output, " v1b: "); return;
-      case RegClass::v2b: fprintf(output, " v2b: "); return;
-      case RegClass::v3b: fprintf(output, " v3b: "); return;
-      case RegClass::v4b: fprintf(output, " v4b: "); return;
-      case RegClass::v6b: fprintf(output, " v6b: "); return;
-      case RegClass::v8b: fprintf(output, " v8b: "); return;
-      case RegClass::v1_linear: fprintf(output, " v1: "); return;
-      case RegClass::v2_linear: fprintf(output, " v2: "); return;
+   case RegClass::s1: fprintf(output, " s1: "); return;
+   case RegClass::s2: fprintf(output, " s2: "); return;
+   case RegClass::s3: fprintf(output, " s3: "); return;
+   case RegClass::s4: fprintf(output, " s4: "); return;
+   case RegClass::s6: fprintf(output, " s6: "); return;
+   case RegClass::s8: fprintf(output, " s8: "); return;
+   case RegClass::s16: fprintf(output, "s16: "); return;
+   case RegClass::v1: fprintf(output, " v1: "); return;
+   case RegClass::v2: fprintf(output, " v2: "); return;
+   case RegClass::v3: fprintf(output, " v3: "); return;
+   case RegClass::v4: fprintf(output, " v4: "); return;
+   case RegClass::v5: fprintf(output, " v5: "); return;
+   case RegClass::v6: fprintf(output, " v6: "); return;
+   case RegClass::v7: fprintf(output, " v7: "); return;
+   case RegClass::v8: fprintf(output, " v8: "); return;
+   case RegClass::v1b: fprintf(output, " v1b: "); return;
+   case RegClass::v2b: fprintf(output, " v2b: "); return;
+   case RegClass::v3b: fprintf(output, " v3b: "); return;
+   case RegClass::v4b: fprintf(output, " v4b: "); return;
+   case RegClass::v6b: fprintf(output, " v6b: "); return;
+   case RegClass::v8b: fprintf(output, " v8b: "); return;
+   case RegClass::v1_linear: fprintf(output, " v1: "); return;
+   case RegClass::v2_linear: fprintf(output, " v2: "); return;
   }
 }

-void print_physReg(PhysReg reg, unsigned bytes, FILE *output, unsigned flags)
+void
+print_physReg(PhysReg reg, unsigned bytes, FILE* output, unsigned flags)
 {
   if (reg == 124) {
      fprintf(output, "m0");
@ -134,16 +136,17 @@ void print_physReg(PhysReg reg, unsigned bytes, FILE *output, unsigned flags)
      } else {
         fprintf(output, "%c[%d", is_vgpr ? 'v' : 's', r);
         if (size > 1)
-            fprintf(output, "-%d]", r + size -1);
+            fprintf(output, "-%d]", r + size - 1);
         else
            fprintf(output, "]");
      }
      if (reg.byte() || bytes % 4)
-         fprintf(output, "[%d:%d]", reg.byte()*8, (reg.byte()+bytes) * 8);
+         fprintf(output, "[%d:%d]", reg.byte() * 8, (reg.byte() + bytes) * 8);
   }
 }

-static void print_constant(uint8_t reg, FILE *output)
+static void
+print_constant(uint8_t reg, FILE* output)
 {
   if (reg >= 128 && reg <= 192) {
      fprintf(output, "%d", reg - 128);
@ -154,37 +157,20 @@ static void print_constant(uint8_t reg, FILE *output)
   }

   switch (reg) {
-   case 240:
-      fprintf(output, "0.5");
-      break;
-   case 241:
-      fprintf(output, "-0.5");
-      break;
-   case 242:
-      fprintf(output, "1.0");
-      break;
-   case 243:
-      fprintf(output, "-1.0");
-      break;
-   case 244:
-      fprintf(output, "2.0");
-      break;
-   case 245:
-      fprintf(output, "-2.0");
-      break;
-   case 246:
-      fprintf(output, "4.0");
-      break;
-   case 247:
-      fprintf(output, "-4.0");
-      break;
-   case 248:
-      fprintf(output, "1/(2*PI)");
-      break;
+   case 240: fprintf(output, "0.5"); break;
+   case 241: fprintf(output, "-0.5"); break;
+   case 242: fprintf(output, "1.0"); break;
+   case 243: fprintf(output, "-1.0"); break;
+   case 244: fprintf(output, "2.0"); break;
+   case 245: fprintf(output, "-2.0"); break;
+   case 246: fprintf(output, "4.0"); break;
+   case 247: fprintf(output, "-4.0"); break;
+   case 248: fprintf(output, "1/(2*PI)"); break;
   }
 }

-void aco_print_operand(const Operand *operand, FILE *output, unsigned flags)
+void
+aco_print_operand(const Operand* operand, FILE* output, unsigned flags)
 {
   if (operand->isLiteral() || (operand->isConstant() && operand->bytes() == 1)) {
      if (operand->bytes() == 1)
@ -216,7 +202,8 @@ void aco_print_operand(const Operand *operand, FILE *output, unsigned flags)
   }
 }

-static void print_definition(const Definition *definition, FILE *output, unsigned flags)
+static void
+print_definition(const Definition* definition, FILE* output, unsigned flags)
 {
   if (!(flags & print_no_ssa))
      print_reg_class(definition->regClass(), output);
@ -235,7 +222,8 @@ static void print_definition(const Definition *definition, FILE *output, unsigne
      print_physReg(definition->physReg(), definition->bytes(), output, flags);
 }

-static void print_storage(storage_class storage, FILE *output)
+static void
+print_storage(storage_class storage, FILE* output)
 {
   fprintf(output, " storage:");
   int printed = 0;
@ -255,7 +243,8 @@ static void print_storage(storage_class storage, FILE *output)
      printed += fprintf(output, "%svgpr_spill", printed ? "," : "");
 }

-static void print_semantics(memory_semantics sem, FILE *output)
+static void
+print_semantics(memory_semantics sem, FILE* output)
 {
   fprintf(output, " semantics:");
   int printed = 0;
@ -275,36 +264,29 @@ static void print_semantics(memory_semantics sem, FILE *output)
      printed += fprintf(output, "%srmw", printed ? "," : "");
 }

-static void print_scope(sync_scope scope, FILE *output, const char *prefix="scope")
+static void
+print_scope(sync_scope scope, FILE* output, const char* prefix = "scope")
 {
   fprintf(output, " %s:", prefix);
   switch (scope) {
-   case scope_invocation:
-      fprintf(output, "invocation");
-      break;
-   case scope_subgroup:
-      fprintf(output, "subgroup");
-      break;
-   case scope_workgroup:
-      fprintf(output, "workgroup");
-      break;
-   case scope_queuefamily:
-      fprintf(output, "queuefamily");
-      break;
-   case scope_device:
-      fprintf(output, "device");
-      break;
+   case scope_invocation: fprintf(output, "invocation"); break;
+   case scope_subgroup: fprintf(output, "subgroup"); break;
+   case scope_workgroup: fprintf(output, "workgroup"); break;
+   case scope_queuefamily: fprintf(output, "queuefamily"); break;
+   case scope_device: fprintf(output, "device"); break;
   }
 }

-static void print_sync(memory_sync_info sync, FILE *output)
+static void
+print_sync(memory_sync_info sync, FILE* output)
 {
   print_storage(sync.storage, output);
   print_semantics(sync.semantics, output);
   print_scope(sync.scope, output);
 }

-static void print_instr_format_specific(const Instruction *instr, FILE *output)
+static void
+print_instr_format_specific(const Instruction* instr, FILE* output)
 {
   switch (instr->format) {
   case Format::SOPK: {
@ -319,9 +301,12 @@ static void print_instr_format_specific(const Instruction *instr, FILE *output)
         /* we usually should check the chip class for vmcnt/lgkm, but
          * insert_waitcnt() should fill it in regardless. */
         unsigned vmcnt = (imm & 0xF) | ((imm & (0x3 << 14)) >> 10);
-         if (vmcnt != 63) fprintf(output, " vmcnt(%d)", vmcnt);
-         if (((imm >> 4) & 0x7) < 0x7) fprintf(output, " expcnt(%d)", (imm >> 4) & 0x7);
-         if (((imm >> 8) & 0x3F) < 0x3F) fprintf(output, " lgkmcnt(%d)", (imm >> 8) & 0x3F);
+         if (vmcnt != 63)
+            fprintf(output, " vmcnt(%d)", vmcnt);
+         if (((imm >> 4) & 0x7) < 0x7)
+            fprintf(output, " expcnt(%d)", (imm >> 4) & 0x7);
+         if (((imm >> 8) & 0x3F) < 0x3F)
+            fprintf(output, " lgkmcnt(%d)", (imm >> 8) & 0x3F);
         break;
      }
      case aco_opcode::s_endpgm:
@ -337,35 +322,21 @@ static void print_instr_format_specific(const Instruction *instr, FILE *output)
      case aco_opcode::s_sendmsg: {
         unsigned id = imm & sendmsg_id_mask;
         switch (id) {
-         case sendmsg_none:
-            fprintf(output, " sendmsg(MSG_NONE)");
-            break;
+         case sendmsg_none: fprintf(output, " sendmsg(MSG_NONE)"); break;
         case _sendmsg_gs:
-            fprintf(output, " sendmsg(gs%s%s, %u)",
-                    imm & 0x10 ? ", cut" : "", imm & 0x20 ? ", emit" : "", imm >> 8);
+            fprintf(output, " sendmsg(gs%s%s, %u)", imm & 0x10 ? ", cut" : "",
+                    imm & 0x20 ? ", emit" : "", imm >> 8);
            break;
         case _sendmsg_gs_done:
-            fprintf(output, " sendmsg(gs_done%s%s, %u)",
-                    imm & 0x10 ? ", cut" : "", imm & 0x20 ? ", emit" : "", imm >> 8);
-            break;
-         case sendmsg_save_wave:
-            fprintf(output, " sendmsg(save_wave)");
-            break;
-         case sendmsg_stall_wave_gen:
-            fprintf(output, " sendmsg(stall_wave_gen)");
-            break;
-         case sendmsg_halt_waves:
-            fprintf(output, " sendmsg(halt_waves)");
-            break;
-         case sendmsg_ordered_ps_done:
-            fprintf(output, " sendmsg(ordered_ps_done)");
-            break;
-         case sendmsg_early_prim_dealloc:
-            fprintf(output, " sendmsg(early_prim_dealloc)");
-            break;
-         case sendmsg_gs_alloc_req:
-            fprintf(output, " sendmsg(gs_alloc_req)");
+            fprintf(output, " sendmsg(gs_done%s%s, %u)", imm & 0x10 ? ", cut" : "",
+                    imm & 0x20 ? ", emit" : "", imm >> 8);
            break;
+         case sendmsg_save_wave: fprintf(output, " sendmsg(save_wave)"); break;
+         case sendmsg_stall_wave_gen: fprintf(output, " sendmsg(stall_wave_gen)"); break;
+         case sendmsg_halt_waves: fprintf(output, " sendmsg(halt_waves)"); break;
+         case sendmsg_ordered_ps_done: fprintf(output, " sendmsg(ordered_ps_done)"); break;
+         case sendmsg_early_prim_dealloc: fprintf(output, " sendmsg(early_prim_dealloc)"); break;
+         case sendmsg_gs_alloc_req: fprintf(output, " sendmsg(gs_alloc_req)"); break;
         }
         break;
      }
@ -433,40 +404,21 @@ static void print_instr_format_specific(const Instruction *instr, FILE *output)
   }
   case Format::MIMG: {
      const MIMG_instruction& mimg = instr->mimg();
-      unsigned identity_dmask = !instr->definitions.empty() ?
-                                (1 << instr->definitions[0].size()) - 1 :
-                                0xf;
+      unsigned identity_dmask =
+         !instr->definitions.empty() ? (1 << instr->definitions[0].size()) - 1 : 0xf;
      if ((mimg.dmask & identity_dmask) != identity_dmask)
-         fprintf(output, " dmask:%s%s%s%s",
-                 mimg.dmask & 0x1 ? "x" : "",
-                 mimg.dmask & 0x2 ? "y" : "",
-                 mimg.dmask & 0x4 ? "z" : "",
+         fprintf(output, " dmask:%s%s%s%s", mimg.dmask & 0x1 ? "x" : "",
+                 mimg.dmask & 0x2 ? "y" : "", mimg.dmask & 0x4 ? "z" : "",
                 mimg.dmask & 0x8 ? "w" : "");
      switch (mimg.dim) {
-      case ac_image_1d:
-         fprintf(output, " 1d");
-         break;
-      case ac_image_2d:
-         fprintf(output, " 2d");
-         break;
-      case ac_image_3d:
-         fprintf(output, " 3d");
-         break;
-      case ac_image_cube:
-         fprintf(output, " cube");
-         break;
-      case ac_image_1darray:
-         fprintf(output, " 1darray");
-         break;
-      case ac_image_2darray:
-         fprintf(output, " 2darray");
-         break;
-      case ac_image_2dmsaa:
-         fprintf(output, " 2dmsaa");
-         break;
-      case ac_image_2darraymsaa:
-         fprintf(output, " 2darraymsaa");
-         break;
+      case ac_image_1d: fprintf(output, " 1d"); break;
+      case ac_image_2d: fprintf(output, " 2d"); break;
+      case ac_image_3d: fprintf(output, " 3d"); break;
+      case ac_image_cube: fprintf(output, " cube"); break;
+      case ac_image_1darray: fprintf(output, " 1darray"); break;
+      case ac_image_2darray: fprintf(output, " 2darray"); break;
+      case ac_image_2dmsaa: fprintf(output, " 2dmsaa"); break;
+      case ac_image_2darraymsaa: fprintf(output, " 2darraymsaa"); break;
      }
      if (mimg.unrm)
         fprintf(output, " unrm");
@ -495,10 +447,8 @@ static void print_instr_format_specific(const Instruction *instr, FILE *output)
      const Export_instruction& exp = instr->exp();
      unsigned identity_mask = exp.compressed ? 0x5 : 0xf;
      if ((exp.enabled_mask & identity_mask) != identity_mask)
-         fprintf(output, " en:%c%c%c%c",
-                 exp.enabled_mask & 0x1 ? 'r' : '*',
-                 exp.enabled_mask & 0x2 ? 'g' : '*',
-                 exp.enabled_mask & 0x4 ? 'b' : '*',
+         fprintf(output, " en:%c%c%c%c", exp.enabled_mask & 0x1 ? 'r' : '*',
+                 exp.enabled_mask & 0x2 ? 'g' : '*', exp.enabled_mask & 0x4 ? 'b' : '*',
                 exp.enabled_mask & 0x8 ? 'a' : '*');
      if (exp.compressed)
         fprintf(output, " compr");
@ -624,15 +574,9 @@ static void print_instr_format_specific(const Instruction *instr, FILE *output)
   if (instr->isVOP3()) {
      const VOP3_instruction& vop3 = instr->vop3();
      switch (vop3.omod) {
-      case 1:
-         fprintf(output, " *2");
-         break;
-      case 2:
-         fprintf(output, " *4");
-         break;
-      case 3:
-         fprintf(output, " *0.5");
-         break;
+      case 1: fprintf(output, " *2"); break;
+      case 2: fprintf(output, " *4"); break;
+      case 3: fprintf(output, " *0.5"); break;
      }
      if (vop3.clamp)
         fprintf(output, " clamp");
@ -641,8 +585,7 @@ static void print_instr_format_specific(const Instruction *instr, FILE *output)
   } else if (instr->isDPP()) {
      const DPP_instruction& dpp = instr->dpp();
      if (dpp.dpp_ctrl <= 0xff) {
-         fprintf(output, " quad_perm:[%d,%d,%d,%d]",
-                 dpp.dpp_ctrl & 0x3, (dpp.dpp_ctrl >> 2) & 0x3,
+         fprintf(output, " quad_perm:[%d,%d,%d,%d]", dpp.dpp_ctrl & 0x3, (dpp.dpp_ctrl >> 2) & 0x3,
                 (dpp.dpp_ctrl >> 4) & 0x3, (dpp.dpp_ctrl >> 6) & 0x3);
      } else if (dpp.dpp_ctrl >= 0x101 && dpp.dpp_ctrl <= 0x10f) {
         fprintf(output, " row_shl:%d", dpp.dpp_ctrl & 0xf);
@ -678,21 +621,14 @@ static void print_instr_format_specific(const Instruction *instr, FILE *output)
   } else if (instr->isSDWA()) {
      const SDWA_instruction& sdwa = instr->sdwa();
      switch (sdwa.omod) {
-      case 1:
-         fprintf(output, " *2");
-         break;
-      case 2:
-         fprintf(output, " *4");
-         break;
-      case 3:
-         fprintf(output, " *0.5");
-         break;
+      case 1: fprintf(output, " *2"); break;
+      case 2: fprintf(output, " *4"); break;
+      case 3: fprintf(output, " *0.5"); break;
      }
      if (sdwa.clamp)
         fprintf(output, " clamp");
      switch (sdwa.dst_sel & sdwa_asuint) {
-      case sdwa_udword:
-         break;
+      case sdwa_udword: break;
      case sdwa_ubyte0:
      case sdwa_ubyte1:
      case sdwa_ubyte2:
@ -711,7 +647,8 @@ static void print_instr_format_specific(const Instruction *instr, FILE *output)
   }
 }

-void aco_print_instr(const Instruction *instr, FILE *output, unsigned flags)
+void
+aco_print_instr(const Instruction* instr, FILE* output, unsigned flags)
 {
   if (!instr->definitions.empty()) {
      for (unsigned i = 0; i < instr->definitions.size(); ++i) {
@ -723,10 +660,10 @@ void aco_print_instr(const Instruction *instr, FILE *output, unsigned flags)
   }
   fprintf(output, "%s", instr_info.name[(int)instr->opcode]);
   if (instr->operands.size()) {
-      bool *const abs = (bool *)alloca(instr->operands.size() * sizeof(bool));
-      bool *const neg = (bool *)alloca(instr->operands.size() * sizeof(bool));
-      bool *const opsel = (bool *)alloca(instr->operands.size() * sizeof(bool));
-      uint8_t *const sel = (uint8_t *)alloca(instr->operands.size() * sizeof(uint8_t));
+      bool* const abs = (bool*)alloca(instr->operands.size() * sizeof(bool));
+      bool* const neg = (bool*)alloca(instr->operands.size() * sizeof(bool));
+      bool* const opsel = (bool*)alloca(instr->operands.size() * sizeof(bool));
+      uint8_t* const sel = (uint8_t*)alloca(instr->operands.size() * sizeof(uint8_t));
      for (unsigned i = 0; i < instr->operands.size(); ++i) {
         abs[i] = false;
         neg[i] = false;
@ -792,8 +729,7 @@ void aco_print_instr(const Instruction *instr, FILE *output, unsigned flags)
         if (instr->isVOP3P()) {
            const VOP3P_instruction& vop3 = instr->vop3p();
            if ((vop3.opsel_lo & (1 << i)) || !(vop3.opsel_hi & (1 << i))) {
-               fprintf(output, ".%c%c",
-                       vop3.opsel_lo & (1 << i) ? 'y' : 'x',
+               fprintf(output, ".%c%c", vop3.opsel_lo & (1 << i) ? 'y' : 'x',
                       vop3.opsel_hi & (1 << i) ? 'y' : 'x');
            }
            if (vop3.neg_lo[i] && vop3.neg_hi[i])
@ -808,7 +744,8 @@ void aco_print_instr(const Instruction *instr, FILE *output, unsigned flags)
   print_instr_format_specific(instr, output);
 }

-static void print_block_kind(uint16_t kind, FILE *output)
+static void
+print_block_kind(uint16_t kind, FILE* output)
 {
   if (kind & block_kind_uniform)
      fprintf(output, "uniform, ");
@ -844,7 +781,8 @@ static void print_block_kind(uint16_t kind, FILE *output)
      fprintf(output, "export_end, ");
 }

-static void print_stage(Stage stage, FILE *output)
+static void
+print_stage(Stage stage, FILE* output)
 {
   fprintf(output, "ACO shader stage: ");

@ -888,7 +826,8 @@ static void print_stage(Stage stage, FILE *output)
   fprintf(output, "\n");
 }

-void aco_print_block(const Block* block, FILE *output, unsigned flags, const live& live_vars)
+void
+aco_print_block(const Block* block, FILE* output, unsigned flags, const live& live_vars)
 {
   fprintf(output, "BB%d\n", block->index);
   fprintf(output, "/* logical preds: ");
@ -927,19 +866,16 @@ void aco_print_block(const Block* block, FILE *output, unsigned flags, const liv
   }
 }

-void aco_print_program(const Program *program, FILE *output, const live& live_vars, unsigned flags)
+void
+aco_print_program(const Program* program, FILE* output, const live& live_vars, unsigned flags)
 {
   switch (program->progress) {
-   case CompilationProgress::after_isel:
-      fprintf(output, "After Instruction Selection:\n");
-      break;
+   case CompilationProgress::after_isel: fprintf(output, "After Instruction Selection:\n"); break;
   case CompilationProgress::after_spilling:
      fprintf(output, "After Spilling:\n");
      flags |= print_kill;
      break;
-   case CompilationProgress::after_ra:
-      fprintf(output, "After RA:\n");
-      break;
+   case CompilationProgress::after_ra: fprintf(output, "After RA:\n"); break;
   }

   print_stage(program->stage, output);
@ -965,9 +901,10 @@ void aco_print_program(const Program *program, FILE *output, const live& live_va
   fprintf(output, "\n");
 }

-void aco_print_program(const Program *program, FILE *output, unsigned flags)
+void
+aco_print_program(const Program* program, FILE* output, unsigned flags)
 {
   aco_print_program(program, output, live(), flags);
 }

-}
+} // namespace aco
--- a/src/amd/compiler/aco_reduce_assign.cpp
+++ b/src/amd/compiler/aco_reduce_assign.cpp
@ -36,7 +36,8 @@

 namespace aco {

-void setup_reduce_temp(Program* program)
+void
+setup_reduce_temp(Program* program)
 {
   unsigned last_top_level_block_idx = 0;
   unsigned maxSize = 0;
@ -69,7 +70,8 @@ void setup_reduce_temp(Program* program)
      if (reduceTmp_in_loop && block.loop_nest_depth == 0) {
         assert(inserted_at == (int)last_top_level_block_idx);

-         aco_ptr<Instruction> end{create_instruction<Instruction>(aco_opcode::p_end_linear_vgpr, Format::PSEUDO, vtmp_in_loop ? 2 : 1, 0)};
+         aco_ptr<Instruction> end{create_instruction<Instruction>(
+            aco_opcode::p_end_linear_vgpr, Format::PSEUDO, vtmp_in_loop ? 2 : 1, 0)};
         end->operands[0] = Operand(reduceTmp);
         if (vtmp_in_loop)
            end->operands[1] = Operand(vtmp);
@ -89,7 +91,7 @@ void setup_reduce_temp(Program* program)

      std::vector<aco_ptr<Instruction>>::iterator it;
      for (it = block.instructions.begin(); it != block.instructions.end(); ++it) {
-         Instruction *instr = (*it).get();
+         Instruction* instr = (*it).get();
         if (instr->format != Format::PSEUDO_REDUCTION)
            continue;

@ -98,7 +100,8 @@ void setup_reduce_temp(Program* program)

         if ((int)last_top_level_block_idx != inserted_at) {
            reduceTmp = program->allocateTmp(reduceTmp.regClass());
-            aco_ptr<Pseudo_instruction> create{create_instruction<Pseudo_instruction>(aco_opcode::p_start_linear_vgpr, Format::PSEUDO, 0, 1)};
+            aco_ptr<Pseudo_instruction> create{create_instruction<Pseudo_instruction>(
+               aco_opcode::p_start_linear_vgpr, Format::PSEUDO, 0, 1)};
            create->definitions[0] = Definition(reduceTmp);
            /* find the right place to insert this definition */
            if (last_top_level_block_idx == block.index) {
@ -110,18 +113,19 @@ void setup_reduce_temp(Program* program)
            } else {
               assert(last_top_level_block_idx < block.index);
               /* insert before the branch at last top level block */
-               std::vector<aco_ptr<Instruction>>& instructions = program->blocks[last_top_level_block_idx].instructions;
-               instructions.insert(std::next(instructions.begin(), instructions.size() - 1), std::move(create));
+               std::vector<aco_ptr<Instruction>>& instructions =
+                  program->blocks[last_top_level_block_idx].instructions;
+               instructions.insert(std::next(instructions.begin(), instructions.size() - 1),
+                                   std::move(create));
               inserted_at = last_top_level_block_idx;
            }
         }

         /* same as before, except for the vector temporary instead of the reduce temporary */
         unsigned cluster_size = instr->reduction().cluster_size;
-         bool need_vtmp = op == imul32 || op == fadd64 || op == fmul64 ||
-                          op == fmin64 || op == fmax64 || op == umin64 ||
-                          op == umax64 || op == imin64 || op == imax64 ||
-                          op == imul64;
+         bool need_vtmp = op == imul32 || op == fadd64 || op == fmul64 || op == fmin64 ||
+                          op == fmax64 || op == umin64 || op == umax64 || op == imin64 ||
+                          op == imax64 || op == imul64;
         bool gfx10_need_vtmp = op == imul8 || op == imax8 || op == imin8 || op == umin8 ||
                                op == imul16 || op == imax16 || op == imin16 || op == umin16 ||
                                op == iadd64;
@ -138,15 +142,18 @@ void setup_reduce_temp(Program* program)
         vtmp_in_loop |= need_vtmp && block.loop_nest_depth > 0;
         if (need_vtmp && (int)last_top_level_block_idx != vtmp_inserted_at) {
            vtmp = program->allocateTmp(vtmp.regClass());
-            aco_ptr<Pseudo_instruction> create{create_instruction<Pseudo_instruction>(aco_opcode::p_start_linear_vgpr, Format::PSEUDO, 0, 1)};
+            aco_ptr<Pseudo_instruction> create{create_instruction<Pseudo_instruction>(
+               aco_opcode::p_start_linear_vgpr, Format::PSEUDO, 0, 1)};
            create->definitions[0] = Definition(vtmp);
            if (last_top_level_block_idx == block.index) {
               it = block.instructions.insert(it, std::move(create));
               it++;
            } else {
               assert(last_top_level_block_idx < block.index);
-               std::vector<aco_ptr<Instruction>>& instructions = program->blocks[last_top_level_block_idx].instructions;
-               instructions.insert(std::next(instructions.begin(), instructions.size() - 1), std::move(create));
+               std::vector<aco_ptr<Instruction>>& instructions =
+                  program->blocks[last_top_level_block_idx].instructions;
+               instructions.insert(std::next(instructions.begin(), instructions.size() - 1),
+                                   std::move(create));
               vtmp_inserted_at = last_top_level_block_idx;
            }
         }
@ -158,5 +165,4 @@ void setup_reduce_temp(Program* program)
   }
 }

-};
-
+}; // namespace aco
--- a/src/amd/compiler/aco_register_allocation.cpp
+++ b/src/amd/compiler/aco_register_allocation.cpp
--- a/src/amd/compiler/aco_reindex_ssa.cpp
+++ b/src/amd/compiler/aco_reindex_ssa.cpp
@ -34,8 +34,8 @@ struct idx_ctx {
   std::vector<uint32_t> renames;
 };

-inline
-void reindex_defs(idx_ctx& ctx, aco_ptr<Instruction>& instr)
+inline void
+reindex_defs(idx_ctx& ctx, aco_ptr<Instruction>& instr)
 {
   for (Definition& def : instr->definitions) {
      if (!def.isTemp())
@ -48,8 +48,8 @@ void reindex_defs(idx_ctx& ctx, aco_ptr<Instruction>& instr)
   }
 }

-inline
-void reindex_ops(idx_ctx& ctx, aco_ptr<Instruction>& instr)
+inline void
+reindex_ops(idx_ctx& ctx, aco_ptr<Instruction>& instr)
 {
   for (Operand& op : instr->operands) {
      if (!op.isTemp())
@ -60,7 +60,8 @@ void reindex_ops(idx_ctx& ctx, aco_ptr<Instruction>& instr)
   }
 }

-void reindex_program(idx_ctx& ctx, Program* program)
+void
+reindex_program(idx_ctx& ctx, Program* program)
 {
   ctx.renames.resize(program->peekAllocationId());

@ -88,12 +89,13 @@ void reindex_program(idx_ctx& ctx, Program* program)
   /* update program members */
   program->private_segment_buffer = Temp(ctx.renames[program->private_segment_buffer.id()],
                                          program->private_segment_buffer.regClass());
-   program->scratch_offset = Temp(ctx.renames[program->scratch_offset.id()],
-                                  program->scratch_offset.regClass());
+   program->scratch_offset =
+      Temp(ctx.renames[program->scratch_offset.id()], program->scratch_offset.regClass());
   program->temp_rc = ctx.temp_rc;
 }

-void update_live_out(idx_ctx& ctx, std::vector<IDSet>& live_out)
+void
+update_live_out(idx_ctx& ctx, std::vector<IDSet>& live_out)
 {
   for (IDSet& set : live_out) {
      IDSet new_set;
@ -105,7 +107,8 @@ void update_live_out(idx_ctx& ctx, std::vector<IDSet>& live_out)

 } /* end namespace */

-void reindex_ssa(Program* program)
+void
+reindex_ssa(Program* program)
 {
   idx_ctx ctx;
   reindex_program(ctx, program);
@ -113,7 +116,8 @@ void reindex_ssa(Program* program)
   program->allocationID = program->temp_rc.size();
 }

-void reindex_ssa(Program* program, std::vector<IDSet>& live_out)
+void
+reindex_ssa(Program* program, std::vector<IDSet>& live_out)
 {
   idx_ctx ctx;
   reindex_program(ctx, program);
@ -122,4 +126,4 @@ void reindex_ssa(Program* program, std::vector<IDSet>& live_out)
   program->allocationID = program->temp_rc.size();
 }

-}
+} // namespace aco
--- a/src/amd/compiler/aco_scheduler.cpp
+++ b/src/amd/compiler/aco_scheduler.cpp
@ -34,11 +34,11 @@
 #define SMEM_WINDOW_SIZE    (350 - ctx.num_waves * 35)
 #define VMEM_WINDOW_SIZE    (1024 - ctx.num_waves * 64)
 #define POS_EXP_WINDOW_SIZE 512
-#define SMEM_MAX_MOVES (64 - ctx.num_waves * 4)
-#define VMEM_MAX_MOVES (256 - ctx.num_waves * 16)
+#define SMEM_MAX_MOVES      (64 - ctx.num_waves * 4)
+#define VMEM_MAX_MOVES      (256 - ctx.num_waves * 16)
 /* creating clauses decreases def-use distances, so make it less aggressive the lower num_waves is */
 #define VMEM_CLAUSE_MAX_GRAB_DIST (ctx.num_waves * 8)
-#define POS_EXP_MAX_MOVES 512
+#define POS_EXP_MAX_MOVES         512

 namespace aco {

@ -54,7 +54,7 @@ enum MoveResult {
 * or below a group of instruction that hardware can execute as a clause.
 */
 struct DownwardsCursor {
-   int source_idx;        /* Current instruction to consider for moving */
+   int source_idx; /* Current instruction to consider for moving */

   int insert_idx_clause; /* First clause instruction */
   int insert_idx;        /* First instruction *after* the clause */
@ -66,11 +66,9 @@ struct DownwardsCursor {
   RegisterDemand total_demand;

   DownwardsCursor(int current_idx, RegisterDemand initial_clause_demand)
-      : source_idx(current_idx - 1),
-        insert_idx_clause(current_idx),
-        insert_idx(current_idx + 1),
-        clause_demand(initial_clause_demand) {
-   }
+       : source_idx(current_idx - 1), insert_idx_clause(current_idx), insert_idx(current_idx + 1),
+         clause_demand(initial_clause_demand)
+   {}

   void verify_invariants(const RegisterDemand* register_demand);
 };
@ -91,18 +89,16 @@ struct UpwardsCursor {
      insert_idx = -1; /* to be initialized later */
   }

-   bool has_insert_idx() const {
-      return insert_idx != -1;
-   }
+   bool has_insert_idx() const { return insert_idx != -1; }
   void verify_invariants(const RegisterDemand* register_demand);
 };

 struct MoveState {
   RegisterDemand max_registers;

-   Block *block;
-   Instruction *current;
-   RegisterDemand *register_demand; /* demand per instruction */
+   Block* block;
+   Instruction* current;
+   RegisterDemand* register_demand; /* demand per instruction */
   bool improved_rar;

   std::vector<bool> depends_on;
@ -143,19 +139,22 @@ struct sched_ctx {
 */

 template <typename T>
-void move_element(T begin_it, size_t idx, size_t before) {
-    if (idx < before) {
-        auto begin = std::next(begin_it, idx);
-        auto end = std::next(begin_it, before);
-        std::rotate(begin, begin + 1, end);
-    } else if (idx > before) {
-        auto begin = std::next(begin_it, before);
-        auto end = std::next(begin_it, idx + 1);
-        std::rotate(begin, end - 1, end);
-    }
+void
+move_element(T begin_it, size_t idx, size_t before)
+{
+   if (idx < before) {
+      auto begin = std::next(begin_it, idx);
+      auto end = std::next(begin_it, before);
+      std::rotate(begin, begin + 1, end);
+   } else if (idx > before) {
+      auto begin = std::next(begin_it, before);
+      auto end = std::next(begin_it, idx + 1);
+      std::rotate(begin, end - 1, end);
+   }
 }

-void DownwardsCursor::verify_invariants(const RegisterDemand* register_demand)
+void
+DownwardsCursor::verify_invariants(const RegisterDemand* register_demand)
 {
   assert(source_idx < insert_idx_clause);
   assert(insert_idx_clause < insert_idx);
@ -175,7 +174,8 @@ void DownwardsCursor::verify_invariants(const RegisterDemand* register_demand)
 #endif
 }

-DownwardsCursor MoveState::downwards_init(int current_idx, bool improved_rar_, bool may_form_clauses)
+DownwardsCursor
+MoveState::downwards_init(int current_idx, bool improved_rar_, bool may_form_clauses)
 {
   improved_rar = improved_rar_;

@ -202,7 +202,8 @@ DownwardsCursor MoveState::downwards_init(int current_idx, bool improved_rar_, b
 /* If add_to_clause is true, the current clause is extended by moving the
 * instruction at source_idx in front of the clause. Otherwise, the instruction
 * is moved past the end of the clause without extending it */
-MoveResult MoveState::downwards_move(DownwardsCursor& cursor, bool add_to_clause)
+MoveResult
+MoveState::downwards_move(DownwardsCursor& cursor, bool add_to_clause)
 {
   aco_ptr<Instruction>& instr = block->instructions[cursor.source_idx];

@ -211,7 +212,8 @@ MoveResult MoveState::downwards_move(DownwardsCursor& cursor, bool add_to_clause
         return move_fail_ssa;

   /* check if one of candidate's operands is killed by depending instruction */
-   std::vector<bool>& RAR_deps = improved_rar ? (add_to_clause ? RAR_dependencies_clause : RAR_dependencies) : depends_on;
+   std::vector<bool>& RAR_deps =
+      improved_rar ? (add_to_clause ? RAR_dependencies_clause : RAR_dependencies) : depends_on;
   for (const Operand& op : instr->operands) {
      if (op.isTemp() && RAR_deps[op.tempId()]) {
         // FIXME: account for difference in register pressure
@ -274,7 +276,8 @@ MoveResult MoveState::downwards_move(DownwardsCursor& cursor, bool add_to_clause
   return move_success;
 }

-void MoveState::downwards_skip(DownwardsCursor& cursor)
+void
+MoveState::downwards_skip(DownwardsCursor& cursor)
 {
   aco_ptr<Instruction>& instr = block->instructions[cursor.source_idx];

@ -292,7 +295,9 @@ void MoveState::downwards_skip(DownwardsCursor& cursor)
   cursor.verify_invariants(register_demand);
 }

-void UpwardsCursor::verify_invariants(const RegisterDemand* register_demand) {
+void
+UpwardsCursor::verify_invariants(const RegisterDemand* register_demand)
+{
 #ifndef NDEBUG
   if (!has_insert_idx()) {
      return;
@ -308,7 +313,8 @@ void UpwardsCursor::verify_invariants(const RegisterDemand* register_demand) {
 #endif
 }

-UpwardsCursor MoveState::upwards_init(int source_idx, bool improved_rar_)
+UpwardsCursor
+MoveState::upwards_init(int source_idx, bool improved_rar_)
 {
   improved_rar = improved_rar_;

@ -323,7 +329,8 @@ UpwardsCursor MoveState::upwards_init(int source_idx, bool improved_rar_)
   return UpwardsCursor(source_idx);
 }

-bool MoveState::upwards_check_deps(UpwardsCursor& cursor)
+bool
+MoveState::upwards_check_deps(UpwardsCursor& cursor)
 {
   aco_ptr<Instruction>& instr = block->instructions[cursor.source_idx];
   for (const Operand& op : instr->operands) {
@ -333,13 +340,15 @@ bool MoveState::upwards_check_deps(UpwardsCursor& cursor)
   return true;
 }

-void MoveState::upwards_update_insert_idx(UpwardsCursor& cursor)
+void
+MoveState::upwards_update_insert_idx(UpwardsCursor& cursor)
 {
   cursor.insert_idx = cursor.source_idx;
   cursor.total_demand = register_demand[cursor.insert_idx];
 }

-MoveResult MoveState::upwards_move(UpwardsCursor& cursor)
+MoveResult
+MoveState::upwards_move(UpwardsCursor& cursor)
 {
   assert(cursor.has_insert_idx());

@ -355,13 +364,15 @@ MoveResult MoveState::upwards_move(UpwardsCursor& cursor)
         return move_fail_rar;
   }

-   /* check if register pressure is low enough: the diff is negative if register pressure is decreased */
+   /* check if register pressure is low enough: the diff is negative if register pressure is
+    * decreased */
   const RegisterDemand candidate_diff = get_live_changes(instr);
   const RegisterDemand temp = get_temp_registers(instr);
   if (RegisterDemand(cursor.total_demand + candidate_diff).exceeds(max_registers))
      return move_fail_pressure;
   const RegisterDemand temp2 = get_temp_registers(block->instructions[cursor.insert_idx - 1]);
-   const RegisterDemand new_demand = register_demand[cursor.insert_idx - 1] - temp2 + candidate_diff + temp;
+   const RegisterDemand new_demand =
+      register_demand[cursor.insert_idx - 1] - temp2 + candidate_diff + temp;
   if (new_demand.exceeds(max_registers))
      return move_fail_pressure;

@ -385,7 +396,8 @@ MoveResult MoveState::upwards_move(UpwardsCursor& cursor)
   return move_success;
 }

-void MoveState::upwards_skip(UpwardsCursor& cursor)
+void
+MoveState::upwards_skip(UpwardsCursor& cursor)
 {
   if (cursor.has_insert_idx()) {
      aco_ptr<Instruction>& instr = block->instructions[cursor.source_idx];
@ -405,30 +417,33 @@ void MoveState::upwards_skip(UpwardsCursor& cursor)
   cursor.verify_invariants(register_demand);
 }

-bool is_gs_or_done_sendmsg(const Instruction *instr)
+bool
+is_gs_or_done_sendmsg(const Instruction* instr)
 {
   if (instr->opcode == aco_opcode::s_sendmsg) {
      uint16_t imm = instr->sopp().imm;
-      return (imm & sendmsg_id_mask) == _sendmsg_gs ||
-             (imm & sendmsg_id_mask) == _sendmsg_gs_done;
+      return (imm & sendmsg_id_mask) == _sendmsg_gs || (imm & sendmsg_id_mask) == _sendmsg_gs_done;
   }
   return false;
 }

-bool is_done_sendmsg(const Instruction *instr)
+bool
+is_done_sendmsg(const Instruction* instr)
 {
   if (instr->opcode == aco_opcode::s_sendmsg)
      return (instr->sopp().imm & sendmsg_id_mask) == _sendmsg_gs_done;
   return false;
 }

-memory_sync_info get_sync_info_with_hack(const Instruction* instr)
+memory_sync_info
+get_sync_info_with_hack(const Instruction* instr)
 {
   memory_sync_info sync = get_sync_info(instr);
   if (instr->isSMEM() && !instr->operands.empty() && instr->operands[0].bytes() == 16) {
      // FIXME: currently, it doesn't seem beneficial to omit this due to how our scheduler works
      sync.storage = (storage_class)(sync.storage | storage_buffer);
-      sync.semantics = (memory_semantics)((sync.semantics | semantic_private) & ~semantic_can_reorder);
+      sync.semantics =
+         (memory_semantics)((sync.semantics | semantic_private) & ~semantic_can_reorder);
   }
   return sync;
 }
@ -451,11 +466,13 @@ struct hazard_query {
   bool contains_sendmsg;
   bool uses_exec;
   memory_event_set mem_events;
-   unsigned aliasing_storage; /* storage classes which are accessed (non-SMEM) */
+   unsigned aliasing_storage;      /* storage classes which are accessed (non-SMEM) */
   unsigned aliasing_storage_smem; /* storage classes which are accessed (SMEM) */
 };

-void init_hazard_query(hazard_query *query) {
+void
+init_hazard_query(hazard_query* query)
+{
   query->contains_spill = false;
   query->contains_sendmsg = false;
   query->uses_exec = false;
@ -464,7 +481,8 @@ void init_hazard_query(hazard_query *query) {
   query->aliasing_storage_smem = 0;
 }

-void add_memory_event(memory_event_set *set, Instruction *instr, memory_sync_info *sync)
+void
+add_memory_event(memory_event_set* set, Instruction* instr, memory_sync_info* sync)
 {
   set->has_control_barrier |= is_done_sendmsg(instr);
   if (instr->opcode == aco_opcode::p_barrier) {
@ -494,7 +512,8 @@ void add_memory_event(memory_event_set *set, Instruction *instr, memory_sync_inf
   }
 }

-void add_to_hazard_query(hazard_query *query, Instruction *instr)
+void
+add_to_hazard_query(hazard_query* query, Instruction* instr)
 {
   if (instr->opcode == aco_opcode::p_spill || instr->opcode == aco_opcode::p_reload)
      query->contains_spill = true;
@ -507,7 +526,8 @@ void add_to_hazard_query(hazard_query *query, Instruction *instr)

   if (!(sync.semantics & semantic_can_reorder)) {
      unsigned storage = sync.storage;
-      /* images and buffer/global memory can alias */ //TODO: more precisely, buffer images and buffer/global memory can alias
+      /* images and buffer/global memory can alias */ // TODO: more precisely, buffer images and
+                                                      // buffer/global memory can alias
      if (storage & (storage_buffer | storage_image))
         storage |= storage_buffer | storage_image;
      if (instr->isSMEM())
@ -531,7 +551,8 @@ enum HazardResult {
   hazard_fail_unreorderable,
 };

-HazardResult perform_hazard_query(hazard_query *query, Instruction *instr, bool upwards)
+HazardResult
+perform_hazard_query(hazard_query* query, Instruction* instr, bool upwards)
 {
   /* don't schedule discards downwards */
   if (!upwards && instr->opcode == aco_opcode::p_exit_early_if)
@ -549,10 +570,8 @@ HazardResult perform_hazard_query(hazard_query *query, Instruction *instr, bool
      return hazard_fail_export;

   /* don't move non-reorderable instructions */
-   if (instr->opcode == aco_opcode::s_memtime ||
-       instr->opcode == aco_opcode::s_memrealtime ||
-       instr->opcode == aco_opcode::s_setprio ||
-       instr->opcode == aco_opcode::s_getreg_b32)
+   if (instr->opcode == aco_opcode::s_memtime || instr->opcode == aco_opcode::s_memrealtime ||
+       instr->opcode == aco_opcode::s_setprio || instr->opcode == aco_opcode::s_getreg_b32)
      return hazard_fail_unreorderable;

   memory_event_set instr_set;
@ -560,8 +579,8 @@ HazardResult perform_hazard_query(hazard_query *query, Instruction *instr, bool
   memory_sync_info sync = get_sync_info_with_hack(instr);
   add_memory_event(&instr_set, instr, &sync);

-   memory_event_set *first = &instr_set;
-   memory_event_set *second = &query->mem_events;
+   memory_event_set* first = &instr_set;
+   memory_event_set* second = &query->mem_events;
   if (upwards)
      std::swap(first, second);

@ -571,7 +590,8 @@ HazardResult perform_hazard_query(hazard_query *query, Instruction *instr, bool
   if ((first->has_control_barrier || first->access_atomic) && second->bar_acquire)
      return hazard_fail_barrier;
   if (((first->access_acquire || first->bar_acquire) && second->bar_classes) ||
-       ((first->access_acquire | first->bar_acquire) & (second->access_relaxed | second->access_atomic)))
+       ((first->access_acquire | first->bar_acquire) &
+        (second->access_relaxed | second->access_atomic)))
      return hazard_fail_barrier;

   /* everything before barrier(release) happens before the atomics/control_barriers after *
@ -580,7 +600,8 @@ HazardResult perform_hazard_query(hazard_query *query, Instruction *instr, bool
   if (first->bar_release && (second->has_control_barrier || second->access_atomic))
      return hazard_fail_barrier;
   if ((first->bar_classes && (second->bar_release || second->access_release)) ||
-       ((first->access_relaxed | first->access_atomic) & (second->bar_release | second->access_release)))
+       ((first->access_relaxed | first->access_atomic) &
+        (second->bar_release | second->access_release)))
      return hazard_fail_barrier;

   /* don't move memory barriers around other memory barriers */
@ -589,14 +610,15 @@ HazardResult perform_hazard_query(hazard_query *query, Instruction *instr, bool

   /* Don't move memory accesses to before control barriers. I don't think
    * this is necessary for the Vulkan memory model, but it might be for GLSL450. */
-   unsigned control_classes = storage_buffer | storage_atomic_counter | storage_image | storage_shared;
-   if (first->has_control_barrier && ((second->access_atomic | second->access_relaxed) & control_classes))
+   unsigned control_classes =
+      storage_buffer | storage_atomic_counter | storage_image | storage_shared;
+   if (first->has_control_barrier &&
+       ((second->access_atomic | second->access_relaxed) & control_classes))
      return hazard_fail_barrier;

   /* don't move memory loads/stores past potentially aliasing loads/stores */
-   unsigned aliasing_storage = instr->isSMEM() ?
-                               query->aliasing_storage_smem :
-                               query->aliasing_storage;
+   unsigned aliasing_storage =
+      instr->isSMEM() ? query->aliasing_storage_smem : query->aliasing_storage;
   if ((sync.storage & aliasing_storage) && !(sync.semantics & semantic_can_reorder)) {
      unsigned intersect = sync.storage & aliasing_storage;
      if (intersect & storage_shared)
@ -614,9 +636,9 @@ HazardResult perform_hazard_query(hazard_query *query, Instruction *instr, bool
   return hazard_success;
 }

-void schedule_SMEM(sched_ctx& ctx, Block* block,
-                   std::vector<RegisterDemand>& register_demand,
-                   Instruction* current, int idx)
+void
+schedule_SMEM(sched_ctx& ctx, Block* block, std::vector<RegisterDemand>& register_demand,
+              Instruction* current, int idx)
 {
   assert(idx != 0);
   int window_size = SMEM_WINDOW_SIZE;
@ -634,30 +656,37 @@ void schedule_SMEM(sched_ctx& ctx, Block* block,

   DownwardsCursor cursor = ctx.mv.downwards_init(idx, false, false);

-   for (int candidate_idx = idx - 1; k < max_moves && candidate_idx > (int) idx - window_size; candidate_idx--) {
+   for (int candidate_idx = idx - 1; k < max_moves && candidate_idx > (int)idx - window_size;
+        candidate_idx--) {
      assert(candidate_idx >= 0);
      assert(candidate_idx == cursor.source_idx);
      aco_ptr<Instruction>& candidate = block->instructions[candidate_idx];

      /* break if we'd make the previous SMEM instruction stall */
-      bool can_stall_prev_smem = idx <= ctx.last_SMEM_dep_idx && candidate_idx < ctx.last_SMEM_dep_idx;
+      bool can_stall_prev_smem =
+         idx <= ctx.last_SMEM_dep_idx && candidate_idx < ctx.last_SMEM_dep_idx;
      if (can_stall_prev_smem && ctx.last_SMEM_stall >= 0)
         break;

      /* break when encountering another MEM instruction, logical_start or barriers */
      if (candidate->opcode == aco_opcode::p_logical_start)
         break;
-      /* only move VMEM instructions below descriptor loads. be more aggressive at higher num_waves to help create more vmem clauses */
-      if (candidate->isVMEM() && (cursor.insert_idx - cursor.source_idx > (ctx.num_waves * 4) || current->operands[0].size() == 4))
+      /* only move VMEM instructions below descriptor loads. be more aggressive at higher num_waves
+       * to help create more vmem clauses */
+      if (candidate->isVMEM() && (cursor.insert_idx - cursor.source_idx > (ctx.num_waves * 4) ||
+                                  current->operands[0].size() == 4))
         break;
      /* don't move descriptor loads below buffer loads */
-      if (candidate->format == Format::SMEM && current->operands[0].size() == 4 && candidate->operands[0].size() == 2)
+      if (candidate->format == Format::SMEM && current->operands[0].size() == 4 &&
+          candidate->operands[0].size() == 2)
         break;

      bool can_move_down = true;

      HazardResult haz = perform_hazard_query(&hq, candidate.get(), false);
-      if (haz == hazard_fail_reorder_ds || haz == hazard_fail_spill || haz == hazard_fail_reorder_sendmsg || haz == hazard_fail_barrier || haz == hazard_fail_export)
+      if (haz == hazard_fail_reorder_ds || haz == hazard_fail_spill ||
+          haz == hazard_fail_reorder_sendmsg || haz == hazard_fail_barrier ||
+          haz == hazard_fail_export)
         can_move_down = false;
      else if (haz != hazard_success)
         break;
@ -689,9 +718,10 @@ void schedule_SMEM(sched_ctx& ctx, Block* block,

   bool found_dependency = false;
   /* second, check if we have instructions after current to move up */
-   for (int candidate_idx = idx + 1; k < max_moves && candidate_idx < (int) idx + window_size; candidate_idx++) {
+   for (int candidate_idx = idx + 1; k < max_moves && candidate_idx < (int)idx + window_size;
+        candidate_idx++) {
      assert(candidate_idx == up_cursor.source_idx);
-      assert(candidate_idx < (int) block->instructions.size());
+      assert(candidate_idx < (int)block->instructions.size());
      aco_ptr<Instruction>& candidate = block->instructions[candidate_idx];

      if (candidate->opcode == aco_opcode::p_logical_end)
@ -748,9 +778,9 @@ void schedule_SMEM(sched_ctx& ctx, Block* block,
   ctx.last_SMEM_stall = 10 - ctx.num_waves - k;
 }

-void schedule_VMEM(sched_ctx& ctx, Block* block,
-                   std::vector<RegisterDemand>& register_demand,
-                   Instruction* current, int idx)
+void
+schedule_VMEM(sched_ctx& ctx, Block* block, std::vector<RegisterDemand>& register_demand,
+              Instruction* current, int idx)
 {
   assert(idx != 0);
   int window_size = VMEM_WINDOW_SIZE;
@ -767,7 +797,8 @@ void schedule_VMEM(sched_ctx& ctx, Block* block,

   DownwardsCursor cursor = ctx.mv.downwards_init(idx, true, true);

-   for (int candidate_idx = idx - 1; k < max_moves && candidate_idx > (int) idx - window_size; candidate_idx--) {
+   for (int candidate_idx = idx - 1; k < max_moves && candidate_idx > (int)idx - window_size;
+        candidate_idx--) {
      assert(candidate_idx == cursor.source_idx);
      assert(candidate_idx >= 0);
      aco_ptr<Instruction>& candidate = block->instructions[candidate_idx];
@ -778,7 +809,8 @@ void schedule_VMEM(sched_ctx& ctx, Block* block,
         break;

      /* break if we'd make the previous SMEM instruction stall */
-      bool can_stall_prev_smem = idx <= ctx.last_SMEM_dep_idx && candidate_idx < ctx.last_SMEM_dep_idx;
+      bool can_stall_prev_smem =
+         idx <= ctx.last_SMEM_dep_idx && candidate_idx < ctx.last_SMEM_dep_idx;
      if (can_stall_prev_smem && ctx.last_SMEM_stall >= 0)
         break;

@ -787,14 +819,15 @@ void schedule_VMEM(sched_ctx& ctx, Block* block,
         int grab_dist = cursor.insert_idx_clause - candidate_idx;
         /* We can't easily tell how much this will decrease the def-to-use
          * distances, so just use how far it will be moved as a heuristic. */
-         part_of_clause = grab_dist < clause_max_grab_dist &&
-                          should_form_clause(current, candidate.get());
+         part_of_clause =
+            grab_dist < clause_max_grab_dist && should_form_clause(current, candidate.get());
      }

      /* if current depends on candidate, add additional dependencies and continue */
      bool can_move_down = !is_vmem || part_of_clause;

-      HazardResult haz = perform_hazard_query(part_of_clause ? &clause_hq : &indep_hq, candidate.get(), false);
+      HazardResult haz =
+         perform_hazard_query(part_of_clause ? &clause_hq : &indep_hq, candidate.get(), false);
      if (haz == hazard_fail_reorder_ds || haz == hazard_fail_spill ||
          haz == hazard_fail_reorder_sendmsg || haz == hazard_fail_barrier ||
          haz == hazard_fail_export)
@ -809,7 +842,7 @@ void schedule_VMEM(sched_ctx& ctx, Block* block,
         continue;
      }

-      Instruction *candidate_ptr = candidate.get();
+      Instruction* candidate_ptr = candidate.get();
      MoveResult res = ctx.mv.downwards_move(cursor, part_of_clause);
      if (res == move_fail_ssa || res == move_fail_rar) {
         add_to_hazard_query(&indep_hq, candidate.get());
@ -832,9 +865,10 @@ void schedule_VMEM(sched_ctx& ctx, Block* block,

   bool found_dependency = false;
   /* second, check if we have instructions after current to move up */
-   for (int candidate_idx = idx + 1; k < max_moves && candidate_idx < (int) idx + window_size; candidate_idx++) {
+   for (int candidate_idx = idx + 1; k < max_moves && candidate_idx < (int)idx + window_size;
+        candidate_idx++) {
      assert(candidate_idx == up_cursor.source_idx);
-      assert(candidate_idx < (int) block->instructions.size());
+      assert(candidate_idx < (int)block->instructions.size());
      aco_ptr<Instruction>& candidate = block->instructions[candidate_idx];
      bool is_vmem = candidate->isVMEM() || candidate->isFlatLike();

@ -889,9 +923,9 @@ void schedule_VMEM(sched_ctx& ctx, Block* block,
   }
 }

-void schedule_position_export(sched_ctx& ctx, Block* block,
-                              std::vector<RegisterDemand>& register_demand,
-                              Instruction* current, int idx)
+void
+schedule_position_export(sched_ctx& ctx, Block* block, std::vector<RegisterDemand>& register_demand,
+                         Instruction* current, int idx)
 {
   assert(idx != 0);
   int window_size = POS_EXP_WINDOW_SIZE;
@ -904,7 +938,8 @@ void schedule_position_export(sched_ctx& ctx, Block* block,
   init_hazard_query(&hq);
   add_to_hazard_query(&hq, current);

-   for (int candidate_idx = idx - 1; k < max_moves && candidate_idx > (int) idx - window_size; candidate_idx--) {
+   for (int candidate_idx = idx - 1; k < max_moves && candidate_idx > (int)idx - window_size;
+        candidate_idx--) {
      assert(candidate_idx >= 0);
      aco_ptr<Instruction>& candidate = block->instructions[candidate_idx];

@ -935,7 +970,8 @@ void schedule_position_export(sched_ctx& ctx, Block* block,
   }
 }

-void schedule_block(sched_ctx& ctx, Program *program, Block* block, live& live_vars)
+void
+schedule_block(sched_ctx& ctx, Program* program, Block* block, live& live_vars)
 {
   ctx.last_SMEM_dep_idx = 0;
   ctx.last_SMEM_stall = INT16_MIN;
@ -950,7 +986,8 @@ void schedule_block(sched_ctx& ctx, Program *program, Block* block, live& live_v
         unsigned target = current->exp().dest;
         if (target >= V_008DFC_SQ_EXP_POS && target < V_008DFC_SQ_EXP_PRIM) {
            ctx.mv.current = current;
-            schedule_position_export(ctx, block, live_vars.register_demand[block->index], current, idx);
+            schedule_position_export(ctx, block, live_vars.register_demand[block->index], current,
+                                     idx);
         }
      }

@ -975,8 +1012,8 @@ void schedule_block(sched_ctx& ctx, Program *program, Block* block, live& live_v
   }
 }

-
-void schedule_program(Program *program, live& live_vars)
+void
+schedule_program(Program* program, live& live_vars)
 {
   /* don't use program->max_reg_demand because that is affected by max_waves_per_simd */
   RegisterDemand demand;
@ -991,7 +1028,7 @@ void schedule_program(Program *program, live& live_vars)
   /* Allowing the scheduler to reduce the number of waves to as low as 5
    * improves performance of Thrones of Britannia significantly and doesn't
    * seem to hurt anything else. */
-   //TODO: account for possible uneven num_waves on GFX10+
+   // TODO: account for possible uneven num_waves on GFX10+
   unsigned wave_fac = program->dev.physical_vgprs / 256;
   if (program->num_waves <= 5 * wave_fac)
      ctx.num_waves = program->num_waves;
@ -1008,8 +1045,8 @@ void schedule_program(Program *program, live& live_vars)
   ctx.num_waves = std::max<uint16_t>(ctx.num_waves / wave_fac, 1);

   assert(ctx.num_waves > 0);
-   ctx.mv.max_registers = { int16_t(get_addr_vgpr_from_waves(program, ctx.num_waves * wave_fac) - 2),
-                            int16_t(get_addr_sgpr_from_waves(program, ctx.num_waves * wave_fac))};
+   ctx.mv.max_registers = {int16_t(get_addr_vgpr_from_waves(program, ctx.num_waves * wave_fac) - 2),
+                           int16_t(get_addr_sgpr_from_waves(program, ctx.num_waves * wave_fac))};

   for (Block& block : program->blocks)
      schedule_block(ctx, program, &block, live_vars);
@ -1021,8 +1058,8 @@ void schedule_program(Program *program, live& live_vars)
   }
   update_vgpr_sgpr_demand(program, new_demand);

-   /* if enabled, this code asserts that register_demand is updated correctly */
-   #if 0
+/* if enabled, this code asserts that register_demand is updated correctly */
+#if 0
   int prev_num_waves = program->num_waves;
   const RegisterDemand prev_max_demand = program->max_reg_demand;

@ -1042,7 +1079,7 @@ void schedule_program(Program *program, live& live_vars)

   assert(program->max_reg_demand == prev_max_demand);
   assert(program->num_waves == prev_num_waves);
-   #endif
+#endif
 }

-}
+} // namespace aco
--- a/src/amd/compiler/aco_spill.cpp
+++ b/src/amd/compiler/aco_spill.cpp
--- a/src/amd/compiler/aco_ssa_elimination.cpp
+++ b/src/amd/compiler/aco_ssa_elimination.cpp
@ -37,7 +37,8 @@ struct phi_info_item {
 };

 struct ssa_elimination_ctx {
-   /* The outer vectors should be indexed by block index. The inner vectors store phi information for each block. */
+   /* The outer vectors should be indexed by block index. The inner vectors store phi information
+    * for each block. */
   std::vector<std::vector<phi_info_item>> logical_phi_info;
   std::vector<std::vector<phi_info_item>> linear_phi_info;
   std::vector<bool> empty_blocks;
@ -45,14 +46,14 @@ struct ssa_elimination_ctx {
   Program* program;

   ssa_elimination_ctx(Program* program_)
-      : logical_phi_info(program_->blocks.size())
-      , linear_phi_info(program_->blocks.size())
-      , empty_blocks(program_->blocks.size(), true)
-      , blocks_incoming_exec_used(program_->blocks.size(), true)
-      , program(program_) {}
+       : logical_phi_info(program_->blocks.size()), linear_phi_info(program_->blocks.size()),
+         empty_blocks(program_->blocks.size(), true),
+         blocks_incoming_exec_used(program_->blocks.size(), true), program(program_)
+   {}
 };

-void collect_phi_info(ssa_elimination_ctx& ctx)
+void
+collect_phi_info(ssa_elimination_ctx& ctx)
 {
   for (Block& block : ctx.program->blocks) {
      for (aco_ptr<Instruction>& phi : block.instructions) {
@ -67,9 +68,11 @@ void collect_phi_info(ssa_elimination_ctx& ctx)

            assert(phi->definitions[0].size() == phi->operands[i].size());

-            std::vector<unsigned>& preds = phi->opcode == aco_opcode::p_phi ? block.logical_preds : block.linear_preds;
+            std::vector<unsigned>& preds =
+               phi->opcode == aco_opcode::p_phi ? block.logical_preds : block.linear_preds;
            uint32_t pred_idx = preds[i];
-            auto& info_vec = phi->opcode == aco_opcode::p_phi ? ctx.logical_phi_info[pred_idx] : ctx.linear_phi_info[pred_idx];
+            auto& info_vec = phi->opcode == aco_opcode::p_phi ? ctx.logical_phi_info[pred_idx]
+                                                              : ctx.linear_phi_info[pred_idx];
            info_vec.push_back({phi->definitions[0], phi->operands[i]});
            ctx.empty_blocks[pred_idx] = false;
         }
@ -77,11 +80,12 @@ void collect_phi_info(ssa_elimination_ctx& ctx)
   }
 }

-void insert_parallelcopies(ssa_elimination_ctx& ctx)
+void
+insert_parallelcopies(ssa_elimination_ctx& ctx)
 {
   /* insert the parallelcopies from logical phis before p_logical_end */
   for (unsigned block_idx = 0; block_idx < ctx.program->blocks.size(); ++block_idx) {
-      auto &logical_phi_info = ctx.logical_phi_info[block_idx];
+      auto& logical_phi_info = ctx.logical_phi_info[block_idx];
      if (logical_phi_info.empty())
         continue;

@ -93,10 +97,11 @@ void insert_parallelcopies(ssa_elimination_ctx& ctx)
      }

      std::vector<aco_ptr<Instruction>>::iterator it = std::next(block.instructions.begin(), idx);
-      aco_ptr<Pseudo_instruction> pc{create_instruction<Pseudo_instruction>(aco_opcode::p_parallelcopy, Format::PSEUDO, logical_phi_info.size(), logical_phi_info.size())};
+      aco_ptr<Pseudo_instruction> pc{
+         create_instruction<Pseudo_instruction>(aco_opcode::p_parallelcopy, Format::PSEUDO,
+                                                logical_phi_info.size(), logical_phi_info.size())};
      unsigned i = 0;
-      for (auto& phi_info : logical_phi_info)
-      {
+      for (auto& phi_info : logical_phi_info) {
         pc->definitions[i] = phi_info.def;
         pc->operands[i] = phi_info.op;
         i++;
@ -108,7 +113,7 @@ void insert_parallelcopies(ssa_elimination_ctx& ctx)

   /* insert parallelcopies for the linear phis at the end of blocks just before the branch */
   for (unsigned block_idx = 0; block_idx < ctx.program->blocks.size(); ++block_idx) {
-      auto &linear_phi_info = ctx.linear_phi_info[block_idx];
+      auto& linear_phi_info = ctx.linear_phi_info[block_idx];
      if (linear_phi_info.empty())
         continue;

@ -116,10 +121,11 @@ void insert_parallelcopies(ssa_elimination_ctx& ctx)
      std::vector<aco_ptr<Instruction>>::iterator it = block.instructions.end();
      --it;
      assert((*it)->isBranch());
-      aco_ptr<Pseudo_instruction> pc{create_instruction<Pseudo_instruction>(aco_opcode::p_parallelcopy, Format::PSEUDO, linear_phi_info.size(), linear_phi_info.size())};
+      aco_ptr<Pseudo_instruction> pc{
+         create_instruction<Pseudo_instruction>(aco_opcode::p_parallelcopy, Format::PSEUDO,
+                                                linear_phi_info.size(), linear_phi_info.size())};
      unsigned i = 0;
-      for (auto& phi_info : linear_phi_info)
-      {
+      for (auto& phi_info : linear_phi_info) {
         pc->definitions[i] = phi_info.def;
         pc->operands[i] = phi_info.op;
         i++;
@ -130,38 +136,38 @@ void insert_parallelcopies(ssa_elimination_ctx& ctx)
   }
 }

-bool is_empty_block(Block* block, bool ignore_exec_writes)
+bool
+is_empty_block(Block* block, bool ignore_exec_writes)
 {
   /* check if this block is empty and the exec mask is not needed */
   for (aco_ptr<Instruction>& instr : block->instructions) {
      switch (instr->opcode) {
-         case aco_opcode::p_linear_phi:
-         case aco_opcode::p_phi:
-         case aco_opcode::p_logical_start:
-         case aco_opcode::p_logical_end:
-         case aco_opcode::p_branch:
+      case aco_opcode::p_linear_phi:
+      case aco_opcode::p_phi:
+      case aco_opcode::p_logical_start:
+      case aco_opcode::p_logical_end:
+      case aco_opcode::p_branch: break;
+      case aco_opcode::p_parallelcopy:
+         for (unsigned i = 0; i < instr->definitions.size(); i++) {
+            if (ignore_exec_writes && instr->definitions[i].physReg() == exec)
+               continue;
+            if (instr->definitions[i].physReg() != instr->operands[i].physReg())
+               return false;
+         }
+         break;
+      case aco_opcode::s_andn2_b64:
+      case aco_opcode::s_andn2_b32:
+         if (ignore_exec_writes && instr->definitions[0].physReg() == exec)
            break;
-         case aco_opcode::p_parallelcopy:
-            for (unsigned i = 0; i < instr->definitions.size(); i++) {
-               if (ignore_exec_writes && instr->definitions[i].physReg() == exec)
-                  continue;
-               if (instr->definitions[i].physReg() != instr->operands[i].physReg())
-                  return false;
-            }
-            break;
-         case aco_opcode::s_andn2_b64:
-         case aco_opcode::s_andn2_b32:
-            if (ignore_exec_writes && instr->definitions[0].physReg() == exec)
-               break;
-            return false;
-         default:
-            return false;
+         return false;
+      default: return false;
      }
   }
   return true;
 }

-void try_remove_merge_block(ssa_elimination_ctx& ctx, Block* block)
+void
+try_remove_merge_block(ssa_elimination_ctx& ctx, Block* block)
 {
   /* check if the successor is another merge block which restores exec */
   // TODO: divergent loops also restore exec
@ -179,7 +185,8 @@ void try_remove_merge_block(ssa_elimination_ctx& ctx, Block* block)
   block->instructions.emplace_back(std::move(branch));
 }

-void try_remove_invert_block(ssa_elimination_ctx& ctx, Block* block)
+void
+try_remove_invert_block(ssa_elimination_ctx& ctx, Block* block)
 {
   assert(block->linear_succs.size() == 2);
   /* only remove this block if the successor got removed as well */
@ -193,7 +200,7 @@ void try_remove_invert_block(ssa_elimination_ctx& ctx, Block* block)
   unsigned succ_idx = block->linear_succs[0];
   assert(block->linear_preds.size() == 2);
   for (unsigned i = 0; i < 2; i++) {
-      Block *pred = &ctx.program->blocks[block->linear_preds[i]];
+      Block* pred = &ctx.program->blocks[block->linear_preds[i]];
      pred->linear_succs[0] = succ_idx;
      ctx.program->blocks[succ_idx].linear_preds[i] = pred->index;

@ -208,7 +215,8 @@ void try_remove_invert_block(ssa_elimination_ctx& ctx, Block* block)
   block->linear_succs.clear();
 }

-void try_remove_simple_block(ssa_elimination_ctx& ctx, Block* block)
+void
+try_remove_simple_block(ssa_elimination_ctx& ctx, Block* block)
 {
   if (!is_empty_block(block, false))
      return;
@ -277,7 +285,8 @@ void try_remove_simple_block(ssa_elimination_ctx& ctx, Block* block)
   block->linear_succs.clear();
 }

-bool instr_writes_exec(Instruction* instr)
+bool
+instr_writes_exec(Instruction* instr)
 {
   for (Definition& def : instr->definitions)
      if (def.physReg() == exec || def.physReg() == exec_hi)
@ -286,7 +295,8 @@ bool instr_writes_exec(Instruction* instr)
   return false;
 }

-void eliminate_useless_exec_writes_in_block(ssa_elimination_ctx& ctx, Block& block)
+void
+eliminate_useless_exec_writes_in_block(ssa_elimination_ctx& ctx, Block& block)
 {
   /* Check if any successor needs the outgoing exec mask from the current block. */

@ -309,8 +319,9 @@ void eliminate_useless_exec_writes_in_block(ssa_elimination_ctx& ctx, Block& blo
         exec_write_used = false;
      else
         /* blocks_incoming_exec_used is initialized to true, so this is correct even for loops. */
-         exec_write_used = std::any_of(block.linear_succs.begin(), block.linear_succs.end(),
-                                       [&ctx](int succ_idx) { return ctx.blocks_incoming_exec_used[succ_idx]; });
+         exec_write_used =
+            std::any_of(block.linear_succs.begin(), block.linear_succs.end(),
+                        [&ctx](int succ_idx) { return ctx.blocks_incoming_exec_used[succ_idx]; });
   }

   /* Go through all instructions and eliminate useless exec writes. */
@ -318,7 +329,8 @@ void eliminate_useless_exec_writes_in_block(ssa_elimination_ctx& ctx, Block& blo
   for (int i = block.instructions.size() - 1; i >= 0; --i) {
      aco_ptr<Instruction>& instr = block.instructions[i];

-      /* We already take information from phis into account before the loop, so let's just break on phis. */
+      /* We already take information from phis into account before the loop, so let's just break on
+       * phis. */
      if (instr->opcode == aco_opcode::p_linear_phi || instr->opcode == aco_opcode::p_phi)
         break;

@ -341,16 +353,15 @@ void eliminate_useless_exec_writes_in_block(ssa_elimination_ctx& ctx, Block& blo
   }

   /* Remember if the current block needs an incoming exec mask from its predecessors. */
-
   ctx.blocks_incoming_exec_used[block.index] = exec_write_used;

   /* Cleanup: remove deleted instructions from the vector. */
-
   auto new_end = std::remove(block.instructions.begin(), block.instructions.end(), nullptr);
   block.instructions.resize(new_end - block.instructions.begin());
 }

-void jump_threading(ssa_elimination_ctx& ctx)
+void
+jump_threading(ssa_elimination_ctx& ctx)
 {
   for (int i = ctx.program->blocks.size() - 1; i >= 0; i--) {
      Block* block = &ctx.program->blocks[i];
@ -367,8 +378,7 @@ void jump_threading(ssa_elimination_ctx& ctx)
      if (block->linear_succs.size() > 1)
         continue;

-      if (block->kind & block_kind_merge ||
-          block->kind & block_kind_loop_exit)
+      if (block->kind & block_kind_merge || block->kind & block_kind_loop_exit)
         try_remove_merge_block(ctx, block);

      if (block->linear_preds.size() == 1)
@ -378,8 +388,8 @@ void jump_threading(ssa_elimination_ctx& ctx)

 } /* end namespace */

-
-void ssa_elimination(Program* program)
+void
+ssa_elimination(Program* program)
 {
   ssa_elimination_ctx ctx(program);

@ -391,6 +401,5 @@ void ssa_elimination(Program* program)

   /* insert parallelcopies from SSA elimination */
   insert_parallelcopies(ctx);
-
-}
 }
+} // namespace aco
--- a/src/amd/compiler/aco_statistics.cpp
+++ b/src/amd/compiler/aco_statistics.cpp
@ -23,6 +23,7 @@
 */

 #include "aco_ir.h"
+
 #include "util/crc32.h"

 #include <algorithm>
@ -33,7 +34,8 @@
 namespace aco {

 /* sgpr_presched/vgpr_presched */
-void collect_presched_stats(Program *program)
+void
+collect_presched_stats(Program* program)
 {
   RegisterDemand presched_demand;
   for (Block& block : program->blocks)
@ -56,9 +58,9 @@ public:
      resource_count,
   };

-   BlockCycleEstimator(Program *program_) : program(program_) {}
+   BlockCycleEstimator(Program* program_) : program(program_) {}

-   Program *program;
+   Program* program;

   int32_t cur_cycle = 0;
   int32_t res_available[(int)BlockCycleEstimator::resource_count] = {0};
@ -72,6 +74,7 @@ public:
   unsigned predict_cost(aco_ptr<Instruction>& instr);
   void add(aco_ptr<Instruction>& instr);
   void join(const BlockCycleEstimator& other);
+
 private:
   unsigned get_waitcnt_cost(wait_imm imm);
   unsigned get_dependency_cost(aco_ptr<Instruction>& instr);
@ -81,8 +84,9 @@ private:
 };

 struct wait_counter_info {
-   wait_counter_info(unsigned vm_, unsigned exp_, unsigned lgkm_, unsigned vs_) :
-      vm(vm_), exp(exp_), lgkm(lgkm_), vs(vs_) {}
+   wait_counter_info(unsigned vm_, unsigned exp_, unsigned lgkm_, unsigned vs_)
+       : vm(vm_), exp(exp_), lgkm(lgkm_), vs(vs_)
+   {}

   unsigned vm;
   unsigned exp;
@ -100,107 +104,83 @@ struct perf_info {
   unsigned cost1;
 };

-static perf_info get_perf_info(Program *program, aco_ptr<Instruction>& instr)
+static perf_info
+get_perf_info(Program* program, aco_ptr<Instruction>& instr)
 {
   instr_class cls = instr_info.classes[(int)instr->opcode];

-   #define WAIT(res) BlockCycleEstimator::res, 0
-   #define WAIT_USE(res, cnt) BlockCycleEstimator::res, cnt
+#define WAIT(res)          BlockCycleEstimator::res, 0
+#define WAIT_USE(res, cnt) BlockCycleEstimator::res, cnt

   if (program->chip_class >= GFX10) {
      /* fp64 might be incorrect */
      switch (cls) {
      case instr_class::valu32:
      case instr_class::valu_convert32:
-      case instr_class::valu_fma:
-         return {5, WAIT_USE(valu, 1)};
-      case instr_class::valu64:
-         return {6, WAIT_USE(valu, 2), WAIT_USE(valu_complex, 2)};
+      case instr_class::valu_fma: return {5, WAIT_USE(valu, 1)};
+      case instr_class::valu64: return {6, WAIT_USE(valu, 2), WAIT_USE(valu_complex, 2)};
      case instr_class::valu_quarter_rate32:
         return {8, WAIT_USE(valu, 4), WAIT_USE(valu_complex, 4)};
      case instr_class::valu_transcendental32:
         return {10, WAIT_USE(valu, 1), WAIT_USE(valu_complex, 4)};
-      case instr_class::valu_double:
-         return {22, WAIT_USE(valu, 16), WAIT_USE(valu_complex, 16)};
+      case instr_class::valu_double: return {22, WAIT_USE(valu, 16), WAIT_USE(valu_complex, 16)};
      case instr_class::valu_double_add:
         return {22, WAIT_USE(valu, 16), WAIT_USE(valu_complex, 16)};
      case instr_class::valu_double_convert:
         return {22, WAIT_USE(valu, 16), WAIT_USE(valu_complex, 16)};
      case instr_class::valu_double_transcendental:
         return {24, WAIT_USE(valu, 16), WAIT_USE(valu_complex, 16)};
-      case instr_class::salu:
-         return {2, WAIT_USE(scalar, 1)};
-      case instr_class::smem:
-         return {0, WAIT_USE(scalar, 1)};
+      case instr_class::salu: return {2, WAIT_USE(scalar, 1)};
+      case instr_class::smem: return {0, WAIT_USE(scalar, 1)};
      case instr_class::branch:
-      case instr_class::sendmsg:
-         return {0, WAIT_USE(branch_sendmsg, 1)};
+      case instr_class::sendmsg: return {0, WAIT_USE(branch_sendmsg, 1)};
      case instr_class::ds:
-         return instr->ds().gds ?
-                perf_info{0, WAIT_USE(export_gds, 1)} :
-                perf_info{0, WAIT_USE(lds, 1)};
-      case instr_class::exp:
-         return {0, WAIT_USE(export_gds, 1)};
-      case instr_class::vmem:
-         return {0, WAIT_USE(vmem, 1)};
+         return instr->ds().gds ? perf_info{0, WAIT_USE(export_gds, 1)}
+                                : perf_info{0, WAIT_USE(lds, 1)};
+      case instr_class::exp: return {0, WAIT_USE(export_gds, 1)};
+      case instr_class::vmem: return {0, WAIT_USE(vmem, 1)};
      case instr_class::barrier:
      case instr_class::waitcnt:
      case instr_class::other:
-      default:
-         return {0};
+      default: return {0};
      }
   } else {
      switch (cls) {
-      case instr_class::valu32:
-         return {4, WAIT_USE(valu, 4)};
-      case instr_class::valu_convert32:
-         return {16, WAIT_USE(valu, 16)};
-      case instr_class::valu64:
-         return {8, WAIT_USE(valu, 8)};
-      case instr_class::valu_quarter_rate32:
-         return {16, WAIT_USE(valu, 16)};
+      case instr_class::valu32: return {4, WAIT_USE(valu, 4)};
+      case instr_class::valu_convert32: return {16, WAIT_USE(valu, 16)};
+      case instr_class::valu64: return {8, WAIT_USE(valu, 8)};
+      case instr_class::valu_quarter_rate32: return {16, WAIT_USE(valu, 16)};
      case instr_class::valu_fma:
-         return program->dev.has_fast_fma32 ?
-                perf_info{4, WAIT_USE(valu, 4)} :
-                perf_info{16, WAIT_USE(valu, 16)};
-      case instr_class::valu_transcendental32:
-         return {16, WAIT_USE(valu, 16)};
-      case instr_class::valu_double:
-         return {64, WAIT_USE(valu, 64)};
-      case instr_class::valu_double_add:
-         return {32, WAIT_USE(valu, 32)};
-      case instr_class::valu_double_convert:
-         return {16, WAIT_USE(valu, 16)};
-      case instr_class::valu_double_transcendental:
-         return {64, WAIT_USE(valu, 64)};
-      case instr_class::salu:
-         return {4, WAIT_USE(scalar, 4)};
-      case instr_class::smem:
-         return {4, WAIT_USE(scalar, 4)};
+         return program->dev.has_fast_fma32 ? perf_info{4, WAIT_USE(valu, 4)}
+                                            : perf_info{16, WAIT_USE(valu, 16)};
+      case instr_class::valu_transcendental32: return {16, WAIT_USE(valu, 16)};
+      case instr_class::valu_double: return {64, WAIT_USE(valu, 64)};
+      case instr_class::valu_double_add: return {32, WAIT_USE(valu, 32)};
+      case instr_class::valu_double_convert: return {16, WAIT_USE(valu, 16)};
+      case instr_class::valu_double_transcendental: return {64, WAIT_USE(valu, 64)};
+      case instr_class::salu: return {4, WAIT_USE(scalar, 4)};
+      case instr_class::smem: return {4, WAIT_USE(scalar, 4)};
      case instr_class::branch:
         return {8, WAIT_USE(branch_sendmsg, 8)};
         return {4, WAIT_USE(branch_sendmsg, 4)};
      case instr_class::ds:
-         return instr->ds().gds ?
-                perf_info{4, WAIT_USE(export_gds, 4)} :
-                perf_info{4, WAIT_USE(lds, 4)};
-      case instr_class::exp:
-         return {16, WAIT_USE(export_gds, 16)};
-      case instr_class::vmem:
-         return {4, WAIT_USE(vmem, 4)};
+         return instr->ds().gds ? perf_info{4, WAIT_USE(export_gds, 4)}
+                                : perf_info{4, WAIT_USE(lds, 4)};
+      case instr_class::exp: return {16, WAIT_USE(export_gds, 16)};
+      case instr_class::vmem: return {4, WAIT_USE(vmem, 4)};
      case instr_class::barrier:
      case instr_class::waitcnt:
      case instr_class::other:
-      default:
-         return {4};
+      default: return {4};
      }
   }

-   #undef WAIT_USE
-   #undef WAIT
+#undef WAIT_USE
+#undef WAIT
 }

-void BlockCycleEstimator::use_resources(aco_ptr<Instruction>& instr)
+void
+BlockCycleEstimator::use_resources(aco_ptr<Instruction>& instr)
 {
   perf_info perf = get_perf_info(program, instr);

@ -215,7 +195,8 @@ void BlockCycleEstimator::use_resources(aco_ptr<Instruction>& instr)
   }
 }

-int32_t BlockCycleEstimator::cycles_until_res_available(aco_ptr<Instruction>& instr)
+int32_t
+BlockCycleEstimator::cycles_until_res_available(aco_ptr<Instruction>& instr)
 {
   perf_info perf = get_perf_info(program, instr);

@ -228,7 +209,8 @@ int32_t BlockCycleEstimator::cycles_until_res_available(aco_ptr<Instruction>& in
   return cost;
 }

-static wait_counter_info get_wait_counter_info(aco_ptr<Instruction>& instr)
+static wait_counter_info
+get_wait_counter_info(aco_ptr<Instruction>& instr)
 {
   /* These numbers are all a bit nonsense. LDS/VMEM/SMEM/EXP performance
    * depends a lot on the situation. */
@ -252,8 +234,8 @@ static wait_counter_info get_wait_counter_info(aco_ptr<Instruction>& instr)

      bool likely_desc_load = instr->operands[0].size() == 2;
      bool soe = instr->operands.size() >= (!instr->definitions.empty() ? 3 : 4);
-      bool const_offset = instr->operands[1].isConstant() &&
-                          (!soe || instr->operands.back().isConstant());
+      bool const_offset =
+         instr->operands[1].isConstant() && (!soe || instr->operands.back().isConstant());

      if (likely_desc_load || const_offset)
         return wait_counter_info(0, 0, 30, 0); /* likely to hit L0 cache */
@ -273,7 +255,8 @@ static wait_counter_info get_wait_counter_info(aco_ptr<Instruction>& instr)
   return wait_counter_info(0, 0, 0, 0);
 }

-static wait_imm get_wait_imm(Program *program, aco_ptr<Instruction>& instr)
+static wait_imm
+get_wait_imm(Program* program, aco_ptr<Instruction>& instr)
 {
   if (instr->opcode == aco_opcode::s_endpgm) {
      return wait_imm(0, 0, 0, 0);
@ -297,7 +280,8 @@ static wait_imm get_wait_imm(Program *program, aco_ptr<Instruction>& instr)
   }
 }

-unsigned BlockCycleEstimator::get_dependency_cost(aco_ptr<Instruction>& instr)
+unsigned
+BlockCycleEstimator::get_dependency_cost(aco_ptr<Instruction>& instr)
 {
   int deps_available = cur_cycle;

@ -337,13 +321,15 @@ unsigned BlockCycleEstimator::get_dependency_cost(aco_ptr<Instruction>& instr)
   return deps_available - cur_cycle;
 }

-unsigned BlockCycleEstimator::predict_cost(aco_ptr<Instruction>& instr)
+unsigned
+BlockCycleEstimator::predict_cost(aco_ptr<Instruction>& instr)
 {
   int32_t dep = get_dependency_cost(instr);
   return dep + std::max(cycles_until_res_available(instr) - dep, 0);
 }

-static bool is_vector(aco_opcode op)
+static bool
+is_vector(aco_opcode op)
 {
   switch (instr_info.classes[(int)op]) {
   case instr_class::valu32:
@ -358,14 +344,13 @@ static bool is_vector(aco_opcode op)
   case instr_class::exp:
   case instr_class::valu64:
   case instr_class::valu_quarter_rate32:
-   case instr_class::valu_transcendental32:
-      return true;
-   default:
-      return false;
+   case instr_class::valu_transcendental32: return true;
+   default: return false;
   }
 }

-void BlockCycleEstimator::add(aco_ptr<Instruction>& instr)
+void
+BlockCycleEstimator::add(aco_ptr<Instruction>& instr)
 {
   perf_info perf = get_perf_info(program, instr);

@ -411,13 +396,14 @@ void BlockCycleEstimator::add(aco_ptr<Instruction>& instr)
   int32_t result_available = start + MAX2(perf.latency, latency);

   for (Definition& def : instr->definitions) {
-      int32_t *available = &reg_available[def.physReg().reg()];
+      int32_t* available = &reg_available[def.physReg().reg()];
      for (unsigned i = 0; i < def.size(); i++)
         available[i] = MAX2(available[i], result_available);
   }
 }

-static void join_queue(std::deque<int32_t>& queue, const std::deque<int32_t>& pred, int cycle_diff)
+static void
+join_queue(std::deque<int32_t>& queue, const std::deque<int32_t>& pred, int cycle_diff)
 {
   for (unsigned i = 0; i < MIN2(queue.size(), pred.size()); i++)
      queue.rbegin()[i] = MAX2(queue.rbegin()[i], pred.rbegin()[i] + cycle_diff);
@ -425,7 +411,8 @@ static void join_queue(std::deque<int32_t>& queue, const std::deque<int32_t>& pr
      queue.push_front(pred[i] + cycle_diff);
 }

-void BlockCycleEstimator::join(const BlockCycleEstimator& pred)
+void
+BlockCycleEstimator::join(const BlockCycleEstimator& pred)
 {
   assert(cur_cycle == 0);

@ -435,8 +422,7 @@ void BlockCycleEstimator::join(const BlockCycleEstimator& pred)
   }

   for (unsigned i = 0; i < 512; i++)
-      reg_available[i] = MAX2(reg_available[i],
-                              pred.reg_available[i] - pred.cur_cycle + cur_cycle);
+      reg_available[i] = MAX2(reg_available[i], pred.reg_available[i] - pred.cur_cycle + cur_cycle);

   join_queue(lgkm, pred.lgkm, -pred.cur_cycle);
   join_queue(exp, pred.exp, -pred.cur_cycle);
@ -445,11 +431,12 @@ void BlockCycleEstimator::join(const BlockCycleEstimator& pred)
 }

 /* instructions/branches/vmem_clauses/smem_clauses/cycles */
-void collect_preasm_stats(Program *program)
+void
+collect_preasm_stats(Program* program)
 {
   for (Block& block : program->blocks) {
-      std::set<Instruction *> vmem_clause;
-      std::set<Instruction *> smem_clause;
+      std::set<Instruction*> vmem_clause;
+      std::set<Instruction*> smem_clause;

      program->statistics[statistic_instructions] += block.instructions.size();

@ -462,7 +449,8 @@ void collect_preasm_stats(Program *program)

         if (instr->isVMEM() && !instr->operands.empty()) {
            if (std::none_of(vmem_clause.begin(), vmem_clause.end(),
-                             [&](Instruction *other) {return should_form_clause(instr.get(), other);}))
+                             [&](Instruction* other)
+                             { return should_form_clause(instr.get(), other); }))
               program->statistics[statistic_vmem_clauses]++;
            vmem_clause.insert(instr.get());
         } else {
@ -471,12 +459,13 @@ void collect_preasm_stats(Program *program)

         if (instr->isSMEM() && !instr->operands.empty()) {
            if (std::none_of(smem_clause.begin(), smem_clause.end(),
-                             [&](Instruction *other) {return should_form_clause(instr.get(), other);}))
+                             [&](Instruction* other)
+                             { return should_form_clause(instr.get(), other); }))
               program->statistics[statistic_smem_clauses]++;
            smem_clause.insert(instr.get());
         } else {
            smem_clause.clear();
-          }
+         }
      }
   }

@ -514,8 +503,10 @@ void collect_preasm_stats(Program *program)
      iter *= pow(0.5, block.uniform_if_depth);
      iter *= pow(0.75, block.divergent_if_logical_depth);

-      bool divergent_if_linear_else = block.logical_preds.empty() && block.linear_preds.size() == 1 && block.linear_succs.size() == 1 &&
-                                      program->blocks[block.linear_preds[0]].kind & (block_kind_branch | block_kind_invert);
+      bool divergent_if_linear_else =
+         block.logical_preds.empty() && block.linear_preds.size() == 1 &&
+         block.linear_succs.size() == 1 &&
+         program->blocks[block.linear_preds[0]].kind & (block_kind_branch | block_kind_invert);
      if (divergent_if_linear_else)
         iter *= 0.25;

@ -540,7 +531,8 @@ void collect_preasm_stats(Program *program)

   double max_utilization = 1.0;
   if (program->workgroup_size != UINT_MAX)
-      max_utilization = program->workgroup_size / (double)align(program->workgroup_size, program->wave_size);
+      max_utilization =
+         program->workgroup_size / (double)align(program->workgroup_size, program->wave_size);
   wave64_per_cycle *= max_utilization;

   program->statistics[statistic_latency] = round(latency);
@ -551,7 +543,8 @@ void collect_preasm_stats(Program *program)

      fprintf(stderr, "num_waves: %u\n", program->num_waves);
      fprintf(stderr, "salu_smem_usage: %f\n", usage[(int)BlockCycleEstimator::scalar]);
-      fprintf(stderr, "branch_sendmsg_usage: %f\n", usage[(int)BlockCycleEstimator::branch_sendmsg]);
+      fprintf(stderr, "branch_sendmsg_usage: %f\n",
+              usage[(int)BlockCycleEstimator::branch_sendmsg]);
      fprintf(stderr, "valu_usage: %f\n", usage[(int)BlockCycleEstimator::valu]);
      fprintf(stderr, "valu_complex_usage: %f\n", usage[(int)BlockCycleEstimator::valu_complex]);
      fprintf(stderr, "lds_usage: %f\n", usage[(int)BlockCycleEstimator::lds]);
@ -565,9 +558,10 @@ void collect_preasm_stats(Program *program)
   }
 }

-void collect_postasm_stats(Program *program, const std::vector<uint32_t>& code)
+void
+collect_postasm_stats(Program* program, const std::vector<uint32_t>& code)
 {
   program->statistics[aco::statistic_hash] = util_hash_crc32(code.data(), code.size() * 4);
 }

-}
+} // namespace aco
--- a/src/amd/compiler/aco_util.h
+++ b/src/amd/compiler/aco_util.h
@ -35,207 +35,198 @@
 namespace aco {

 /*! \brief      Definition of a span object
-*
-*   \details    A "span" is an "array view" type for holding a view of contiguous
-*               data. The "span" object does not own the data itself.
-*/
-template <typename T>
-class span {
+ *
+ *   \details    A "span" is an "array view" type for holding a view of contiguous
+ *               data. The "span" object does not own the data itself.
+ */
+template <typename T> class span {
 public:
-   using value_type             = T;
-   using pointer                = value_type*;
-   using const_pointer          = const value_type*;
-   using reference              = value_type&;
-   using const_reference        = const value_type&;
-   using iterator               = pointer;
-   using const_iterator         = const_pointer;
-   using reverse_iterator       = std::reverse_iterator<iterator>;
+   using value_type = T;
+   using pointer = value_type*;
+   using const_pointer = const value_type*;
+   using reference = value_type&;
+   using const_reference = const value_type&;
+   using iterator = pointer;
+   using const_iterator = const_pointer;
+   using reverse_iterator = std::reverse_iterator<iterator>;
   using const_reverse_iterator = std::reverse_iterator<const_iterator>;
-   using size_type              = uint16_t;
-   using difference_type        = ptrdiff_t;
+   using size_type = uint16_t;
+   using difference_type = ptrdiff_t;

   /*! \brief                  Compiler generated default constructor
-   */
+    */
   constexpr span() = default;

   /*! \brief                 Constructor taking a pointer and the length of the span
-   *   \param[in]   data      Pointer to the underlying data array
-   *   \param[in]   length    The size of the span
-   */
-   constexpr span(uint16_t offset_, const size_type length_)
-       : offset{ offset_ } , length{ length_ } {}
+    *  \param[in]   data      Pointer to the underlying data array
+    *  \param[in]   length    The size of the span
+    */
+   constexpr span(uint16_t offset_, const size_type length_) : offset{offset_}, length{length_} {}

   /*! \brief                 Returns an iterator to the begin of the span
-   *   \return                data
-   */
-   constexpr iterator begin() noexcept {
-      return (pointer)((uintptr_t)this + offset);
-   }
+    *  \return                data
+    */
+   constexpr iterator begin() noexcept { return (pointer)((uintptr_t)this + offset); }

   /*! \brief                 Returns a const_iterator to the begin of the span
-   *   \return                data
-   */
-   constexpr const_iterator begin() const noexcept {
+    *  \return                data
+    */
+   constexpr const_iterator begin() const noexcept
+   {
      return (const_pointer)((uintptr_t)this + offset);
   }

   /*! \brief                 Returns an iterator to the end of the span
-   *   \return                data + length
-   */
-   constexpr iterator end() noexcept {
-      return std::next(begin(), length);
-   }
+    *  \return                data + length
+    */
+   constexpr iterator end() noexcept { return std::next(begin(), length); }

   /*! \brief                 Returns a const_iterator to the end of the span
-   *   \return                data + length
-   */
-   constexpr const_iterator end() const noexcept {
-      return std::next(begin(), length);
-   }
+    *  \return                data + length
+    */
+   constexpr const_iterator end() const noexcept { return std::next(begin(), length); }

   /*! \brief                 Returns a const_iterator to the begin of the span
-   *   \return                data
-   */
-   constexpr const_iterator cbegin() const noexcept {
-      return begin();
-   }
+    *  \return                data
+    */
+   constexpr const_iterator cbegin() const noexcept { return begin(); }

   /*! \brief                 Returns a const_iterator to the end of the span
-   *   \return                data + length
-   */
-   constexpr const_iterator cend() const noexcept {
-      return std::next(begin(), length);
-   }
+    *  \return                data + length
+    */
+   constexpr const_iterator cend() const noexcept { return std::next(begin(), length); }

   /*! \brief                 Returns a reverse_iterator to the end of the span
-   *   \return                reverse_iterator(end())
-   */
-   constexpr reverse_iterator rbegin() noexcept {
-      return reverse_iterator(end());
-   }
+    *  \return                reverse_iterator(end())
+    */
+   constexpr reverse_iterator rbegin() noexcept { return reverse_iterator(end()); }

   /*! \brief                 Returns a const_reverse_iterator to the end of the span
-   *   \return                reverse_iterator(end())
-   */
-   constexpr const_reverse_iterator rbegin() const noexcept {
+    *  \return                reverse_iterator(end())
+    */
+   constexpr const_reverse_iterator rbegin() const noexcept
+   {
      return const_reverse_iterator(end());
   }

   /*! \brief                 Returns a reverse_iterator to the begin of the span
-   *   \return                reverse_iterator(begin())
-   */
-   constexpr reverse_iterator rend() noexcept {
-      return reverse_iterator(begin());
-   }
+    *   \return                reverse_iterator(begin())
+    */
+   constexpr reverse_iterator rend() noexcept { return reverse_iterator(begin()); }

   /*! \brief                 Returns a const_reverse_iterator to the begin of the span
-   *   \return                reverse_iterator(begin())
-   */
-   constexpr const_reverse_iterator rend() const noexcept {
+    *  \return                reverse_iterator(begin())
+    */
+   constexpr const_reverse_iterator rend() const noexcept
+   {
      return const_reverse_iterator(begin());
   }

   /*! \brief                 Returns a const_reverse_iterator to the end of the span
-   *   \return                rbegin()
-   */
-   constexpr const_reverse_iterator crbegin() const noexcept {
+    *  \return                rbegin()
+    */
+   constexpr const_reverse_iterator crbegin() const noexcept
+   {
      return const_reverse_iterator(cend());
   }

   /*! \brief                 Returns a const_reverse_iterator to the begin of the span
-   *   \return                rend()
-   */
-   constexpr const_reverse_iterator crend() const noexcept {
+    *  \return                rend()
+    */
+   constexpr const_reverse_iterator crend() const noexcept
+   {
      return const_reverse_iterator(cbegin());
   }

   /*! \brief                 Unchecked access operator
-   *   \param[in] index       Index of the element we want to access
-   *   \return                *(std::next(data, index))
-   */
-   constexpr reference operator[](const size_type index) noexcept {
+    *  \param[in] index       Index of the element we want to access
+    *  \return                *(std::next(data, index))
+    */
+   constexpr reference operator[](const size_type index) noexcept
+   {
      assert(length > index);
      return *(std::next(begin(), index));
   }

   /*! \brief                 Unchecked const access operator
-   *   \param[in] index       Index of the element we want to access
-   *   \return                *(std::next(data, index))
-   */
-   constexpr const_reference operator[](const size_type index) const noexcept {
+    *  \param[in] index       Index of the element we want to access
+    *  \return                *(std::next(data, index))
+    */
+   constexpr const_reference operator[](const size_type index) const noexcept
+   {
      assert(length > index);
      return *(std::next(begin(), index));
   }

   /*! \brief                 Returns a reference to the last element of the span
-   *   \return                *(std::next(data, length - 1))
-   */
-   constexpr reference back() noexcept {
+    *  \return                *(std::next(data, length - 1))
+    */
+   constexpr reference back() noexcept
+   {
      assert(length > 0);
      return *(std::next(begin(), length - 1));
   }

   /*! \brief                 Returns a const_reference to the last element of the span
-   *   \return                *(std::next(data, length - 1))
-   */
-   constexpr const_reference back() const noexcept {
+    *  \return                *(std::next(data, length - 1))
+    */
+   constexpr const_reference back() const noexcept
+   {
      assert(length > 0);
      return *(std::next(begin(), length - 1));
   }

   /*! \brief                 Returns a reference to the first element of the span
-   *   \return                *begin()
-   */
-   constexpr reference front() noexcept {
+    *  \return                *begin()
+    */
+   constexpr reference front() noexcept
+   {
      assert(length > 0);
      return *begin();
   }

   /*! \brief                 Returns a const_reference to the first element of the span
-   *   \return                *cbegin()
-   */
-   constexpr const_reference front() const noexcept {
+    *  \return                *cbegin()
+    */
+   constexpr const_reference front() const noexcept
+   {
      assert(length > 0);
      return *cbegin();
   }

   /*! \brief                 Returns true if the span is empty
-   *   \return                length == 0
-   */
-   constexpr bool empty() const noexcept {
-      return length == 0;
-   }
+    *  \return                length == 0
+    */
+   constexpr bool empty() const noexcept { return length == 0; }

   /*! \brief                 Returns the size of the span
-   *   \return                length == 0
-   */
-   constexpr size_type size() const noexcept {
-      return length;
-   }
+    *  \return                length == 0
+    */
+   constexpr size_type size() const noexcept { return length; }

   /*! \brief                 Decreases the size of the span by 1
-   */
-   constexpr void pop_back() noexcept {
+    */
+   constexpr void pop_back() noexcept
+   {
      assert(length > 0);
      --length;
   }

   /*! \brief                 Adds an element to the end of the span
-   */
-   constexpr void push_back(const_reference val) noexcept {
-      *std::next(begin(), length++) = val;
-   }
+    */
+   constexpr void push_back(const_reference val) noexcept { *std::next(begin(), length++) = val; }

   /*! \brief                 Clears the span
-   */
-   constexpr void clear() noexcept {
+    */
+   constexpr void clear() noexcept
+   {
      offset = 0;
      length = 0;
   }

 private:
-   uint16_t offset{ 0 };      //!> Byte offset from span to data
-   size_type length{ 0 };     //!> Size of the span
+   uint16_t offset{0};  //!> Byte offset from span to data
+   size_type length{0}; //!> Size of the span
 };

 /*
@ -250,30 +241,32 @@ private:
 */
 struct IDSet {
   struct Iterator {
-      const IDSet *set;
+      const IDSet* set;
      union {
         struct {
-            uint32_t bit:6;
-            uint32_t word:26;
+            uint32_t bit : 6;
+            uint32_t word : 26;
         };
         uint32_t id;
      };

-      Iterator& operator ++();
+      Iterator& operator++();

-      bool operator != (const Iterator& other) const;
+      bool operator!=(const Iterator& other) const;

-      uint32_t operator * () const;
+      uint32_t operator*() const;
   };

-   size_t count(uint32_t id) const {
+   size_t count(uint32_t id) const
+   {
      if (id >= words.size() * 64)
         return 0;

      return words[id / 64u] & (1ull << (id % 64u)) ? 1 : 0;
   }

-   Iterator find(uint32_t id) const {
+   Iterator find(uint32_t id) const
+   {
      if (!count(id))
         return end();

@ -284,7 +277,8 @@ struct IDSet {
      return it;
   }

-   std::pair<Iterator, bool> insert(uint32_t id) {
+   std::pair<Iterator, bool> insert(uint32_t id)
+   {
      if (words.size() * 64u <= id)
         words.resize(id / 64u + 1);

@ -302,7 +296,8 @@ struct IDSet {
      return std::make_pair(it, true);
   }

-   size_t erase(uint32_t id) {
+   size_t erase(uint32_t id)
+   {
      if (!count(id))
         return 0;

@ -311,7 +306,8 @@ struct IDSet {
      return 1;
   }

-   Iterator cbegin() const {
+   Iterator cbegin() const
+   {
      Iterator it;
      it.set = this;
      for (size_t i = 0; i < words.size(); i++) {
@ -324,7 +320,8 @@ struct IDSet {
      return end();
   }

-   Iterator cend() const {
+   Iterator cend() const
+   {
      Iterator it;
      it.set = this;
      it.word = words.size();
@ -332,27 +329,21 @@ struct IDSet {
      return it;
   }

-   Iterator begin() const {
-      return cbegin();
-   }
+   Iterator begin() const { return cbegin(); }

-   Iterator end() const {
-      return cend();
-   }
+   Iterator end() const { return cend(); }

-   bool empty() const {
-      return bits_set == 0;
-   }
+   bool empty() const { return bits_set == 0; }

-   size_t size() const {
-      return bits_set;
-   }
+   size_t size() const { return bits_set; }

   std::vector<uint64_t> words;
   uint32_t bits_set = 0;
 };

-inline IDSet::Iterator& IDSet::Iterator::operator ++() {
+inline IDSet::Iterator&
+IDSet::Iterator::operator++()
+{
   uint64_t m = set->words[word];
   m &= ~((2ull << bit) - 1ull);
   if (!m) {
@ -374,12 +365,16 @@ inline IDSet::Iterator& IDSet::Iterator::operator ++() {
   return *this;
 }

-inline bool IDSet::Iterator::operator != (const IDSet::Iterator& other) const {
+inline bool
+IDSet::Iterator::operator!=(const IDSet::Iterator& other) const
+{
   assert(set == other.set);
   return id != other.id;
 }

-inline uint32_t IDSet::Iterator::operator * () const {
+inline uint32_t
+IDSet::Iterator::operator*() const
+{
   return (word << 6) | bit;
 }

--- a/src/amd/compiler/aco_validate.cpp
+++ b/src/amd/compiler/aco_validate.cpp
@ -23,6 +23,7 @@
 */

 #include "aco_ir.h"
+
 #include "util/memstream.h"

 #include <array>
@ -32,11 +33,11 @@

 namespace aco {

-static void aco_log(Program *program, enum radv_compiler_debug_level level,
-                    const char *prefix, const char *file, unsigned line,
-                    const char *fmt, va_list args)
+static void
+aco_log(Program* program, enum radv_compiler_debug_level level, const char* prefix,
+        const char* file, unsigned line, const char* fmt, va_list args)
 {
-   char *msg;
+   char* msg;

   if (program->debug.shorten_messages) {
      msg = ralloc_vasprintf(NULL, fmt, args);
@ -55,38 +56,39 @@ static void aco_log(Program *program, enum radv_compiler_debug_level level,
   ralloc_free(msg);
 }

-void _aco_perfwarn(Program *program, const char *file, unsigned line,
-                   const char *fmt, ...)
+void
+_aco_perfwarn(Program* program, const char* file, unsigned line, const char* fmt, ...)
 {
   va_list args;

   va_start(args, fmt);
-   aco_log(program, RADV_COMPILER_DEBUG_LEVEL_PERFWARN,
-           "ACO PERFWARN:\n", file, line, fmt, args);
+   aco_log(program, RADV_COMPILER_DEBUG_LEVEL_PERFWARN, "ACO PERFWARN:\n", file, line, fmt, args);
   va_end(args);
 }

-void _aco_err(Program *program, const char *file, unsigned line,
-              const char *fmt, ...)
+void
+_aco_err(Program* program, const char* file, unsigned line, const char* fmt, ...)
 {
   va_list args;

   va_start(args, fmt);
-   aco_log(program, RADV_COMPILER_DEBUG_LEVEL_ERROR,
-           "ACO ERROR:\n", file, line, fmt, args);
+   aco_log(program, RADV_COMPILER_DEBUG_LEVEL_ERROR, "ACO ERROR:\n", file, line, fmt, args);
   va_end(args);
 }

-bool validate_ir(Program* program)
+bool
+validate_ir(Program* program)
 {
   bool is_valid = true;
-   auto check = [&program, &is_valid](bool success, const char * msg, aco::Instruction * instr) -> void {
+   auto check = [&program, &is_valid](bool success, const char* msg,
+                                      aco::Instruction* instr) -> void
+   {
      if (!success) {
-         char *out;
+         char* out;
         size_t outsize;
         struct u_memstream mem;
         u_memstream_open(&mem, &out, &outsize);
-         FILE *const memf = u_memstream_get(&mem);
+         FILE* const memf = u_memstream_get(&mem);

         fprintf(memf, "%s: ", msg);
         aco_print_instr(instr, memf);
@ -99,7 +101,9 @@ bool validate_ir(Program* program)
      }
   };

-   auto check_block = [&program, &is_valid](bool success, const char * msg, aco::Block * block) -> void {
+   auto check_block = [&program, &is_valid](bool success, const char* msg,
+                                            aco::Block* block) -> void
+   {
      if (!success) {
         aco_err(program, "%s: BB%u", msg, block->index);
         is_valid = false;
@ -132,32 +136,32 @@ bool validate_ir(Program* program)
               base_format = Format::VINTRP;
            }
         }
-         check(base_format == instr_info.format[(int)instr->opcode], "Wrong base format for instruction", instr.get());
+         check(base_format == instr_info.format[(int)instr->opcode],
+               "Wrong base format for instruction", instr.get());

         /* check VOP3 modifiers */
         if (instr->isVOP3() && instr->format != Format::VOP3) {
-            check(base_format == Format::VOP2 ||
-                  base_format == Format::VOP1 ||
-                  base_format == Format::VOPC ||
-                  base_format == Format::VINTRP,
+            check(base_format == Format::VOP2 || base_format == Format::VOP1 ||
+                     base_format == Format::VOPC || base_format == Format::VINTRP,
                  "Format cannot have VOP3/VOP3B applied", instr.get());
         }

         /* check SDWA */
         if (instr->isSDWA()) {
-            check(base_format == Format::VOP2 ||
-                  base_format == Format::VOP1 ||
-                  base_format == Format::VOPC,
+            check(base_format == Format::VOP2 || base_format == Format::VOP1 ||
+                     base_format == Format::VOPC,
                  "Format cannot have SDWA applied", instr.get());

            check(program->chip_class >= GFX8, "SDWA is GFX8+ only", instr.get());

            SDWA_instruction& sdwa = instr->sdwa();
-            check(sdwa.omod == 0 || program->chip_class >= GFX9, "SDWA omod only supported on GFX9+", instr.get());
+            check(sdwa.omod == 0 || program->chip_class >= GFX9,
+                  "SDWA omod only supported on GFX9+", instr.get());
            if (base_format == Format::VOPC) {
-               check(sdwa.clamp == false || program->chip_class == GFX8, "SDWA VOPC clamp only supported on GFX8", instr.get());
+               check(sdwa.clamp == false || program->chip_class == GFX8,
+                     "SDWA VOPC clamp only supported on GFX8", instr.get());
               check((instr->definitions[0].isFixed() && instr->definitions[0].physReg() == vcc) ||
-                     program->chip_class >= GFX9,
+                        program->chip_class >= GFX9,
                     "SDWA+VOPC definition must be fixed to vcc on GFX8", instr.get());
            }

@ -171,8 +175,7 @@ bool validate_ir(Program* program)
            }

            const bool sdwa_opcodes =
-               instr->opcode != aco_opcode::v_fmac_f32 &&
-               instr->opcode != aco_opcode::v_fmac_f16 &&
+               instr->opcode != aco_opcode::v_fmac_f32 && instr->opcode != aco_opcode::v_fmac_f16 &&
               instr->opcode != aco_opcode::v_fmamk_f32 &&
               instr->opcode != aco_opcode::v_fmaak_f32 &&
               instr->opcode != aco_opcode::v_fmamk_f16 &&
@ -186,67 +189,75 @@ bool validate_ir(Program* program)

            const bool feature_mac =
               program->chip_class == GFX8 &&
-               (instr->opcode == aco_opcode::v_mac_f32 &&
-                instr->opcode == aco_opcode::v_mac_f16);
+               (instr->opcode == aco_opcode::v_mac_f32 && instr->opcode == aco_opcode::v_mac_f16);

            check(sdwa_opcodes || feature_mac, "SDWA can't be used with this opcode", instr.get());

            if (instr->definitions[0].regClass().is_subdword())
-               check((sdwa.dst_sel & sdwa_asuint) == (sdwa_isra | instr->definitions[0].bytes()), "Unexpected SDWA sel for sub-dword definition", instr.get());
+               check((sdwa.dst_sel & sdwa_asuint) == (sdwa_isra | instr->definitions[0].bytes()),
+                     "Unexpected SDWA sel for sub-dword definition", instr.get());
         }

         /* check opsel */
         if (instr->isVOP3()) {
            VOP3_instruction& vop3 = instr->vop3();
-            check(vop3.opsel == 0 || program->chip_class >= GFX9, "Opsel is only supported on GFX9+", instr.get());
+            check(vop3.opsel == 0 || program->chip_class >= GFX9,
+                  "Opsel is only supported on GFX9+", instr.get());

            for (unsigned i = 0; i < 3; i++) {
               if (i >= instr->operands.size() ||
-                   (instr->operands[i].hasRegClass() && instr->operands[i].regClass().is_subdword() && !instr->operands[i].isFixed()))
+                   (instr->operands[i].hasRegClass() &&
+                    instr->operands[i].regClass().is_subdword() && !instr->operands[i].isFixed()))
                  check((vop3.opsel & (1 << i)) == 0, "Unexpected opsel for operand", instr.get());
            }
            if (instr->definitions[0].regClass().is_subdword() && !instr->definitions[0].isFixed())
-               check((vop3.opsel & (1 << 3)) == 0, "Unexpected opsel for sub-dword definition", instr.get());
+               check((vop3.opsel & (1 << 3)) == 0, "Unexpected opsel for sub-dword definition",
+                     instr.get());
         }

         /* check for undefs */
         for (unsigned i = 0; i < instr->operands.size(); i++) {
            if (instr->operands[i].isUndefined()) {
               bool flat = instr->isFlatLike();
-               bool can_be_undef = is_phi(instr) || instr->isEXP() ||
-                                   instr->isReduction() ||
+               bool can_be_undef = is_phi(instr) || instr->isEXP() || instr->isReduction() ||
                                   instr->opcode == aco_opcode::p_create_vector ||
                                   (flat && i == 1) || (instr->isMIMG() && (i == 1 || i == 2)) ||
                                   ((instr->isMUBUF() || instr->isMTBUF()) && i == 1);
               check(can_be_undef, "Undefs can only be used in certain operands", instr.get());
            } else {
-               check(instr->operands[i].isFixed() || instr->operands[i].isTemp() || instr->operands[i].isConstant(), "Uninitialized Operand", instr.get());
+               check(instr->operands[i].isFixed() || instr->operands[i].isTemp() ||
+                        instr->operands[i].isConstant(),
+                     "Uninitialized Operand", instr.get());
            }
         }

         /* check subdword definitions */
         for (unsigned i = 0; i < instr->definitions.size(); i++) {
            if (instr->definitions[i].regClass().is_subdword())
-               check(instr->isPseudo() || instr->definitions[i].bytes() <= 4, "Only Pseudo instructions can write subdword registers larger than 4 bytes", instr.get());
+               check(instr->isPseudo() || instr->definitions[i].bytes() <= 4,
+                     "Only Pseudo instructions can write subdword registers larger than 4 bytes",
+                     instr.get());
         }

         if (instr->isSALU() || instr->isVALU()) {
            /* check literals */
            Operand literal(s1);
-            for (unsigned i = 0; i < instr->operands.size(); i++)
-            {
+            for (unsigned i = 0; i < instr->operands.size(); i++) {
               Operand op = instr->operands[i];
               if (!op.isLiteral())
                  continue;

               check(!instr->isDPP() && !instr->isSDWA() &&
-                     (!instr->isVOP3() || program->chip_class >= GFX10) &&
-                     (!instr->isVOP3P() || program->chip_class >= GFX10),
+                        (!instr->isVOP3() || program->chip_class >= GFX10) &&
+                        (!instr->isVOP3P() || program->chip_class >= GFX10),
                     "Literal applied on wrong instruction format", instr.get());

-               check(literal.isUndefined() || (literal.size() == op.size() && literal.constantValue() == op.constantValue()), "Only 1 Literal allowed", instr.get());
+               check(literal.isUndefined() || (literal.size() == op.size() &&
+                                               literal.constantValue() == op.constantValue()),
+                     "Only 1 Literal allowed", instr.get());
               literal = op;
-               check(instr->isSALU() || instr->isVOP3() || instr->isVOP3P() || i == 0 || i == 2, "Wrong source position for Literal argument", instr.get());
+               check(instr->isSALU() || instr->isVOP3() || instr->isVOP3P() || i == 0 || i == 2,
+                     "Wrong source position for Literal argument", instr.get());
            }

            /* check num sgprs for VALU */
@ -264,8 +275,7 @@ bool validate_ir(Program* program)
               else if (instr->isDPP())
                  scalar_mask = 0x0;

-               if (instr->isVOPC() ||
-                   instr->opcode == aco_opcode::v_readfirstlane_b32 ||
+               if (instr->isVOPC() || instr->opcode == aco_opcode::v_readfirstlane_b32 ||
                   instr->opcode == aco_opcode::v_readlane_b32 ||
                   instr->opcode == aco_opcode::v_readlane_b32_e64) {
                  check(instr->definitions[0].getTemp().type() == RegType::sgpr,
@ -277,45 +287,42 @@ bool validate_ir(Program* program)

               unsigned num_sgprs = 0;
               unsigned sgpr[] = {0, 0};
-               for (unsigned i = 0; i < instr->operands.size(); i++)
-               {
+               for (unsigned i = 0; i < instr->operands.size(); i++) {
                  Operand op = instr->operands[i];
                  if (instr->opcode == aco_opcode::v_readfirstlane_b32 ||
                      instr->opcode == aco_opcode::v_readlane_b32 ||
                      instr->opcode == aco_opcode::v_readlane_b32_e64) {
-                     check(i != 1 ||
-                           (op.isTemp() && op.regClass().type() == RegType::sgpr) ||
-                           op.isConstant(),
+                     check(i != 1 || (op.isTemp() && op.regClass().type() == RegType::sgpr) ||
+                              op.isConstant(),
                           "Must be a SGPR or a constant", instr.get());
-                     check(i == 1 ||
-                           (op.isTemp() && op.regClass().type() == RegType::vgpr && op.bytes() <= 4),
+                     check(i == 1 || (op.isTemp() && op.regClass().type() == RegType::vgpr &&
+                                      op.bytes() <= 4),
                           "Wrong Operand type for VALU instruction", instr.get());
                     continue;
                  }
                  if (instr->opcode == aco_opcode::v_permlane16_b32 ||
                      instr->opcode == aco_opcode::v_permlanex16_b32) {
-                     check(i != 0 ||
-                           (op.isTemp() && op.regClass().type() == RegType::vgpr),
+                     check(i != 0 || (op.isTemp() && op.regClass().type() == RegType::vgpr),
                           "Operand 0 of v_permlane must be VGPR", instr.get());
-                     check(i == 0 ||
-                           (op.isTemp() && op.regClass().type() == RegType::sgpr) ||
-                           op.isConstant(),
-                           "Lane select operands of v_permlane must be SGPR or constant", instr.get());
+                     check(i == 0 || (op.isTemp() && op.regClass().type() == RegType::sgpr) ||
+                              op.isConstant(),
+                           "Lane select operands of v_permlane must be SGPR or constant",
+                           instr.get());
                  }

                  if (instr->opcode == aco_opcode::v_writelane_b32 ||
                      instr->opcode == aco_opcode::v_writelane_b32_e64) {
-                     check(i != 2 ||
-                           (op.isTemp() && op.regClass().type() == RegType::vgpr && op.bytes() <= 4),
+                     check(i != 2 || (op.isTemp() && op.regClass().type() == RegType::vgpr &&
+                                      op.bytes() <= 4),
                           "Wrong Operand type for VALU instruction", instr.get());
-                     check(i == 2 ||
-                           (op.isTemp() && op.regClass().type() == RegType::sgpr) ||
-                           op.isConstant(),
+                     check(i == 2 || (op.isTemp() && op.regClass().type() == RegType::sgpr) ||
+                              op.isConstant(),
                           "Must be a SGPR or a constant", instr.get());
                     continue;
                  }
                  if (op.isTemp() && instr->operands[i].regClass().type() == RegType::sgpr) {
-                     check(scalar_mask & (1 << i), "Wrong source position for SGPR argument", instr.get());
+                     check(scalar_mask & (1 << i), "Wrong source position for SGPR argument",
+                           instr.get());

                     if (op.tempId() != sgpr[0] && op.tempId() != sgpr[1]) {
                        if (num_sgprs < 2)
@ -324,19 +331,22 @@ bool validate_ir(Program* program)
                  }

                  if (op.isConstant() && !op.isLiteral())
-                     check(scalar_mask & (1 << i), "Wrong source position for constant argument", instr.get());
+                     check(scalar_mask & (1 << i), "Wrong source position for constant argument",
+                           instr.get());
               }
-               check(num_sgprs + (literal.isUndefined() ? 0 : 1) <= const_bus_limit, "Too many SGPRs/literals", instr.get());
+               check(num_sgprs + (literal.isUndefined() ? 0 : 1) <= const_bus_limit,
+                     "Too many SGPRs/literals", instr.get());
            }

            if (instr->isSOP1() || instr->isSOP2()) {
-               check(instr->definitions[0].getTemp().type() == RegType::sgpr, "Wrong Definition type for SALU instruction", instr.get());
+               check(instr->definitions[0].getTemp().type() == RegType::sgpr,
+                     "Wrong Definition type for SALU instruction", instr.get());
               for (const Operand& op : instr->operands) {
-                 check(op.isConstant() || op.regClass().type() <= RegType::sgpr,
-                       "Wrong Operand type for SALU instruction", instr.get());
+                  check(op.isConstant() || op.regClass().type() <= RegType::sgpr,
+                        "Wrong Operand type for SALU instruction", instr.get());
+               }
            }
         }
-         }

         switch (instr->format) {
         case Format::PSEUDO: {
@ -346,7 +356,8 @@ bool validate_ir(Program* program)
                  check(op.bytes() < 4 || size % 4 == 0, "Operand is not aligned", instr.get());
                  size += op.bytes();
               }
-               check(size == instr->definitions[0].bytes(), "Definition size does not match operand sizes", instr.get());
+               check(size == instr->definitions[0].bytes(),
+                     "Definition size does not match operand sizes", instr.get());
               if (instr->definitions[0].getTemp().type() == RegType::sgpr) {
                  for (const Operand& op : instr->operands) {
                     check(op.isConstant() || op.regClass().type() == RegType::sgpr,
@ -354,55 +365,75 @@ bool validate_ir(Program* program)
                  }
               }
            } else if (instr->opcode == aco_opcode::p_extract_vector) {
-               check((instr->operands[0].isTemp()) && instr->operands[1].isConstant(), "Wrong Operand types", instr.get());
-               check((instr->operands[1].constantValue() + 1) * instr->definitions[0].bytes() <= instr->operands[0].bytes(), "Index out of range", instr.get());
-               check(instr->definitions[0].getTemp().type() == RegType::vgpr || instr->operands[0].regClass().type() == RegType::sgpr,
+               check((instr->operands[0].isTemp()) && instr->operands[1].isConstant(),
+                     "Wrong Operand types", instr.get());
+               check((instr->operands[1].constantValue() + 1) * instr->definitions[0].bytes() <=
+                        instr->operands[0].bytes(),
+                     "Index out of range", instr.get());
+               check(instr->definitions[0].getTemp().type() == RegType::vgpr ||
+                        instr->operands[0].regClass().type() == RegType::sgpr,
                     "Cannot extract SGPR value from VGPR vector", instr.get());
-               check(program->chip_class >= GFX9 || !instr->definitions[0].regClass().is_subdword() ||
-                     instr->operands[0].regClass().type() == RegType::vgpr, "Cannot extract subdword from SGPR before GFX9+", instr.get());
+               check(program->chip_class >= GFX9 ||
+                        !instr->definitions[0].regClass().is_subdword() ||
+                        instr->operands[0].regClass().type() == RegType::vgpr,
+                     "Cannot extract subdword from SGPR before GFX9+", instr.get());
            } else if (instr->opcode == aco_opcode::p_split_vector) {
               check(instr->operands[0].isTemp(), "Operand must be a temporary", instr.get());
               unsigned size = 0;
               for (const Definition& def : instr->definitions) {
                  size += def.bytes();
               }
-               check(size == instr->operands[0].bytes(), "Operand size does not match definition sizes", instr.get());
+               check(size == instr->operands[0].bytes(),
+                     "Operand size does not match definition sizes", instr.get());
               if (instr->operands[0].getTemp().type() == RegType::vgpr) {
                  for (const Definition& def : instr->definitions)
-                     check(def.regClass().type() == RegType::vgpr, "Wrong Definition type for VGPR split_vector", instr.get());
+                     check(def.regClass().type() == RegType::vgpr,
+                           "Wrong Definition type for VGPR split_vector", instr.get());
               } else {
                  for (const Definition& def : instr->definitions)
-                     check(program->chip_class >= GFX9 || !def.regClass().is_subdword(), "Cannot split SGPR into subdword VGPRs before GFX9+", instr.get());
+                     check(program->chip_class >= GFX9 || !def.regClass().is_subdword(),
+                           "Cannot split SGPR into subdword VGPRs before GFX9+", instr.get());
               }
            } else if (instr->opcode == aco_opcode::p_parallelcopy) {
-               check(instr->definitions.size() == instr->operands.size(), "Number of Operands does not match number of Definitions", instr.get());
+               check(instr->definitions.size() == instr->operands.size(),
+                     "Number of Operands does not match number of Definitions", instr.get());
               for (unsigned i = 0; i < instr->operands.size(); i++) {
-                  check(instr->definitions[i].bytes() == instr->operands[i].bytes(), "Operand and Definition size must match", instr.get());
+                  check(instr->definitions[i].bytes() == instr->operands[i].bytes(),
+                        "Operand and Definition size must match", instr.get());
                  if (instr->operands[i].isTemp())
-                     check((instr->definitions[i].getTemp().type() == instr->operands[i].regClass().type()) ||
-                           (instr->definitions[i].getTemp().type() == RegType::vgpr && instr->operands[i].regClass().type() == RegType::sgpr),
+                     check((instr->definitions[i].getTemp().type() ==
+                            instr->operands[i].regClass().type()) ||
+                              (instr->definitions[i].getTemp().type() == RegType::vgpr &&
+                               instr->operands[i].regClass().type() == RegType::sgpr),
                           "Operand and Definition types do not match", instr.get());
               }
            } else if (instr->opcode == aco_opcode::p_phi) {
-               check(instr->operands.size() == block.logical_preds.size(), "Number of Operands does not match number of predecessors", instr.get());
-               check(instr->definitions[0].getTemp().type() == RegType::vgpr, "Logical Phi Definition must be vgpr", instr.get());
+               check(instr->operands.size() == block.logical_preds.size(),
+                     "Number of Operands does not match number of predecessors", instr.get());
+               check(instr->definitions[0].getTemp().type() == RegType::vgpr,
+                     "Logical Phi Definition must be vgpr", instr.get());
               for (const Operand& op : instr->operands)
-                  check(instr->definitions[0].size() == op.size(), "Operand sizes must match Definition size", instr.get());
+                  check(instr->definitions[0].size() == op.size(),
+                        "Operand sizes must match Definition size", instr.get());
            } else if (instr->opcode == aco_opcode::p_linear_phi) {
               for (const Operand& op : instr->operands) {
-                  check(!op.isTemp() || op.getTemp().is_linear(), "Wrong Operand type", instr.get());
-                  check(instr->definitions[0].size() == op.size(), "Operand sizes must match Definition size", instr.get());
+                  check(!op.isTemp() || op.getTemp().is_linear(), "Wrong Operand type",
+                        instr.get());
+                  check(instr->definitions[0].size() == op.size(),
+                        "Operand sizes must match Definition size", instr.get());
               }
-               check(instr->operands.size() == block.linear_preds.size(), "Number of Operands does not match number of predecessors", instr.get());
-            } else if (instr->opcode == aco_opcode::p_extract || instr->opcode == aco_opcode::p_insert) {
-               check(instr->operands[0].isTemp(),
-                     "Data operand must be temporary", instr.get());
+               check(instr->operands.size() == block.linear_preds.size(),
+                     "Number of Operands does not match number of predecessors", instr.get());
+            } else if (instr->opcode == aco_opcode::p_extract ||
+                       instr->opcode == aco_opcode::p_insert) {
+               check(instr->operands[0].isTemp(), "Data operand must be temporary", instr.get());
               check(instr->operands[1].isConstant(), "Index must be constant", instr.get());
               if (instr->opcode == aco_opcode::p_extract)
-                  check(instr->operands[3].isConstant(), "Sign-extend flag must be constant", instr.get());
+                  check(instr->operands[3].isConstant(), "Sign-extend flag must be constant",
+                        instr.get());

               check(instr->definitions[0].getTemp().type() != RegType::sgpr ||
-                     instr->operands[0].getTemp().type() == RegType::sgpr,
+                        instr->operands[0].getTemp().type() == RegType::sgpr,
                     "Can't extract/insert VGPR to SGPR", instr.get());

               if (instr->operands[0].getTemp().type() == RegType::vgpr)
@ -410,69 +441,106 @@ bool validate_ir(Program* program)
                        "Sizes of operand and definition must match", instr.get());

               if (instr->definitions[0].getTemp().type() == RegType::sgpr)
-                  check(instr->definitions.size() >= 2 && instr->definitions[1].isFixed() && instr->definitions[1].physReg() == scc, "SGPR extract/insert needs a SCC definition", instr.get());
+                  check(instr->definitions.size() >= 2 && instr->definitions[1].isFixed() &&
+                           instr->definitions[1].physReg() == scc,
+                        "SGPR extract/insert needs a SCC definition", instr.get());

-               check(instr->operands[2].constantEquals(8) || instr->operands[2].constantEquals(16), "Size must be 8 or 16", instr.get());
-               check(instr->operands[2].constantValue() < instr->operands[0].getTemp().bytes() * 8u, "Size must be smaller than source", instr.get());
+               check(instr->operands[2].constantEquals(8) || instr->operands[2].constantEquals(16),
+                     "Size must be 8 or 16", instr.get());
+               check(instr->operands[2].constantValue() < instr->operands[0].getTemp().bytes() * 8u,
+                     "Size must be smaller than source", instr.get());

-               unsigned comp = instr->operands[0].bytes() * 8u / MAX2(instr->operands[2].constantValue(), 1);
-               check(instr->operands[1].constantValue() < comp, "Index must be in-bounds", instr.get());
+               unsigned comp =
+                  instr->operands[0].bytes() * 8u / MAX2(instr->operands[2].constantValue(), 1);
+               check(instr->operands[1].constantValue() < comp, "Index must be in-bounds",
+                     instr.get());
            }
            break;
         }
         case Format::PSEUDO_REDUCTION: {
-            for (const Operand &op : instr->operands)
-               check(op.regClass().type() == RegType::vgpr, "All operands of PSEUDO_REDUCTION instructions must be in VGPRs.", instr.get());
+            for (const Operand& op : instr->operands)
+               check(op.regClass().type() == RegType::vgpr,
+                     "All operands of PSEUDO_REDUCTION instructions must be in VGPRs.",
+                     instr.get());

-            if (instr->opcode == aco_opcode::p_reduce && instr->reduction().cluster_size == program->wave_size)
-               check(instr->definitions[0].regClass().type() == RegType::sgpr || program->wave_size == 32, "The result of unclustered reductions must go into an SGPR.", instr.get());
+            if (instr->opcode == aco_opcode::p_reduce &&
+                instr->reduction().cluster_size == program->wave_size)
+               check(instr->definitions[0].regClass().type() == RegType::sgpr ||
+                        program->wave_size == 32,
+                     "The result of unclustered reductions must go into an SGPR.", instr.get());
            else
-               check(instr->definitions[0].regClass().type() == RegType::vgpr, "The result of scans and clustered reductions must go into a VGPR.", instr.get());
+               check(instr->definitions[0].regClass().type() == RegType::vgpr,
+                     "The result of scans and clustered reductions must go into a VGPR.",
+                     instr.get());

            break;
         }
         case Format::SMEM: {
            if (instr->operands.size() >= 1)
               check((instr->operands[0].isFixed() && !instr->operands[0].isConstant()) ||
-                     (instr->operands[0].isTemp() && instr->operands[0].regClass().type() == RegType::sgpr), "SMEM operands must be sgpr", instr.get());
+                        (instr->operands[0].isTemp() &&
+                         instr->operands[0].regClass().type() == RegType::sgpr),
+                     "SMEM operands must be sgpr", instr.get());
            if (instr->operands.size() >= 2)
-               check(instr->operands[1].isConstant() || (instr->operands[1].isTemp() && instr->operands[1].regClass().type() == RegType::sgpr),
+               check(instr->operands[1].isConstant() ||
+                        (instr->operands[1].isTemp() &&
+                         instr->operands[1].regClass().type() == RegType::sgpr),
                     "SMEM offset must be constant or sgpr", instr.get());
            if (!instr->definitions.empty())
-               check(instr->definitions[0].getTemp().type() == RegType::sgpr, "SMEM result must be sgpr", instr.get());
+               check(instr->definitions[0].getTemp().type() == RegType::sgpr,
+                     "SMEM result must be sgpr", instr.get());
            break;
         }
         case Format::MTBUF:
         case Format::MUBUF: {
-            check(instr->operands.size() > 1, "VMEM instructions must have at least one operand", instr.get());
-            check(instr->operands[1].hasRegClass() && instr->operands[1].regClass().type() == RegType::vgpr,
+            check(instr->operands.size() > 1, "VMEM instructions must have at least one operand",
+                  instr.get());
+            check(instr->operands[1].hasRegClass() &&
+                     instr->operands[1].regClass().type() == RegType::vgpr,
                  "VADDR must be in vgpr for VMEM instructions", instr.get());
-            check(instr->operands[0].isTemp() && instr->operands[0].regClass().type() == RegType::sgpr, "VMEM resource constant must be sgpr", instr.get());
-            check(instr->operands.size() < 4 || (instr->operands[3].isTemp() && instr->operands[3].regClass().type() == RegType::vgpr), "VMEM write data must be vgpr", instr.get());
+            check(
+               instr->operands[0].isTemp() && instr->operands[0].regClass().type() == RegType::sgpr,
+               "VMEM resource constant must be sgpr", instr.get());
+            check(instr->operands.size() < 4 ||
+                     (instr->operands[3].isTemp() &&
+                      instr->operands[3].regClass().type() == RegType::vgpr),
+                  "VMEM write data must be vgpr", instr.get());
            break;
         }
         case Format::MIMG: {
-            check(instr->operands.size() >= 4, "MIMG instructions must have at least 4 operands", instr.get());
-            check(instr->operands[0].hasRegClass() && (instr->operands[0].regClass() == s4 || instr->operands[0].regClass() == s8),
+            check(instr->operands.size() >= 4, "MIMG instructions must have at least 4 operands",
+                  instr.get());
+            check(instr->operands[0].hasRegClass() &&
+                     (instr->operands[0].regClass() == s4 || instr->operands[0].regClass() == s8),
                  "MIMG operands[0] (resource constant) must be in 4 or 8 SGPRs", instr.get());
            if (instr->operands[1].hasRegClass())
-               check(instr->operands[1].regClass() == s4, "MIMG operands[1] (sampler constant) must be 4 SGPRs", instr.get());
+               check(instr->operands[1].regClass() == s4,
+                     "MIMG operands[1] (sampler constant) must be 4 SGPRs", instr.get());
            if (!instr->operands[2].isUndefined()) {
               bool is_cmpswap = instr->opcode == aco_opcode::image_atomic_cmpswap ||
                                 instr->opcode == aco_opcode::image_atomic_fcmpswap;
-               check(instr->definitions.empty() || (instr->definitions[0].regClass() == instr->operands[2].regClass() || is_cmpswap),
-                     "MIMG operands[2] (VDATA) must be the same as definitions[0] for atomics and TFE/LWE loads", instr.get());
+               check(instr->definitions.empty() ||
+                        (instr->definitions[0].regClass() == instr->operands[2].regClass() ||
+                         is_cmpswap),
+                     "MIMG operands[2] (VDATA) must be the same as definitions[0] for atomics and "
+                     "TFE/LWE loads",
+                     instr.get());
            }
-            check(instr->operands.size() == 4 || program->chip_class >= GFX10, "NSA is only supported on GFX10+", instr.get());
+            check(instr->operands.size() == 4 || program->chip_class >= GFX10,
+                  "NSA is only supported on GFX10+", instr.get());
            for (unsigned i = 3; i < instr->operands.size(); i++) {
               if (instr->operands.size() == 4) {
-                  check(instr->operands[i].hasRegClass() && instr->operands[i].regClass().type() == RegType::vgpr,
+                  check(instr->operands[i].hasRegClass() &&
+                           instr->operands[i].regClass().type() == RegType::vgpr,
                        "MIMG operands[3] (VADDR) must be VGPR", instr.get());
               } else {
-                  check(instr->operands[i].regClass() == v1, "MIMG VADDR must be v1 if NSA is used", instr.get());
+                  check(instr->operands[i].regClass() == v1, "MIMG VADDR must be v1 if NSA is used",
+                        instr.get());
               }
            }
-            check(instr->definitions.empty() || (instr->definitions[0].isTemp() && instr->definitions[0].regClass().type() == RegType::vgpr),
+            check(instr->definitions.empty() ||
+                     (instr->definitions[0].isTemp() &&
+                      instr->definitions[0].regClass().type() == RegType::vgpr),
                  "MIMG definitions[0] (VDATA) must be VGPR", instr.get());
            break;
         }
@ -482,31 +550,38 @@ bool validate_ir(Program* program)
                     "Only VGPRs are valid DS instruction operands", instr.get());
            }
            if (!instr->definitions.empty())
-               check(instr->definitions[0].getTemp().type() == RegType::vgpr, "DS instruction must return VGPR", instr.get());
+               check(instr->definitions[0].getTemp().type() == RegType::vgpr,
+                     "DS instruction must return VGPR", instr.get());
            break;
         }
         case Format::EXP: {
            for (unsigned i = 0; i < 4; i++)
-               check(instr->operands[i].hasRegClass() && instr->operands[i].regClass().type() == RegType::vgpr,
+               check(instr->operands[i].hasRegClass() &&
+                        instr->operands[i].regClass().type() == RegType::vgpr,
                     "Only VGPRs are valid Export arguments", instr.get());
            break;
         }
         case Format::FLAT:
-            check(instr->operands[1].isUndefined(), "Flat instructions don't support SADDR", instr.get());
+            check(instr->operands[1].isUndefined(), "Flat instructions don't support SADDR",
+                  instr.get());
            FALLTHROUGH;
         case Format::GLOBAL:
         case Format::SCRATCH: {
-            check(instr->operands[0].isTemp() && instr->operands[0].regClass().type() == RegType::vgpr, "FLAT/GLOBAL/SCRATCH address must be vgpr", instr.get());
-            check(instr->operands[1].hasRegClass() && instr->operands[1].regClass().type() == RegType::sgpr,
+            check(
+               instr->operands[0].isTemp() && instr->operands[0].regClass().type() == RegType::vgpr,
+               "FLAT/GLOBAL/SCRATCH address must be vgpr", instr.get());
+            check(instr->operands[1].hasRegClass() &&
+                     instr->operands[1].regClass().type() == RegType::sgpr,
                  "FLAT/GLOBAL/SCRATCH sgpr address must be undefined or sgpr", instr.get());
            if (!instr->definitions.empty())
-               check(instr->definitions[0].getTemp().type() == RegType::vgpr, "FLAT/GLOBAL/SCRATCH result must be vgpr", instr.get());
+               check(instr->definitions[0].getTemp().type() == RegType::vgpr,
+                     "FLAT/GLOBAL/SCRATCH result must be vgpr", instr.get());
            else
-               check(instr->operands[2].regClass().type() == RegType::vgpr, "FLAT/GLOBAL/SCRATCH data must be vgpr", instr.get());
+               check(instr->operands[2].regClass().type() == RegType::vgpr,
+                     "FLAT/GLOBAL/SCRATCH data must be vgpr", instr.get());
            break;
         }
-         default:
-            break;
+         default: break;
         }
      }
   }
@ -518,20 +593,26 @@ bool validate_ir(Program* program)

      /* predecessors/successors should be sorted */
      for (unsigned j = 0; j + 1 < block.linear_preds.size(); j++)
-         check_block(block.linear_preds[j] < block.linear_preds[j + 1], "linear predecessors must be sorted", &block);
+         check_block(block.linear_preds[j] < block.linear_preds[j + 1],
+                     "linear predecessors must be sorted", &block);
      for (unsigned j = 0; j + 1 < block.logical_preds.size(); j++)
-         check_block(block.logical_preds[j] < block.logical_preds[j + 1], "logical predecessors must be sorted", &block);
+         check_block(block.logical_preds[j] < block.logical_preds[j + 1],
+                     "logical predecessors must be sorted", &block);
      for (unsigned j = 0; j + 1 < block.linear_succs.size(); j++)
-         check_block(block.linear_succs[j] < block.linear_succs[j + 1], "linear successors must be sorted", &block);
+         check_block(block.linear_succs[j] < block.linear_succs[j + 1],
+                     "linear successors must be sorted", &block);
      for (unsigned j = 0; j + 1 < block.logical_succs.size(); j++)
-         check_block(block.logical_succs[j] < block.logical_succs[j + 1], "logical successors must be sorted", &block);
+         check_block(block.logical_succs[j] < block.logical_succs[j + 1],
+                     "logical successors must be sorted", &block);

      /* critical edges are not allowed */
      if (block.linear_preds.size() > 1) {
         for (unsigned pred : block.linear_preds)
-            check_block(program->blocks[pred].linear_succs.size() == 1, "linear critical edges are not allowed", &program->blocks[pred]);
+            check_block(program->blocks[pred].linear_succs.size() == 1,
+                        "linear critical edges are not allowed", &program->blocks[pred]);
         for (unsigned pred : block.logical_preds)
-            check_block(program->blocks[pred].logical_succs.size() == 1, "logical critical edges are not allowed", &program->blocks[pred]);
+            check_block(program->blocks[pred].logical_succs.size() == 1,
+                        "logical critical edges are not allowed", &program->blocks[pred]);
      }
   }

@ -544,8 +625,8 @@ namespace {
 struct Location {
   Location() : block(NULL), instr(NULL) {}

-   Block *block;
-   Instruction *instr; //NULL if it's the block's live-in
+   Block* block;
+   Instruction* instr; // NULL if it's the block's live-in
 };

 struct Assignment {
@ -554,18 +635,20 @@ struct Assignment {
   PhysReg reg;
 };

-bool ra_fail(Program *program, Location loc, Location loc2, const char *fmt, ...) {
+bool
+ra_fail(Program* program, Location loc, Location loc2, const char* fmt, ...)
+{
   va_list args;
   va_start(args, fmt);
   char msg[1024];
   vsprintf(msg, fmt, args);
   va_end(args);

-   char *out;
+   char* out;
   size_t outsize;
   struct u_memstream mem;
   u_memstream_open(&mem, &out, &outsize);
-   FILE *const memf = u_memstream_get(&mem);
+   FILE* const memf = u_memstream_get(&mem);

   fprintf(memf, "RA error found at instruction in BB%d:\n", loc.block->index);
   if (loc.instr) {
@ -587,7 +670,8 @@ bool ra_fail(Program *program, Location loc, Location loc2, const char *fmt, ...
   return true;
 }

-bool validate_subdword_operand(chip_class chip, const aco_ptr<Instruction>& instr, unsigned index)
+bool
+validate_subdword_operand(chip_class chip, const aco_ptr<Instruction>& instr, unsigned index)
 {
   Operand op = instr->operands[index];
   unsigned byte = op.physReg().byte();
@ -635,14 +719,14 @@ bool validate_subdword_operand(chip_class chip, const aco_ptr<Instruction>& inst
      if (byte == 2 && index == 2)
         return true;
      break;
-   default:
-      break;
+   default: break;
   }

   return byte == 0;
 }

-bool validate_subdword_definition(chip_class chip, const aco_ptr<Instruction>& instr)
+bool
+validate_subdword_definition(chip_class chip, const aco_ptr<Instruction>& instr)
 {
   Definition def = instr->definitions[0];
   unsigned byte = def.physReg().byte();
@ -664,16 +748,15 @@ bool validate_subdword_definition(chip_class chip, const aco_ptr<Instruction>& i
   case aco_opcode::global_load_ubyte_d16_hi:
   case aco_opcode::global_load_short_d16_hi:
   case aco_opcode::ds_read_u8_d16_hi:
-   case aco_opcode::ds_read_u16_d16_hi:
-      return byte == 2;
-   default:
-      break;
+   case aco_opcode::ds_read_u16_d16_hi: return byte == 2;
+   default: break;
   }

   return byte == 0;
 }

-unsigned get_subdword_bytes_written(Program *program, const aco_ptr<Instruction>& instr, unsigned index)
+unsigned
+get_subdword_bytes_written(Program* program, const aco_ptr<Instruction>& instr, unsigned index)
 {
   chip_class chip = program->chip_class;
   Definition def = instr->definitions[index];
@ -703,8 +786,7 @@ unsigned get_subdword_bytes_written(Program *program, const aco_ptr<Instruction>
   case aco_opcode::global_load_ubyte_d16_hi:
   case aco_opcode::global_load_short_d16_hi:
   case aco_opcode::ds_read_u8_d16_hi:
-   case aco_opcode::ds_read_u16_d16_hi:
-      return program->dev.sram_ecc_enabled ? 4 : 2;
+   case aco_opcode::ds_read_u16_d16_hi: return program->dev.sram_ecc_enabled ? 4 : 2;
   case aco_opcode::v_mad_f16:
   case aco_opcode::v_mad_u16:
   case aco_opcode::v_mad_i16:
@ -714,16 +796,18 @@ unsigned get_subdword_bytes_written(Program *program, const aco_ptr<Instruction>
      if (chip >= GFX9)
         return 2;
      break;
-   default:
-      break;
+   default: break;
   }

-   return MAX2(chip >= GFX10 ? def.bytes() : 4, instr_info.definition_size[(int)instr->opcode] / 8u);
+   return MAX2(chip >= GFX10 ? def.bytes() : 4,
+               instr_info.definition_size[(int)instr->opcode] / 8u);
 }

 } /* end namespace */

-bool validate_ra(Program *program) {
+bool
+validate_ra(Program* program)
+{
   if (!(debug_flags & DEBUG_VALIDATE_RA))
      return false;

@ -754,13 +838,21 @@ bool validate_ra(Program *program) {
            if (!op.isFixed())
               err |= ra_fail(program, loc, Location(), "Operand %d is not assigned a register", i);
            if (assignments.count(op.tempId()) && assignments[op.tempId()].reg != op.physReg())
-               err |= ra_fail(program, loc, assignments.at(op.tempId()).firstloc, "Operand %d has an inconsistent register assignment with instruction", i);
-            if ((op.getTemp().type() == RegType::vgpr && op.physReg().reg_b + op.bytes() > (256 + program->config->num_vgprs) * 4) ||
-                (op.getTemp().type() == RegType::sgpr && op.physReg() + op.size() > program->config->num_sgprs && op.physReg() < sgpr_limit))
-               err |= ra_fail(program, loc, assignments.at(op.tempId()).firstloc, "Operand %d has an out-of-bounds register assignment", i);
+               err |=
+                  ra_fail(program, loc, assignments.at(op.tempId()).firstloc,
+                          "Operand %d has an inconsistent register assignment with instruction", i);
+            if ((op.getTemp().type() == RegType::vgpr &&
+                 op.physReg().reg_b + op.bytes() > (256 + program->config->num_vgprs) * 4) ||
+                (op.getTemp().type() == RegType::sgpr &&
+                 op.physReg() + op.size() > program->config->num_sgprs &&
+                 op.physReg() < sgpr_limit))
+               err |= ra_fail(program, loc, assignments.at(op.tempId()).firstloc,
+                              "Operand %d has an out-of-bounds register assignment", i);
            if (op.physReg() == vcc && !program->needs_vcc)
-               err |= ra_fail(program, loc, Location(), "Operand %d fixed to vcc but needs_vcc=false", i);
-            if (op.regClass().is_subdword() && !validate_subdword_operand(program->chip_class, instr, i))
+               err |= ra_fail(program, loc, Location(),
+                              "Operand %d fixed to vcc but needs_vcc=false", i);
+            if (op.regClass().is_subdword() &&
+                !validate_subdword_operand(program->chip_class, instr, i))
               err |= ra_fail(program, loc, Location(), "Operand %d not aligned correctly", i);
            if (!assignments[op.tempId()].firstloc.block)
               assignments[op.tempId()].firstloc = loc;
@ -773,15 +865,23 @@ bool validate_ra(Program *program) {
            if (!def.isTemp())
               continue;
            if (!def.isFixed())
-               err |= ra_fail(program, loc, Location(), "Definition %d is not assigned a register", i);
+               err |=
+                  ra_fail(program, loc, Location(), "Definition %d is not assigned a register", i);
            if (assignments[def.tempId()].defloc.block)
-               err |= ra_fail(program, loc, assignments.at(def.tempId()).defloc, "Temporary %%%d also defined by instruction", def.tempId());
-            if ((def.getTemp().type() == RegType::vgpr && def.physReg().reg_b + def.bytes() > (256 + program->config->num_vgprs) * 4) ||
-                (def.getTemp().type() == RegType::sgpr && def.physReg() + def.size() > program->config->num_sgprs && def.physReg() < sgpr_limit))
-               err |= ra_fail(program, loc, assignments.at(def.tempId()).firstloc, "Definition %d has an out-of-bounds register assignment", i);
+               err |= ra_fail(program, loc, assignments.at(def.tempId()).defloc,
+                              "Temporary %%%d also defined by instruction", def.tempId());
+            if ((def.getTemp().type() == RegType::vgpr &&
+                 def.physReg().reg_b + def.bytes() > (256 + program->config->num_vgprs) * 4) ||
+                (def.getTemp().type() == RegType::sgpr &&
+                 def.physReg() + def.size() > program->config->num_sgprs &&
+                 def.physReg() < sgpr_limit))
+               err |= ra_fail(program, loc, assignments.at(def.tempId()).firstloc,
+                              "Definition %d has an out-of-bounds register assignment", i);
            if (def.physReg() == vcc && !program->needs_vcc)
-               err |= ra_fail(program, loc, Location(), "Definition %d fixed to vcc but needs_vcc=false", i);
-            if (def.regClass().is_subdword() && !validate_subdword_definition(program->chip_class, instr))
+               err |= ra_fail(program, loc, Location(),
+                              "Definition %d fixed to vcc but needs_vcc=false", i);
+            if (def.regClass().is_subdword() &&
+                !validate_subdword_definition(program->chip_class, instr))
               err |= ra_fail(program, loc, Location(), "Definition %d not aligned correctly", i);
            if (!assignments[def.tempId()].firstloc.block)
               assignments[def.tempId()].firstloc = loc;
@ -810,7 +910,9 @@ bool validate_ra(Program *program) {
         PhysReg reg = assignments.at(tmp.id()).reg;
         for (unsigned i = 0; i < tmp.bytes(); i++) {
            if (regs[reg.reg_b + i]) {
-               err |= ra_fail(program, loc, Location(), "Assignment of element %d of %%%d already taken by %%%d in live-out", i, tmp.id(), regs[reg.reg_b + i]);
+               err |= ra_fail(program, loc, Location(),
+                              "Assignment of element %d of %%%d already taken by %%%d in live-out",
+                              i, tmp.id(), regs[reg.reg_b + i]);
            }
            regs[reg.reg_b + i] = tmp.id();
         }
@ -826,7 +928,10 @@ bool validate_ra(Program *program) {
               PhysReg reg = assignments.at(tmp.id()).reg;
               for (unsigned i = 0; i < tmp.bytes(); i++) {
                  if (regs[reg.reg_b + i])
-                     err |= ra_fail(program, loc, Location(), "Assignment of element %d of %%%d already taken by %%%d in live-out", i, tmp.id(), regs[reg.reg_b + i]);
+                     err |= ra_fail(
+                        program, loc, Location(),
+                        "Assignment of element %d of %%%d already taken by %%%d in live-out", i,
+                        tmp.id(), regs[reg.reg_b + i]);
               }
               live.emplace(tmp);
            }
@ -886,16 +991,23 @@ bool validate_ra(Program *program) {
            PhysReg reg = assignments.at(tmp.id()).reg;
            for (unsigned j = 0; j < tmp.bytes(); j++) {
               if (regs[reg.reg_b + j])
-                  err |= ra_fail(program, loc, assignments.at(regs[reg.reg_b + j]).defloc, "Assignment of element %d of %%%d already taken by %%%d from instruction", i, tmp.id(), regs[reg.reg_b + j]);
+                  err |= ra_fail(
+                     program, loc, assignments.at(regs[reg.reg_b + j]).defloc,
+                     "Assignment of element %d of %%%d already taken by %%%d from instruction", i,
+                     tmp.id(), regs[reg.reg_b + j]);
               regs[reg.reg_b + j] = tmp.id();
            }
            if (def.regClass().is_subdword() && def.bytes() < 4) {
               unsigned written = get_subdword_bytes_written(program, instr, i);
-               /* If written=4, the instruction still might write the upper half. In that case, it's the lower half that isn't preserved */
+               /* If written=4, the instruction still might write the upper half. In that case, it's
+                * the lower half that isn't preserved */
               for (unsigned j = reg.byte() & ~(written - 1); j < written; j++) {
                  unsigned written_reg = reg.reg() * 4u + j;
                  if (regs[written_reg] && regs[written_reg] != def.tempId())
-                     err |= ra_fail(program, loc, assignments.at(regs[written_reg]).defloc, "Assignment of element %d of %%%d overwrites the full register taken by %%%d from instruction", i, tmp.id(), regs[written_reg]);
+                     err |= ra_fail(program, loc, assignments.at(regs[written_reg]).defloc,
+                                    "Assignment of element %d of %%%d overwrites the full register "
+                                    "taken by %%%d from instruction",
+                                    i, tmp.id(), regs[written_reg]);
               }
            }
         }
@ -924,4 +1036,4 @@ bool validate_ra(Program *program) {

   return err;
 }
-}
+} // namespace aco