panfrost: Add a library to build CSF command streams

Signed-off-by: Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com> Reviewed-by: Antonino Maniscalco <antonino.maniscalco@collabora.com> Reviewed-by: Erik Faye-Lund <erik.faye-lund@collabora.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26358>
2023-07-05 11:07:04 +02:00 · 2023-07-05 11:07:04 +02:00 · 3b82448f47
parent 8e303b9350
commit 3b82448f47
2 changed files with 878 additions and 0 deletions
--- a/src/.clang-format
+++ b/src/.clang-format
@ -296,6 +296,7 @@ ForEachMacros:
  - bi_foreach_src
  - bi_foreach_ssa_src
  - bi_foreach_successor
+  - cs_emit
  - mir_foreach_block
  - mir_foreach_block_from
  - mir_foreach_bundle_in_block
--- a/src/panfrost/lib/genxml/cs_builder.h
+++ b/src/panfrost/lib/genxml/cs_builder.h
@ -0,0 +1,877 @@
+/*
+ * Copyright (C) 2022 Collabora Ltd.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#if !defined(PAN_ARCH) || PAN_ARCH < 10
+#error "cs_builder.h requires PAN_ARCH >= 10"
+#endif
+
+#include "gen_macros.h"
+
+/*
+ * cs_builder implements a builder for CSF command streams. It manages the
+ * allocation and overflow behaviour of queues and provides helpers for emitting
+ * commands to run on the CSF pipe.
+ *
+ * Users are responsible for the CS buffer allocation and must initialize the
+ * command stream with an initial buffer using cs_builder_init(). The CS can
+ * be extended with new buffers allocated with cs_builder_conf::alloc_buffer()
+ * if the builder runs out of memory.
+ */
+
+struct cs_buffer {
+   /* CPU pointer */
+   uint64_t *cpu;
+
+   /* GPU pointer */
+   uint64_t gpu;
+
+   /* Capacity in number of 64-bit instructions */
+   uint32_t capacity;
+};
+
+struct cs_builder_conf {
+   /* Number of 32-bit registers in the hardware register file */
+   uint8_t nr_registers;
+
+   /* Number of 32-bit registers used by the kernel at submission time */
+   uint8_t nr_kernel_registers;
+
+   /* CS buffer allocator */
+   struct cs_buffer (*alloc_buffer)(void *cookie);
+
+   /* Cookie passed back to alloc_buffer() */
+   void *cookie;
+};
+
+/* The CS is formed of one or more CS chunks linked with JUMP instructions.
+ * The builder keeps track of the current chunk and the position inside this
+ * chunk, so it can emit new instructions, and decide when a new chunk needs
+ * to be allocated.
+ */
+struct cs_chunk {
+   /* CS buffer object backing this chunk */
+   struct cs_buffer buffer;
+
+   union {
+      /* Current position in the buffer object when the chunk is active. */
+      uint32_t pos;
+
+      /* Chunk size when the chunk was wrapped. */
+      uint32_t size;
+   };
+};
+
+struct cs_builder {
+   /* CS builder configuration */
+   struct cs_builder_conf conf;
+
+   /* Initial (root) CS chunk. */
+   struct cs_chunk root_chunk;
+
+   /* Current CS chunk. */
+   struct cs_chunk cur_chunk;
+
+   /* Move immediate instruction at the end of the last CS chunk that needs to
+    * be patched with the final length of the current CS chunk in order to
+    * facilitate correct overflow behaviour.
+    */
+   uint32_t *length_patch;
+
+   /* Used as temporary storage when the allocator couldn't allocate a new
+    * CS chunk.
+    */
+   uint64_t discard_instr_slot;
+};
+
+static void
+cs_builder_init(struct cs_builder *b, const struct cs_builder_conf *conf,
+                struct cs_buffer root_buffer)
+{
+   *b = (struct cs_builder){
+      .conf = *conf,
+      .root_chunk.buffer = root_buffer,
+      .cur_chunk.buffer = root_buffer,
+   };
+
+   /* We need at least 3 registers for CS chunk linking. Assume the kernel needs
+    * at least that too.
+    */
+   b->conf.nr_kernel_registers = MAX2(b->conf.nr_kernel_registers, 3);
+}
+
+static bool
+cs_is_valid(struct cs_builder *b)
+{
+   return b->cur_chunk.buffer.cpu != NULL;
+}
+
+/*
+ * Wrap the current queue. External users shouldn't call this function
+ * directly, they should call cs_finish() when they are done building
+ * the command stream, which will in turn call cs_wrap_queue().
+ *
+ * Internally, this is also used to finalize internal CS chunks when
+ * allocating new sub-chunks. See cs_alloc_chunk() for details.
+ *
+ * This notably requires patching the previous chunk with the length
+ * we ended up emitting for this chunk.
+ */
+static void
+cs_wrap_chunk(struct cs_builder *b)
+{
+   if (!cs_is_valid(b))
+      return;
+
+   if (b->length_patch) {
+      *b->length_patch = (b->cur_chunk.pos * 8);
+      b->length_patch = NULL;
+   }
+
+   if (b->root_chunk.buffer.gpu == b->cur_chunk.buffer.gpu)
+      b->root_chunk.size = b->cur_chunk.size;
+}
+
+/* Call this when you are done building a command stream and want to prepare
+ * it for submission.
+ */
+static void
+cs_finish(struct cs_builder *b)
+{
+   if (!cs_is_valid(b))
+      return;
+
+   cs_wrap_chunk(b);
+
+   /* This prevents adding instructions after that point. */
+   memset(&b->cur_chunk, 0, sizeof(b->cur_chunk));
+}
+
+enum cs_index_type {
+   CS_INDEX_REGISTER = 0,
+   CS_INDEX_UNDEF,
+};
+
+struct cs_index {
+   enum cs_index_type type;
+
+   /* Number of 32-bit words in the index, must be nonzero */
+   uint8_t size;
+
+   union {
+      uint64_t imm;
+      uint8_t reg;
+   };
+};
+
+static inline struct cs_index
+cs_undef(void)
+{
+   return (struct cs_index){
+      .type = CS_INDEX_UNDEF,
+   };
+}
+
+static inline uint8_t
+cs_to_reg_tuple(struct cs_index idx, ASSERTED unsigned expected_size)
+{
+   assert(idx.type == CS_INDEX_REGISTER);
+   assert(idx.size == expected_size);
+
+   return idx.reg;
+}
+
+static inline uint8_t
+cs_to_reg32(struct cs_index idx)
+{
+   return cs_to_reg_tuple(idx, 1);
+}
+
+static inline uint8_t
+cs_to_reg64(struct cs_index idx)
+{
+   return cs_to_reg_tuple(idx, 2);
+}
+
+static inline struct cs_index
+cs_reg_tuple(ASSERTED struct cs_builder *b, unsigned reg, unsigned size)
+{
+   assert(reg + size <= b->conf.nr_registers - b->conf.nr_kernel_registers &&
+          "overflowed register file");
+   assert(size <= 16 && "unsupported");
+
+   return (struct cs_index){
+      .type = CS_INDEX_REGISTER,
+      .size = size,
+      .reg = reg,
+   };
+}
+
+static inline struct cs_index
+cs_reg32(struct cs_builder *b, unsigned reg)
+{
+   return cs_reg_tuple(b, reg, 1);
+}
+
+static inline struct cs_index
+cs_reg64(struct cs_builder *b, unsigned reg)
+{
+   assert((reg % 2) == 0 && "unaligned 64-bit reg");
+   return cs_reg_tuple(b, reg, 2);
+}
+
+/*
+ * The top of the register file is reserved for cs_builder internal use. We
+ * need 3 spare registers for handling command queue overflow. These are
+ * available here.
+ */
+static inline uint8_t
+cs_overflow_address_reg(struct cs_builder *b)
+{
+   return b->conf.nr_registers - 2;
+}
+
+static inline uint8_t
+cs_overflow_length_reg(struct cs_builder *b)
+{
+   return b->conf.nr_registers - 3;
+}
+
+static inline struct cs_index
+cs_extract32(struct cs_builder *b, struct cs_index idx, unsigned word)
+{
+   assert(idx.type == CS_INDEX_REGISTER && "unsupported");
+   assert(word < idx.size && "overrun");
+
+   return cs_reg32(b, idx.reg + word);
+}
+
+#define JUMP_SEQ_INSTR_COUNT 4
+
+static inline void *
+cs_alloc_ins(struct cs_builder *b)
+{
+   /* If an allocation failure happened before, we just discard all following
+    * instructions.
+    */
+   if (unlikely(!b->cur_chunk.buffer.cpu))
+      return &b->discard_instr_slot;
+
+   /* If the current chunk runs out of space, allocate a new one and jump to it.
+    * We actually do this a few instructions before running out, because the
+    * sequence to jump to a new queue takes multiple instructions.
+    */
+   if (unlikely((b->cur_chunk.size + JUMP_SEQ_INSTR_COUNT) >
+                b->cur_chunk.buffer.capacity)) {
+      /* Now, allocate a new chunk */
+      struct cs_buffer newbuf = b->conf.alloc_buffer(b->conf.cookie);
+
+      /* Allocation failure, from now on, all new instructions will be
+       * discarded.
+       */
+      if (unlikely(!b->cur_chunk.buffer.cpu))
+         return &b->discard_instr_slot;
+
+      uint64_t *ptr = b->cur_chunk.buffer.cpu + (b->cur_chunk.pos++);
+
+      pan_pack(ptr, CS_MOVE, I) {
+         I.destination = cs_overflow_address_reg(b);
+         I.immediate = newbuf.gpu;
+      }
+
+      ptr = b->cur_chunk.buffer.cpu + (b->cur_chunk.pos++);
+
+      pan_pack(ptr, CS_MOVE32, I) {
+         I.destination = cs_overflow_length_reg(b);
+      }
+
+      /* The length will be patched in later */
+      uint32_t *length_patch = (uint32_t *)ptr;
+
+      ptr = b->cur_chunk.buffer.cpu + (b->cur_chunk.pos++);
+
+      pan_pack(ptr, CS_JUMP, I) {
+         I.length = cs_overflow_length_reg(b);
+         I.address = cs_overflow_address_reg(b);
+      }
+
+      /* Now that we've emitted everything, finish up the previous queue */
+      cs_wrap_chunk(b);
+
+      /* And make this one current */
+      b->length_patch = length_patch;
+      b->cur_chunk.buffer = newbuf;
+      b->cur_chunk.pos = 0;
+   }
+
+   assert(b->cur_chunk.size < b->cur_chunk.buffer.capacity);
+   return b->cur_chunk.buffer.cpu + (b->cur_chunk.pos++);
+}
+
+/*
+ * Helper to emit a new instruction into the command queue. The allocation needs
+ * to be separated out being pan_pack can evaluate its argument multiple times,
+ * yet cs_alloc has side effects.
+ */
+#define cs_emit(b, T, cfg) pan_pack(cs_alloc_ins(b), CS_##T, cfg)
+
+/* Asynchronous operations take a mask of scoreboard slots to wait on
+ * before executing the instruction, and signal a scoreboard slot when
+ * the operation is complete.
+ * A wait_mask of zero means the operation is synchronous, and signal_slot
+ * is ignored in that case.
+ */
+struct cs_async_op {
+   uint16_t wait_mask;
+   uint8_t signal_slot;
+};
+
+static inline struct cs_async_op
+cs_defer(unsigned wait_mask, unsigned signal_slot)
+{
+   /* The scoreboard slot to signal is incremented before the wait operation,
+    * waiting on it would cause an infinite wait.
+    */
+   assert(!(wait_mask & BITFIELD_BIT(signal_slot)));
+
+   return (struct cs_async_op){
+      .wait_mask = wait_mask,
+      .signal_slot = signal_slot,
+   };
+}
+
+static inline struct cs_async_op
+cs_now(void)
+{
+   return (struct cs_async_op){
+      .wait_mask = 0,
+      .signal_slot = 0,
+   };
+}
+
+#define cs_apply_async(I, async)                                               \
+   do {                                                                        \
+      I.wait_mask = async.wait_mask;                                           \
+      I.signal_slot = async.signal_slot;                                       \
+   } while (0)
+
+static inline void
+cs_move32_to(struct cs_builder *b, struct cs_index dest, unsigned imm)
+{
+   cs_emit(b, MOVE32, I) {
+      I.destination = cs_to_reg32(dest);
+      I.immediate = imm;
+   }
+}
+
+static inline void
+cs_move48_to(struct cs_builder *b, struct cs_index dest, uint64_t imm)
+{
+   cs_emit(b, MOVE, I) {
+      I.destination = cs_to_reg64(dest);
+      I.immediate = imm;
+   }
+}
+
+static inline void
+cs_move64_to(struct cs_builder *b, struct cs_index dest, uint64_t imm)
+{
+   if (imm < (1ull << 48)) {
+      /* Zero extends */
+      cs_move48_to(b, dest, imm);
+   } else {
+      cs_move32_to(b, cs_extract32(b, dest, 0), imm);
+      cs_move32_to(b, cs_extract32(b, dest, 1), imm >> 32);
+   }
+}
+
+static inline void
+cs_wait_slots(struct cs_builder *b, unsigned wait_mask, bool progress_inc)
+{
+   cs_emit(b, WAIT, I) {
+      I.wait_mask = wait_mask;
+      I.progress_increment = progress_inc;
+   }
+}
+
+static inline void
+cs_wait_slot(struct cs_builder *b, unsigned slot, bool progress_inc)
+{
+   assert(slot < 8 && "invalid slot");
+
+   cs_wait_slots(b, BITFIELD_BIT(slot), progress_inc);
+}
+
+struct cs_shader_res_sel {
+   uint8_t srt, fau, spd, tsd;
+};
+
+static inline struct cs_shader_res_sel
+cs_shader_res_sel(unsigned srt, unsigned fau, unsigned spd, unsigned tsd)
+{
+   return (struct cs_shader_res_sel){
+      .srt = srt,
+      .fau = fau,
+      .spd = spd,
+      .tsd = tsd,
+   };
+}
+
+static inline void
+cs_run_compute(struct cs_builder *b, unsigned task_increment,
+               enum mali_task_axis task_axis, bool progress_inc,
+               struct cs_shader_res_sel res_sel)
+{
+   cs_emit(b, RUN_COMPUTE, I) {
+      I.task_increment = task_increment;
+      I.task_axis = task_axis;
+      I.progress_increment = progress_inc;
+      I.srt_select = res_sel.srt;
+      I.spd_select = res_sel.spd;
+      I.tsd_select = res_sel.tsd;
+      I.fau_select = res_sel.fau;
+   }
+}
+
+static inline void
+cs_run_tiling(struct cs_builder *b, uint32_t flags_override, bool progress_inc,
+              struct cs_shader_res_sel res_sel)
+{
+   cs_emit(b, RUN_TILING, I) {
+      I.flags_override = flags_override;
+      I.progress_increment = progress_inc;
+      I.srt_select = res_sel.srt;
+      I.spd_select = res_sel.spd;
+      I.tsd_select = res_sel.tsd;
+      I.fau_select = res_sel.fau;
+   }
+}
+
+static inline void
+cs_run_idvs(struct cs_builder *b, uint32_t flags_override, bool progress_inc,
+            bool malloc_enable, struct cs_shader_res_sel varying_sel,
+            struct cs_shader_res_sel frag_sel, struct cs_index draw_id)
+{
+   cs_emit(b, RUN_IDVS, I) {
+      I.flags_override = flags_override;
+      I.progress_increment = progress_inc;
+      I.malloc_enable = malloc_enable;
+
+      if (draw_id.type == CS_INDEX_UNDEF) {
+         I.draw_id_register_enable = false;
+      } else {
+         I.draw_id_register_enable = true;
+         I.draw_id = cs_to_reg32(draw_id);
+      }
+
+      assert(varying_sel.spd == 1);
+      assert(varying_sel.fau == 0 || varying_sel.fau == 1);
+      assert(varying_sel.srt == 0 || varying_sel.srt == 1);
+      assert(varying_sel.tsd == 0 || varying_sel.tsd == 1);
+      I.varying_fau_select = varying_sel.fau == 1;
+      I.varying_srt_select = varying_sel.srt == 1;
+      I.varying_tsd_select = varying_sel.tsd == 1;
+
+      assert(frag_sel.spd == 2);
+      assert(frag_sel.fau == 2);
+      assert(frag_sel.srt == 2 || frag_sel.srt == 0);
+      assert(frag_sel.tsd == 2 || frag_sel.tsd == 0);
+      I.fragment_srt_select = frag_sel.srt == 2;
+      I.fragment_tsd_select = frag_sel.tsd == 2;
+   }
+}
+
+static inline void
+cs_run_fragment(struct cs_builder *b, bool enable_tem,
+                enum mali_tile_render_order tile_order, bool progress_inc)
+{
+   cs_emit(b, RUN_FRAGMENT, I) {
+      I.enable_tem = enable_tem;
+      I.tile_order = tile_order;
+      I.progress_increment = progress_inc;
+   }
+}
+
+static inline void
+cs_run_fullscreen(struct cs_builder *b, uint32_t flags_override,
+                  bool progress_inc, struct cs_index dcd)
+{
+   cs_emit(b, RUN_FULLSCREEN, I) {
+      I.flags_override = flags_override;
+      I.progress_increment = progress_inc;
+      I.dcd = cs_to_reg64(dcd);
+   }
+}
+
+static inline void
+cs_finish_tiling(struct cs_builder *b, bool progress_inc)
+{
+   cs_emit(b, FINISH_TILING, I)
+      I.progress_increment = progress_inc;
+}
+
+static inline void
+cs_finish_fragment(struct cs_builder *b, bool increment_frag_completed,
+                   struct cs_index first_free_heap_chunk,
+                   struct cs_index last_free_heap_chunk,
+                   struct cs_async_op async)
+{
+   cs_emit(b, FINISH_FRAGMENT, I) {
+      I.increment_fragment_completed = increment_frag_completed;
+      cs_apply_async(I, async);
+      I.first_heap_chunk = cs_to_reg64(first_free_heap_chunk);
+      I.last_heap_chunk = cs_to_reg64(last_free_heap_chunk);
+   }
+}
+
+static inline void
+cs_add32(struct cs_builder *b, struct cs_index dest, struct cs_index src,
+         unsigned imm)
+{
+   cs_emit(b, ADD_IMMEDIATE32, I) {
+      I.destination = cs_to_reg32(dest);
+      I.source = cs_to_reg32(src);
+      I.immediate = imm;
+   }
+}
+
+static inline void
+cs_add64(struct cs_builder *b, struct cs_index dest, struct cs_index src,
+         unsigned imm)
+{
+   cs_emit(b, ADD_IMMEDIATE64, I) {
+      I.destination = cs_to_reg64(dest);
+      I.source = cs_to_reg64(src);
+      I.immediate = imm;
+   }
+}
+
+static inline void
+cs_umin32(struct cs_builder *b, struct cs_index dest, struct cs_index src1,
+          struct cs_index src2)
+{
+   cs_emit(b, UMIN32, I) {
+      I.destination = cs_to_reg32(dest);
+      I.source_1 = cs_to_reg32(src1);
+      I.source_2 = cs_to_reg32(src2);
+   }
+}
+
+static inline void
+cs_load_to(struct cs_builder *b, struct cs_index dest, struct cs_index address,
+           unsigned mask, int offset)
+{
+   cs_emit(b, LOAD_MULTIPLE, I) {
+      I.base_register = cs_to_reg_tuple(dest, util_bitcount(mask));
+      I.address = cs_to_reg64(address);
+      I.mask = mask;
+      I.offset = offset;
+   }
+}
+
+static inline void
+cs_load32_to(struct cs_builder *b, struct cs_index dest,
+             struct cs_index address, int offset)
+{
+   cs_load_to(b, dest, address, BITFIELD_MASK(1), offset);
+}
+
+static inline void
+cs_load64_to(struct cs_builder *b, struct cs_index dest,
+             struct cs_index address, int offset)
+{
+   cs_load_to(b, dest, address, BITFIELD_MASK(2), offset);
+}
+
+static inline void
+cs_store(struct cs_builder *b, struct cs_index data, struct cs_index address,
+         unsigned mask, int offset)
+{
+   cs_emit(b, STORE_MULTIPLE, I) {
+      I.base_register = cs_to_reg_tuple(data, util_bitcount(mask));
+      I.address = cs_to_reg64(address);
+      I.mask = mask;
+      I.offset = offset;
+   }
+}
+
+static inline void
+cs_store32(struct cs_builder *b, struct cs_index data, struct cs_index address,
+           int offset)
+{
+   cs_store(b, data, address, BITFIELD_MASK(1), offset);
+}
+
+static inline void
+cs_store64(struct cs_builder *b, struct cs_index data, struct cs_index address,
+           int offset)
+{
+   cs_store(b, data, address, BITFIELD_MASK(2), offset);
+}
+
+static inline void
+cs_branch(struct cs_builder *b, int offset, enum mali_cs_condition cond,
+          struct cs_index val)
+{
+   cs_emit(b, BRANCH, I) {
+      I.offset = offset;
+      I.condition = cond;
+      I.value = cs_to_reg32(val);
+   }
+}
+
+/*
+ * Select which scoreboard entry will track endpoint tasks and other tasks
+ * respectively. Pass to cs_wait to wait later.
+ */
+static inline void
+cs_set_scoreboard_entry(struct cs_builder *b, unsigned ep, unsigned other)
+{
+   assert(ep < 8 && "invalid slot");
+   assert(other < 8 && "invalid slot");
+
+   cs_emit(b, SET_SB_ENTRY, I) {
+      I.endpoint_entry = ep;
+      I.other_entry = other;
+   }
+}
+
+static inline void
+cs_progress_wait(struct cs_builder *b, unsigned queue, struct cs_index ref)
+{
+   cs_emit(b, PROGRESS_WAIT, I) {
+      I.source = cs_to_reg64(ref);
+      I.queue = queue;
+   }
+}
+
+static inline void
+cs_set_exception_handler(struct cs_builder *b,
+                         enum mali_cs_exception_type exception_type,
+                         struct cs_index address, struct cs_index length)
+{
+   cs_emit(b, SET_EXCEPTION_HANDLER, I) {
+      I.exception_type = exception_type;
+      I.address = cs_to_reg64(address);
+      I.length = cs_to_reg32(length);
+   }
+}
+
+static inline void
+cs_call(struct cs_builder *b, struct cs_index address, struct cs_index length)
+{
+   cs_emit(b, CALL, I) {
+      I.address = cs_to_reg64(address);
+      I.length = cs_to_reg32(length);
+   }
+}
+
+static inline void
+cs_jump(struct cs_builder *b, struct cs_index address, struct cs_index length)
+{
+   cs_emit(b, JUMP, I) {
+      I.address = cs_to_reg64(address);
+      I.length = cs_to_reg32(length);
+   }
+}
+
+enum cs_res_id {
+   CS_COMPUTE_RES = BITFIELD_BIT(0),
+   CS_FRAG_RES = BITFIELD_BIT(1),
+   CS_TILER_RES = BITFIELD_BIT(2),
+   CS_IDVS_RES = BITFIELD_BIT(3),
+};
+
+static inline void
+cs_req_res(struct cs_builder *b, u32 res_mask)
+{
+   cs_emit(b, REQ_RESOURCE, I) {
+      I.compute = res_mask & CS_COMPUTE_RES;
+      I.tiler = res_mask & CS_TILER_RES;
+      I.idvs = res_mask & CS_IDVS_RES;
+      I.fragment = res_mask & CS_FRAG_RES;
+   }
+}
+
+static inline void
+cs_flush_caches(struct cs_builder *b, enum mali_cs_flush_mode l2,
+                enum mali_cs_flush_mode lsc, bool other_inv,
+                struct cs_index flush_id, struct cs_async_op async)
+{
+   cs_emit(b, FLUSH_CACHE2, I) {
+      I.l2_flush_mode = l2;
+      I.lsc_flush_mode = lsc;
+      I.other_invalidate = other_inv;
+      I.latest_flush_id = cs_to_reg32(flush_id);
+      cs_apply_async(I, async);
+   }
+}
+
+#define CS_SYNC_OPS(__cnt_width)                                               \
+   static inline void cs_sync##__cnt_width##_set(                              \
+      struct cs_builder *b, bool propagate_error,                              \
+      enum mali_cs_sync_scope scope, struct cs_index val,                      \
+      struct cs_index addr, struct cs_async_op async)                          \
+   {                                                                           \
+      cs_emit(b, SYNC_SET##__cnt_width, I) {                                   \
+         I.error_propagate = propagate_error;                                  \
+         I.scope = scope;                                                      \
+         I.data = cs_to_reg##__cnt_width(val);                                 \
+         I.address = cs_to_reg64(addr);                                        \
+         cs_apply_async(I, async);                                             \
+      }                                                                        \
+   }                                                                           \
+                                                                               \
+   static inline void cs_sync##__cnt_width##_add(                              \
+      struct cs_builder *b, bool propagate_error,                              \
+      enum mali_cs_sync_scope scope, struct cs_index val,                      \
+      struct cs_index addr, struct cs_async_op async)                          \
+   {                                                                           \
+      cs_emit(b, SYNC_ADD##__cnt_width, I) {                                   \
+         I.error_propagate = propagate_error;                                  \
+         I.scope = scope;                                                      \
+         I.data = cs_to_reg##__cnt_width(val);                                 \
+         I.address = cs_to_reg64(addr);                                        \
+         cs_apply_async(I, async);                                             \
+      }                                                                        \
+   }                                                                           \
+                                                                               \
+   static inline void cs_sync##__cnt_width##_wait(                             \
+      struct cs_builder *b, bool reject_error, enum mali_cs_condition cond,    \
+      struct cs_index ref, struct cs_index addr)                               \
+   {                                                                           \
+      assert(cond == MALI_CS_CONDITION_LEQUAL ||                               \
+             cond == MALI_CS_CONDITION_GREATER);                               \
+      cs_emit(b, SYNC_WAIT##__cnt_width, I) {                                  \
+         I.error_reject = reject_error;                                        \
+         I.condition = cond;                                                   \
+         I.data = cs_to_reg##__cnt_width(ref);                                 \
+         I.address = cs_to_reg64(addr);                                        \
+      }                                                                        \
+   }
+
+CS_SYNC_OPS(32)
+CS_SYNC_OPS(64)
+
+static inline void
+cs_store_state(struct cs_builder *b, struct cs_index address, int offset,
+               enum mali_cs_state state, struct cs_async_op async)
+{
+   cs_emit(b, STORE_STATE, I) {
+      I.offset = offset;
+      I.state = state;
+      I.address = cs_to_reg64(address);
+      cs_apply_async(I, async);
+   }
+}
+
+static inline void
+cs_prot_region(struct cs_builder *b, unsigned size)
+{
+   cs_emit(b, PROT_REGION, I) {
+      I.size = size;
+   }
+}
+
+static inline void
+cs_progress_store(struct cs_builder *b, struct cs_index src)
+{
+   cs_emit(b, PROGRESS_STORE, I)
+      I.source = cs_to_reg64(src);
+}
+
+static inline void
+cs_progress_load(struct cs_builder *b, struct cs_index dst)
+{
+   cs_emit(b, PROGRESS_LOAD, I)
+      I.destination = cs_to_reg64(dst);
+}
+
+static inline void
+cs_run_compute_indirect(struct cs_builder *b, unsigned wg_per_task,
+                        bool progress_inc, struct cs_shader_res_sel res_sel)
+{
+   cs_emit(b, RUN_COMPUTE_INDIRECT, I) {
+      I.workgroups_per_task = wg_per_task;
+      I.progress_increment = progress_inc;
+      I.srt_select = res_sel.srt;
+      I.spd_select = res_sel.spd;
+      I.tsd_select = res_sel.tsd;
+      I.fau_select = res_sel.fau;
+   }
+}
+
+static inline void
+cs_error_barrier(struct cs_builder *b)
+{
+   cs_emit(b, ERROR_BARRIER, _)
+      ;
+}
+
+static inline void
+cs_heap_set(struct cs_builder *b, struct cs_index address)
+{
+   cs_emit(b, HEAP_SET, I) {
+      I.address = cs_to_reg64(address);
+   }
+}
+
+static inline void
+cs_heap_operation(struct cs_builder *b, enum mali_cs_heap_operation operation,
+                  struct cs_async_op async)
+{
+   cs_emit(b, HEAP_OPERATION, I) {
+      I.operation = operation;
+      cs_apply_async(I, async);
+   }
+}
+
+static inline void
+cs_vt_start(struct cs_builder *b, struct cs_async_op async)
+{
+   cs_heap_operation(b, MALI_CS_HEAP_OPERATION_VERTEX_TILER_STARTED, async);
+}
+
+static inline void
+cs_vt_end(struct cs_builder *b, struct cs_async_op async)
+{
+   cs_heap_operation(b, MALI_CS_HEAP_OPERATION_VERTEX_TILER_COMPLETED, async);
+}
+
+static inline void
+cs_frag_end(struct cs_builder *b, struct cs_async_op async)
+{
+   cs_heap_operation(b, MALI_CS_HEAP_OPERATION_FRAGMENT_COMPLETED, async);
+}
+
+static inline void
+cs_trace_point(struct cs_builder *b, struct cs_index regs,
+               struct cs_async_op async)
+{
+   cs_emit(b, TRACE_POINT, I) {
+      I.base_register = cs_to_reg_tuple(regs, regs.size);
+      I.register_count = regs.size;
+      cs_apply_async(I, async);
+   }
+}