From 3b82448f4751fdad028ab74669768aef7f188994 Mon Sep 17 00:00:00 2001 From: Alyssa Rosenzweig Date: Wed, 5 Jul 2023 11:07:04 +0200 Subject: [PATCH] panfrost: Add a library to build CSF command streams Signed-off-by: Alyssa Rosenzweig Reviewed-by: Antonino Maniscalco Reviewed-by: Erik Faye-Lund Part-of: --- src/.clang-format | 1 + src/panfrost/lib/genxml/cs_builder.h | 877 +++++++++++++++++++++++++++ 2 files changed, 878 insertions(+) create mode 100644 src/panfrost/lib/genxml/cs_builder.h diff --git a/src/.clang-format b/src/.clang-format index badf59ab108e4..b788618523843 100644 --- a/src/.clang-format +++ b/src/.clang-format @@ -296,6 +296,7 @@ ForEachMacros: - bi_foreach_src - bi_foreach_ssa_src - bi_foreach_successor + - cs_emit - mir_foreach_block - mir_foreach_block_from - mir_foreach_bundle_in_block diff --git a/src/panfrost/lib/genxml/cs_builder.h b/src/panfrost/lib/genxml/cs_builder.h new file mode 100644 index 0000000000000..17c1958120a71 --- /dev/null +++ b/src/panfrost/lib/genxml/cs_builder.h @@ -0,0 +1,877 @@ +/* + * Copyright (C) 2022 Collabora Ltd. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#pragma once + +#if !defined(PAN_ARCH) || PAN_ARCH < 10 +#error "cs_builder.h requires PAN_ARCH >= 10" +#endif + +#include "gen_macros.h" + +/* + * cs_builder implements a builder for CSF command streams. It manages the + * allocation and overflow behaviour of queues and provides helpers for emitting + * commands to run on the CSF pipe. + * + * Users are responsible for the CS buffer allocation and must initialize the + * command stream with an initial buffer using cs_builder_init(). The CS can + * be extended with new buffers allocated with cs_builder_conf::alloc_buffer() + * if the builder runs out of memory. + */ + +struct cs_buffer { + /* CPU pointer */ + uint64_t *cpu; + + /* GPU pointer */ + uint64_t gpu; + + /* Capacity in number of 64-bit instructions */ + uint32_t capacity; +}; + +struct cs_builder_conf { + /* Number of 32-bit registers in the hardware register file */ + uint8_t nr_registers; + + /* Number of 32-bit registers used by the kernel at submission time */ + uint8_t nr_kernel_registers; + + /* CS buffer allocator */ + struct cs_buffer (*alloc_buffer)(void *cookie); + + /* Cookie passed back to alloc_buffer() */ + void *cookie; +}; + +/* The CS is formed of one or more CS chunks linked with JUMP instructions. + * The builder keeps track of the current chunk and the position inside this + * chunk, so it can emit new instructions, and decide when a new chunk needs + * to be allocated. + */ +struct cs_chunk { + /* CS buffer object backing this chunk */ + struct cs_buffer buffer; + + union { + /* Current position in the buffer object when the chunk is active. */ + uint32_t pos; + + /* Chunk size when the chunk was wrapped. */ + uint32_t size; + }; +}; + +struct cs_builder { + /* CS builder configuration */ + struct cs_builder_conf conf; + + /* Initial (root) CS chunk. */ + struct cs_chunk root_chunk; + + /* Current CS chunk. */ + struct cs_chunk cur_chunk; + + /* Move immediate instruction at the end of the last CS chunk that needs to + * be patched with the final length of the current CS chunk in order to + * facilitate correct overflow behaviour. + */ + uint32_t *length_patch; + + /* Used as temporary storage when the allocator couldn't allocate a new + * CS chunk. + */ + uint64_t discard_instr_slot; +}; + +static void +cs_builder_init(struct cs_builder *b, const struct cs_builder_conf *conf, + struct cs_buffer root_buffer) +{ + *b = (struct cs_builder){ + .conf = *conf, + .root_chunk.buffer = root_buffer, + .cur_chunk.buffer = root_buffer, + }; + + /* We need at least 3 registers for CS chunk linking. Assume the kernel needs + * at least that too. + */ + b->conf.nr_kernel_registers = MAX2(b->conf.nr_kernel_registers, 3); +} + +static bool +cs_is_valid(struct cs_builder *b) +{ + return b->cur_chunk.buffer.cpu != NULL; +} + +/* + * Wrap the current queue. External users shouldn't call this function + * directly, they should call cs_finish() when they are done building + * the command stream, which will in turn call cs_wrap_queue(). + * + * Internally, this is also used to finalize internal CS chunks when + * allocating new sub-chunks. See cs_alloc_chunk() for details. + * + * This notably requires patching the previous chunk with the length + * we ended up emitting for this chunk. + */ +static void +cs_wrap_chunk(struct cs_builder *b) +{ + if (!cs_is_valid(b)) + return; + + if (b->length_patch) { + *b->length_patch = (b->cur_chunk.pos * 8); + b->length_patch = NULL; + } + + if (b->root_chunk.buffer.gpu == b->cur_chunk.buffer.gpu) + b->root_chunk.size = b->cur_chunk.size; +} + +/* Call this when you are done building a command stream and want to prepare + * it for submission. + */ +static void +cs_finish(struct cs_builder *b) +{ + if (!cs_is_valid(b)) + return; + + cs_wrap_chunk(b); + + /* This prevents adding instructions after that point. */ + memset(&b->cur_chunk, 0, sizeof(b->cur_chunk)); +} + +enum cs_index_type { + CS_INDEX_REGISTER = 0, + CS_INDEX_UNDEF, +}; + +struct cs_index { + enum cs_index_type type; + + /* Number of 32-bit words in the index, must be nonzero */ + uint8_t size; + + union { + uint64_t imm; + uint8_t reg; + }; +}; + +static inline struct cs_index +cs_undef(void) +{ + return (struct cs_index){ + .type = CS_INDEX_UNDEF, + }; +} + +static inline uint8_t +cs_to_reg_tuple(struct cs_index idx, ASSERTED unsigned expected_size) +{ + assert(idx.type == CS_INDEX_REGISTER); + assert(idx.size == expected_size); + + return idx.reg; +} + +static inline uint8_t +cs_to_reg32(struct cs_index idx) +{ + return cs_to_reg_tuple(idx, 1); +} + +static inline uint8_t +cs_to_reg64(struct cs_index idx) +{ + return cs_to_reg_tuple(idx, 2); +} + +static inline struct cs_index +cs_reg_tuple(ASSERTED struct cs_builder *b, unsigned reg, unsigned size) +{ + assert(reg + size <= b->conf.nr_registers - b->conf.nr_kernel_registers && + "overflowed register file"); + assert(size <= 16 && "unsupported"); + + return (struct cs_index){ + .type = CS_INDEX_REGISTER, + .size = size, + .reg = reg, + }; +} + +static inline struct cs_index +cs_reg32(struct cs_builder *b, unsigned reg) +{ + return cs_reg_tuple(b, reg, 1); +} + +static inline struct cs_index +cs_reg64(struct cs_builder *b, unsigned reg) +{ + assert((reg % 2) == 0 && "unaligned 64-bit reg"); + return cs_reg_tuple(b, reg, 2); +} + +/* + * The top of the register file is reserved for cs_builder internal use. We + * need 3 spare registers for handling command queue overflow. These are + * available here. + */ +static inline uint8_t +cs_overflow_address_reg(struct cs_builder *b) +{ + return b->conf.nr_registers - 2; +} + +static inline uint8_t +cs_overflow_length_reg(struct cs_builder *b) +{ + return b->conf.nr_registers - 3; +} + +static inline struct cs_index +cs_extract32(struct cs_builder *b, struct cs_index idx, unsigned word) +{ + assert(idx.type == CS_INDEX_REGISTER && "unsupported"); + assert(word < idx.size && "overrun"); + + return cs_reg32(b, idx.reg + word); +} + +#define JUMP_SEQ_INSTR_COUNT 4 + +static inline void * +cs_alloc_ins(struct cs_builder *b) +{ + /* If an allocation failure happened before, we just discard all following + * instructions. + */ + if (unlikely(!b->cur_chunk.buffer.cpu)) + return &b->discard_instr_slot; + + /* If the current chunk runs out of space, allocate a new one and jump to it. + * We actually do this a few instructions before running out, because the + * sequence to jump to a new queue takes multiple instructions. + */ + if (unlikely((b->cur_chunk.size + JUMP_SEQ_INSTR_COUNT) > + b->cur_chunk.buffer.capacity)) { + /* Now, allocate a new chunk */ + struct cs_buffer newbuf = b->conf.alloc_buffer(b->conf.cookie); + + /* Allocation failure, from now on, all new instructions will be + * discarded. + */ + if (unlikely(!b->cur_chunk.buffer.cpu)) + return &b->discard_instr_slot; + + uint64_t *ptr = b->cur_chunk.buffer.cpu + (b->cur_chunk.pos++); + + pan_pack(ptr, CS_MOVE, I) { + I.destination = cs_overflow_address_reg(b); + I.immediate = newbuf.gpu; + } + + ptr = b->cur_chunk.buffer.cpu + (b->cur_chunk.pos++); + + pan_pack(ptr, CS_MOVE32, I) { + I.destination = cs_overflow_length_reg(b); + } + + /* The length will be patched in later */ + uint32_t *length_patch = (uint32_t *)ptr; + + ptr = b->cur_chunk.buffer.cpu + (b->cur_chunk.pos++); + + pan_pack(ptr, CS_JUMP, I) { + I.length = cs_overflow_length_reg(b); + I.address = cs_overflow_address_reg(b); + } + + /* Now that we've emitted everything, finish up the previous queue */ + cs_wrap_chunk(b); + + /* And make this one current */ + b->length_patch = length_patch; + b->cur_chunk.buffer = newbuf; + b->cur_chunk.pos = 0; + } + + assert(b->cur_chunk.size < b->cur_chunk.buffer.capacity); + return b->cur_chunk.buffer.cpu + (b->cur_chunk.pos++); +} + +/* + * Helper to emit a new instruction into the command queue. The allocation needs + * to be separated out being pan_pack can evaluate its argument multiple times, + * yet cs_alloc has side effects. + */ +#define cs_emit(b, T, cfg) pan_pack(cs_alloc_ins(b), CS_##T, cfg) + +/* Asynchronous operations take a mask of scoreboard slots to wait on + * before executing the instruction, and signal a scoreboard slot when + * the operation is complete. + * A wait_mask of zero means the operation is synchronous, and signal_slot + * is ignored in that case. + */ +struct cs_async_op { + uint16_t wait_mask; + uint8_t signal_slot; +}; + +static inline struct cs_async_op +cs_defer(unsigned wait_mask, unsigned signal_slot) +{ + /* The scoreboard slot to signal is incremented before the wait operation, + * waiting on it would cause an infinite wait. + */ + assert(!(wait_mask & BITFIELD_BIT(signal_slot))); + + return (struct cs_async_op){ + .wait_mask = wait_mask, + .signal_slot = signal_slot, + }; +} + +static inline struct cs_async_op +cs_now(void) +{ + return (struct cs_async_op){ + .wait_mask = 0, + .signal_slot = 0, + }; +} + +#define cs_apply_async(I, async) \ + do { \ + I.wait_mask = async.wait_mask; \ + I.signal_slot = async.signal_slot; \ + } while (0) + +static inline void +cs_move32_to(struct cs_builder *b, struct cs_index dest, unsigned imm) +{ + cs_emit(b, MOVE32, I) { + I.destination = cs_to_reg32(dest); + I.immediate = imm; + } +} + +static inline void +cs_move48_to(struct cs_builder *b, struct cs_index dest, uint64_t imm) +{ + cs_emit(b, MOVE, I) { + I.destination = cs_to_reg64(dest); + I.immediate = imm; + } +} + +static inline void +cs_move64_to(struct cs_builder *b, struct cs_index dest, uint64_t imm) +{ + if (imm < (1ull << 48)) { + /* Zero extends */ + cs_move48_to(b, dest, imm); + } else { + cs_move32_to(b, cs_extract32(b, dest, 0), imm); + cs_move32_to(b, cs_extract32(b, dest, 1), imm >> 32); + } +} + +static inline void +cs_wait_slots(struct cs_builder *b, unsigned wait_mask, bool progress_inc) +{ + cs_emit(b, WAIT, I) { + I.wait_mask = wait_mask; + I.progress_increment = progress_inc; + } +} + +static inline void +cs_wait_slot(struct cs_builder *b, unsigned slot, bool progress_inc) +{ + assert(slot < 8 && "invalid slot"); + + cs_wait_slots(b, BITFIELD_BIT(slot), progress_inc); +} + +struct cs_shader_res_sel { + uint8_t srt, fau, spd, tsd; +}; + +static inline struct cs_shader_res_sel +cs_shader_res_sel(unsigned srt, unsigned fau, unsigned spd, unsigned tsd) +{ + return (struct cs_shader_res_sel){ + .srt = srt, + .fau = fau, + .spd = spd, + .tsd = tsd, + }; +} + +static inline void +cs_run_compute(struct cs_builder *b, unsigned task_increment, + enum mali_task_axis task_axis, bool progress_inc, + struct cs_shader_res_sel res_sel) +{ + cs_emit(b, RUN_COMPUTE, I) { + I.task_increment = task_increment; + I.task_axis = task_axis; + I.progress_increment = progress_inc; + I.srt_select = res_sel.srt; + I.spd_select = res_sel.spd; + I.tsd_select = res_sel.tsd; + I.fau_select = res_sel.fau; + } +} + +static inline void +cs_run_tiling(struct cs_builder *b, uint32_t flags_override, bool progress_inc, + struct cs_shader_res_sel res_sel) +{ + cs_emit(b, RUN_TILING, I) { + I.flags_override = flags_override; + I.progress_increment = progress_inc; + I.srt_select = res_sel.srt; + I.spd_select = res_sel.spd; + I.tsd_select = res_sel.tsd; + I.fau_select = res_sel.fau; + } +} + +static inline void +cs_run_idvs(struct cs_builder *b, uint32_t flags_override, bool progress_inc, + bool malloc_enable, struct cs_shader_res_sel varying_sel, + struct cs_shader_res_sel frag_sel, struct cs_index draw_id) +{ + cs_emit(b, RUN_IDVS, I) { + I.flags_override = flags_override; + I.progress_increment = progress_inc; + I.malloc_enable = malloc_enable; + + if (draw_id.type == CS_INDEX_UNDEF) { + I.draw_id_register_enable = false; + } else { + I.draw_id_register_enable = true; + I.draw_id = cs_to_reg32(draw_id); + } + + assert(varying_sel.spd == 1); + assert(varying_sel.fau == 0 || varying_sel.fau == 1); + assert(varying_sel.srt == 0 || varying_sel.srt == 1); + assert(varying_sel.tsd == 0 || varying_sel.tsd == 1); + I.varying_fau_select = varying_sel.fau == 1; + I.varying_srt_select = varying_sel.srt == 1; + I.varying_tsd_select = varying_sel.tsd == 1; + + assert(frag_sel.spd == 2); + assert(frag_sel.fau == 2); + assert(frag_sel.srt == 2 || frag_sel.srt == 0); + assert(frag_sel.tsd == 2 || frag_sel.tsd == 0); + I.fragment_srt_select = frag_sel.srt == 2; + I.fragment_tsd_select = frag_sel.tsd == 2; + } +} + +static inline void +cs_run_fragment(struct cs_builder *b, bool enable_tem, + enum mali_tile_render_order tile_order, bool progress_inc) +{ + cs_emit(b, RUN_FRAGMENT, I) { + I.enable_tem = enable_tem; + I.tile_order = tile_order; + I.progress_increment = progress_inc; + } +} + +static inline void +cs_run_fullscreen(struct cs_builder *b, uint32_t flags_override, + bool progress_inc, struct cs_index dcd) +{ + cs_emit(b, RUN_FULLSCREEN, I) { + I.flags_override = flags_override; + I.progress_increment = progress_inc; + I.dcd = cs_to_reg64(dcd); + } +} + +static inline void +cs_finish_tiling(struct cs_builder *b, bool progress_inc) +{ + cs_emit(b, FINISH_TILING, I) + I.progress_increment = progress_inc; +} + +static inline void +cs_finish_fragment(struct cs_builder *b, bool increment_frag_completed, + struct cs_index first_free_heap_chunk, + struct cs_index last_free_heap_chunk, + struct cs_async_op async) +{ + cs_emit(b, FINISH_FRAGMENT, I) { + I.increment_fragment_completed = increment_frag_completed; + cs_apply_async(I, async); + I.first_heap_chunk = cs_to_reg64(first_free_heap_chunk); + I.last_heap_chunk = cs_to_reg64(last_free_heap_chunk); + } +} + +static inline void +cs_add32(struct cs_builder *b, struct cs_index dest, struct cs_index src, + unsigned imm) +{ + cs_emit(b, ADD_IMMEDIATE32, I) { + I.destination = cs_to_reg32(dest); + I.source = cs_to_reg32(src); + I.immediate = imm; + } +} + +static inline void +cs_add64(struct cs_builder *b, struct cs_index dest, struct cs_index src, + unsigned imm) +{ + cs_emit(b, ADD_IMMEDIATE64, I) { + I.destination = cs_to_reg64(dest); + I.source = cs_to_reg64(src); + I.immediate = imm; + } +} + +static inline void +cs_umin32(struct cs_builder *b, struct cs_index dest, struct cs_index src1, + struct cs_index src2) +{ + cs_emit(b, UMIN32, I) { + I.destination = cs_to_reg32(dest); + I.source_1 = cs_to_reg32(src1); + I.source_2 = cs_to_reg32(src2); + } +} + +static inline void +cs_load_to(struct cs_builder *b, struct cs_index dest, struct cs_index address, + unsigned mask, int offset) +{ + cs_emit(b, LOAD_MULTIPLE, I) { + I.base_register = cs_to_reg_tuple(dest, util_bitcount(mask)); + I.address = cs_to_reg64(address); + I.mask = mask; + I.offset = offset; + } +} + +static inline void +cs_load32_to(struct cs_builder *b, struct cs_index dest, + struct cs_index address, int offset) +{ + cs_load_to(b, dest, address, BITFIELD_MASK(1), offset); +} + +static inline void +cs_load64_to(struct cs_builder *b, struct cs_index dest, + struct cs_index address, int offset) +{ + cs_load_to(b, dest, address, BITFIELD_MASK(2), offset); +} + +static inline void +cs_store(struct cs_builder *b, struct cs_index data, struct cs_index address, + unsigned mask, int offset) +{ + cs_emit(b, STORE_MULTIPLE, I) { + I.base_register = cs_to_reg_tuple(data, util_bitcount(mask)); + I.address = cs_to_reg64(address); + I.mask = mask; + I.offset = offset; + } +} + +static inline void +cs_store32(struct cs_builder *b, struct cs_index data, struct cs_index address, + int offset) +{ + cs_store(b, data, address, BITFIELD_MASK(1), offset); +} + +static inline void +cs_store64(struct cs_builder *b, struct cs_index data, struct cs_index address, + int offset) +{ + cs_store(b, data, address, BITFIELD_MASK(2), offset); +} + +static inline void +cs_branch(struct cs_builder *b, int offset, enum mali_cs_condition cond, + struct cs_index val) +{ + cs_emit(b, BRANCH, I) { + I.offset = offset; + I.condition = cond; + I.value = cs_to_reg32(val); + } +} + +/* + * Select which scoreboard entry will track endpoint tasks and other tasks + * respectively. Pass to cs_wait to wait later. + */ +static inline void +cs_set_scoreboard_entry(struct cs_builder *b, unsigned ep, unsigned other) +{ + assert(ep < 8 && "invalid slot"); + assert(other < 8 && "invalid slot"); + + cs_emit(b, SET_SB_ENTRY, I) { + I.endpoint_entry = ep; + I.other_entry = other; + } +} + +static inline void +cs_progress_wait(struct cs_builder *b, unsigned queue, struct cs_index ref) +{ + cs_emit(b, PROGRESS_WAIT, I) { + I.source = cs_to_reg64(ref); + I.queue = queue; + } +} + +static inline void +cs_set_exception_handler(struct cs_builder *b, + enum mali_cs_exception_type exception_type, + struct cs_index address, struct cs_index length) +{ + cs_emit(b, SET_EXCEPTION_HANDLER, I) { + I.exception_type = exception_type; + I.address = cs_to_reg64(address); + I.length = cs_to_reg32(length); + } +} + +static inline void +cs_call(struct cs_builder *b, struct cs_index address, struct cs_index length) +{ + cs_emit(b, CALL, I) { + I.address = cs_to_reg64(address); + I.length = cs_to_reg32(length); + } +} + +static inline void +cs_jump(struct cs_builder *b, struct cs_index address, struct cs_index length) +{ + cs_emit(b, JUMP, I) { + I.address = cs_to_reg64(address); + I.length = cs_to_reg32(length); + } +} + +enum cs_res_id { + CS_COMPUTE_RES = BITFIELD_BIT(0), + CS_FRAG_RES = BITFIELD_BIT(1), + CS_TILER_RES = BITFIELD_BIT(2), + CS_IDVS_RES = BITFIELD_BIT(3), +}; + +static inline void +cs_req_res(struct cs_builder *b, u32 res_mask) +{ + cs_emit(b, REQ_RESOURCE, I) { + I.compute = res_mask & CS_COMPUTE_RES; + I.tiler = res_mask & CS_TILER_RES; + I.idvs = res_mask & CS_IDVS_RES; + I.fragment = res_mask & CS_FRAG_RES; + } +} + +static inline void +cs_flush_caches(struct cs_builder *b, enum mali_cs_flush_mode l2, + enum mali_cs_flush_mode lsc, bool other_inv, + struct cs_index flush_id, struct cs_async_op async) +{ + cs_emit(b, FLUSH_CACHE2, I) { + I.l2_flush_mode = l2; + I.lsc_flush_mode = lsc; + I.other_invalidate = other_inv; + I.latest_flush_id = cs_to_reg32(flush_id); + cs_apply_async(I, async); + } +} + +#define CS_SYNC_OPS(__cnt_width) \ + static inline void cs_sync##__cnt_width##_set( \ + struct cs_builder *b, bool propagate_error, \ + enum mali_cs_sync_scope scope, struct cs_index val, \ + struct cs_index addr, struct cs_async_op async) \ + { \ + cs_emit(b, SYNC_SET##__cnt_width, I) { \ + I.error_propagate = propagate_error; \ + I.scope = scope; \ + I.data = cs_to_reg##__cnt_width(val); \ + I.address = cs_to_reg64(addr); \ + cs_apply_async(I, async); \ + } \ + } \ + \ + static inline void cs_sync##__cnt_width##_add( \ + struct cs_builder *b, bool propagate_error, \ + enum mali_cs_sync_scope scope, struct cs_index val, \ + struct cs_index addr, struct cs_async_op async) \ + { \ + cs_emit(b, SYNC_ADD##__cnt_width, I) { \ + I.error_propagate = propagate_error; \ + I.scope = scope; \ + I.data = cs_to_reg##__cnt_width(val); \ + I.address = cs_to_reg64(addr); \ + cs_apply_async(I, async); \ + } \ + } \ + \ + static inline void cs_sync##__cnt_width##_wait( \ + struct cs_builder *b, bool reject_error, enum mali_cs_condition cond, \ + struct cs_index ref, struct cs_index addr) \ + { \ + assert(cond == MALI_CS_CONDITION_LEQUAL || \ + cond == MALI_CS_CONDITION_GREATER); \ + cs_emit(b, SYNC_WAIT##__cnt_width, I) { \ + I.error_reject = reject_error; \ + I.condition = cond; \ + I.data = cs_to_reg##__cnt_width(ref); \ + I.address = cs_to_reg64(addr); \ + } \ + } + +CS_SYNC_OPS(32) +CS_SYNC_OPS(64) + +static inline void +cs_store_state(struct cs_builder *b, struct cs_index address, int offset, + enum mali_cs_state state, struct cs_async_op async) +{ + cs_emit(b, STORE_STATE, I) { + I.offset = offset; + I.state = state; + I.address = cs_to_reg64(address); + cs_apply_async(I, async); + } +} + +static inline void +cs_prot_region(struct cs_builder *b, unsigned size) +{ + cs_emit(b, PROT_REGION, I) { + I.size = size; + } +} + +static inline void +cs_progress_store(struct cs_builder *b, struct cs_index src) +{ + cs_emit(b, PROGRESS_STORE, I) + I.source = cs_to_reg64(src); +} + +static inline void +cs_progress_load(struct cs_builder *b, struct cs_index dst) +{ + cs_emit(b, PROGRESS_LOAD, I) + I.destination = cs_to_reg64(dst); +} + +static inline void +cs_run_compute_indirect(struct cs_builder *b, unsigned wg_per_task, + bool progress_inc, struct cs_shader_res_sel res_sel) +{ + cs_emit(b, RUN_COMPUTE_INDIRECT, I) { + I.workgroups_per_task = wg_per_task; + I.progress_increment = progress_inc; + I.srt_select = res_sel.srt; + I.spd_select = res_sel.spd; + I.tsd_select = res_sel.tsd; + I.fau_select = res_sel.fau; + } +} + +static inline void +cs_error_barrier(struct cs_builder *b) +{ + cs_emit(b, ERROR_BARRIER, _) + ; +} + +static inline void +cs_heap_set(struct cs_builder *b, struct cs_index address) +{ + cs_emit(b, HEAP_SET, I) { + I.address = cs_to_reg64(address); + } +} + +static inline void +cs_heap_operation(struct cs_builder *b, enum mali_cs_heap_operation operation, + struct cs_async_op async) +{ + cs_emit(b, HEAP_OPERATION, I) { + I.operation = operation; + cs_apply_async(I, async); + } +} + +static inline void +cs_vt_start(struct cs_builder *b, struct cs_async_op async) +{ + cs_heap_operation(b, MALI_CS_HEAP_OPERATION_VERTEX_TILER_STARTED, async); +} + +static inline void +cs_vt_end(struct cs_builder *b, struct cs_async_op async) +{ + cs_heap_operation(b, MALI_CS_HEAP_OPERATION_VERTEX_TILER_COMPLETED, async); +} + +static inline void +cs_frag_end(struct cs_builder *b, struct cs_async_op async) +{ + cs_heap_operation(b, MALI_CS_HEAP_OPERATION_FRAGMENT_COMPLETED, async); +} + +static inline void +cs_trace_point(struct cs_builder *b, struct cs_index regs, + struct cs_async_op async) +{ + cs_emit(b, TRACE_POINT, I) { + I.base_register = cs_to_reg_tuple(regs, regs.size); + I.register_count = regs.size; + cs_apply_async(I, async); + } +}