mesa/src/freedreno/vulkan/tu_private.h

/*
 * Copyright © 2016 Red Hat.
 * Copyright © 2016 Bas Nieuwenhuizen
 *
 * based in part on anv driver which is:
 * Copyright © 2015 Intel Corporation
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice (including the next
 * paragraph) shall be included in all copies or substantial portions of the
 * Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 * DEALINGS IN THE SOFTWARE.
 */

#ifndef TU_PRIVATE_H
#define TU_PRIVATE_H

#include <assert.h>
#include <pthread.h>
#include <stdbool.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#ifdef HAVE_VALGRIND
#include <memcheck.h>
#include <valgrind.h>
#define VG(x) x
#else
#define VG(x) ((void)0)
#endif

#define MESA_LOG_TAG "TU"

#include "c11/threads.h"
#include "util/rounding.h"
#include "util/bitscan.h"
#include "util/list.h"
#include "util/log.h"
#include "util/macros.h"
#include "util/sparse_array.h"
#include "util/u_atomic.h"
#include "util/u_dynarray.h"
#include "util/xmlconfig.h"
#include "util/perf/u_trace.h"
#include "vk_alloc.h"
#include "vk_debug_report.h"
#include "vk_device.h"
#include "vk_dispatch_table.h"
#include "vk_extensions.h"
#include "vk_instance.h"
#include "vk_log.h"
#include "vk_physical_device.h"
#include "vk_shader_module.h"
#include "vk_pipeline_cache.h"
#include "wsi_common.h"

#include "ir3/ir3_compiler.h"
#include "ir3/ir3_shader.h"

#include "adreno_common.xml.h"
#include "adreno_pm4.xml.h"
#include "a6xx.xml.h"
#include "fdl/freedreno_layout.h"
#include "common/freedreno_dev_info.h"
#include "perfcntrs/freedreno_perfcntr.h"

#include "tu_descriptor_set.h"
#include "tu_autotune.h"
#include "tu_util.h"
#include "tu_perfetto.h"

/* Pre-declarations needed for WSI entrypoints */
struct wl_surface;
struct wl_display;
typedef struct xcb_connection_t xcb_connection_t;
typedef uint32_t xcb_visualid_t;
typedef uint32_t xcb_window_t;

#include <vulkan/vk_android_native_buffer.h>
#include <vulkan/vk_icd.h>
#include <vulkan/vulkan.h>

#include "tu_entrypoints.h"
#include "vulkan/runtime/vk_common_entrypoints.h"

#include "vk_format.h"
#include "vk_image.h"
#include "vk_command_buffer.h"
#include "vk_command_pool.h"
#include "vk_queue.h"
#include "vk_object.h"
#include "vk_sync.h"
#include "vk_drm_syncobj.h"
#include "vk_sync_timeline.h"

#define MAX_VBS 32
#define MAX_VERTEX_ATTRIBS 32
#define MAX_RTS 8
#define MAX_VSC_PIPES 32
#define MAX_VIEWPORTS 16
#define MAX_VIEWPORT_SIZE (1 << 14)
#define MAX_SCISSORS 16
#define MAX_DISCARD_RECTANGLES 4
#define MAX_PUSH_CONSTANTS_SIZE 256
#define MAX_PUSH_DESCRIPTORS 32
#define MAX_DYNAMIC_UNIFORM_BUFFERS 16
#define MAX_DYNAMIC_STORAGE_BUFFERS 8
#define MAX_DYNAMIC_BUFFERS_SIZE                                             \
   (MAX_DYNAMIC_UNIFORM_BUFFERS + 2 * MAX_DYNAMIC_STORAGE_BUFFERS) *         \
   A6XX_TEX_CONST_DWORDS

#define TU_MAX_DRM_DEVICES 8
#define MAX_VIEWS 16
#define MAX_BIND_POINTS 2 /* compute + graphics */
/* The Qualcomm driver exposes 0x20000058 */
#define MAX_STORAGE_BUFFER_RANGE 0x20000000
/* We use ldc for uniform buffer loads, just like the Qualcomm driver, so
 * expose the same maximum range.
 * TODO: The SIZE bitfield is 15 bits, and in 4-dword units, so the actual
 * range might be higher.
 */
#define MAX_UNIFORM_BUFFER_RANGE 0x10000

#define A6XX_TEX_CONST_DWORDS 16
#define A6XX_TEX_SAMP_DWORDS 4

#define COND(bool, val) ((bool) ? (val) : 0)
#define BIT(bit) (1u << (bit))

/* Whenever we generate an error, pass it through this function. Useful for
 * debugging, where we can break on it. Only call at error site, not when
 * propagating errors. Might be useful to plug in a stack trace here.
 */

struct tu_instance;
struct breadcrumbs_context;

VkResult
__vk_startup_errorf(struct tu_instance *instance,
                    VkResult error,
                    bool force_print,
                    const char *file,
                    int line,
                    const char *format,
                    ...) PRINTFLIKE(6, 7);

/* Prints startup errors if TU_DEBUG=startup is set or on a debug driver
 * build.
 */
#define vk_startup_errorf(instance, error, format, ...) \
   __vk_startup_errorf(instance, error, \
                       instance->debug_flags & TU_DEBUG_STARTUP, \
                       __FILE__, __LINE__, format, ##__VA_ARGS__)

void
__tu_finishme(const char *file, int line, const char *format, ...)
   PRINTFLIKE(3, 4);

/**
 * Print a FINISHME message, including its source location.
 */
#define tu_finishme(format, ...)                                             \
   do {                                                                      \
      static bool reported = false;                                          \
      if (!reported) {                                                       \
         __tu_finishme(__FILE__, __LINE__, format, ##__VA_ARGS__);           \
         reported = true;                                                    \
      }                                                                      \
   } while (0)

#define tu_stub()                                                            \
   do {                                                                      \
      tu_finishme("stub %s", __func__);                                      \
   } while (0)

struct tu_memory_heap {
   /* Standard bits passed on to the client */
   VkDeviceSize      size;
   VkMemoryHeapFlags flags;

   /** Copied from ANV:
    *
    * Driver-internal book-keeping.
    *
    * Align it to 64 bits to make atomic operations faster on 32 bit platforms.
    */
   VkDeviceSize      used __attribute__ ((aligned (8)));
};

uint64_t
tu_get_system_heap_size(void);

struct tu_physical_device
{
   struct vk_physical_device vk;

   struct tu_instance *instance;

   const char *name;
   uint8_t driver_uuid[VK_UUID_SIZE];
   uint8_t device_uuid[VK_UUID_SIZE];
   uint8_t cache_uuid[VK_UUID_SIZE];

   struct wsi_device wsi_device;

   int local_fd;
   bool has_local;
   int64_t local_major;
   int64_t local_minor;
   int master_fd;
   bool has_master;
   int64_t master_major;
   int64_t master_minor;

   uint32_t gmem_size;
   uint64_t gmem_base;
   uint32_t ccu_offset_gmem;
   uint32_t ccu_offset_bypass;

   struct fd_dev_id dev_id;
   const struct fd_dev_info *info;

   int msm_major_version;
   int msm_minor_version;

   /* Address space and global fault count for this local_fd with DRM backend */
   uint64_t fault_count;

   struct tu_memory_heap heap;

   struct vk_sync_type syncobj_type;
   struct vk_sync_timeline_type timeline_type;
   const struct vk_sync_type *sync_types[3];
};

enum tu_debug_flags
{
   TU_DEBUG_STARTUP = 1 << 0,
   TU_DEBUG_NIR = 1 << 1,
   TU_DEBUG_NOBIN = 1 << 3,
   TU_DEBUG_SYSMEM = 1 << 4,
   TU_DEBUG_FORCEBIN = 1 << 5,
   TU_DEBUG_NOUBWC = 1 << 6,
   TU_DEBUG_NOMULTIPOS = 1 << 7,
   TU_DEBUG_NOLRZ = 1 << 8,
   TU_DEBUG_PERFC = 1 << 9,
   TU_DEBUG_FLUSHALL = 1 << 10,
   TU_DEBUG_SYNCDRAW = 1 << 11,
   TU_DEBUG_DONT_CARE_AS_LOAD = 1 << 12,
   TU_DEBUG_GMEM = 1 << 13,
   TU_DEBUG_RAST_ORDER = 1 << 14,
   TU_DEBUG_UNALIGNED_STORE = 1 << 15,
   TU_DEBUG_LAYOUT = 1 << 16,
   TU_DEBUG_LOG_SKIP_GMEM_OPS = 1 << 17,
   TU_DEBUG_PERF = 1 << 18,
   TU_DEBUG_NOLRZFC = 1 << 19,
   TU_DEBUG_DYNAMIC = 1 << 20,
};

struct tu_instance
{
   struct vk_instance vk;

   uint32_t api_version;
   int physical_device_count;
   struct tu_physical_device physical_devices[TU_MAX_DRM_DEVICES];

   struct driOptionCache dri_options;
   struct driOptionCache available_dri_options;

   enum tu_debug_flags debug_flags;
};

VkResult
tu_wsi_init(struct tu_physical_device *physical_device);
void
tu_wsi_finish(struct tu_physical_device *physical_device);

bool
tu_instance_extension_supported(const char *name);
uint32_t
tu_physical_device_api_version(struct tu_physical_device *dev);
bool
tu_physical_device_extension_supported(struct tu_physical_device *dev,
                                       const char *name);

enum tu_bo_alloc_flags
{
   TU_BO_ALLOC_NO_FLAGS = 0,
   TU_BO_ALLOC_ALLOW_DUMP = 1 << 0,
   TU_BO_ALLOC_GPU_READ_ONLY = 1 << 1,
};

struct cache_entry;

struct tu_pipeline_cache
{
   struct vk_object_base base;

   struct tu_device *device;
   pthread_mutex_t mutex;

   uint32_t total_size;
   uint32_t table_size;
   uint32_t kernel_count;
   struct cache_entry **hash_table;
   bool modified;

   VkAllocationCallbacks alloc;
};

struct tu_pipeline_key
{
};


/* queue types */
#define TU_QUEUE_GENERAL 0

#define TU_MAX_QUEUE_FAMILIES 1

/* Keep tu_syncobj until porting to common code for kgsl too */
#ifdef TU_USE_KGSL
struct tu_syncobj;
#endif
struct tu_u_trace_syncobj;

/* Define tu_timeline_sync type based on drm syncobj for a point type
 * for vk_sync_timeline, and the logic to handle is mostly copied from
 * anv_bo_sync since it seems it can be used by similar way to anv.
 */
enum tu_timeline_sync_state {
   /** Indicates that this is a new (or newly reset fence) */
   TU_TIMELINE_SYNC_STATE_RESET,

   /** Indicates that this fence has been submitted to the GPU but is still
    * (as far as we know) in use by the GPU.
    */
   TU_TIMELINE_SYNC_STATE_SUBMITTED,

   TU_TIMELINE_SYNC_STATE_SIGNALED,
};

struct tu_timeline_sync {
   struct vk_sync base;

   enum tu_timeline_sync_state state;
   uint32_t syncobj;
};

struct tu_queue
{
   struct vk_queue vk;

   struct tu_device *device;

   uint32_t msm_queue_id;
   int fence;
};

struct tu_bo
{
   uint32_t gem_handle;
   uint64_t size;
   uint64_t iova;
   void *map;
   int32_t refcnt;

#ifndef TU_USE_KGSL
   uint32_t bo_list_idx;
#endif

   bool implicit_sync : 1;
};

/* externally-synchronized BO suballocator. */
struct tu_suballocator
{
   struct tu_device *dev;

   uint32_t default_size;
   enum tu_bo_alloc_flags flags;

   /** Current BO we're suballocating out of. */
   struct tu_bo *bo;
   uint32_t next_offset;

   /** Optional BO cached for recycling as the next suballoc->bo, instead of having to allocate one. */
   struct tu_bo *cached_bo;
};

struct tu_suballoc_bo
{
   struct tu_bo *bo;
   uint64_t iova;
   uint32_t size; /* bytes */
};

void
tu_bo_suballocator_init(struct tu_suballocator *suballoc,
                        struct tu_device *dev,
                        uint32_t default_size,
                        uint32_t flags);
void
tu_bo_suballocator_finish(struct tu_suballocator *suballoc);

VkResult
tu_suballoc_bo_alloc(struct tu_suballoc_bo *suballoc_bo, struct tu_suballocator *suballoc,
                     uint32_t size, uint32_t align);

void *
tu_suballoc_bo_map(struct tu_suballoc_bo *bo);

void
tu_suballoc_bo_free(struct tu_suballocator *suballoc, struct tu_suballoc_bo *bo);

enum global_shader {
   GLOBAL_SH_VS_BLIT,
   GLOBAL_SH_VS_CLEAR,
   GLOBAL_SH_FS_BLIT,
   GLOBAL_SH_FS_BLIT_ZSCALE,
   GLOBAL_SH_FS_COPY_MS,
   GLOBAL_SH_FS_CLEAR0,
   GLOBAL_SH_FS_CLEAR_MAX = GLOBAL_SH_FS_CLEAR0 + MAX_RTS,
   GLOBAL_SH_COUNT,
};

/**
 * Tracks the results from an individual renderpass. Initially created
 * per renderpass, and appended to the tail of at->pending_results. At a later
 * time, when the GPU has finished writing the results, we fill samples_passed.
 */
struct tu_renderpass_result {
   /* Points into GPU memory */
   struct tu_renderpass_samples* samples;

   struct tu_suballoc_bo bo;

   /*
    * Below here, only used internally within autotune
    */
   uint64_t rp_key;
   struct tu_renderpass_history *history;
   struct list_head node;
   uint32_t fence;
   uint64_t samples_passed;
};

#define TU_BORDER_COLOR_COUNT 4096
#define TU_BORDER_COLOR_BUILTIN 6

#define TU_BLIT_SHADER_SIZE 1024

/* This struct defines the layout of the global_bo */
struct tu6_global
{
   /* clear/blit shaders */
   uint32_t shaders[TU_BLIT_SHADER_SIZE];

   uint32_t seqno_dummy;          /* dummy seqno for CP_EVENT_WRITE */
   uint32_t _pad0;
   volatile uint32_t vsc_draw_overflow;
   uint32_t _pad1;
   volatile uint32_t vsc_prim_overflow;
   uint32_t _pad2;
   uint64_t predicate;

   /* scratch space for VPC_SO[i].FLUSH_BASE_LO/HI, start on 32 byte boundary. */
   struct {
      uint32_t offset;
      uint32_t pad[7];
   } flush_base[4];

   ALIGN16 uint32_t cs_indirect_xyz[3];

   volatile uint32_t vtx_stats_query_not_running;

   /* To know when renderpass stats for autotune are valid */
   volatile uint32_t autotune_fence;

   /* For recycling command buffers for dynamic suspend/resume comamnds */
   volatile uint32_t dynamic_rendering_fence;

   volatile uint32_t dbg_one;
   volatile uint32_t dbg_gmem_total_loads;
   volatile uint32_t dbg_gmem_taken_loads;
   volatile uint32_t dbg_gmem_total_stores;
   volatile uint32_t dbg_gmem_taken_stores;

   /* Written from GPU */
   volatile uint32_t breadcrumb_gpu_sync_seqno;
   uint32_t _pad3;
   /* Written from CPU, acknowledges value written from GPU */
   volatile uint32_t breadcrumb_cpu_sync_seqno;
   uint32_t _pad4;

   /* note: larger global bo will be used for customBorderColors */
   struct bcolor_entry bcolor_builtin[TU_BORDER_COLOR_BUILTIN], bcolor[];
};
#define gb_offset(member) offsetof(struct tu6_global, member)
#define global_iova(cmd, member) ((cmd)->device->global_bo->iova + gb_offset(member))

/* extra space in vsc draw/prim streams */
#define VSC_PAD 0x40

struct tu_device
{
   struct vk_device vk;
   struct tu_instance *instance;

   struct tu_queue *queues[TU_MAX_QUEUE_FAMILIES];
   int queue_count[TU_MAX_QUEUE_FAMILIES];

   struct tu_physical_device *physical_device;
   int fd;

   struct ir3_compiler *compiler;

   /* Backup in-memory cache to be used if the app doesn't provide one */
   struct vk_pipeline_cache *mem_cache;

#define MIN_SCRATCH_BO_SIZE_LOG2 12 /* A page */

   /* Currently the kernel driver uses a 32-bit GPU address space, but it
    * should be impossible to go beyond 48 bits.
    */
   struct {
      struct tu_bo *bo;
      mtx_t construct_mtx;
      bool initialized;
   } scratch_bos[48 - MIN_SCRATCH_BO_SIZE_LOG2];

   struct tu_bo *global_bo;

   uint32_t implicit_sync_bo_count;

   /* Device-global BO suballocator for reducing BO management overhead for
    * (read-only) pipeline state.  Synchronized by pipeline_mutex.
    */
   struct tu_suballocator pipeline_suballoc;
   mtx_t pipeline_mutex;

   /* Device-global BO suballocator for reducing BO management for small
    * gmem/sysmem autotune result buffers.  Synchronized by autotune_mutex.
    */
   struct tu_suballocator autotune_suballoc;
   mtx_t autotune_mutex;

   /* the blob seems to always use 8K factor and 128K param sizes, copy them */
#define TU_TESS_FACTOR_SIZE (8 * 1024)
#define TU_TESS_PARAM_SIZE (128 * 1024)
#define TU_TESS_BO_SIZE (TU_TESS_FACTOR_SIZE + TU_TESS_PARAM_SIZE)
   /* Lazily allocated, protected by the device mutex. */
   struct tu_bo *tess_bo;

   struct ir3_shader_variant *global_shader_variants[GLOBAL_SH_COUNT];
   struct ir3_shader *global_shaders[GLOBAL_SH_COUNT];
   uint64_t global_shader_va[GLOBAL_SH_COUNT];

   uint32_t vsc_draw_strm_pitch;
   uint32_t vsc_prim_strm_pitch;
   BITSET_DECLARE(custom_border_color, TU_BORDER_COLOR_COUNT);
   mtx_t mutex;

   /* bo list for submits: */
   struct drm_msm_gem_submit_bo *bo_list;
   /* map bo handles to bo list index: */
   uint32_t bo_count, bo_list_size;
   mtx_t bo_mutex;
   /* protects imported BOs creation/freeing */
   struct u_rwlock dma_bo_lock;

   /* This array holds all our 'struct tu_bo' allocations. We use this
    * so we can add a refcount to our BOs and check if a particular BO
    * was already allocated in this device using its GEM handle. This is
    * necessary to properly manage BO imports, because the kernel doesn't
    * refcount the underlying BO memory.
    *
    * Specifically, when self-importing (i.e. importing a BO into the same
    * device that created it), the kernel will give us the same BO handle
    * for both BOs and we must only free it once when  both references are
    * freed. Otherwise, if we are not self-importing, we get two different BO
    * handles, and we want to free each one individually.
    *
    * The refcount is also useful for being able to maintain BOs across
    * VK object lifetimes, such as pipelines suballocating out of BOs
    * allocated on the device.
    */
   struct util_sparse_array bo_map;

   /* Command streams to set pass index to a scratch reg */
   struct tu_cs *perfcntrs_pass_cs;
   struct tu_cs_entry *perfcntrs_pass_cs_entries;

   struct util_dynarray dynamic_rendering_pending;
   VkCommandPool dynamic_rendering_pool;
   uint32_t dynamic_rendering_fence;

   /* Condition variable for timeline semaphore to notify waiters when a
    * new submit is executed. */
   pthread_cond_t timeline_cond;
   pthread_mutex_t submit_mutex;

   struct tu_autotune autotune;

   struct breadcrumbs_context *breadcrumbs_ctx;

#ifdef ANDROID
   const void *gralloc;
   enum {
      TU_GRALLOC_UNKNOWN,
      TU_GRALLOC_CROS,
      TU_GRALLOC_OTHER,
   } gralloc_type;
#endif

   uint32_t submit_count;

   struct u_trace_context trace_context;

   #ifdef HAVE_PERFETTO
   struct tu_perfetto_state perfetto;
   #endif

   bool use_z24uint_s8uint;
};

void tu_init_clear_blit_shaders(struct tu_device *dev);

void tu_destroy_clear_blit_shaders(struct tu_device *dev);

VkResult tu_init_dynamic_rendering(struct tu_device *dev);

void tu_destroy_dynamic_rendering(struct tu_device *dev);

VkResult tu_insert_dynamic_cmdbufs(struct tu_device *dev,
                                   struct tu_cmd_buffer ***cmds_ptr,
                                   uint32_t *size);

VkResult
tu_device_submit_deferred_locked(struct tu_device *dev);

VkResult
tu_device_wait_u_trace(struct tu_device *dev, struct tu_u_trace_syncobj *syncobj);

uint64_t
tu_device_ticks_to_ns(struct tu_device *dev, uint64_t ts);

VkResult
tu_device_check_status(struct vk_device *vk_device);

VkResult
tu_bo_init_new(struct tu_device *dev, struct tu_bo **bo, uint64_t size,
               enum tu_bo_alloc_flags flags);
VkResult
tu_bo_init_dmabuf(struct tu_device *dev,
                  struct tu_bo **bo,
                  uint64_t size,
                  int fd);
int
tu_bo_export_dmabuf(struct tu_device *dev, struct tu_bo *bo);
void
tu_bo_finish(struct tu_device *dev, struct tu_bo *bo);
VkResult
tu_bo_map(struct tu_device *dev, struct tu_bo *bo);

static inline struct tu_bo *
tu_device_lookup_bo(struct tu_device *device, uint32_t handle)
{
   return (struct tu_bo *) util_sparse_array_get(&device->bo_map, handle);
}

static inline struct tu_bo *
tu_bo_get_ref(struct tu_bo *bo)
{
   p_atomic_inc(&bo->refcnt);
   return bo;
}

/* Get a scratch bo for use inside a command buffer. This will always return
 * the same bo given the same size or similar sizes, so only one scratch bo
 * can be used at the same time. It's meant for short-lived things where we
 * need to write to some piece of memory, read from it, and then immediately
 * discard it.
 */
VkResult
tu_get_scratch_bo(struct tu_device *dev, uint64_t size, struct tu_bo **bo);

struct tu_cs_entry
{
   /* No ownership */
   const struct tu_bo *bo;

   uint32_t size;
   uint32_t offset;
};

struct tu_cs_memory {
   uint32_t *map;
   uint64_t iova;
};

struct tu_draw_state {
   uint64_t iova : 48;
   uint32_t size : 16;
};

enum tu_dynamic_state
{
   /* re-use VK_DYNAMIC_STATE_ enums for non-extended dynamic states */
   TU_DYNAMIC_STATE_SAMPLE_LOCATIONS = VK_DYNAMIC_STATE_STENCIL_REFERENCE + 1,
   TU_DYNAMIC_STATE_RB_DEPTH_CNTL,
   TU_DYNAMIC_STATE_RB_STENCIL_CNTL,
   TU_DYNAMIC_STATE_VB_STRIDE,
   TU_DYNAMIC_STATE_RASTERIZER_DISCARD,
   TU_DYNAMIC_STATE_BLEND,
   TU_DYNAMIC_STATE_COUNT,
   /* no associated draw state: */
   TU_DYNAMIC_STATE_PRIMITIVE_TOPOLOGY = TU_DYNAMIC_STATE_COUNT,
   TU_DYNAMIC_STATE_PRIMITIVE_RESTART_ENABLE,
   TU_DYNAMIC_STATE_LOGIC_OP,
   TU_DYNAMIC_STATE_COLOR_WRITE_ENABLE,
   /* re-use the line width enum as it uses GRAS_SU_CNTL: */
   TU_DYNAMIC_STATE_GRAS_SU_CNTL = VK_DYNAMIC_STATE_LINE_WIDTH,
};

enum tu_draw_state_group_id
{
   TU_DRAW_STATE_PROGRAM_CONFIG,
   TU_DRAW_STATE_PROGRAM,
   TU_DRAW_STATE_PROGRAM_BINNING,
   TU_DRAW_STATE_VB,
   TU_DRAW_STATE_VI,
   TU_DRAW_STATE_VI_BINNING,
   TU_DRAW_STATE_RAST,
   TU_DRAW_STATE_CONST,
   TU_DRAW_STATE_DESC_SETS,
   TU_DRAW_STATE_DESC_SETS_LOAD,
   TU_DRAW_STATE_VS_PARAMS,
   TU_DRAW_STATE_INPUT_ATTACHMENTS_GMEM,
   TU_DRAW_STATE_INPUT_ATTACHMENTS_SYSMEM,
   TU_DRAW_STATE_LRZ_AND_DEPTH_PLANE,
   TU_DRAW_STATE_PRIM_MODE_GMEM,
   TU_DRAW_STATE_PRIM_MODE_SYSMEM,

   /* dynamic state related draw states */
   TU_DRAW_STATE_DYNAMIC,
   TU_DRAW_STATE_COUNT = TU_DRAW_STATE_DYNAMIC + TU_DYNAMIC_STATE_COUNT,
};

enum tu_cs_mode
{

   /*
    * A command stream in TU_CS_MODE_GROW mode grows automatically whenever it
    * is full.  tu_cs_begin must be called before command packet emission and
    * tu_cs_end must be called after.
    *
    * This mode may create multiple entries internally.  The entries must be
    * submitted together.
    */
   TU_CS_MODE_GROW,

   /*
    * A command stream in TU_CS_MODE_EXTERNAL mode wraps an external,
    * fixed-size buffer.  tu_cs_begin and tu_cs_end are optional and have no
    * effect on it.
    *
    * This mode does not create any entry or any BO.
    */
   TU_CS_MODE_EXTERNAL,

   /*
    * A command stream in TU_CS_MODE_SUB_STREAM mode does not support direct
    * command packet emission.  tu_cs_begin_sub_stream must be called to get a
    * sub-stream to emit comamnd packets to.  When done with the sub-stream,
    * tu_cs_end_sub_stream must be called.
    *
    * This mode does not create any entry internally.
    */
   TU_CS_MODE_SUB_STREAM,
};

#define TU_COND_EXEC_STACK_SIZE 4

struct tu_cs
{
   uint32_t *start;
   uint32_t *cur;
   uint32_t *reserved_end;
   uint32_t *end;

   struct tu_device *device;
   enum tu_cs_mode mode;
   uint32_t next_bo_size;

   struct tu_cs_entry *entries;
   uint32_t entry_count;
   uint32_t entry_capacity;

   struct tu_bo **bos;
   uint32_t bo_count;
   uint32_t bo_capacity;

   /* Optional BO that this CS is sub-allocated from for TU_CS_MODE_SUB_STREAM */
   struct tu_bo *refcount_bo;

   /* state for cond_exec_start/cond_exec_end */
   uint32_t cond_stack_depth;
   uint32_t cond_flags[TU_COND_EXEC_STACK_SIZE];
   uint32_t *cond_dwords[TU_COND_EXEC_STACK_SIZE];

   uint32_t breadcrumb_emit_after;
};

struct tu_device_memory
{
   struct vk_object_base base;

   struct tu_bo *bo;
};

struct tu_descriptor_range
{
   uint64_t va;
   uint32_t size;
};

struct tu_descriptor_set
{
   struct vk_object_base base;

   /* Link to descriptor pool's desc_sets list . */
   struct list_head pool_link;

   struct tu_descriptor_set_layout *layout;
   struct tu_descriptor_pool *pool;
   uint32_t size;

   uint64_t va;
   uint32_t *mapped_ptr;

   uint32_t *dynamic_descriptors;
};

struct tu_descriptor_pool_entry
{
   uint32_t offset;
   uint32_t size;
   struct tu_descriptor_set *set;
};

struct tu_descriptor_pool
{
   struct vk_object_base base;

   struct tu_bo *bo;
   uint64_t current_offset;
   uint64_t size;

   uint8_t *host_memory_base;
   uint8_t *host_memory_ptr;
   uint8_t *host_memory_end;
   uint8_t *host_bo;

   struct list_head desc_sets;

   uint32_t entry_count;
   uint32_t max_entry_count;
   struct tu_descriptor_pool_entry entries[0];
};

struct tu_descriptor_update_template_entry
{
   VkDescriptorType descriptor_type;

   /* The number of descriptors to update */
   uint32_t descriptor_count;

   /* Into mapped_ptr or dynamic_descriptors, in units of the respective array
    */
   uint32_t dst_offset;

   /* In dwords. Not valid/used for dynamic descriptors */
   uint32_t dst_stride;

   uint32_t buffer_offset;

   /* Only valid for combined image samplers and samplers */
   uint16_t has_sampler;

   /* In bytes */
   size_t src_offset;
   size_t src_stride;

   /* For push descriptors */
   const struct tu_sampler *immutable_samplers;
};

struct tu_descriptor_update_template
{
   struct vk_object_base base;

   uint32_t entry_count;
   VkPipelineBindPoint bind_point;
   struct tu_descriptor_update_template_entry entry[0];
};

struct tu_buffer
{
   struct vk_object_base base;

   VkDeviceSize size;

   VkBufferUsageFlags usage;
   VkBufferCreateFlags flags;

   struct tu_bo *bo;
   uint64_t iova;
};

const char *
tu_get_debug_option_name(int id);

const char *
tu_get_perftest_option_name(int id);

struct tu_attachment_info
{
   struct tu_image_view *attachment;
};

struct tu_framebuffer
{
   struct vk_object_base base;

   uint32_t width;
   uint32_t height;
   uint32_t layers;

   /* size of the first tile */
   VkExtent2D tile0;
   /* number of tiles */
   VkExtent2D tile_count;

   /* size of the first VSC pipe */
   VkExtent2D pipe0;
   /* number of VSC pipes */
   VkExtent2D pipe_count;

   /* Whether binning should be used for gmem rendering using this framebuffer. */
   bool binning;

   /* Whether binning could be used for gmem rendering using this framebuffer. */
   bool binning_possible;

   /* pipe register values */
   uint32_t pipe_config[MAX_VSC_PIPES];
   uint32_t pipe_sizes[MAX_VSC_PIPES];

   uint32_t attachment_count;
   struct tu_attachment_info attachments[0];
};

struct tu_subpass_barrier {
   VkPipelineStageFlags2 src_stage_mask;
   VkPipelineStageFlags2 dst_stage_mask;
   VkAccessFlags2 src_access_mask;
   VkAccessFlags2 dst_access_mask;
   bool incoherent_ccu_color, incoherent_ccu_depth;
};

struct tu_subpass_attachment
{
   uint32_t attachment;

   /* For input attachments, true if it needs to be patched to refer to GMEM
    * in GMEM mode. This is false if it hasn't already been written as an
    * attachment.
    */
   bool patch_input_gmem;
};

struct tu_subpass
{
   uint32_t input_count;
   uint32_t color_count;
   uint32_t resolve_count;
   bool resolve_depth_stencil;

   bool feedback_loop_color;
   bool feedback_loop_ds;

   /* True if we must invalidate UCHE thanks to a feedback loop. */
   bool feedback_invalidate;

   /* In other words - framebuffer fetch support */
   bool raster_order_attachment_access;

   struct tu_subpass_attachment *input_attachments;
   struct tu_subpass_attachment *color_attachments;
   struct tu_subpass_attachment *resolve_attachments;
   struct tu_subpass_attachment depth_stencil_attachment;

   VkSampleCountFlagBits samples;

   uint32_t srgb_cntl;
   uint32_t multiview_mask;

   struct tu_subpass_barrier start_barrier;
};

struct tu_render_pass_attachment
{
   VkFormat format;
   uint32_t samples;
   uint32_t cpp;
   VkImageAspectFlags clear_mask;
   uint32_t clear_views;
   bool load;
   bool store;
   int32_t gmem_offset;
   bool will_be_resolved;
   /* for D32S8 separate stencil: */
   bool load_stencil;
   bool store_stencil;

   bool cond_load_allowed;
   bool cond_store_allowed;

   int32_t gmem_offset_stencil;
};

struct tu_render_pass
{
   struct vk_object_base base;

   uint32_t attachment_count;
   uint32_t subpass_count;
   uint32_t gmem_pixels;
   uint32_t tile_align_w;

   /* memory bandwidth costs (in bytes) for gmem / sysmem rendering */
   uint32_t gmem_bandwidth_per_pixel;
   uint32_t sysmem_bandwidth_per_pixel;

   struct tu_subpass_attachment *subpass_attachments;
   struct tu_render_pass_attachment *attachments;
   struct tu_subpass_barrier end_barrier;
   struct tu_subpass subpasses[0];
};

void
tu_framebuffer_tiling_config(struct tu_framebuffer *fb,
                             const struct tu_device *device,
                             const struct tu_render_pass *pass);

struct tu_descriptor_state
{
   struct tu_descriptor_set *sets[MAX_SETS];
   struct tu_descriptor_set push_set;
   uint32_t dynamic_descriptors[MAX_DYNAMIC_BUFFERS_SIZE];
};

enum tu_cmd_dirty_bits
{
   TU_CMD_DIRTY_VERTEX_BUFFERS = BIT(0),
   TU_CMD_DIRTY_VB_STRIDE = BIT(1),
   TU_CMD_DIRTY_GRAS_SU_CNTL = BIT(2),
   TU_CMD_DIRTY_RB_DEPTH_CNTL = BIT(3),
   TU_CMD_DIRTY_RB_STENCIL_CNTL = BIT(4),
   TU_CMD_DIRTY_DESC_SETS_LOAD = BIT(5),
   TU_CMD_DIRTY_COMPUTE_DESC_SETS_LOAD = BIT(6),
   TU_CMD_DIRTY_SHADER_CONSTS = BIT(7),
   TU_CMD_DIRTY_LRZ = BIT(8),
   TU_CMD_DIRTY_VS_PARAMS = BIT(9),
   TU_CMD_DIRTY_RASTERIZER_DISCARD = BIT(10),
   TU_CMD_DIRTY_VIEWPORTS = BIT(11),
   TU_CMD_DIRTY_BLEND = BIT(12),
   /* all draw states were disabled and need to be re-enabled: */
   TU_CMD_DIRTY_DRAW_STATE = BIT(13)
};

/* There are only three cache domains we have to care about: the CCU, or
 * color cache unit, which is used for color and depth/stencil attachments
 * and copy/blit destinations, and is split conceptually into color and depth,
 * and the universal cache or UCHE which is used for pretty much everything
 * else, except for the CP (uncached) and host. We need to flush whenever data
 * crosses these boundaries.
 */

enum tu_cmd_access_mask {
   TU_ACCESS_UCHE_READ = 1 << 0,
   TU_ACCESS_UCHE_WRITE = 1 << 1,
   TU_ACCESS_CCU_COLOR_READ = 1 << 2,
   TU_ACCESS_CCU_COLOR_WRITE = 1 << 3,
   TU_ACCESS_CCU_DEPTH_READ = 1 << 4,
   TU_ACCESS_CCU_DEPTH_WRITE = 1 << 5,

   /* Experiments have shown that while it's safe to avoid flushing the CCU
    * after each blit/renderpass, it's not safe to assume that subsequent
    * lookups with a different attachment state will hit unflushed cache
    * entries. That is, the CCU needs to be flushed and possibly invalidated
    * when accessing memory with a different attachment state. Writing to an
    * attachment under the following conditions after clearing using the
    * normal 2d engine path is known to have issues:
    *
    * - It isn't the 0'th layer.
    * - There are more than one attachment, and this isn't the 0'th attachment
    *   (this seems to also depend on the cpp of the attachments).
    *
    * Our best guess is that the layer/MRT state is used when computing
    * the location of a cache entry in CCU, to avoid conflicts. We assume that
    * any access in a renderpass after or before an access by a transfer needs
    * a flush/invalidate, and use the _INCOHERENT variants to represent access
    * by a renderpass.
    */
   TU_ACCESS_CCU_COLOR_INCOHERENT_READ = 1 << 6,
   TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE = 1 << 7,
   TU_ACCESS_CCU_DEPTH_INCOHERENT_READ = 1 << 8,
   TU_ACCESS_CCU_DEPTH_INCOHERENT_WRITE = 1 << 9,

   /* Accesses which bypasses any cache. e.g. writes via the host,
    * CP_EVENT_WRITE::BLIT, and the CP are SYSMEM_WRITE.
    */
   TU_ACCESS_SYSMEM_READ = 1 << 10,
   TU_ACCESS_SYSMEM_WRITE = 1 << 11,

   /* Memory writes from the CP start in-order with draws and event writes,
    * but execute asynchronously and hence need a CP_WAIT_MEM_WRITES if read.
    */
   TU_ACCESS_CP_WRITE = 1 << 12,

   TU_ACCESS_READ =
      TU_ACCESS_UCHE_READ |
      TU_ACCESS_CCU_COLOR_READ |
      TU_ACCESS_CCU_DEPTH_READ |
      TU_ACCESS_CCU_COLOR_INCOHERENT_READ |
      TU_ACCESS_CCU_DEPTH_INCOHERENT_READ |
      TU_ACCESS_SYSMEM_READ,

   TU_ACCESS_WRITE =
      TU_ACCESS_UCHE_WRITE |
      TU_ACCESS_CCU_COLOR_WRITE |
      TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE |
      TU_ACCESS_CCU_DEPTH_WRITE |
      TU_ACCESS_CCU_DEPTH_INCOHERENT_WRITE |
      TU_ACCESS_SYSMEM_WRITE |
      TU_ACCESS_CP_WRITE,

   TU_ACCESS_ALL =
      TU_ACCESS_READ |
      TU_ACCESS_WRITE,
};

/* Starting with a6xx, the pipeline is split into several "clusters" (really
 * pipeline stages). Each stage has its own pair of register banks and can
 * switch them independently, so that earlier stages can run ahead of later
 * ones. e.g. the FS of draw N and the VS of draw N + 1 can be executing at
 * the same time.
 *
 * As a result of this, we need to insert a WFI when an earlier stage depends
 * on the result of a later stage. CP_DRAW_* and CP_BLIT will wait for any
 * pending WFI's to complete before starting, and usually before reading
 * indirect params even, so a WFI also acts as a full "pipeline stall".
 *
 * Note, the names of the stages come from CLUSTER_* in devcoredump. We
 * include all the stages for completeness, even ones which do not read/write
 * anything.
 */

enum tu_stage {
   /* This doesn't correspond to a cluster, but we need it for tracking
    * indirect draw parameter reads etc.
    */
   TU_STAGE_CP,

   /* - Fetch index buffer
    * - Fetch vertex attributes, dispatch VS
    */
   TU_STAGE_FE,

   /* Execute all geometry stages (VS thru GS) */
   TU_STAGE_SP_VS,

   /* Write to VPC, do primitive assembly. */
   TU_STAGE_PC_VS,

   /* Rasterization. RB_DEPTH_BUFFER_BASE only exists in CLUSTER_PS according
    * to devcoredump so presumably this stage stalls for TU_STAGE_PS when
    * early depth testing is enabled before dispatching fragments? However
    * GRAS reads and writes LRZ directly.
    */
   TU_STAGE_GRAS,

   /* Execute FS */
   TU_STAGE_SP_PS,

   /* - Fragment tests
    * - Write color/depth
    * - Streamout writes (???)
    * - Varying interpolation (???)
    */
   TU_STAGE_PS,
};

enum tu_cmd_flush_bits {
   TU_CMD_FLAG_CCU_FLUSH_DEPTH = 1 << 0,
   TU_CMD_FLAG_CCU_FLUSH_COLOR = 1 << 1,
   TU_CMD_FLAG_CCU_INVALIDATE_DEPTH = 1 << 2,
   TU_CMD_FLAG_CCU_INVALIDATE_COLOR = 1 << 3,
   TU_CMD_FLAG_CACHE_FLUSH = 1 << 4,
   TU_CMD_FLAG_CACHE_INVALIDATE = 1 << 5,
   TU_CMD_FLAG_WAIT_MEM_WRITES = 1 << 6,
   TU_CMD_FLAG_WAIT_FOR_IDLE = 1 << 7,
   TU_CMD_FLAG_WAIT_FOR_ME = 1 << 8,

   TU_CMD_FLAG_ALL_FLUSH =
      TU_CMD_FLAG_CCU_FLUSH_DEPTH |
      TU_CMD_FLAG_CCU_FLUSH_COLOR |
      TU_CMD_FLAG_CACHE_FLUSH |
      /* Treat the CP as a sort of "cache" which may need to be "flushed" via
       * waiting for writes to land with WAIT_FOR_MEM_WRITES.
       */
      TU_CMD_FLAG_WAIT_MEM_WRITES,

   TU_CMD_FLAG_ALL_INVALIDATE =
      TU_CMD_FLAG_CCU_INVALIDATE_DEPTH |
      TU_CMD_FLAG_CCU_INVALIDATE_COLOR |
      TU_CMD_FLAG_CACHE_INVALIDATE |
      /* Treat CP_WAIT_FOR_ME as a "cache" that needs to be invalidated when a
       * a command that needs CP_WAIT_FOR_ME is executed. This means we may
       * insert an extra WAIT_FOR_ME before an indirect command requiring it
       * in case there was another command before the current command buffer
       * that it needs to wait for.
       */
      TU_CMD_FLAG_WAIT_FOR_ME,
};

/* Changing the CCU from sysmem mode to gmem mode or vice-versa is pretty
 * heavy, involving a CCU cache flush/invalidate and a WFI in order to change
 * which part of the gmem is used by the CCU. Here we keep track of what the
 * state of the CCU.
 */
enum tu_cmd_ccu_state {
   TU_CMD_CCU_SYSMEM,
   TU_CMD_CCU_GMEM,
   TU_CMD_CCU_UNKNOWN,
};

struct tu_cache_state {
   /* Caches which must be made available (flushed) eventually if there are
    * any users outside that cache domain, and caches which must be
    * invalidated eventually if there are any reads.
    */
   enum tu_cmd_flush_bits pending_flush_bits;
   /* Pending flushes */
   enum tu_cmd_flush_bits flush_bits;
};

enum tu_lrz_force_disable_mask {
   TU_LRZ_FORCE_DISABLE_LRZ = 1 << 0,
   TU_LRZ_FORCE_DISABLE_WRITE = 1 << 1,
};

enum tu_lrz_direction {
   TU_LRZ_UNKNOWN,
   /* Depth func less/less-than: */
   TU_LRZ_LESS,
   /* Depth func greater/greater-than: */
   TU_LRZ_GREATER,
};

struct tu_lrz_pipeline
{
   uint32_t force_disable_mask;
   bool fs_has_kill;
   bool force_late_z;
   bool early_fragment_tests;
};

struct tu_lrz_state
{
   /* Depth/Stencil image currently on use to do LRZ */
   const struct tu_image_view *image_view;
   VkClearValue depth_clear_value;
   /* If LRZ is in invalid state we cannot use it until depth is cleared */
   bool valid : 1;
   /* Allows to temporary disable LRZ */
   bool enabled : 1;
   bool fast_clear : 1;
   bool gpu_dir_tracking : 1;
   /* Continue using old LRZ state (LOAD_OP_LOAD of depth) */
   bool reuse_previous_state : 1;
   enum tu_lrz_direction prev_direction;
};

struct tu_vs_params {
   uint32_t vertex_offset;
   uint32_t first_instance;
};

/* This should be for state that is set inside a renderpass and used at
 * renderpass end time, e.g. to decide whether to use sysmem. This needs
 * special handling for secondary cmdbufs and suspending/resuming render
 * passes where the state may need to be combined afterwards.
 */
struct tu_render_pass_state
{
   bool xfb_used;
   bool has_tess;
   bool has_prim_generated_query_in_rp;
   bool disable_gmem;

   /* Track whether conditional predicate for COND_REG_EXEC is changed in draw_cs */
   bool draw_cs_writes_to_cond_pred;

   uint32_t drawcall_count;

   /* A calculated "draw cost" value for renderpass, which tries to
    * estimate the bandwidth-per-sample of all the draws according
    * to:
    *
    *    foreach_draw (...) {
    *      sum += pipeline->color_bandwidth_per_sample;
    *      if (depth_test_enabled)
    *        sum += pipeline->depth_cpp_per_sample;
    *      if (depth_write_enabled)
    *        sum += pipeline->depth_cpp_per_sample;
    *      if (stencil_write_enabled)
    *        sum += pipeline->stencil_cpp_per_sample * 2;
    *    }
    *    drawcall_bandwidth_per_sample = sum / drawcall_count;
    *
    * It allows us to estimate the total bandwidth of drawcalls later, by
    * calculating (drawcall_bandwidth_per_sample * zpass_sample_count).
    *
    * This does ignore depth buffer traffic for samples which do not
    * pass due to depth-test fail, and some other details.  But it is
    * just intended to be a rough estimate that is easy to calculate.
    */
   uint32_t drawcall_bandwidth_per_sample_sum;
};

void tu_render_pass_state_merge(struct tu_render_pass_state *dst,
                                const struct tu_render_pass_state *src);
struct tu_cmd_state
{
   uint32_t dirty;

   struct tu_pipeline *pipeline;
   struct tu_pipeline *compute_pipeline;

   struct tu_render_pass_state rp;

   /* Vertex buffers, viewports, and scissors
    * the states for these can be updated partially, so we need to save these
    * to be able to emit a complete draw state
    */
   struct {
      uint64_t base;
      uint32_t size;
      uint32_t stride;
   } vb[MAX_VBS];
   VkViewport viewport[MAX_VIEWPORTS];
   VkRect2D scissor[MAX_SCISSORS];
   uint32_t max_viewport, max_scissor;

   /* for dynamic states that can't be emitted directly */
   uint32_t dynamic_stencil_mask;
   uint32_t dynamic_stencil_wrmask;
   uint32_t dynamic_stencil_ref;

   uint32_t gras_su_cntl, rb_depth_cntl, rb_stencil_cntl;
   uint32_t pc_raster_cntl, vpc_unknown_9107;
   uint32_t rb_mrt_control[MAX_RTS], rb_mrt_blend_control[MAX_RTS];
   uint32_t rb_mrt_control_rop;
   uint32_t rb_blend_cntl, sp_blend_cntl;
   uint32_t pipeline_color_write_enable, pipeline_blend_enable;
   uint32_t color_write_enable;
   bool logic_op_enabled;
   bool rop_reads_dst;
   enum pc_di_primtype primtype;
   bool primitive_restart_enable;

   /* saved states to re-emit in TU_CMD_DIRTY_DRAW_STATE case */
   struct tu_draw_state dynamic_state[TU_DYNAMIC_STATE_COUNT];
   struct tu_draw_state vertex_buffers;
   struct tu_draw_state shader_const;
   struct tu_draw_state desc_sets;

   struct tu_draw_state vs_params;

   /* Index buffer */
   uint64_t index_va;
   uint32_t max_index_count;
   uint8_t index_size;

   /* because streamout base has to be 32-byte aligned
    * there is an extra offset to deal with when it is
    * unaligned
    */
   uint8_t streamout_offset[IR3_MAX_SO_BUFFERS];

   /* Renderpasses are tricky, because we may need to flush differently if
    * using sysmem vs. gmem and therefore we have to delay any flushing that
    * happens before a renderpass. So we have to have two copies of the flush
    * state, one for intra-renderpass flushes (i.e. renderpass dependencies)
    * and one for outside a renderpass.
    */
   struct tu_cache_state cache;
   struct tu_cache_state renderpass_cache;

   enum tu_cmd_ccu_state ccu_state;

   const struct tu_render_pass *pass;
   const struct tu_subpass *subpass;
   const struct tu_framebuffer *framebuffer;
   VkRect2D render_area;

   const struct tu_image_view **attachments;

   /* State that in the dynamic case comes from VkRenderingInfo and needs to
    * be saved/restored when suspending. This holds the state for the last
    * suspended renderpass, which may point to this command buffer's dynamic_*
    * or another command buffer if executed on a secondary.
    */
   struct {
      const struct tu_render_pass *pass;
      const struct tu_subpass *subpass;
      const struct tu_framebuffer *framebuffer;
      VkRect2D render_area;

      const struct tu_image_view **attachments;

      struct tu_lrz_state lrz;
   } suspended_pass;

   bool tessfactor_addr_set;
   bool predication_active;
   enum a5xx_line_mode line_mode;
   bool z_negative_one_to_one;

   /* VK_QUERY_PIPELINE_STATISTIC_CLIPPING_INVOCATIONS_BIT and
    * VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT are allowed to run simultaniously,
    * but they use the same {START,STOP}_PRIMITIVE_CTRS control.
    */
   uint32_t prim_counters_running;

   bool prim_generated_query_running_before_rp;

   /* These are the states of the suspend/resume state machine. In addition to
    * tracking whether we're in the middle of a chain of suspending and
    * resuming passes that will be merged, we need to track whether the
    * command buffer begins in the middle of such a chain, for when it gets
    * merged with other command buffers. We call such a chain that begins
    * before the command buffer starts a "pre-chain".
    *
    * Note that when this command buffer is finished, this state is untouched
    * but it gains a different meaning. For example, if we finish in state
    * SR_IN_CHAIN, we finished in the middle of a suspend/resume chain, so
    * there's a suspend/resume chain that extends past the end of the command
    * buffer. In this sense it's the "opposite" of SR_AFTER_PRE_CHAIN, which
    * means that there's a suspend/resume chain that extends before the
    * beginning.
    */
   enum {
      /* Either there are no suspend/resume chains, or they are entirely
       * contained in the current command buffer.
       *
       *   BeginCommandBuffer() <- start of current command buffer
       *       ...
       *       // we are here
       */
      SR_NONE = 0,

      /* We are in the middle of a suspend/resume chain that starts before the
       * current command buffer. This happens when the command buffer begins
       * with a resuming render pass and all of the passes up to the current
       * one are suspending. In this state, our part of the chain is not saved
       * and is in the current draw_cs/state.
       *
       *   BeginRendering() ... EndRendering(suspending)
       *   BeginCommandBuffer() <- start of current command buffer
       *       BeginRendering(resuming) ... EndRendering(suspending)
       *       BeginRendering(resuming) ... EndRendering(suspending)
       *       ...
       *       // we are here
       */
      SR_IN_PRE_CHAIN,

      /* We are currently outside of any suspend/resume chains, but there is a
       * chain starting before the current command buffer. It is saved in
       * pre_chain.
       *
       *   BeginRendering() ... EndRendering(suspending)
       *   BeginCommandBuffer() <- start of current command buffer
       *       // This part is stashed in pre_chain
       *       BeginRendering(resuming) ... EndRendering(suspending)
       *       BeginRendering(resuming) ... EndRendering(suspending)
       *       ...
       *       BeginRendering(resuming) ... EndRendering() // end of chain
       *       ...
       *       // we are here
       */
      SR_AFTER_PRE_CHAIN,

      /* We are in the middle of a suspend/resume chain and there is no chain
       * starting before the current command buffer.
       *
       *   BeginCommandBuffer() <- start of current command buffer
       *       ...
       *       BeginRendering() ... EndRendering(suspending)
       *       BeginRendering(resuming) ... EndRendering(suspending)
       *       BeginRendering(resuming) ... EndRendering(suspending)
       *       ...
       *       // we are here
       */
      SR_IN_CHAIN,

      /* We are in the middle of a suspend/resume chain and there is another,
       * separate, chain starting before the current command buffer.
       *
       *   BeginRendering() ... EndRendering(suspending)
       *   CommandBufferBegin() <- start of current command buffer
       *       // This part is stashed in pre_chain
       *       BeginRendering(resuming) ... EndRendering(suspending)
       *       BeginRendering(resuming) ... EndRendering(suspending)
       *       ...
       *       BeginRendering(resuming) ... EndRendering() // end of chain
       *       ...
       *       BeginRendering() ... EndRendering(suspending)
       *       BeginRendering(resuming) ... EndRendering(suspending)
       *       BeginRendering(resuming) ... EndRendering(suspending)
       *       ...
       *       // we are here
       */
      SR_IN_CHAIN_AFTER_PRE_CHAIN,
   } suspend_resume;

   bool suspending, resuming;

   struct tu_lrz_state lrz;

   struct tu_draw_state lrz_and_depth_plane_state;

   struct tu_vs_params last_vs_params;
};

struct tu_cmd_pool
{
   struct vk_command_pool vk;

   struct list_head cmd_buffers;
   struct list_head free_cmd_buffers;
};

enum tu_cmd_buffer_status
{
   TU_CMD_BUFFER_STATUS_INVALID,
   TU_CMD_BUFFER_STATUS_INITIAL,
   TU_CMD_BUFFER_STATUS_RECORDING,
   TU_CMD_BUFFER_STATUS_EXECUTABLE,
   TU_CMD_BUFFER_STATUS_PENDING,
};

struct tu_cmd_buffer
{
   struct vk_command_buffer vk;

   struct tu_device *device;

   struct tu_cmd_pool *pool;
   struct list_head pool_link;

   struct u_trace trace;
   struct u_trace_iterator trace_renderpass_start;
   struct u_trace_iterator trace_renderpass_end;

   struct list_head renderpass_autotune_results;
   struct tu_autotune_results_buffer* autotune_buffer;

   VkCommandBufferUsageFlags usage_flags;
   enum tu_cmd_buffer_status status;

   VkQueryPipelineStatisticFlags inherited_pipeline_statistics;

   struct tu_cmd_state state;
   uint32_t queue_family_index;

   uint32_t push_constants[MAX_PUSH_CONSTANTS_SIZE / 4];
   VkShaderStageFlags push_constant_stages;
   struct tu_descriptor_set meta_push_descriptors;

   struct tu_descriptor_state descriptors[MAX_BIND_POINTS];

   struct tu_render_pass_attachment dynamic_rp_attachments[2 * (MAX_RTS + 1)];
   struct tu_subpass_attachment dynamic_color_attachments[MAX_RTS];
   struct tu_subpass_attachment dynamic_resolve_attachments[MAX_RTS + 1];
   const struct tu_image_view *dynamic_attachments[2 * (MAX_RTS + 1)];

   struct tu_render_pass dynamic_pass;
   struct tu_subpass dynamic_subpass;
   struct tu_framebuffer dynamic_framebuffer;

   VkResult record_result;

   struct tu_cs cs;
   struct tu_cs draw_cs;
   struct tu_cs tile_store_cs;
   struct tu_cs draw_epilogue_cs;
   struct tu_cs sub_cs;

   /* If the first render pass in the command buffer is resuming, then it is
    * part of a suspend/resume chain that starts before the current command
    * buffer and needs to be merged later. In this case, its incomplete state
    * is stored in pre_chain. In the symmetric case where the last render pass
    * is suspending, we just skip ending the render pass and its state is
    * stored in draw_cs/the current state. The first and last render pass
    * might be part of different chains, which is why all the state may need
    * to be saved separately here.
    */
   struct {
      struct tu_cs draw_cs;
      struct tu_cs draw_epilogue_cs;

      struct u_trace_iterator trace_renderpass_start, trace_renderpass_end;

      struct tu_render_pass_state state;
   } pre_chain;

   uint32_t vsc_draw_strm_pitch;
   uint32_t vsc_prim_strm_pitch;
};

/* Temporary struct for tracking a register state to be written, used by
 * a6xx-pack.h and tu_cs_emit_regs()
 */
struct tu_reg_value {
   uint32_t reg;
   uint64_t value;
   bool is_address;
   struct tu_bo *bo;
   bool bo_write;
   uint32_t bo_offset;
   uint32_t bo_shift;
};

VkResult tu_cmd_buffer_begin(struct tu_cmd_buffer *cmd_buffer,
                             VkCommandBufferUsageFlags usage_flags);

void tu_emit_cache_flush_renderpass(struct tu_cmd_buffer *cmd_buffer,
                                    struct tu_cs *cs);

void tu_emit_cache_flush_ccu(struct tu_cmd_buffer *cmd_buffer,
                             struct tu_cs *cs,
                             enum tu_cmd_ccu_state ccu_state);

void tu_setup_dynamic_render_pass(struct tu_cmd_buffer *cmd_buffer,
                                  const VkRenderingInfo *pRenderingInfo);

void tu_setup_dynamic_inheritance(struct tu_cmd_buffer *cmd_buffer,
                                  const VkCommandBufferInheritanceRenderingInfo *info);

void tu_setup_dynamic_framebuffer(struct tu_cmd_buffer *cmd_buffer,
                                  const VkRenderingInfo *pRenderingInfo);

void
tu_append_pre_chain(struct tu_cmd_buffer *cmd,
                    struct tu_cmd_buffer *secondary);

void
tu_append_pre_post_chain(struct tu_cmd_buffer *cmd,
                         struct tu_cmd_buffer *secondary);

void
tu_append_post_chain(struct tu_cmd_buffer *cmd,
                     struct tu_cmd_buffer *secondary);

void
tu_restore_suspended_pass(struct tu_cmd_buffer *cmd,
                          struct tu_cmd_buffer *suspended);

void tu_cmd_render(struct tu_cmd_buffer *cmd);

void
tu6_emit_event_write(struct tu_cmd_buffer *cmd,
                     struct tu_cs *cs,
                     enum vgt_event_type event);

static inline struct tu_descriptor_state *
tu_get_descriptors_state(struct tu_cmd_buffer *cmd_buffer,
                         VkPipelineBindPoint bind_point)
{
   return &cmd_buffer->descriptors[bind_point];
}

struct tu_event
{
   struct vk_object_base base;
   struct tu_bo *bo;
};

struct tu_push_constant_range
{
   uint32_t lo;
   uint32_t dwords;
};

struct tu_shader
{
   struct ir3_shader *ir3_shader;

   struct tu_push_constant_range push_consts;
   uint8_t active_desc_sets;
   bool multi_pos_output;
};

struct tu_shader_key {
   unsigned multiview_mask;
   bool force_sample_interp;
   enum ir3_wavesize_option api_wavesize, real_wavesize;
};

struct tu_compiled_shaders
{
   struct vk_pipeline_cache_object base;

   struct tu_push_constant_range push_consts[MESA_SHADER_STAGES];
   uint8_t active_desc_sets;
   bool multi_pos_output;

   struct ir3_shader_variant *variants[MESA_SHADER_STAGES];
};

extern const struct vk_pipeline_cache_object_ops tu_shaders_ops;

bool
tu_nir_lower_multiview(nir_shader *nir, uint32_t mask, bool *multi_pos_output,
                       struct tu_device *dev);

nir_shader *
tu_spirv_to_nir(struct tu_device *dev,
                void *mem_ctx,
                const VkPipelineShaderStageCreateInfo *stage_info,
                gl_shader_stage stage);

struct tu_shader *
tu_shader_create(struct tu_device *dev,
                 nir_shader *nir,
                 const struct tu_shader_key *key,
                 struct tu_pipeline_layout *layout,
                 const VkAllocationCallbacks *alloc);

void
tu_shader_destroy(struct tu_device *dev,
                  struct tu_shader *shader,
                  const VkAllocationCallbacks *alloc);

static bool inline
tu6_shared_constants_enable(const struct tu_pipeline_layout *layout,
                            const struct ir3_compiler *compiler)
{
   return layout->push_constant_size > 0 &&
          layout->push_constant_size <= (compiler->shared_consts_size * 16);
}


struct tu_program_descriptor_linkage
{
   struct ir3_const_state const_state;

   uint32_t constlen;

   struct tu_push_constant_range push_consts;
};

struct tu_pipeline_executable {
   gl_shader_stage stage;

   struct ir3_info stats;
   bool is_binning;

   char *nir_from_spirv;
   char *nir_final;
   char *disasm;
};

struct tu_pipeline
{
   struct vk_object_base base;

   struct tu_cs cs;
   struct tu_suballoc_bo bo;

   /* Separate BO for private memory since it should GPU writable */
   struct tu_bo *pvtmem_bo;

   bool need_indirect_descriptor_sets;
   VkShaderStageFlags active_stages;
   uint32_t active_desc_sets;

   /* mask of enabled dynamic states
    * if BIT(i) is set, pipeline->dynamic_state[i] is *NOT* used
    */
   uint32_t dynamic_state_mask;
   struct tu_draw_state dynamic_state[TU_DYNAMIC_STATE_COUNT];

   /* for dynamic states which use the same register: */
   uint32_t gras_su_cntl, gras_su_cntl_mask;
   uint32_t rb_depth_cntl, rb_depth_cntl_mask;
   uint32_t rb_stencil_cntl, rb_stencil_cntl_mask;
   uint32_t pc_raster_cntl, pc_raster_cntl_mask;
   uint32_t vpc_unknown_9107, vpc_unknown_9107_mask;
   uint32_t stencil_wrmask;

   unsigned num_rts;
   uint32_t rb_mrt_control[MAX_RTS], rb_mrt_control_mask;
   uint32_t rb_mrt_blend_control[MAX_RTS];
   uint32_t sp_blend_cntl, sp_blend_cntl_mask;
   uint32_t rb_blend_cntl, rb_blend_cntl_mask;
   uint32_t color_write_enable, blend_enable;
   bool logic_op_enabled, rop_reads_dst;
   bool rasterizer_discard;

   bool rb_depth_cntl_disable;

   enum a5xx_line_mode line_mode;

   /* draw states for the pipeline */
   struct tu_draw_state load_state, rast_state;
   struct tu_draw_state prim_order_state_sysmem, prim_order_state_gmem;

   /* for vertex buffers state */
   uint32_t num_vbs;

   struct tu_push_constant_range shared_consts;

   struct
   {
      struct tu_draw_state config_state;
      struct tu_draw_state state;
      struct tu_draw_state binning_state;

      struct tu_program_descriptor_linkage link[MESA_SHADER_STAGES];
   } program;

   struct
   {
      struct tu_draw_state state;
      struct tu_draw_state binning_state;
   } vi;

   struct
   {
      enum pc_di_primtype primtype;
      bool primitive_restart;
   } ia;

   struct
   {
      uint32_t patch_type;
      uint32_t param_stride;
      bool upper_left_domain_origin;
   } tess;

   struct
   {
      uint32_t local_size[3];
      uint32_t subgroup_size;
   } compute;

   bool provoking_vertex_last;

   struct tu_lrz_pipeline lrz;

   /* In other words - framebuffer fetch support */
   bool raster_order_attachment_access;
   bool subpass_feedback_loop_ds;

   bool z_negative_one_to_one;

   /* memory bandwidth cost (in bytes) for color attachments */
   uint32_t color_bandwidth_per_sample;

   uint32_t depth_cpp_per_sample;
   uint32_t stencil_cpp_per_sample;

   void *executables_mem_ctx;
   /* tu_pipeline_executable */
   struct util_dynarray executables;
};

struct tu_image;

void
tu6_clear_lrz(struct tu_cmd_buffer *cmd, struct tu_cs *cs, struct tu_image* image, const VkClearValue *value);

void
tu6_dirty_lrz_fc(struct tu_cmd_buffer *cmd, struct tu_cs *cs, struct tu_image* image);

void
tu6_emit_lrz(struct tu_cmd_buffer *cmd, struct tu_cs *cs);

void
tu_disable_lrz(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
               struct tu_image *image);

void
tu_lrz_clear_depth_image(struct tu_cmd_buffer *cmd,
                         struct tu_image *image,
                         const VkClearDepthStencilValue *pDepthStencil,
                         uint32_t rangeCount,
                         const VkImageSubresourceRange *pRanges);

void
tu_lrz_begin_renderpass(struct tu_cmd_buffer *cmd,
                        const VkClearValue *clear_values);

void
tu_lrz_begin_resumed_renderpass(struct tu_cmd_buffer *cmd,
                                const VkClearValue *clear_values);

void
tu_lrz_begin_secondary_cmdbuf(struct tu_cmd_buffer *cmd);

void
tu_lrz_tiling_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs);

void
tu_lrz_tiling_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs);

void
tu_lrz_sysmem_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs);

void
tu_lrz_sysmem_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs);

void
tu_lrz_disable_during_renderpass(struct tu_cmd_buffer *cmd);

void
tu6_emit_viewport(struct tu_cs *cs, const VkViewport *viewport, uint32_t num_viewport,
                  bool z_negative_one_to_one);

void
tu6_emit_scissor(struct tu_cs *cs, const VkRect2D *scs, uint32_t scissor_count);

void
tu6_emit_sample_locations(struct tu_cs *cs, const VkSampleLocationsInfoEXT *samp_loc);

void
tu6_emit_depth_bias(struct tu_cs *cs,
                    float constant_factor,
                    float clamp,
                    float slope_factor);

void tu6_emit_msaa(struct tu_cs *cs, VkSampleCountFlagBits samples,
                   enum a5xx_line_mode line_mode);

void tu6_emit_window_scissor(struct tu_cs *cs, uint32_t x1, uint32_t y1, uint32_t x2, uint32_t y2);

void tu6_emit_window_offset(struct tu_cs *cs, uint32_t x1, uint32_t y1);

uint32_t tu6_rb_mrt_control_rop(VkLogicOp op, bool *rop_reads_dst);

void tu_disable_draw_states(struct tu_cmd_buffer *cmd, struct tu_cs *cs);

void tu6_apply_depth_bounds_workaround(struct tu_device *device,
                                       uint32_t *rb_depth_cntl);

struct tu_pvtmem_config {
   uint64_t iova;
   uint32_t per_fiber_size;
   uint32_t per_sp_size;
   bool per_wave;
};

void
tu6_emit_xs_config(struct tu_cs *cs,
                   gl_shader_stage stage,
                   const struct ir3_shader_variant *xs);

void
tu6_emit_xs(struct tu_cs *cs,
            gl_shader_stage stage,
            const struct ir3_shader_variant *xs,
            const struct tu_pvtmem_config *pvtmem,
            uint64_t binary_iova);

void
tu6_emit_vpc(struct tu_cs *cs,
             const struct ir3_shader_variant *vs,
             const struct ir3_shader_variant *hs,
             const struct ir3_shader_variant *ds,
             const struct ir3_shader_variant *gs,
             const struct ir3_shader_variant *fs,
             uint32_t patch_control_points);

void
tu6_emit_fs_inputs(struct tu_cs *cs, const struct ir3_shader_variant *fs);

struct tu_image_view;

void
tu_resolve_sysmem(struct tu_cmd_buffer *cmd,
                  struct tu_cs *cs,
                  const struct tu_image_view *src,
                  const struct tu_image_view *dst,
                  uint32_t layer_mask,
                  uint32_t layers,
                  const VkRect2D *rect);

void
tu_clear_sysmem_attachment(struct tu_cmd_buffer *cmd,
                           struct tu_cs *cs,
                           uint32_t a,
                           const VkClearValue *value);

void
tu_clear_gmem_attachment(struct tu_cmd_buffer *cmd,
                         struct tu_cs *cs,
                         uint32_t a,
                         const VkClearValue *value);

void
tu_load_gmem_attachment(struct tu_cmd_buffer *cmd,
                        struct tu_cs *cs,
                        uint32_t a,
                        bool cond_exec_allowed,
                        bool force_load);

/* expose this function to be able to emit load without checking LOAD_OP */
void
tu_emit_load_gmem_attachment(struct tu_cmd_buffer *cmd, struct tu_cs *cs, uint32_t a);

/* note: gmem store can also resolve */
void
tu_store_gmem_attachment(struct tu_cmd_buffer *cmd,
                         struct tu_cs *cs,
                         uint32_t a,
                         uint32_t gmem_a,
                         bool cond_exec_allowed);

enum pipe_format tu_vk_format_to_pipe_format(VkFormat vk_format);

struct tu_native_format
{
   enum a6xx_format fmt : 8;
   enum a3xx_color_swap swap : 8;
   enum a6xx_tile_mode tile_mode : 8;
};

enum pipe_format tu_vk_format_to_pipe_format(VkFormat vk_format);
bool tu6_format_vtx_supported(VkFormat format);
struct tu_native_format tu6_format_vtx(VkFormat format);
bool tu6_format_color_supported(enum pipe_format format);
struct tu_native_format tu6_format_color(enum pipe_format format, enum a6xx_tile_mode tile_mode);
bool tu6_format_texture_supported(enum pipe_format format);
struct tu_native_format tu6_format_texture(enum pipe_format format, enum a6xx_tile_mode tile_mode);

static inline enum a6xx_format
tu6_base_format(enum pipe_format format)
{
   /* note: tu6_format_color doesn't care about tiling for .fmt field */
   return tu6_format_color(format, TILE6_LINEAR).fmt;
}

struct tu_image
{
   struct vk_image vk;

   struct fdl_layout layout[3];
   uint32_t total_size;

#ifdef ANDROID
   /* For VK_ANDROID_native_buffer, the WSI image owns the memory, */
   VkDeviceMemory owned_memory;
#endif

   /* Set when bound */
   struct tu_bo *bo;
   uint64_t iova;

   uint32_t lrz_height;
   uint32_t lrz_pitch;
   uint32_t lrz_offset;
   uint32_t lrz_fc_offset;
   uint32_t lrz_fc_size;
};

uint32_t tu6_plane_count(VkFormat format);
enum pipe_format tu6_plane_format(VkFormat format, uint32_t plane);

uint32_t tu6_plane_index(VkFormat format, VkImageAspectFlags aspect_mask);

enum pipe_format tu_format_for_aspect(enum pipe_format format,
                                      VkImageAspectFlags aspect_mask);

struct tu_image_view
{
   struct vk_image_view vk;

   struct tu_image *image; /**< VkImageViewCreateInfo::image */

   struct fdl6_view view;

   /* for d32s8 separate depth */
   uint64_t depth_base_addr;
   uint32_t depth_layer_size;
   uint32_t depth_PITCH;

   /* for d32s8 separate stencil */
   uint64_t stencil_base_addr;
   uint32_t stencil_layer_size;
   uint32_t stencil_PITCH;
};

struct tu_sampler_ycbcr_conversion {
   struct vk_object_base base;

   VkFormat format;
   VkSamplerYcbcrModelConversion ycbcr_model;
   VkSamplerYcbcrRange ycbcr_range;
   VkComponentMapping components;
   VkChromaLocation chroma_offsets[2];
   VkFilter chroma_filter;
};

struct tu_sampler {
   struct vk_object_base base;

   uint32_t descriptor[A6XX_TEX_SAMP_DWORDS];
   struct tu_sampler_ycbcr_conversion *ycbcr_sampler;
};

void
tu_cs_image_ref(struct tu_cs *cs, const struct fdl6_view *iview, uint32_t layer);

void
tu_cs_image_ref_2d(struct tu_cs *cs, const struct fdl6_view *iview, uint32_t layer, bool src);

void
tu_cs_image_flag_ref(struct tu_cs *cs, const struct fdl6_view *iview, uint32_t layer);

void
tu_cs_image_stencil_ref(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer);

void
tu_cs_image_depth_ref(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer);

#define tu_image_view_stencil(iview, x) \
   ((iview->view.x & ~A6XX_##x##_COLOR_FORMAT__MASK) | A6XX_##x##_COLOR_FORMAT(FMT6_8_UINT))

#define tu_image_view_depth(iview, x) \
   ((iview->view.x & ~A6XX_##x##_COLOR_FORMAT__MASK) | A6XX_##x##_COLOR_FORMAT(FMT6_32_FLOAT))

VkResult
tu_gralloc_info(struct tu_device *device,
                const VkNativeBufferANDROID *gralloc_info,
                int *dma_buf,
                uint64_t *modifier);

VkResult
tu_import_memory_from_gralloc_handle(VkDevice device_h,
                                     int dma_buf,
                                     const VkAllocationCallbacks *alloc,
                                     VkImage image_h);

bool
tiling_possible(VkFormat format);

bool
ubwc_possible(VkFormat format, VkImageType type, VkImageUsageFlags usage, VkImageUsageFlags stencil_usage,
              const struct fd_dev_info *info, VkSampleCountFlagBits samples,
              bool use_z24uint_s8uint);

struct tu_buffer_view
{
   struct vk_object_base base;

   uint32_t descriptor[A6XX_TEX_CONST_DWORDS];

   struct tu_buffer *buffer;
};
void
tu_buffer_view_init(struct tu_buffer_view *view,
                    struct tu_device *device,
                    const VkBufferViewCreateInfo *pCreateInfo);

#define PERF_CNTRS_REG 4

struct tu_perf_query_data
{
   uint32_t gid;      /* group-id */
   uint32_t cid;      /* countable-id within the group */
   uint32_t cntr_reg; /* counter register within the group */
   uint32_t pass;     /* pass index that countables can be requested */
   uint32_t app_idx;  /* index provided by apps */
};

struct tu_query_pool
{
   struct vk_object_base base;

   VkQueryType type;
   uint32_t stride;
   uint64_t size;
   uint32_t pipeline_statistics;
   struct tu_bo *bo;

   /* For performance query */
   const struct fd_perfcntr_group *perf_group;
   uint32_t perf_group_count;
   uint32_t counter_index_count;
   struct tu_perf_query_data perf_query_data[0];
};

uint32_t
tu_subpass_get_attachment_to_resolve(const struct tu_subpass *subpass, uint32_t index);

void
tu_update_descriptor_sets(const struct tu_device *device,
                          VkDescriptorSet overrideSet,
                          uint32_t descriptorWriteCount,
                          const VkWriteDescriptorSet *pDescriptorWrites,
                          uint32_t descriptorCopyCount,
                          const VkCopyDescriptorSet *pDescriptorCopies);

void
tu_update_descriptor_set_with_template(
   const struct tu_device *device,
   struct tu_descriptor_set *set,
   VkDescriptorUpdateTemplate descriptorUpdateTemplate,
   const void *pData);

VkResult
tu_physical_device_init(struct tu_physical_device *device,
                        struct tu_instance *instance);
VkResult
tu_enumerate_devices(struct tu_instance *instance);

int
tu_device_get_gpu_timestamp(struct tu_device *dev,
                            uint64_t *ts);

int
tu_device_get_suspend_count(struct tu_device *dev,
                            uint64_t *suspend_count);

int
tu_drm_submitqueue_new(const struct tu_device *dev,
                       int priority,
                       uint32_t *queue_id);

void
tu_drm_submitqueue_close(const struct tu_device *dev, uint32_t queue_id);

int
tu_syncobj_to_fd(struct tu_device *device, struct vk_sync *sync);

VkResult
tu_queue_submit(struct vk_queue *vk_queue, struct vk_queue_submit *submit);

void
tu_copy_timestamp_buffer(struct u_trace_context *utctx, void *cmdstream,
                         void *ts_from, uint32_t from_offset,
                         void *ts_to, uint32_t to_offset,
                         uint32_t count);


VkResult
tu_create_copy_timestamp_cs(struct tu_cmd_buffer *cmdbuf, struct tu_cs** cs,
                            struct u_trace **trace_copy);

/* If we copy trace and timestamps we will have to free them. */
struct tu_u_trace_cmd_data
{
   struct tu_cs *timestamp_copy_cs;
   struct u_trace *trace;
};

/* Data necessary to retrieve timestamps and clean all
 * associated resources afterwards.
 */
struct tu_u_trace_submission_data
{
   uint32_t submission_id;
   /* We have to know when timestamps are available,
    * this sync object indicates it.
    */
   struct tu_u_trace_syncobj *syncobj;

   uint32_t cmd_buffer_count;
   uint32_t last_buffer_with_tracepoints;
   struct tu_u_trace_cmd_data *cmd_trace_data;
};

VkResult
tu_u_trace_submission_data_create(
   struct tu_device *device,
   struct tu_cmd_buffer **cmd_buffers,
   uint32_t cmd_buffer_count,
   struct tu_u_trace_submission_data **submission_data);

void
tu_u_trace_submission_data_finish(
   struct tu_device *device,
   struct tu_u_trace_submission_data *submission_data);

void
tu_breadcrumbs_init(struct tu_device *device);

void
tu_breadcrumbs_finish(struct tu_device *device);

#define TU_FROM_HANDLE(__tu_type, __name, __handle)                          \
   VK_FROM_HANDLE(__tu_type, __name, __handle)

VK_DEFINE_HANDLE_CASTS(tu_cmd_buffer, vk.base, VkCommandBuffer,
                       VK_OBJECT_TYPE_COMMAND_BUFFER)
VK_DEFINE_HANDLE_CASTS(tu_device, vk.base, VkDevice, VK_OBJECT_TYPE_DEVICE)
VK_DEFINE_HANDLE_CASTS(tu_instance, vk.base, VkInstance,
                       VK_OBJECT_TYPE_INSTANCE)
VK_DEFINE_HANDLE_CASTS(tu_physical_device, vk.base, VkPhysicalDevice,
                       VK_OBJECT_TYPE_PHYSICAL_DEVICE)
VK_DEFINE_HANDLE_CASTS(tu_queue, vk.base, VkQueue, VK_OBJECT_TYPE_QUEUE)

VK_DEFINE_NONDISP_HANDLE_CASTS(tu_cmd_pool, vk.base, VkCommandPool,
                               VK_OBJECT_TYPE_COMMAND_POOL)
VK_DEFINE_NONDISP_HANDLE_CASTS(tu_buffer, base, VkBuffer,
                               VK_OBJECT_TYPE_BUFFER)
VK_DEFINE_NONDISP_HANDLE_CASTS(tu_buffer_view, base, VkBufferView,
                               VK_OBJECT_TYPE_BUFFER_VIEW)
VK_DEFINE_NONDISP_HANDLE_CASTS(tu_descriptor_pool, base, VkDescriptorPool,
                               VK_OBJECT_TYPE_DESCRIPTOR_POOL)
VK_DEFINE_NONDISP_HANDLE_CASTS(tu_descriptor_set, base, VkDescriptorSet,
                               VK_OBJECT_TYPE_DESCRIPTOR_SET)
VK_DEFINE_NONDISP_HANDLE_CASTS(tu_descriptor_set_layout, base,
                               VkDescriptorSetLayout,
                               VK_OBJECT_TYPE_DESCRIPTOR_SET_LAYOUT)
VK_DEFINE_NONDISP_HANDLE_CASTS(tu_descriptor_update_template, base,
                               VkDescriptorUpdateTemplate,
                               VK_OBJECT_TYPE_DESCRIPTOR_UPDATE_TEMPLATE)
VK_DEFINE_NONDISP_HANDLE_CASTS(tu_device_memory, base, VkDeviceMemory,
                               VK_OBJECT_TYPE_DEVICE_MEMORY)
VK_DEFINE_NONDISP_HANDLE_CASTS(tu_event, base, VkEvent, VK_OBJECT_TYPE_EVENT)
VK_DEFINE_NONDISP_HANDLE_CASTS(tu_framebuffer, base, VkFramebuffer,
                               VK_OBJECT_TYPE_FRAMEBUFFER)
VK_DEFINE_NONDISP_HANDLE_CASTS(tu_image, vk.base, VkImage, VK_OBJECT_TYPE_IMAGE)
VK_DEFINE_NONDISP_HANDLE_CASTS(tu_image_view, vk.base, VkImageView,
                               VK_OBJECT_TYPE_IMAGE_VIEW);
VK_DEFINE_NONDISP_HANDLE_CASTS(tu_pipeline_cache, base, VkPipelineCache,
                               VK_OBJECT_TYPE_PIPELINE_CACHE)
VK_DEFINE_NONDISP_HANDLE_CASTS(tu_pipeline, base, VkPipeline,
                               VK_OBJECT_TYPE_PIPELINE)
VK_DEFINE_NONDISP_HANDLE_CASTS(tu_pipeline_layout, base, VkPipelineLayout,
                               VK_OBJECT_TYPE_PIPELINE_LAYOUT)
VK_DEFINE_NONDISP_HANDLE_CASTS(tu_query_pool, base, VkQueryPool,
                               VK_OBJECT_TYPE_QUERY_POOL)
VK_DEFINE_NONDISP_HANDLE_CASTS(tu_render_pass, base, VkRenderPass,
                               VK_OBJECT_TYPE_RENDER_PASS)
VK_DEFINE_NONDISP_HANDLE_CASTS(tu_sampler, base, VkSampler,
                               VK_OBJECT_TYPE_SAMPLER)
VK_DEFINE_NONDISP_HANDLE_CASTS(tu_sampler_ycbcr_conversion, base, VkSamplerYcbcrConversion,
                               VK_OBJECT_TYPE_SAMPLER_YCBCR_CONVERSION)

/* for TU_FROM_HANDLE with both VkFence and VkSemaphore: */
#define tu_syncobj_from_handle(x) ((struct tu_syncobj*) (uintptr_t) (x))

void
update_stencil_mask(uint32_t *value, VkStencilFaceFlags face, uint32_t mask);

#endif /* TU_PRIVATE_H */