mesa/src/freedreno/vulkan/tu_private.h

2389 lines
69 KiB
C

/*
* Copyright © 2016 Red Hat.
* Copyright © 2016 Bas Nieuwenhuizen
*
* based in part on anv driver which is:
* Copyright © 2015 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
#ifndef TU_PRIVATE_H
#define TU_PRIVATE_H
#include <assert.h>
#include <pthread.h>
#include <stdbool.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#ifdef HAVE_VALGRIND
#include <memcheck.h>
#include <valgrind.h>
#define VG(x) x
#else
#define VG(x) ((void)0)
#endif
#define MESA_LOG_TAG "TU"
#include "c11/threads.h"
#include "util/rounding.h"
#include "util/bitscan.h"
#include "util/list.h"
#include "util/log.h"
#include "util/macros.h"
#include "util/sparse_array.h"
#include "util/u_atomic.h"
#include "util/u_dynarray.h"
#include "util/xmlconfig.h"
#include "util/perf/u_trace.h"
#include "vk_alloc.h"
#include "vk_debug_report.h"
#include "vk_device.h"
#include "vk_dispatch_table.h"
#include "vk_extensions.h"
#include "vk_instance.h"
#include "vk_log.h"
#include "vk_physical_device.h"
#include "vk_shader_module.h"
#include "vk_pipeline_cache.h"
#include "wsi_common.h"
#include "ir3/ir3_compiler.h"
#include "ir3/ir3_shader.h"
#include "adreno_common.xml.h"
#include "adreno_pm4.xml.h"
#include "a6xx.xml.h"
#include "fdl/freedreno_layout.h"
#include "common/freedreno_dev_info.h"
#include "perfcntrs/freedreno_perfcntr.h"
#include "tu_descriptor_set.h"
#include "tu_autotune.h"
#include "tu_util.h"
#include "tu_perfetto.h"
/* Pre-declarations needed for WSI entrypoints */
struct wl_surface;
struct wl_display;
typedef struct xcb_connection_t xcb_connection_t;
typedef uint32_t xcb_visualid_t;
typedef uint32_t xcb_window_t;
#include <vulkan/vk_android_native_buffer.h>
#include <vulkan/vk_icd.h>
#include <vulkan/vulkan.h>
#include "tu_entrypoints.h"
#include "vulkan/runtime/vk_common_entrypoints.h"
#include "vk_format.h"
#include "vk_image.h"
#include "vk_command_buffer.h"
#include "vk_command_pool.h"
#include "vk_queue.h"
#include "vk_object.h"
#include "vk_sync.h"
#include "vk_drm_syncobj.h"
#include "vk_sync_timeline.h"
#define MAX_VBS 32
#define MAX_VERTEX_ATTRIBS 32
#define MAX_RTS 8
#define MAX_VSC_PIPES 32
#define MAX_VIEWPORTS 16
#define MAX_VIEWPORT_SIZE (1 << 14)
#define MAX_SCISSORS 16
#define MAX_DISCARD_RECTANGLES 4
#define MAX_PUSH_CONSTANTS_SIZE 256
#define MAX_PUSH_DESCRIPTORS 32
#define MAX_DYNAMIC_UNIFORM_BUFFERS 16
#define MAX_DYNAMIC_STORAGE_BUFFERS 8
#define MAX_DYNAMIC_BUFFERS_SIZE \
(MAX_DYNAMIC_UNIFORM_BUFFERS + 2 * MAX_DYNAMIC_STORAGE_BUFFERS) * \
A6XX_TEX_CONST_DWORDS
#define TU_MAX_DRM_DEVICES 8
#define MAX_VIEWS 16
#define MAX_BIND_POINTS 2 /* compute + graphics */
/* The Qualcomm driver exposes 0x20000058 */
#define MAX_STORAGE_BUFFER_RANGE 0x20000000
/* We use ldc for uniform buffer loads, just like the Qualcomm driver, so
* expose the same maximum range.
* TODO: The SIZE bitfield is 15 bits, and in 4-dword units, so the actual
* range might be higher.
*/
#define MAX_UNIFORM_BUFFER_RANGE 0x10000
#define A6XX_TEX_CONST_DWORDS 16
#define A6XX_TEX_SAMP_DWORDS 4
#define COND(bool, val) ((bool) ? (val) : 0)
#define BIT(bit) (1u << (bit))
/* Whenever we generate an error, pass it through this function. Useful for
* debugging, where we can break on it. Only call at error site, not when
* propagating errors. Might be useful to plug in a stack trace here.
*/
struct tu_instance;
struct breadcrumbs_context;
VkResult
__vk_startup_errorf(struct tu_instance *instance,
VkResult error,
bool force_print,
const char *file,
int line,
const char *format,
...) PRINTFLIKE(6, 7);
/* Prints startup errors if TU_DEBUG=startup is set or on a debug driver
* build.
*/
#define vk_startup_errorf(instance, error, format, ...) \
__vk_startup_errorf(instance, error, \
instance->debug_flags & TU_DEBUG_STARTUP, \
__FILE__, __LINE__, format, ##__VA_ARGS__)
void
__tu_finishme(const char *file, int line, const char *format, ...)
PRINTFLIKE(3, 4);
/**
* Print a FINISHME message, including its source location.
*/
#define tu_finishme(format, ...) \
do { \
static bool reported = false; \
if (!reported) { \
__tu_finishme(__FILE__, __LINE__, format, ##__VA_ARGS__); \
reported = true; \
} \
} while (0)
#define tu_stub() \
do { \
tu_finishme("stub %s", __func__); \
} while (0)
struct tu_memory_heap {
/* Standard bits passed on to the client */
VkDeviceSize size;
VkMemoryHeapFlags flags;
/** Copied from ANV:
*
* Driver-internal book-keeping.
*
* Align it to 64 bits to make atomic operations faster on 32 bit platforms.
*/
VkDeviceSize used __attribute__ ((aligned (8)));
};
uint64_t
tu_get_system_heap_size(void);
struct tu_physical_device
{
struct vk_physical_device vk;
struct tu_instance *instance;
const char *name;
uint8_t driver_uuid[VK_UUID_SIZE];
uint8_t device_uuid[VK_UUID_SIZE];
uint8_t cache_uuid[VK_UUID_SIZE];
struct wsi_device wsi_device;
int local_fd;
bool has_local;
int64_t local_major;
int64_t local_minor;
int master_fd;
bool has_master;
int64_t master_major;
int64_t master_minor;
uint32_t gmem_size;
uint64_t gmem_base;
uint32_t ccu_offset_gmem;
uint32_t ccu_offset_bypass;
struct fd_dev_id dev_id;
const struct fd_dev_info *info;
int msm_major_version;
int msm_minor_version;
/* Address space and global fault count for this local_fd with DRM backend */
uint64_t fault_count;
struct tu_memory_heap heap;
struct vk_sync_type syncobj_type;
struct vk_sync_timeline_type timeline_type;
const struct vk_sync_type *sync_types[3];
};
enum tu_debug_flags
{
TU_DEBUG_STARTUP = 1 << 0,
TU_DEBUG_NIR = 1 << 1,
TU_DEBUG_NOBIN = 1 << 3,
TU_DEBUG_SYSMEM = 1 << 4,
TU_DEBUG_FORCEBIN = 1 << 5,
TU_DEBUG_NOUBWC = 1 << 6,
TU_DEBUG_NOMULTIPOS = 1 << 7,
TU_DEBUG_NOLRZ = 1 << 8,
TU_DEBUG_PERFC = 1 << 9,
TU_DEBUG_FLUSHALL = 1 << 10,
TU_DEBUG_SYNCDRAW = 1 << 11,
TU_DEBUG_DONT_CARE_AS_LOAD = 1 << 12,
TU_DEBUG_GMEM = 1 << 13,
TU_DEBUG_RAST_ORDER = 1 << 14,
TU_DEBUG_UNALIGNED_STORE = 1 << 15,
TU_DEBUG_LAYOUT = 1 << 16,
TU_DEBUG_LOG_SKIP_GMEM_OPS = 1 << 17,
TU_DEBUG_PERF = 1 << 18,
TU_DEBUG_NOLRZFC = 1 << 19,
TU_DEBUG_DYNAMIC = 1 << 20,
};
struct tu_instance
{
struct vk_instance vk;
uint32_t api_version;
int physical_device_count;
struct tu_physical_device physical_devices[TU_MAX_DRM_DEVICES];
struct driOptionCache dri_options;
struct driOptionCache available_dri_options;
enum tu_debug_flags debug_flags;
};
VkResult
tu_wsi_init(struct tu_physical_device *physical_device);
void
tu_wsi_finish(struct tu_physical_device *physical_device);
bool
tu_instance_extension_supported(const char *name);
uint32_t
tu_physical_device_api_version(struct tu_physical_device *dev);
bool
tu_physical_device_extension_supported(struct tu_physical_device *dev,
const char *name);
enum tu_bo_alloc_flags
{
TU_BO_ALLOC_NO_FLAGS = 0,
TU_BO_ALLOC_ALLOW_DUMP = 1 << 0,
TU_BO_ALLOC_GPU_READ_ONLY = 1 << 1,
};
struct cache_entry;
struct tu_pipeline_cache
{
struct vk_object_base base;
struct tu_device *device;
pthread_mutex_t mutex;
uint32_t total_size;
uint32_t table_size;
uint32_t kernel_count;
struct cache_entry **hash_table;
bool modified;
VkAllocationCallbacks alloc;
};
struct tu_pipeline_key
{
};
/* queue types */
#define TU_QUEUE_GENERAL 0
#define TU_MAX_QUEUE_FAMILIES 1
/* Keep tu_syncobj until porting to common code for kgsl too */
#ifdef TU_USE_KGSL
struct tu_syncobj;
#endif
struct tu_u_trace_syncobj;
/* Define tu_timeline_sync type based on drm syncobj for a point type
* for vk_sync_timeline, and the logic to handle is mostly copied from
* anv_bo_sync since it seems it can be used by similar way to anv.
*/
enum tu_timeline_sync_state {
/** Indicates that this is a new (or newly reset fence) */
TU_TIMELINE_SYNC_STATE_RESET,
/** Indicates that this fence has been submitted to the GPU but is still
* (as far as we know) in use by the GPU.
*/
TU_TIMELINE_SYNC_STATE_SUBMITTED,
TU_TIMELINE_SYNC_STATE_SIGNALED,
};
struct tu_timeline_sync {
struct vk_sync base;
enum tu_timeline_sync_state state;
uint32_t syncobj;
};
struct tu_queue
{
struct vk_queue vk;
struct tu_device *device;
uint32_t msm_queue_id;
int fence;
};
struct tu_bo
{
uint32_t gem_handle;
uint64_t size;
uint64_t iova;
void *map;
int32_t refcnt;
#ifndef TU_USE_KGSL
uint32_t bo_list_idx;
#endif
bool implicit_sync : 1;
};
/* externally-synchronized BO suballocator. */
struct tu_suballocator
{
struct tu_device *dev;
uint32_t default_size;
enum tu_bo_alloc_flags flags;
/** Current BO we're suballocating out of. */
struct tu_bo *bo;
uint32_t next_offset;
/** Optional BO cached for recycling as the next suballoc->bo, instead of having to allocate one. */
struct tu_bo *cached_bo;
};
struct tu_suballoc_bo
{
struct tu_bo *bo;
uint64_t iova;
uint32_t size; /* bytes */
};
void
tu_bo_suballocator_init(struct tu_suballocator *suballoc,
struct tu_device *dev,
uint32_t default_size,
uint32_t flags);
void
tu_bo_suballocator_finish(struct tu_suballocator *suballoc);
VkResult
tu_suballoc_bo_alloc(struct tu_suballoc_bo *suballoc_bo, struct tu_suballocator *suballoc,
uint32_t size, uint32_t align);
void *
tu_suballoc_bo_map(struct tu_suballoc_bo *bo);
void
tu_suballoc_bo_free(struct tu_suballocator *suballoc, struct tu_suballoc_bo *bo);
enum global_shader {
GLOBAL_SH_VS_BLIT,
GLOBAL_SH_VS_CLEAR,
GLOBAL_SH_FS_BLIT,
GLOBAL_SH_FS_BLIT_ZSCALE,
GLOBAL_SH_FS_COPY_MS,
GLOBAL_SH_FS_CLEAR0,
GLOBAL_SH_FS_CLEAR_MAX = GLOBAL_SH_FS_CLEAR0 + MAX_RTS,
GLOBAL_SH_COUNT,
};
/**
* Tracks the results from an individual renderpass. Initially created
* per renderpass, and appended to the tail of at->pending_results. At a later
* time, when the GPU has finished writing the results, we fill samples_passed.
*/
struct tu_renderpass_result {
/* Points into GPU memory */
struct tu_renderpass_samples* samples;
struct tu_suballoc_bo bo;
/*
* Below here, only used internally within autotune
*/
uint64_t rp_key;
struct tu_renderpass_history *history;
struct list_head node;
uint32_t fence;
uint64_t samples_passed;
};
#define TU_BORDER_COLOR_COUNT 4096
#define TU_BORDER_COLOR_BUILTIN 6
#define TU_BLIT_SHADER_SIZE 1024
/* This struct defines the layout of the global_bo */
struct tu6_global
{
/* clear/blit shaders */
uint32_t shaders[TU_BLIT_SHADER_SIZE];
uint32_t seqno_dummy; /* dummy seqno for CP_EVENT_WRITE */
uint32_t _pad0;
volatile uint32_t vsc_draw_overflow;
uint32_t _pad1;
volatile uint32_t vsc_prim_overflow;
uint32_t _pad2;
uint64_t predicate;
/* scratch space for VPC_SO[i].FLUSH_BASE_LO/HI, start on 32 byte boundary. */
struct {
uint32_t offset;
uint32_t pad[7];
} flush_base[4];
ALIGN16 uint32_t cs_indirect_xyz[3];
volatile uint32_t vtx_stats_query_not_running;
/* To know when renderpass stats for autotune are valid */
volatile uint32_t autotune_fence;
/* For recycling command buffers for dynamic suspend/resume comamnds */
volatile uint32_t dynamic_rendering_fence;
volatile uint32_t dbg_one;
volatile uint32_t dbg_gmem_total_loads;
volatile uint32_t dbg_gmem_taken_loads;
volatile uint32_t dbg_gmem_total_stores;
volatile uint32_t dbg_gmem_taken_stores;
/* Written from GPU */
volatile uint32_t breadcrumb_gpu_sync_seqno;
uint32_t _pad3;
/* Written from CPU, acknowledges value written from GPU */
volatile uint32_t breadcrumb_cpu_sync_seqno;
uint32_t _pad4;
/* note: larger global bo will be used for customBorderColors */
struct bcolor_entry bcolor_builtin[TU_BORDER_COLOR_BUILTIN], bcolor[];
};
#define gb_offset(member) offsetof(struct tu6_global, member)
#define global_iova(cmd, member) ((cmd)->device->global_bo->iova + gb_offset(member))
/* extra space in vsc draw/prim streams */
#define VSC_PAD 0x40
struct tu_device
{
struct vk_device vk;
struct tu_instance *instance;
struct tu_queue *queues[TU_MAX_QUEUE_FAMILIES];
int queue_count[TU_MAX_QUEUE_FAMILIES];
struct tu_physical_device *physical_device;
int fd;
struct ir3_compiler *compiler;
/* Backup in-memory cache to be used if the app doesn't provide one */
struct vk_pipeline_cache *mem_cache;
#define MIN_SCRATCH_BO_SIZE_LOG2 12 /* A page */
/* Currently the kernel driver uses a 32-bit GPU address space, but it
* should be impossible to go beyond 48 bits.
*/
struct {
struct tu_bo *bo;
mtx_t construct_mtx;
bool initialized;
} scratch_bos[48 - MIN_SCRATCH_BO_SIZE_LOG2];
struct tu_bo *global_bo;
uint32_t implicit_sync_bo_count;
/* Device-global BO suballocator for reducing BO management overhead for
* (read-only) pipeline state. Synchronized by pipeline_mutex.
*/
struct tu_suballocator pipeline_suballoc;
mtx_t pipeline_mutex;
/* Device-global BO suballocator for reducing BO management for small
* gmem/sysmem autotune result buffers. Synchronized by autotune_mutex.
*/
struct tu_suballocator autotune_suballoc;
mtx_t autotune_mutex;
/* the blob seems to always use 8K factor and 128K param sizes, copy them */
#define TU_TESS_FACTOR_SIZE (8 * 1024)
#define TU_TESS_PARAM_SIZE (128 * 1024)
#define TU_TESS_BO_SIZE (TU_TESS_FACTOR_SIZE + TU_TESS_PARAM_SIZE)
/* Lazily allocated, protected by the device mutex. */
struct tu_bo *tess_bo;
struct ir3_shader_variant *global_shader_variants[GLOBAL_SH_COUNT];
struct ir3_shader *global_shaders[GLOBAL_SH_COUNT];
uint64_t global_shader_va[GLOBAL_SH_COUNT];
uint32_t vsc_draw_strm_pitch;
uint32_t vsc_prim_strm_pitch;
BITSET_DECLARE(custom_border_color, TU_BORDER_COLOR_COUNT);
mtx_t mutex;
/* bo list for submits: */
struct drm_msm_gem_submit_bo *bo_list;
/* map bo handles to bo list index: */
uint32_t bo_count, bo_list_size;
mtx_t bo_mutex;
/* protects imported BOs creation/freeing */
struct u_rwlock dma_bo_lock;
/* This array holds all our 'struct tu_bo' allocations. We use this
* so we can add a refcount to our BOs and check if a particular BO
* was already allocated in this device using its GEM handle. This is
* necessary to properly manage BO imports, because the kernel doesn't
* refcount the underlying BO memory.
*
* Specifically, when self-importing (i.e. importing a BO into the same
* device that created it), the kernel will give us the same BO handle
* for both BOs and we must only free it once when both references are
* freed. Otherwise, if we are not self-importing, we get two different BO
* handles, and we want to free each one individually.
*
* The refcount is also useful for being able to maintain BOs across
* VK object lifetimes, such as pipelines suballocating out of BOs
* allocated on the device.
*/
struct util_sparse_array bo_map;
/* Command streams to set pass index to a scratch reg */
struct tu_cs *perfcntrs_pass_cs;
struct tu_cs_entry *perfcntrs_pass_cs_entries;
struct util_dynarray dynamic_rendering_pending;
VkCommandPool dynamic_rendering_pool;
uint32_t dynamic_rendering_fence;
/* Condition variable for timeline semaphore to notify waiters when a
* new submit is executed. */
pthread_cond_t timeline_cond;
pthread_mutex_t submit_mutex;
struct tu_autotune autotune;
struct breadcrumbs_context *breadcrumbs_ctx;
#ifdef ANDROID
const void *gralloc;
enum {
TU_GRALLOC_UNKNOWN,
TU_GRALLOC_CROS,
TU_GRALLOC_OTHER,
} gralloc_type;
#endif
uint32_t submit_count;
struct u_trace_context trace_context;
#ifdef HAVE_PERFETTO
struct tu_perfetto_state perfetto;
#endif
bool use_z24uint_s8uint;
};
void tu_init_clear_blit_shaders(struct tu_device *dev);
void tu_destroy_clear_blit_shaders(struct tu_device *dev);
VkResult tu_init_dynamic_rendering(struct tu_device *dev);
void tu_destroy_dynamic_rendering(struct tu_device *dev);
VkResult tu_insert_dynamic_cmdbufs(struct tu_device *dev,
struct tu_cmd_buffer ***cmds_ptr,
uint32_t *size);
VkResult
tu_device_submit_deferred_locked(struct tu_device *dev);
VkResult
tu_device_wait_u_trace(struct tu_device *dev, struct tu_u_trace_syncobj *syncobj);
uint64_t
tu_device_ticks_to_ns(struct tu_device *dev, uint64_t ts);
VkResult
tu_device_check_status(struct vk_device *vk_device);
VkResult
tu_bo_init_new(struct tu_device *dev, struct tu_bo **bo, uint64_t size,
enum tu_bo_alloc_flags flags);
VkResult
tu_bo_init_dmabuf(struct tu_device *dev,
struct tu_bo **bo,
uint64_t size,
int fd);
int
tu_bo_export_dmabuf(struct tu_device *dev, struct tu_bo *bo);
void
tu_bo_finish(struct tu_device *dev, struct tu_bo *bo);
VkResult
tu_bo_map(struct tu_device *dev, struct tu_bo *bo);
static inline struct tu_bo *
tu_device_lookup_bo(struct tu_device *device, uint32_t handle)
{
return (struct tu_bo *) util_sparse_array_get(&device->bo_map, handle);
}
static inline struct tu_bo *
tu_bo_get_ref(struct tu_bo *bo)
{
p_atomic_inc(&bo->refcnt);
return bo;
}
/* Get a scratch bo for use inside a command buffer. This will always return
* the same bo given the same size or similar sizes, so only one scratch bo
* can be used at the same time. It's meant for short-lived things where we
* need to write to some piece of memory, read from it, and then immediately
* discard it.
*/
VkResult
tu_get_scratch_bo(struct tu_device *dev, uint64_t size, struct tu_bo **bo);
struct tu_cs_entry
{
/* No ownership */
const struct tu_bo *bo;
uint32_t size;
uint32_t offset;
};
struct tu_cs_memory {
uint32_t *map;
uint64_t iova;
};
struct tu_draw_state {
uint64_t iova : 48;
uint32_t size : 16;
};
enum tu_dynamic_state
{
/* re-use VK_DYNAMIC_STATE_ enums for non-extended dynamic states */
TU_DYNAMIC_STATE_SAMPLE_LOCATIONS = VK_DYNAMIC_STATE_STENCIL_REFERENCE + 1,
TU_DYNAMIC_STATE_RB_DEPTH_CNTL,
TU_DYNAMIC_STATE_RB_STENCIL_CNTL,
TU_DYNAMIC_STATE_VB_STRIDE,
TU_DYNAMIC_STATE_RASTERIZER_DISCARD,
TU_DYNAMIC_STATE_BLEND,
TU_DYNAMIC_STATE_COUNT,
/* no associated draw state: */
TU_DYNAMIC_STATE_PRIMITIVE_TOPOLOGY = TU_DYNAMIC_STATE_COUNT,
TU_DYNAMIC_STATE_PRIMITIVE_RESTART_ENABLE,
TU_DYNAMIC_STATE_LOGIC_OP,
TU_DYNAMIC_STATE_COLOR_WRITE_ENABLE,
/* re-use the line width enum as it uses GRAS_SU_CNTL: */
TU_DYNAMIC_STATE_GRAS_SU_CNTL = VK_DYNAMIC_STATE_LINE_WIDTH,
};
enum tu_draw_state_group_id
{
TU_DRAW_STATE_PROGRAM_CONFIG,
TU_DRAW_STATE_PROGRAM,
TU_DRAW_STATE_PROGRAM_BINNING,
TU_DRAW_STATE_VB,
TU_DRAW_STATE_VI,
TU_DRAW_STATE_VI_BINNING,
TU_DRAW_STATE_RAST,
TU_DRAW_STATE_CONST,
TU_DRAW_STATE_DESC_SETS,
TU_DRAW_STATE_DESC_SETS_LOAD,
TU_DRAW_STATE_VS_PARAMS,
TU_DRAW_STATE_INPUT_ATTACHMENTS_GMEM,
TU_DRAW_STATE_INPUT_ATTACHMENTS_SYSMEM,
TU_DRAW_STATE_LRZ_AND_DEPTH_PLANE,
TU_DRAW_STATE_PRIM_MODE_GMEM,
TU_DRAW_STATE_PRIM_MODE_SYSMEM,
/* dynamic state related draw states */
TU_DRAW_STATE_DYNAMIC,
TU_DRAW_STATE_COUNT = TU_DRAW_STATE_DYNAMIC + TU_DYNAMIC_STATE_COUNT,
};
enum tu_cs_mode
{
/*
* A command stream in TU_CS_MODE_GROW mode grows automatically whenever it
* is full. tu_cs_begin must be called before command packet emission and
* tu_cs_end must be called after.
*
* This mode may create multiple entries internally. The entries must be
* submitted together.
*/
TU_CS_MODE_GROW,
/*
* A command stream in TU_CS_MODE_EXTERNAL mode wraps an external,
* fixed-size buffer. tu_cs_begin and tu_cs_end are optional and have no
* effect on it.
*
* This mode does not create any entry or any BO.
*/
TU_CS_MODE_EXTERNAL,
/*
* A command stream in TU_CS_MODE_SUB_STREAM mode does not support direct
* command packet emission. tu_cs_begin_sub_stream must be called to get a
* sub-stream to emit comamnd packets to. When done with the sub-stream,
* tu_cs_end_sub_stream must be called.
*
* This mode does not create any entry internally.
*/
TU_CS_MODE_SUB_STREAM,
};
#define TU_COND_EXEC_STACK_SIZE 4
struct tu_cs
{
uint32_t *start;
uint32_t *cur;
uint32_t *reserved_end;
uint32_t *end;
struct tu_device *device;
enum tu_cs_mode mode;
uint32_t next_bo_size;
struct tu_cs_entry *entries;
uint32_t entry_count;
uint32_t entry_capacity;
struct tu_bo **bos;
uint32_t bo_count;
uint32_t bo_capacity;
/* Optional BO that this CS is sub-allocated from for TU_CS_MODE_SUB_STREAM */
struct tu_bo *refcount_bo;
/* state for cond_exec_start/cond_exec_end */
uint32_t cond_stack_depth;
uint32_t cond_flags[TU_COND_EXEC_STACK_SIZE];
uint32_t *cond_dwords[TU_COND_EXEC_STACK_SIZE];
uint32_t breadcrumb_emit_after;
};
struct tu_device_memory
{
struct vk_object_base base;
struct tu_bo *bo;
};
struct tu_descriptor_range
{
uint64_t va;
uint32_t size;
};
struct tu_descriptor_set
{
struct vk_object_base base;
/* Link to descriptor pool's desc_sets list . */
struct list_head pool_link;
struct tu_descriptor_set_layout *layout;
struct tu_descriptor_pool *pool;
uint32_t size;
uint64_t va;
uint32_t *mapped_ptr;
uint32_t *dynamic_descriptors;
};
struct tu_descriptor_pool_entry
{
uint32_t offset;
uint32_t size;
struct tu_descriptor_set *set;
};
struct tu_descriptor_pool
{
struct vk_object_base base;
struct tu_bo *bo;
uint64_t current_offset;
uint64_t size;
uint8_t *host_memory_base;
uint8_t *host_memory_ptr;
uint8_t *host_memory_end;
uint8_t *host_bo;
struct list_head desc_sets;
uint32_t entry_count;
uint32_t max_entry_count;
struct tu_descriptor_pool_entry entries[0];
};
struct tu_descriptor_update_template_entry
{
VkDescriptorType descriptor_type;
/* The number of descriptors to update */
uint32_t descriptor_count;
/* Into mapped_ptr or dynamic_descriptors, in units of the respective array
*/
uint32_t dst_offset;
/* In dwords. Not valid/used for dynamic descriptors */
uint32_t dst_stride;
uint32_t buffer_offset;
/* Only valid for combined image samplers and samplers */
uint16_t has_sampler;
/* In bytes */
size_t src_offset;
size_t src_stride;
/* For push descriptors */
const struct tu_sampler *immutable_samplers;
};
struct tu_descriptor_update_template
{
struct vk_object_base base;
uint32_t entry_count;
VkPipelineBindPoint bind_point;
struct tu_descriptor_update_template_entry entry[0];
};
struct tu_buffer
{
struct vk_object_base base;
VkDeviceSize size;
VkBufferUsageFlags usage;
VkBufferCreateFlags flags;
struct tu_bo *bo;
uint64_t iova;
};
const char *
tu_get_debug_option_name(int id);
const char *
tu_get_perftest_option_name(int id);
struct tu_attachment_info
{
struct tu_image_view *attachment;
};
struct tu_framebuffer
{
struct vk_object_base base;
uint32_t width;
uint32_t height;
uint32_t layers;
/* size of the first tile */
VkExtent2D tile0;
/* number of tiles */
VkExtent2D tile_count;
/* size of the first VSC pipe */
VkExtent2D pipe0;
/* number of VSC pipes */
VkExtent2D pipe_count;
/* Whether binning should be used for gmem rendering using this framebuffer. */
bool binning;
/* Whether binning could be used for gmem rendering using this framebuffer. */
bool binning_possible;
/* pipe register values */
uint32_t pipe_config[MAX_VSC_PIPES];
uint32_t pipe_sizes[MAX_VSC_PIPES];
uint32_t attachment_count;
struct tu_attachment_info attachments[0];
};
struct tu_subpass_barrier {
VkPipelineStageFlags2 src_stage_mask;
VkPipelineStageFlags2 dst_stage_mask;
VkAccessFlags2 src_access_mask;
VkAccessFlags2 dst_access_mask;
bool incoherent_ccu_color, incoherent_ccu_depth;
};
struct tu_subpass_attachment
{
uint32_t attachment;
/* For input attachments, true if it needs to be patched to refer to GMEM
* in GMEM mode. This is false if it hasn't already been written as an
* attachment.
*/
bool patch_input_gmem;
};
struct tu_subpass
{
uint32_t input_count;
uint32_t color_count;
uint32_t resolve_count;
bool resolve_depth_stencil;
bool feedback_loop_color;
bool feedback_loop_ds;
/* True if we must invalidate UCHE thanks to a feedback loop. */
bool feedback_invalidate;
/* In other words - framebuffer fetch support */
bool raster_order_attachment_access;
struct tu_subpass_attachment *input_attachments;
struct tu_subpass_attachment *color_attachments;
struct tu_subpass_attachment *resolve_attachments;
struct tu_subpass_attachment depth_stencil_attachment;
VkSampleCountFlagBits samples;
uint32_t srgb_cntl;
uint32_t multiview_mask;
struct tu_subpass_barrier start_barrier;
};
struct tu_render_pass_attachment
{
VkFormat format;
uint32_t samples;
uint32_t cpp;
VkImageAspectFlags clear_mask;
uint32_t clear_views;
bool load;
bool store;
int32_t gmem_offset;
bool will_be_resolved;
/* for D32S8 separate stencil: */
bool load_stencil;
bool store_stencil;
bool cond_load_allowed;
bool cond_store_allowed;
int32_t gmem_offset_stencil;
};
struct tu_render_pass
{
struct vk_object_base base;
uint32_t attachment_count;
uint32_t subpass_count;
uint32_t gmem_pixels;
uint32_t tile_align_w;
/* memory bandwidth costs (in bytes) for gmem / sysmem rendering */
uint32_t gmem_bandwidth_per_pixel;
uint32_t sysmem_bandwidth_per_pixel;
struct tu_subpass_attachment *subpass_attachments;
struct tu_render_pass_attachment *attachments;
struct tu_subpass_barrier end_barrier;
struct tu_subpass subpasses[0];
};
void
tu_framebuffer_tiling_config(struct tu_framebuffer *fb,
const struct tu_device *device,
const struct tu_render_pass *pass);
struct tu_descriptor_state
{
struct tu_descriptor_set *sets[MAX_SETS];
struct tu_descriptor_set push_set;
uint32_t dynamic_descriptors[MAX_DYNAMIC_BUFFERS_SIZE];
};
enum tu_cmd_dirty_bits
{
TU_CMD_DIRTY_VERTEX_BUFFERS = BIT(0),
TU_CMD_DIRTY_VB_STRIDE = BIT(1),
TU_CMD_DIRTY_GRAS_SU_CNTL = BIT(2),
TU_CMD_DIRTY_RB_DEPTH_CNTL = BIT(3),
TU_CMD_DIRTY_RB_STENCIL_CNTL = BIT(4),
TU_CMD_DIRTY_DESC_SETS_LOAD = BIT(5),
TU_CMD_DIRTY_COMPUTE_DESC_SETS_LOAD = BIT(6),
TU_CMD_DIRTY_SHADER_CONSTS = BIT(7),
TU_CMD_DIRTY_LRZ = BIT(8),
TU_CMD_DIRTY_VS_PARAMS = BIT(9),
TU_CMD_DIRTY_RASTERIZER_DISCARD = BIT(10),
TU_CMD_DIRTY_VIEWPORTS = BIT(11),
TU_CMD_DIRTY_BLEND = BIT(12),
/* all draw states were disabled and need to be re-enabled: */
TU_CMD_DIRTY_DRAW_STATE = BIT(13)
};
/* There are only three cache domains we have to care about: the CCU, or
* color cache unit, which is used for color and depth/stencil attachments
* and copy/blit destinations, and is split conceptually into color and depth,
* and the universal cache or UCHE which is used for pretty much everything
* else, except for the CP (uncached) and host. We need to flush whenever data
* crosses these boundaries.
*/
enum tu_cmd_access_mask {
TU_ACCESS_UCHE_READ = 1 << 0,
TU_ACCESS_UCHE_WRITE = 1 << 1,
TU_ACCESS_CCU_COLOR_READ = 1 << 2,
TU_ACCESS_CCU_COLOR_WRITE = 1 << 3,
TU_ACCESS_CCU_DEPTH_READ = 1 << 4,
TU_ACCESS_CCU_DEPTH_WRITE = 1 << 5,
/* Experiments have shown that while it's safe to avoid flushing the CCU
* after each blit/renderpass, it's not safe to assume that subsequent
* lookups with a different attachment state will hit unflushed cache
* entries. That is, the CCU needs to be flushed and possibly invalidated
* when accessing memory with a different attachment state. Writing to an
* attachment under the following conditions after clearing using the
* normal 2d engine path is known to have issues:
*
* - It isn't the 0'th layer.
* - There are more than one attachment, and this isn't the 0'th attachment
* (this seems to also depend on the cpp of the attachments).
*
* Our best guess is that the layer/MRT state is used when computing
* the location of a cache entry in CCU, to avoid conflicts. We assume that
* any access in a renderpass after or before an access by a transfer needs
* a flush/invalidate, and use the _INCOHERENT variants to represent access
* by a renderpass.
*/
TU_ACCESS_CCU_COLOR_INCOHERENT_READ = 1 << 6,
TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE = 1 << 7,
TU_ACCESS_CCU_DEPTH_INCOHERENT_READ = 1 << 8,
TU_ACCESS_CCU_DEPTH_INCOHERENT_WRITE = 1 << 9,
/* Accesses which bypasses any cache. e.g. writes via the host,
* CP_EVENT_WRITE::BLIT, and the CP are SYSMEM_WRITE.
*/
TU_ACCESS_SYSMEM_READ = 1 << 10,
TU_ACCESS_SYSMEM_WRITE = 1 << 11,
/* Memory writes from the CP start in-order with draws and event writes,
* but execute asynchronously and hence need a CP_WAIT_MEM_WRITES if read.
*/
TU_ACCESS_CP_WRITE = 1 << 12,
TU_ACCESS_READ =
TU_ACCESS_UCHE_READ |
TU_ACCESS_CCU_COLOR_READ |
TU_ACCESS_CCU_DEPTH_READ |
TU_ACCESS_CCU_COLOR_INCOHERENT_READ |
TU_ACCESS_CCU_DEPTH_INCOHERENT_READ |
TU_ACCESS_SYSMEM_READ,
TU_ACCESS_WRITE =
TU_ACCESS_UCHE_WRITE |
TU_ACCESS_CCU_COLOR_WRITE |
TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE |
TU_ACCESS_CCU_DEPTH_WRITE |
TU_ACCESS_CCU_DEPTH_INCOHERENT_WRITE |
TU_ACCESS_SYSMEM_WRITE |
TU_ACCESS_CP_WRITE,
TU_ACCESS_ALL =
TU_ACCESS_READ |
TU_ACCESS_WRITE,
};
/* Starting with a6xx, the pipeline is split into several "clusters" (really
* pipeline stages). Each stage has its own pair of register banks and can
* switch them independently, so that earlier stages can run ahead of later
* ones. e.g. the FS of draw N and the VS of draw N + 1 can be executing at
* the same time.
*
* As a result of this, we need to insert a WFI when an earlier stage depends
* on the result of a later stage. CP_DRAW_* and CP_BLIT will wait for any
* pending WFI's to complete before starting, and usually before reading
* indirect params even, so a WFI also acts as a full "pipeline stall".
*
* Note, the names of the stages come from CLUSTER_* in devcoredump. We
* include all the stages for completeness, even ones which do not read/write
* anything.
*/
enum tu_stage {
/* This doesn't correspond to a cluster, but we need it for tracking
* indirect draw parameter reads etc.
*/
TU_STAGE_CP,
/* - Fetch index buffer
* - Fetch vertex attributes, dispatch VS
*/
TU_STAGE_FE,
/* Execute all geometry stages (VS thru GS) */
TU_STAGE_SP_VS,
/* Write to VPC, do primitive assembly. */
TU_STAGE_PC_VS,
/* Rasterization. RB_DEPTH_BUFFER_BASE only exists in CLUSTER_PS according
* to devcoredump so presumably this stage stalls for TU_STAGE_PS when
* early depth testing is enabled before dispatching fragments? However
* GRAS reads and writes LRZ directly.
*/
TU_STAGE_GRAS,
/* Execute FS */
TU_STAGE_SP_PS,
/* - Fragment tests
* - Write color/depth
* - Streamout writes (???)
* - Varying interpolation (???)
*/
TU_STAGE_PS,
};
enum tu_cmd_flush_bits {
TU_CMD_FLAG_CCU_FLUSH_DEPTH = 1 << 0,
TU_CMD_FLAG_CCU_FLUSH_COLOR = 1 << 1,
TU_CMD_FLAG_CCU_INVALIDATE_DEPTH = 1 << 2,
TU_CMD_FLAG_CCU_INVALIDATE_COLOR = 1 << 3,
TU_CMD_FLAG_CACHE_FLUSH = 1 << 4,
TU_CMD_FLAG_CACHE_INVALIDATE = 1 << 5,
TU_CMD_FLAG_WAIT_MEM_WRITES = 1 << 6,
TU_CMD_FLAG_WAIT_FOR_IDLE = 1 << 7,
TU_CMD_FLAG_WAIT_FOR_ME = 1 << 8,
TU_CMD_FLAG_ALL_FLUSH =
TU_CMD_FLAG_CCU_FLUSH_DEPTH |
TU_CMD_FLAG_CCU_FLUSH_COLOR |
TU_CMD_FLAG_CACHE_FLUSH |
/* Treat the CP as a sort of "cache" which may need to be "flushed" via
* waiting for writes to land with WAIT_FOR_MEM_WRITES.
*/
TU_CMD_FLAG_WAIT_MEM_WRITES,
TU_CMD_FLAG_ALL_INVALIDATE =
TU_CMD_FLAG_CCU_INVALIDATE_DEPTH |
TU_CMD_FLAG_CCU_INVALIDATE_COLOR |
TU_CMD_FLAG_CACHE_INVALIDATE |
/* Treat CP_WAIT_FOR_ME as a "cache" that needs to be invalidated when a
* a command that needs CP_WAIT_FOR_ME is executed. This means we may
* insert an extra WAIT_FOR_ME before an indirect command requiring it
* in case there was another command before the current command buffer
* that it needs to wait for.
*/
TU_CMD_FLAG_WAIT_FOR_ME,
};
/* Changing the CCU from sysmem mode to gmem mode or vice-versa is pretty
* heavy, involving a CCU cache flush/invalidate and a WFI in order to change
* which part of the gmem is used by the CCU. Here we keep track of what the
* state of the CCU.
*/
enum tu_cmd_ccu_state {
TU_CMD_CCU_SYSMEM,
TU_CMD_CCU_GMEM,
TU_CMD_CCU_UNKNOWN,
};
struct tu_cache_state {
/* Caches which must be made available (flushed) eventually if there are
* any users outside that cache domain, and caches which must be
* invalidated eventually if there are any reads.
*/
enum tu_cmd_flush_bits pending_flush_bits;
/* Pending flushes */
enum tu_cmd_flush_bits flush_bits;
};
enum tu_lrz_force_disable_mask {
TU_LRZ_FORCE_DISABLE_LRZ = 1 << 0,
TU_LRZ_FORCE_DISABLE_WRITE = 1 << 1,
};
enum tu_lrz_direction {
TU_LRZ_UNKNOWN,
/* Depth func less/less-than: */
TU_LRZ_LESS,
/* Depth func greater/greater-than: */
TU_LRZ_GREATER,
};
struct tu_lrz_pipeline
{
uint32_t force_disable_mask;
bool fs_has_kill;
bool force_late_z;
bool early_fragment_tests;
};
struct tu_lrz_state
{
/* Depth/Stencil image currently on use to do LRZ */
const struct tu_image_view *image_view;
VkClearValue depth_clear_value;
/* If LRZ is in invalid state we cannot use it until depth is cleared */
bool valid : 1;
/* Allows to temporary disable LRZ */
bool enabled : 1;
bool fast_clear : 1;
bool gpu_dir_tracking : 1;
/* Continue using old LRZ state (LOAD_OP_LOAD of depth) */
bool reuse_previous_state : 1;
enum tu_lrz_direction prev_direction;
};
struct tu_vs_params {
uint32_t vertex_offset;
uint32_t first_instance;
};
/* This should be for state that is set inside a renderpass and used at
* renderpass end time, e.g. to decide whether to use sysmem. This needs
* special handling for secondary cmdbufs and suspending/resuming render
* passes where the state may need to be combined afterwards.
*/
struct tu_render_pass_state
{
bool xfb_used;
bool has_tess;
bool has_prim_generated_query_in_rp;
bool disable_gmem;
/* Track whether conditional predicate for COND_REG_EXEC is changed in draw_cs */
bool draw_cs_writes_to_cond_pred;
uint32_t drawcall_count;
/* A calculated "draw cost" value for renderpass, which tries to
* estimate the bandwidth-per-sample of all the draws according
* to:
*
* foreach_draw (...) {
* sum += pipeline->color_bandwidth_per_sample;
* if (depth_test_enabled)
* sum += pipeline->depth_cpp_per_sample;
* if (depth_write_enabled)
* sum += pipeline->depth_cpp_per_sample;
* if (stencil_write_enabled)
* sum += pipeline->stencil_cpp_per_sample * 2;
* }
* drawcall_bandwidth_per_sample = sum / drawcall_count;
*
* It allows us to estimate the total bandwidth of drawcalls later, by
* calculating (drawcall_bandwidth_per_sample * zpass_sample_count).
*
* This does ignore depth buffer traffic for samples which do not
* pass due to depth-test fail, and some other details. But it is
* just intended to be a rough estimate that is easy to calculate.
*/
uint32_t drawcall_bandwidth_per_sample_sum;
};
void tu_render_pass_state_merge(struct tu_render_pass_state *dst,
const struct tu_render_pass_state *src);
struct tu_cmd_state
{
uint32_t dirty;
struct tu_pipeline *pipeline;
struct tu_pipeline *compute_pipeline;
struct tu_render_pass_state rp;
/* Vertex buffers, viewports, and scissors
* the states for these can be updated partially, so we need to save these
* to be able to emit a complete draw state
*/
struct {
uint64_t base;
uint32_t size;
uint32_t stride;
} vb[MAX_VBS];
VkViewport viewport[MAX_VIEWPORTS];
VkRect2D scissor[MAX_SCISSORS];
uint32_t max_viewport, max_scissor;
/* for dynamic states that can't be emitted directly */
uint32_t dynamic_stencil_mask;
uint32_t dynamic_stencil_wrmask;
uint32_t dynamic_stencil_ref;
uint32_t gras_su_cntl, rb_depth_cntl, rb_stencil_cntl;
uint32_t pc_raster_cntl, vpc_unknown_9107;
uint32_t rb_mrt_control[MAX_RTS], rb_mrt_blend_control[MAX_RTS];
uint32_t rb_mrt_control_rop;
uint32_t rb_blend_cntl, sp_blend_cntl;
uint32_t pipeline_color_write_enable, pipeline_blend_enable;
uint32_t color_write_enable;
bool logic_op_enabled;
bool rop_reads_dst;
enum pc_di_primtype primtype;
bool primitive_restart_enable;
/* saved states to re-emit in TU_CMD_DIRTY_DRAW_STATE case */
struct tu_draw_state dynamic_state[TU_DYNAMIC_STATE_COUNT];
struct tu_draw_state vertex_buffers;
struct tu_draw_state shader_const;
struct tu_draw_state desc_sets;
struct tu_draw_state vs_params;
/* Index buffer */
uint64_t index_va;
uint32_t max_index_count;
uint8_t index_size;
/* because streamout base has to be 32-byte aligned
* there is an extra offset to deal with when it is
* unaligned
*/
uint8_t streamout_offset[IR3_MAX_SO_BUFFERS];
/* Renderpasses are tricky, because we may need to flush differently if
* using sysmem vs. gmem and therefore we have to delay any flushing that
* happens before a renderpass. So we have to have two copies of the flush
* state, one for intra-renderpass flushes (i.e. renderpass dependencies)
* and one for outside a renderpass.
*/
struct tu_cache_state cache;
struct tu_cache_state renderpass_cache;
enum tu_cmd_ccu_state ccu_state;
const struct tu_render_pass *pass;
const struct tu_subpass *subpass;
const struct tu_framebuffer *framebuffer;
VkRect2D render_area;
const struct tu_image_view **attachments;
/* State that in the dynamic case comes from VkRenderingInfo and needs to
* be saved/restored when suspending. This holds the state for the last
* suspended renderpass, which may point to this command buffer's dynamic_*
* or another command buffer if executed on a secondary.
*/
struct {
const struct tu_render_pass *pass;
const struct tu_subpass *subpass;
const struct tu_framebuffer *framebuffer;
VkRect2D render_area;
const struct tu_image_view **attachments;
struct tu_lrz_state lrz;
} suspended_pass;
bool tessfactor_addr_set;
bool predication_active;
enum a5xx_line_mode line_mode;
bool z_negative_one_to_one;
/* VK_QUERY_PIPELINE_STATISTIC_CLIPPING_INVOCATIONS_BIT and
* VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT are allowed to run simultaniously,
* but they use the same {START,STOP}_PRIMITIVE_CTRS control.
*/
uint32_t prim_counters_running;
bool prim_generated_query_running_before_rp;
/* These are the states of the suspend/resume state machine. In addition to
* tracking whether we're in the middle of a chain of suspending and
* resuming passes that will be merged, we need to track whether the
* command buffer begins in the middle of such a chain, for when it gets
* merged with other command buffers. We call such a chain that begins
* before the command buffer starts a "pre-chain".
*
* Note that when this command buffer is finished, this state is untouched
* but it gains a different meaning. For example, if we finish in state
* SR_IN_CHAIN, we finished in the middle of a suspend/resume chain, so
* there's a suspend/resume chain that extends past the end of the command
* buffer. In this sense it's the "opposite" of SR_AFTER_PRE_CHAIN, which
* means that there's a suspend/resume chain that extends before the
* beginning.
*/
enum {
/* Either there are no suspend/resume chains, or they are entirely
* contained in the current command buffer.
*
* BeginCommandBuffer() <- start of current command buffer
* ...
* // we are here
*/
SR_NONE = 0,
/* We are in the middle of a suspend/resume chain that starts before the
* current command buffer. This happens when the command buffer begins
* with a resuming render pass and all of the passes up to the current
* one are suspending. In this state, our part of the chain is not saved
* and is in the current draw_cs/state.
*
* BeginRendering() ... EndRendering(suspending)
* BeginCommandBuffer() <- start of current command buffer
* BeginRendering(resuming) ... EndRendering(suspending)
* BeginRendering(resuming) ... EndRendering(suspending)
* ...
* // we are here
*/
SR_IN_PRE_CHAIN,
/* We are currently outside of any suspend/resume chains, but there is a
* chain starting before the current command buffer. It is saved in
* pre_chain.
*
* BeginRendering() ... EndRendering(suspending)
* BeginCommandBuffer() <- start of current command buffer
* // This part is stashed in pre_chain
* BeginRendering(resuming) ... EndRendering(suspending)
* BeginRendering(resuming) ... EndRendering(suspending)
* ...
* BeginRendering(resuming) ... EndRendering() // end of chain
* ...
* // we are here
*/
SR_AFTER_PRE_CHAIN,
/* We are in the middle of a suspend/resume chain and there is no chain
* starting before the current command buffer.
*
* BeginCommandBuffer() <- start of current command buffer
* ...
* BeginRendering() ... EndRendering(suspending)
* BeginRendering(resuming) ... EndRendering(suspending)
* BeginRendering(resuming) ... EndRendering(suspending)
* ...
* // we are here
*/
SR_IN_CHAIN,
/* We are in the middle of a suspend/resume chain and there is another,
* separate, chain starting before the current command buffer.
*
* BeginRendering() ... EndRendering(suspending)
* CommandBufferBegin() <- start of current command buffer
* // This part is stashed in pre_chain
* BeginRendering(resuming) ... EndRendering(suspending)
* BeginRendering(resuming) ... EndRendering(suspending)
* ...
* BeginRendering(resuming) ... EndRendering() // end of chain
* ...
* BeginRendering() ... EndRendering(suspending)
* BeginRendering(resuming) ... EndRendering(suspending)
* BeginRendering(resuming) ... EndRendering(suspending)
* ...
* // we are here
*/
SR_IN_CHAIN_AFTER_PRE_CHAIN,
} suspend_resume;
bool suspending, resuming;
struct tu_lrz_state lrz;
struct tu_draw_state lrz_and_depth_plane_state;
struct tu_vs_params last_vs_params;
};
struct tu_cmd_pool
{
struct vk_command_pool vk;
struct list_head cmd_buffers;
struct list_head free_cmd_buffers;
};
enum tu_cmd_buffer_status
{
TU_CMD_BUFFER_STATUS_INVALID,
TU_CMD_BUFFER_STATUS_INITIAL,
TU_CMD_BUFFER_STATUS_RECORDING,
TU_CMD_BUFFER_STATUS_EXECUTABLE,
TU_CMD_BUFFER_STATUS_PENDING,
};
struct tu_cmd_buffer
{
struct vk_command_buffer vk;
struct tu_device *device;
struct tu_cmd_pool *pool;
struct list_head pool_link;
struct u_trace trace;
struct u_trace_iterator trace_renderpass_start;
struct u_trace_iterator trace_renderpass_end;
struct list_head renderpass_autotune_results;
struct tu_autotune_results_buffer* autotune_buffer;
VkCommandBufferUsageFlags usage_flags;
enum tu_cmd_buffer_status status;
VkQueryPipelineStatisticFlags inherited_pipeline_statistics;
struct tu_cmd_state state;
uint32_t queue_family_index;
uint32_t push_constants[MAX_PUSH_CONSTANTS_SIZE / 4];
VkShaderStageFlags push_constant_stages;
struct tu_descriptor_set meta_push_descriptors;
struct tu_descriptor_state descriptors[MAX_BIND_POINTS];
struct tu_render_pass_attachment dynamic_rp_attachments[2 * (MAX_RTS + 1)];
struct tu_subpass_attachment dynamic_color_attachments[MAX_RTS];
struct tu_subpass_attachment dynamic_resolve_attachments[MAX_RTS + 1];
const struct tu_image_view *dynamic_attachments[2 * (MAX_RTS + 1)];
struct tu_render_pass dynamic_pass;
struct tu_subpass dynamic_subpass;
struct tu_framebuffer dynamic_framebuffer;
VkResult record_result;
struct tu_cs cs;
struct tu_cs draw_cs;
struct tu_cs tile_store_cs;
struct tu_cs draw_epilogue_cs;
struct tu_cs sub_cs;
/* If the first render pass in the command buffer is resuming, then it is
* part of a suspend/resume chain that starts before the current command
* buffer and needs to be merged later. In this case, its incomplete state
* is stored in pre_chain. In the symmetric case where the last render pass
* is suspending, we just skip ending the render pass and its state is
* stored in draw_cs/the current state. The first and last render pass
* might be part of different chains, which is why all the state may need
* to be saved separately here.
*/
struct {
struct tu_cs draw_cs;
struct tu_cs draw_epilogue_cs;
struct u_trace_iterator trace_renderpass_start, trace_renderpass_end;
struct tu_render_pass_state state;
} pre_chain;
uint32_t vsc_draw_strm_pitch;
uint32_t vsc_prim_strm_pitch;
};
/* Temporary struct for tracking a register state to be written, used by
* a6xx-pack.h and tu_cs_emit_regs()
*/
struct tu_reg_value {
uint32_t reg;
uint64_t value;
bool is_address;
struct tu_bo *bo;
bool bo_write;
uint32_t bo_offset;
uint32_t bo_shift;
};
VkResult tu_cmd_buffer_begin(struct tu_cmd_buffer *cmd_buffer,
VkCommandBufferUsageFlags usage_flags);
void tu_emit_cache_flush_renderpass(struct tu_cmd_buffer *cmd_buffer,
struct tu_cs *cs);
void tu_emit_cache_flush_ccu(struct tu_cmd_buffer *cmd_buffer,
struct tu_cs *cs,
enum tu_cmd_ccu_state ccu_state);
void tu_setup_dynamic_render_pass(struct tu_cmd_buffer *cmd_buffer,
const VkRenderingInfo *pRenderingInfo);
void tu_setup_dynamic_inheritance(struct tu_cmd_buffer *cmd_buffer,
const VkCommandBufferInheritanceRenderingInfo *info);
void tu_setup_dynamic_framebuffer(struct tu_cmd_buffer *cmd_buffer,
const VkRenderingInfo *pRenderingInfo);
void
tu_append_pre_chain(struct tu_cmd_buffer *cmd,
struct tu_cmd_buffer *secondary);
void
tu_append_pre_post_chain(struct tu_cmd_buffer *cmd,
struct tu_cmd_buffer *secondary);
void
tu_append_post_chain(struct tu_cmd_buffer *cmd,
struct tu_cmd_buffer *secondary);
void
tu_restore_suspended_pass(struct tu_cmd_buffer *cmd,
struct tu_cmd_buffer *suspended);
void tu_cmd_render(struct tu_cmd_buffer *cmd);
void
tu6_emit_event_write(struct tu_cmd_buffer *cmd,
struct tu_cs *cs,
enum vgt_event_type event);
static inline struct tu_descriptor_state *
tu_get_descriptors_state(struct tu_cmd_buffer *cmd_buffer,
VkPipelineBindPoint bind_point)
{
return &cmd_buffer->descriptors[bind_point];
}
struct tu_event
{
struct vk_object_base base;
struct tu_bo *bo;
};
struct tu_push_constant_range
{
uint32_t lo;
uint32_t dwords;
};
struct tu_shader
{
struct ir3_shader *ir3_shader;
struct tu_push_constant_range push_consts;
uint8_t active_desc_sets;
bool multi_pos_output;
};
struct tu_shader_key {
unsigned multiview_mask;
bool force_sample_interp;
enum ir3_wavesize_option api_wavesize, real_wavesize;
};
struct tu_compiled_shaders
{
struct vk_pipeline_cache_object base;
struct tu_push_constant_range push_consts[MESA_SHADER_STAGES];
uint8_t active_desc_sets;
bool multi_pos_output;
struct ir3_shader_variant *variants[MESA_SHADER_STAGES];
};
extern const struct vk_pipeline_cache_object_ops tu_shaders_ops;
bool
tu_nir_lower_multiview(nir_shader *nir, uint32_t mask, bool *multi_pos_output,
struct tu_device *dev);
nir_shader *
tu_spirv_to_nir(struct tu_device *dev,
void *mem_ctx,
const VkPipelineShaderStageCreateInfo *stage_info,
gl_shader_stage stage);
struct tu_shader *
tu_shader_create(struct tu_device *dev,
nir_shader *nir,
const struct tu_shader_key *key,
struct tu_pipeline_layout *layout,
const VkAllocationCallbacks *alloc);
void
tu_shader_destroy(struct tu_device *dev,
struct tu_shader *shader,
const VkAllocationCallbacks *alloc);
static bool inline
tu6_shared_constants_enable(const struct tu_pipeline_layout *layout,
const struct ir3_compiler *compiler)
{
return layout->push_constant_size > 0 &&
layout->push_constant_size <= (compiler->shared_consts_size * 16);
}
struct tu_program_descriptor_linkage
{
struct ir3_const_state const_state;
uint32_t constlen;
struct tu_push_constant_range push_consts;
};
struct tu_pipeline_executable {
gl_shader_stage stage;
struct ir3_info stats;
bool is_binning;
char *nir_from_spirv;
char *nir_final;
char *disasm;
};
struct tu_pipeline
{
struct vk_object_base base;
struct tu_cs cs;
struct tu_suballoc_bo bo;
/* Separate BO for private memory since it should GPU writable */
struct tu_bo *pvtmem_bo;
bool need_indirect_descriptor_sets;
VkShaderStageFlags active_stages;
uint32_t active_desc_sets;
/* mask of enabled dynamic states
* if BIT(i) is set, pipeline->dynamic_state[i] is *NOT* used
*/
uint32_t dynamic_state_mask;
struct tu_draw_state dynamic_state[TU_DYNAMIC_STATE_COUNT];
/* for dynamic states which use the same register: */
uint32_t gras_su_cntl, gras_su_cntl_mask;
uint32_t rb_depth_cntl, rb_depth_cntl_mask;
uint32_t rb_stencil_cntl, rb_stencil_cntl_mask;
uint32_t pc_raster_cntl, pc_raster_cntl_mask;
uint32_t vpc_unknown_9107, vpc_unknown_9107_mask;
uint32_t stencil_wrmask;
unsigned num_rts;
uint32_t rb_mrt_control[MAX_RTS], rb_mrt_control_mask;
uint32_t rb_mrt_blend_control[MAX_RTS];
uint32_t sp_blend_cntl, sp_blend_cntl_mask;
uint32_t rb_blend_cntl, rb_blend_cntl_mask;
uint32_t color_write_enable, blend_enable;
bool logic_op_enabled, rop_reads_dst;
bool rasterizer_discard;
bool rb_depth_cntl_disable;
enum a5xx_line_mode line_mode;
/* draw states for the pipeline */
struct tu_draw_state load_state, rast_state;
struct tu_draw_state prim_order_state_sysmem, prim_order_state_gmem;
/* for vertex buffers state */
uint32_t num_vbs;
struct tu_push_constant_range shared_consts;
struct
{
struct tu_draw_state config_state;
struct tu_draw_state state;
struct tu_draw_state binning_state;
struct tu_program_descriptor_linkage link[MESA_SHADER_STAGES];
} program;
struct
{
struct tu_draw_state state;
struct tu_draw_state binning_state;
} vi;
struct
{
enum pc_di_primtype primtype;
bool primitive_restart;
} ia;
struct
{
uint32_t patch_type;
uint32_t param_stride;
bool upper_left_domain_origin;
} tess;
struct
{
uint32_t local_size[3];
uint32_t subgroup_size;
} compute;
bool provoking_vertex_last;
struct tu_lrz_pipeline lrz;
/* In other words - framebuffer fetch support */
bool raster_order_attachment_access;
bool subpass_feedback_loop_ds;
bool z_negative_one_to_one;
/* memory bandwidth cost (in bytes) for color attachments */
uint32_t color_bandwidth_per_sample;
uint32_t depth_cpp_per_sample;
uint32_t stencil_cpp_per_sample;
void *executables_mem_ctx;
/* tu_pipeline_executable */
struct util_dynarray executables;
};
struct tu_image;
void
tu6_clear_lrz(struct tu_cmd_buffer *cmd, struct tu_cs *cs, struct tu_image* image, const VkClearValue *value);
void
tu6_dirty_lrz_fc(struct tu_cmd_buffer *cmd, struct tu_cs *cs, struct tu_image* image);
void
tu6_emit_lrz(struct tu_cmd_buffer *cmd, struct tu_cs *cs);
void
tu_disable_lrz(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
struct tu_image *image);
void
tu_lrz_clear_depth_image(struct tu_cmd_buffer *cmd,
struct tu_image *image,
const VkClearDepthStencilValue *pDepthStencil,
uint32_t rangeCount,
const VkImageSubresourceRange *pRanges);
void
tu_lrz_begin_renderpass(struct tu_cmd_buffer *cmd,
const VkClearValue *clear_values);
void
tu_lrz_begin_resumed_renderpass(struct tu_cmd_buffer *cmd,
const VkClearValue *clear_values);
void
tu_lrz_begin_secondary_cmdbuf(struct tu_cmd_buffer *cmd);
void
tu_lrz_tiling_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs);
void
tu_lrz_tiling_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs);
void
tu_lrz_sysmem_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs);
void
tu_lrz_sysmem_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs);
void
tu_lrz_disable_during_renderpass(struct tu_cmd_buffer *cmd);
void
tu6_emit_viewport(struct tu_cs *cs, const VkViewport *viewport, uint32_t num_viewport,
bool z_negative_one_to_one);
void
tu6_emit_scissor(struct tu_cs *cs, const VkRect2D *scs, uint32_t scissor_count);
void
tu6_emit_sample_locations(struct tu_cs *cs, const VkSampleLocationsInfoEXT *samp_loc);
void
tu6_emit_depth_bias(struct tu_cs *cs,
float constant_factor,
float clamp,
float slope_factor);
void tu6_emit_msaa(struct tu_cs *cs, VkSampleCountFlagBits samples,
enum a5xx_line_mode line_mode);
void tu6_emit_window_scissor(struct tu_cs *cs, uint32_t x1, uint32_t y1, uint32_t x2, uint32_t y2);
void tu6_emit_window_offset(struct tu_cs *cs, uint32_t x1, uint32_t y1);
uint32_t tu6_rb_mrt_control_rop(VkLogicOp op, bool *rop_reads_dst);
void tu_disable_draw_states(struct tu_cmd_buffer *cmd, struct tu_cs *cs);
void tu6_apply_depth_bounds_workaround(struct tu_device *device,
uint32_t *rb_depth_cntl);
struct tu_pvtmem_config {
uint64_t iova;
uint32_t per_fiber_size;
uint32_t per_sp_size;
bool per_wave;
};
void
tu6_emit_xs_config(struct tu_cs *cs,
gl_shader_stage stage,
const struct ir3_shader_variant *xs);
void
tu6_emit_xs(struct tu_cs *cs,
gl_shader_stage stage,
const struct ir3_shader_variant *xs,
const struct tu_pvtmem_config *pvtmem,
uint64_t binary_iova);
void
tu6_emit_vpc(struct tu_cs *cs,
const struct ir3_shader_variant *vs,
const struct ir3_shader_variant *hs,
const struct ir3_shader_variant *ds,
const struct ir3_shader_variant *gs,
const struct ir3_shader_variant *fs,
uint32_t patch_control_points);
void
tu6_emit_fs_inputs(struct tu_cs *cs, const struct ir3_shader_variant *fs);
struct tu_image_view;
void
tu_resolve_sysmem(struct tu_cmd_buffer *cmd,
struct tu_cs *cs,
const struct tu_image_view *src,
const struct tu_image_view *dst,
uint32_t layer_mask,
uint32_t layers,
const VkRect2D *rect);
void
tu_clear_sysmem_attachment(struct tu_cmd_buffer *cmd,
struct tu_cs *cs,
uint32_t a,
const VkClearValue *value);
void
tu_clear_gmem_attachment(struct tu_cmd_buffer *cmd,
struct tu_cs *cs,
uint32_t a,
const VkClearValue *value);
void
tu_load_gmem_attachment(struct tu_cmd_buffer *cmd,
struct tu_cs *cs,
uint32_t a,
bool cond_exec_allowed,
bool force_load);
/* expose this function to be able to emit load without checking LOAD_OP */
void
tu_emit_load_gmem_attachment(struct tu_cmd_buffer *cmd, struct tu_cs *cs, uint32_t a);
/* note: gmem store can also resolve */
void
tu_store_gmem_attachment(struct tu_cmd_buffer *cmd,
struct tu_cs *cs,
uint32_t a,
uint32_t gmem_a,
bool cond_exec_allowed);
enum pipe_format tu_vk_format_to_pipe_format(VkFormat vk_format);
struct tu_native_format
{
enum a6xx_format fmt : 8;
enum a3xx_color_swap swap : 8;
enum a6xx_tile_mode tile_mode : 8;
};
enum pipe_format tu_vk_format_to_pipe_format(VkFormat vk_format);
bool tu6_format_vtx_supported(VkFormat format);
struct tu_native_format tu6_format_vtx(VkFormat format);
bool tu6_format_color_supported(enum pipe_format format);
struct tu_native_format tu6_format_color(enum pipe_format format, enum a6xx_tile_mode tile_mode);
bool tu6_format_texture_supported(enum pipe_format format);
struct tu_native_format tu6_format_texture(enum pipe_format format, enum a6xx_tile_mode tile_mode);
static inline enum a6xx_format
tu6_base_format(enum pipe_format format)
{
/* note: tu6_format_color doesn't care about tiling for .fmt field */
return tu6_format_color(format, TILE6_LINEAR).fmt;
}
struct tu_image
{
struct vk_image vk;
struct fdl_layout layout[3];
uint32_t total_size;
#ifdef ANDROID
/* For VK_ANDROID_native_buffer, the WSI image owns the memory, */
VkDeviceMemory owned_memory;
#endif
/* Set when bound */
struct tu_bo *bo;
uint64_t iova;
uint32_t lrz_height;
uint32_t lrz_pitch;
uint32_t lrz_offset;
uint32_t lrz_fc_offset;
uint32_t lrz_fc_size;
};
uint32_t tu6_plane_count(VkFormat format);
enum pipe_format tu6_plane_format(VkFormat format, uint32_t plane);
uint32_t tu6_plane_index(VkFormat format, VkImageAspectFlags aspect_mask);
enum pipe_format tu_format_for_aspect(enum pipe_format format,
VkImageAspectFlags aspect_mask);
struct tu_image_view
{
struct vk_image_view vk;
struct tu_image *image; /**< VkImageViewCreateInfo::image */
struct fdl6_view view;
/* for d32s8 separate depth */
uint64_t depth_base_addr;
uint32_t depth_layer_size;
uint32_t depth_PITCH;
/* for d32s8 separate stencil */
uint64_t stencil_base_addr;
uint32_t stencil_layer_size;
uint32_t stencil_PITCH;
};
struct tu_sampler_ycbcr_conversion {
struct vk_object_base base;
VkFormat format;
VkSamplerYcbcrModelConversion ycbcr_model;
VkSamplerYcbcrRange ycbcr_range;
VkComponentMapping components;
VkChromaLocation chroma_offsets[2];
VkFilter chroma_filter;
};
struct tu_sampler {
struct vk_object_base base;
uint32_t descriptor[A6XX_TEX_SAMP_DWORDS];
struct tu_sampler_ycbcr_conversion *ycbcr_sampler;
};
void
tu_cs_image_ref(struct tu_cs *cs, const struct fdl6_view *iview, uint32_t layer);
void
tu_cs_image_ref_2d(struct tu_cs *cs, const struct fdl6_view *iview, uint32_t layer, bool src);
void
tu_cs_image_flag_ref(struct tu_cs *cs, const struct fdl6_view *iview, uint32_t layer);
void
tu_cs_image_stencil_ref(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer);
void
tu_cs_image_depth_ref(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer);
#define tu_image_view_stencil(iview, x) \
((iview->view.x & ~A6XX_##x##_COLOR_FORMAT__MASK) | A6XX_##x##_COLOR_FORMAT(FMT6_8_UINT))
#define tu_image_view_depth(iview, x) \
((iview->view.x & ~A6XX_##x##_COLOR_FORMAT__MASK) | A6XX_##x##_COLOR_FORMAT(FMT6_32_FLOAT))
VkResult
tu_gralloc_info(struct tu_device *device,
const VkNativeBufferANDROID *gralloc_info,
int *dma_buf,
uint64_t *modifier);
VkResult
tu_import_memory_from_gralloc_handle(VkDevice device_h,
int dma_buf,
const VkAllocationCallbacks *alloc,
VkImage image_h);
bool
tiling_possible(VkFormat format);
bool
ubwc_possible(VkFormat format, VkImageType type, VkImageUsageFlags usage, VkImageUsageFlags stencil_usage,
const struct fd_dev_info *info, VkSampleCountFlagBits samples,
bool use_z24uint_s8uint);
struct tu_buffer_view
{
struct vk_object_base base;
uint32_t descriptor[A6XX_TEX_CONST_DWORDS];
struct tu_buffer *buffer;
};
void
tu_buffer_view_init(struct tu_buffer_view *view,
struct tu_device *device,
const VkBufferViewCreateInfo *pCreateInfo);
#define PERF_CNTRS_REG 4
struct tu_perf_query_data
{
uint32_t gid; /* group-id */
uint32_t cid; /* countable-id within the group */
uint32_t cntr_reg; /* counter register within the group */
uint32_t pass; /* pass index that countables can be requested */
uint32_t app_idx; /* index provided by apps */
};
struct tu_query_pool
{
struct vk_object_base base;
VkQueryType type;
uint32_t stride;
uint64_t size;
uint32_t pipeline_statistics;
struct tu_bo *bo;
/* For performance query */
const struct fd_perfcntr_group *perf_group;
uint32_t perf_group_count;
uint32_t counter_index_count;
struct tu_perf_query_data perf_query_data[0];
};
uint32_t
tu_subpass_get_attachment_to_resolve(const struct tu_subpass *subpass, uint32_t index);
void
tu_update_descriptor_sets(const struct tu_device *device,
VkDescriptorSet overrideSet,
uint32_t descriptorWriteCount,
const VkWriteDescriptorSet *pDescriptorWrites,
uint32_t descriptorCopyCount,
const VkCopyDescriptorSet *pDescriptorCopies);
void
tu_update_descriptor_set_with_template(
const struct tu_device *device,
struct tu_descriptor_set *set,
VkDescriptorUpdateTemplate descriptorUpdateTemplate,
const void *pData);
VkResult
tu_physical_device_init(struct tu_physical_device *device,
struct tu_instance *instance);
VkResult
tu_enumerate_devices(struct tu_instance *instance);
int
tu_device_get_gpu_timestamp(struct tu_device *dev,
uint64_t *ts);
int
tu_device_get_suspend_count(struct tu_device *dev,
uint64_t *suspend_count);
int
tu_drm_submitqueue_new(const struct tu_device *dev,
int priority,
uint32_t *queue_id);
void
tu_drm_submitqueue_close(const struct tu_device *dev, uint32_t queue_id);
int
tu_syncobj_to_fd(struct tu_device *device, struct vk_sync *sync);
VkResult
tu_queue_submit(struct vk_queue *vk_queue, struct vk_queue_submit *submit);
void
tu_copy_timestamp_buffer(struct u_trace_context *utctx, void *cmdstream,
void *ts_from, uint32_t from_offset,
void *ts_to, uint32_t to_offset,
uint32_t count);
VkResult
tu_create_copy_timestamp_cs(struct tu_cmd_buffer *cmdbuf, struct tu_cs** cs,
struct u_trace **trace_copy);
/* If we copy trace and timestamps we will have to free them. */
struct tu_u_trace_cmd_data
{
struct tu_cs *timestamp_copy_cs;
struct u_trace *trace;
};
/* Data necessary to retrieve timestamps and clean all
* associated resources afterwards.
*/
struct tu_u_trace_submission_data
{
uint32_t submission_id;
/* We have to know when timestamps are available,
* this sync object indicates it.
*/
struct tu_u_trace_syncobj *syncobj;
uint32_t cmd_buffer_count;
uint32_t last_buffer_with_tracepoints;
struct tu_u_trace_cmd_data *cmd_trace_data;
};
VkResult
tu_u_trace_submission_data_create(
struct tu_device *device,
struct tu_cmd_buffer **cmd_buffers,
uint32_t cmd_buffer_count,
struct tu_u_trace_submission_data **submission_data);
void
tu_u_trace_submission_data_finish(
struct tu_device *device,
struct tu_u_trace_submission_data *submission_data);
void
tu_breadcrumbs_init(struct tu_device *device);
void
tu_breadcrumbs_finish(struct tu_device *device);
#define TU_FROM_HANDLE(__tu_type, __name, __handle) \
VK_FROM_HANDLE(__tu_type, __name, __handle)
VK_DEFINE_HANDLE_CASTS(tu_cmd_buffer, vk.base, VkCommandBuffer,
VK_OBJECT_TYPE_COMMAND_BUFFER)
VK_DEFINE_HANDLE_CASTS(tu_device, vk.base, VkDevice, VK_OBJECT_TYPE_DEVICE)
VK_DEFINE_HANDLE_CASTS(tu_instance, vk.base, VkInstance,
VK_OBJECT_TYPE_INSTANCE)
VK_DEFINE_HANDLE_CASTS(tu_physical_device, vk.base, VkPhysicalDevice,
VK_OBJECT_TYPE_PHYSICAL_DEVICE)
VK_DEFINE_HANDLE_CASTS(tu_queue, vk.base, VkQueue, VK_OBJECT_TYPE_QUEUE)
VK_DEFINE_NONDISP_HANDLE_CASTS(tu_cmd_pool, vk.base, VkCommandPool,
VK_OBJECT_TYPE_COMMAND_POOL)
VK_DEFINE_NONDISP_HANDLE_CASTS(tu_buffer, base, VkBuffer,
VK_OBJECT_TYPE_BUFFER)
VK_DEFINE_NONDISP_HANDLE_CASTS(tu_buffer_view, base, VkBufferView,
VK_OBJECT_TYPE_BUFFER_VIEW)
VK_DEFINE_NONDISP_HANDLE_CASTS(tu_descriptor_pool, base, VkDescriptorPool,
VK_OBJECT_TYPE_DESCRIPTOR_POOL)
VK_DEFINE_NONDISP_HANDLE_CASTS(tu_descriptor_set, base, VkDescriptorSet,
VK_OBJECT_TYPE_DESCRIPTOR_SET)
VK_DEFINE_NONDISP_HANDLE_CASTS(tu_descriptor_set_layout, base,
VkDescriptorSetLayout,
VK_OBJECT_TYPE_DESCRIPTOR_SET_LAYOUT)
VK_DEFINE_NONDISP_HANDLE_CASTS(tu_descriptor_update_template, base,
VkDescriptorUpdateTemplate,
VK_OBJECT_TYPE_DESCRIPTOR_UPDATE_TEMPLATE)
VK_DEFINE_NONDISP_HANDLE_CASTS(tu_device_memory, base, VkDeviceMemory,
VK_OBJECT_TYPE_DEVICE_MEMORY)
VK_DEFINE_NONDISP_HANDLE_CASTS(tu_event, base, VkEvent, VK_OBJECT_TYPE_EVENT)
VK_DEFINE_NONDISP_HANDLE_CASTS(tu_framebuffer, base, VkFramebuffer,
VK_OBJECT_TYPE_FRAMEBUFFER)
VK_DEFINE_NONDISP_HANDLE_CASTS(tu_image, vk.base, VkImage, VK_OBJECT_TYPE_IMAGE)
VK_DEFINE_NONDISP_HANDLE_CASTS(tu_image_view, vk.base, VkImageView,
VK_OBJECT_TYPE_IMAGE_VIEW);
VK_DEFINE_NONDISP_HANDLE_CASTS(tu_pipeline_cache, base, VkPipelineCache,
VK_OBJECT_TYPE_PIPELINE_CACHE)
VK_DEFINE_NONDISP_HANDLE_CASTS(tu_pipeline, base, VkPipeline,
VK_OBJECT_TYPE_PIPELINE)
VK_DEFINE_NONDISP_HANDLE_CASTS(tu_pipeline_layout, base, VkPipelineLayout,
VK_OBJECT_TYPE_PIPELINE_LAYOUT)
VK_DEFINE_NONDISP_HANDLE_CASTS(tu_query_pool, base, VkQueryPool,
VK_OBJECT_TYPE_QUERY_POOL)
VK_DEFINE_NONDISP_HANDLE_CASTS(tu_render_pass, base, VkRenderPass,
VK_OBJECT_TYPE_RENDER_PASS)
VK_DEFINE_NONDISP_HANDLE_CASTS(tu_sampler, base, VkSampler,
VK_OBJECT_TYPE_SAMPLER)
VK_DEFINE_NONDISP_HANDLE_CASTS(tu_sampler_ycbcr_conversion, base, VkSamplerYcbcrConversion,
VK_OBJECT_TYPE_SAMPLER_YCBCR_CONVERSION)
/* for TU_FROM_HANDLE with both VkFence and VkSemaphore: */
#define tu_syncobj_from_handle(x) ((struct tu_syncobj*) (uintptr_t) (x))
void
update_stencil_mask(uint32_t *value, VkStencilFaceFlags face, uint32_t mask);
#endif /* TU_PRIVATE_H */