mesa/src/freedreno/vulkan/tu_private.h

2389 lines
69 KiB
C
Raw Normal View History

/*
* Copyright © 2016 Red Hat.
* Copyright © 2016 Bas Nieuwenhuizen
*
* based in part on anv driver which is:
* Copyright © 2015 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
#ifndef TU_PRIVATE_H
#define TU_PRIVATE_H
#include <assert.h>
#include <pthread.h>
#include <stdbool.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#ifdef HAVE_VALGRIND
#include <memcheck.h>
#include <valgrind.h>
#define VG(x) x
#else
#define VG(x) ((void)0)
#endif
#define MESA_LOG_TAG "TU"
#include "c11/threads.h"
#include "util/rounding.h"
#include "util/bitscan.h"
#include "util/list.h"
#include "util/log.h"
#include "util/macros.h"
#include "util/sparse_array.h"
#include "util/u_atomic.h"
#include "util/u_dynarray.h"
#include "util/xmlconfig.h"
#include "util/perf/u_trace.h"
#include "vk_alloc.h"
#include "vk_debug_report.h"
#include "vk_device.h"
#include "vk_dispatch_table.h"
#include "vk_extensions.h"
#include "vk_instance.h"
#include "vk_log.h"
#include "vk_physical_device.h"
#include "vk_shader_module.h"
#include "vk_pipeline_cache.h"
#include "wsi_common.h"
#include "ir3/ir3_compiler.h"
#include "ir3/ir3_shader.h"
2019-02-04 13:52:34 +00:00
#include "adreno_common.xml.h"
#include "adreno_pm4.xml.h"
#include "a6xx.xml.h"
#include "fdl/freedreno_layout.h"
#include "common/freedreno_dev_info.h"
#include "perfcntrs/freedreno_perfcntr.h"
2019-02-04 13:52:34 +00:00
#include "tu_descriptor_set.h"
#include "tu_autotune.h"
#include "tu_util.h"
#include "tu_perfetto.h"
/* Pre-declarations needed for WSI entrypoints */
struct wl_surface;
struct wl_display;
typedef struct xcb_connection_t xcb_connection_t;
typedef uint32_t xcb_visualid_t;
typedef uint32_t xcb_window_t;
#include <vulkan/vk_android_native_buffer.h>
#include <vulkan/vk_icd.h>
#include <vulkan/vulkan.h>
#include "tu_entrypoints.h"
#include "vulkan/runtime/vk_common_entrypoints.h"
#include "vk_format.h"
#include "vk_image.h"
#include "vk_command_buffer.h"
#include "vk_command_pool.h"
#include "vk_queue.h"
#include "vk_object.h"
#include "vk_sync.h"
#include "vk_drm_syncobj.h"
#include "vk_sync_timeline.h"
#define MAX_VBS 32
#define MAX_VERTEX_ATTRIBS 32
#define MAX_RTS 8
#define MAX_VSC_PIPES 32
#define MAX_VIEWPORTS 16
#define MAX_VIEWPORT_SIZE (1 << 14)
#define MAX_SCISSORS 16
#define MAX_DISCARD_RECTANGLES 4
#define MAX_PUSH_CONSTANTS_SIZE 256
#define MAX_PUSH_DESCRIPTORS 32
#define MAX_DYNAMIC_UNIFORM_BUFFERS 16
#define MAX_DYNAMIC_STORAGE_BUFFERS 8
#define MAX_DYNAMIC_BUFFERS_SIZE \
(MAX_DYNAMIC_UNIFORM_BUFFERS + 2 * MAX_DYNAMIC_STORAGE_BUFFERS) * \
A6XX_TEX_CONST_DWORDS
#define TU_MAX_DRM_DEVICES 8
#define MAX_VIEWS 16
#define MAX_BIND_POINTS 2 /* compute + graphics */
/* The Qualcomm driver exposes 0x20000058 */
#define MAX_STORAGE_BUFFER_RANGE 0x20000000
/* We use ldc for uniform buffer loads, just like the Qualcomm driver, so
* expose the same maximum range.
* TODO: The SIZE bitfield is 15 bits, and in 4-dword units, so the actual
* range might be higher.
*/
#define MAX_UNIFORM_BUFFER_RANGE 0x10000
#define A6XX_TEX_CONST_DWORDS 16
#define A6XX_TEX_SAMP_DWORDS 4
#define COND(bool, val) ((bool) ? (val) : 0)
#define BIT(bit) (1u << (bit))
/* Whenever we generate an error, pass it through this function. Useful for
* debugging, where we can break on it. Only call at error site, not when
* propagating errors. Might be useful to plug in a stack trace here.
*/
struct tu_instance;
struct breadcrumbs_context;
VkResult
__vk_startup_errorf(struct tu_instance *instance,
VkResult error,
bool force_print,
const char *file,
int line,
const char *format,
...) PRINTFLIKE(6, 7);
/* Prints startup errors if TU_DEBUG=startup is set or on a debug driver
* build.
*/
#define vk_startup_errorf(instance, error, format, ...) \
__vk_startup_errorf(instance, error, \
instance->debug_flags & TU_DEBUG_STARTUP, \
__FILE__, __LINE__, format, ##__VA_ARGS__)
void
__tu_finishme(const char *file, int line, const char *format, ...)
PRINTFLIKE(3, 4);
/**
* Print a FINISHME message, including its source location.
*/
#define tu_finishme(format, ...) \
do { \
static bool reported = false; \
if (!reported) { \
__tu_finishme(__FILE__, __LINE__, format, ##__VA_ARGS__); \
reported = true; \
} \
} while (0)
#define tu_stub() \
do { \
tu_finishme("stub %s", __func__); \
} while (0)
struct tu_memory_heap {
/* Standard bits passed on to the client */
VkDeviceSize size;
VkMemoryHeapFlags flags;
/** Copied from ANV:
*
* Driver-internal book-keeping.
*
* Align it to 64 bits to make atomic operations faster on 32 bit platforms.
*/
VkDeviceSize used __attribute__ ((aligned (8)));
};
uint64_t
tu_get_system_heap_size(void);
struct tu_physical_device
{
struct vk_physical_device vk;
struct tu_instance *instance;
const char *name;
uint8_t driver_uuid[VK_UUID_SIZE];
uint8_t device_uuid[VK_UUID_SIZE];
uint8_t cache_uuid[VK_UUID_SIZE];
struct wsi_device wsi_device;
int local_fd;
bool has_local;
int64_t local_major;
int64_t local_minor;
int master_fd;
bool has_master;
int64_t master_major;
int64_t master_minor;
2018-08-09 10:09:01 +01:00
uint32_t gmem_size;
uint64_t gmem_base;
uint32_t ccu_offset_gmem;
uint32_t ccu_offset_bypass;
struct fd_dev_id dev_id;
const struct fd_dev_info *info;
int msm_major_version;
int msm_minor_version;
/* Address space and global fault count for this local_fd with DRM backend */
uint64_t fault_count;
struct tu_memory_heap heap;
struct vk_sync_type syncobj_type;
struct vk_sync_timeline_type timeline_type;
const struct vk_sync_type *sync_types[3];
};
enum tu_debug_flags
{
TU_DEBUG_STARTUP = 1 << 0,
TU_DEBUG_NIR = 1 << 1,
TU_DEBUG_NOBIN = 1 << 3,
TU_DEBUG_SYSMEM = 1 << 4,
TU_DEBUG_FORCEBIN = 1 << 5,
TU_DEBUG_NOUBWC = 1 << 6,
TU_DEBUG_NOMULTIPOS = 1 << 7,
TU_DEBUG_NOLRZ = 1 << 8,
TU_DEBUG_PERFC = 1 << 9,
TU_DEBUG_FLUSHALL = 1 << 10,
TU_DEBUG_SYNCDRAW = 1 << 11,
TU_DEBUG_DONT_CARE_AS_LOAD = 1 << 12,
TU_DEBUG_GMEM = 1 << 13,
TU_DEBUG_RAST_ORDER = 1 << 14,
TU_DEBUG_UNALIGNED_STORE = 1 << 15,
TU_DEBUG_LAYOUT = 1 << 16,
TU_DEBUG_LOG_SKIP_GMEM_OPS = 1 << 17,
TU_DEBUG_PERF = 1 << 18,
TU_DEBUG_NOLRZFC = 1 << 19,
TU_DEBUG_DYNAMIC = 1 << 20,
};
struct tu_instance
{
struct vk_instance vk;
uint32_t api_version;
int physical_device_count;
struct tu_physical_device physical_devices[TU_MAX_DRM_DEVICES];
struct driOptionCache dri_options;
struct driOptionCache available_dri_options;
enum tu_debug_flags debug_flags;
};
VkResult
tu_wsi_init(struct tu_physical_device *physical_device);
void
tu_wsi_finish(struct tu_physical_device *physical_device);
bool
tu_instance_extension_supported(const char *name);
uint32_t
tu_physical_device_api_version(struct tu_physical_device *dev);
bool
tu_physical_device_extension_supported(struct tu_physical_device *dev,
const char *name);
enum tu_bo_alloc_flags
{
TU_BO_ALLOC_NO_FLAGS = 0,
TU_BO_ALLOC_ALLOW_DUMP = 1 << 0,
TU_BO_ALLOC_GPU_READ_ONLY = 1 << 1,
};
struct cache_entry;
struct tu_pipeline_cache
{
struct vk_object_base base;
struct tu_device *device;
pthread_mutex_t mutex;
uint32_t total_size;
uint32_t table_size;
uint32_t kernel_count;
struct cache_entry **hash_table;
bool modified;
VkAllocationCallbacks alloc;
};
struct tu_pipeline_key
{
};
/* queue types */
#define TU_QUEUE_GENERAL 0
#define TU_MAX_QUEUE_FAMILIES 1
/* Keep tu_syncobj until porting to common code for kgsl too */
#ifdef TU_USE_KGSL
struct tu_syncobj;
#endif
struct tu_u_trace_syncobj;
/* Define tu_timeline_sync type based on drm syncobj for a point type
* for vk_sync_timeline, and the logic to handle is mostly copied from
* anv_bo_sync since it seems it can be used by similar way to anv.
*/
enum tu_timeline_sync_state {
/** Indicates that this is a new (or newly reset fence) */
TU_TIMELINE_SYNC_STATE_RESET,
/** Indicates that this fence has been submitted to the GPU but is still
* (as far as we know) in use by the GPU.
*/
TU_TIMELINE_SYNC_STATE_SUBMITTED,
TU_TIMELINE_SYNC_STATE_SIGNALED,
};
struct tu_timeline_sync {
struct vk_sync base;
enum tu_timeline_sync_state state;
uint32_t syncobj;
};
struct tu_queue
{
struct vk_queue vk;
struct tu_device *device;
2019-01-10 20:12:38 +00:00
uint32_t msm_queue_id;
int fence;
};
struct tu_bo
{
uint32_t gem_handle;
uint64_t size;
uint64_t iova;
void *map;
int32_t refcnt;
#ifndef TU_USE_KGSL
uint32_t bo_list_idx;
#endif
bool implicit_sync : 1;
};
/* externally-synchronized BO suballocator. */
struct tu_suballocator
{
struct tu_device *dev;
uint32_t default_size;
enum tu_bo_alloc_flags flags;
/** Current BO we're suballocating out of. */
struct tu_bo *bo;
uint32_t next_offset;
/** Optional BO cached for recycling as the next suballoc->bo, instead of having to allocate one. */
struct tu_bo *cached_bo;
};
struct tu_suballoc_bo
{
struct tu_bo *bo;
uint64_t iova;
uint32_t size; /* bytes */
};
void
tu_bo_suballocator_init(struct tu_suballocator *suballoc,
struct tu_device *dev,
uint32_t default_size,
uint32_t flags);
void
tu_bo_suballocator_finish(struct tu_suballocator *suballoc);
VkResult
tu_suballoc_bo_alloc(struct tu_suballoc_bo *suballoc_bo, struct tu_suballocator *suballoc,
uint32_t size, uint32_t align);
void *
tu_suballoc_bo_map(struct tu_suballoc_bo *bo);
void
tu_suballoc_bo_free(struct tu_suballocator *suballoc, struct tu_suballoc_bo *bo);
enum global_shader {
GLOBAL_SH_VS_BLIT,
GLOBAL_SH_VS_CLEAR,
GLOBAL_SH_FS_BLIT,
GLOBAL_SH_FS_BLIT_ZSCALE,
GLOBAL_SH_FS_COPY_MS,
GLOBAL_SH_FS_CLEAR0,
GLOBAL_SH_FS_CLEAR_MAX = GLOBAL_SH_FS_CLEAR0 + MAX_RTS,
GLOBAL_SH_COUNT,
};
/**
* Tracks the results from an individual renderpass. Initially created
* per renderpass, and appended to the tail of at->pending_results. At a later
* time, when the GPU has finished writing the results, we fill samples_passed.
*/
struct tu_renderpass_result {
/* Points into GPU memory */
struct tu_renderpass_samples* samples;
struct tu_suballoc_bo bo;
/*
* Below here, only used internally within autotune
*/
uint64_t rp_key;
struct tu_renderpass_history *history;
struct list_head node;
uint32_t fence;
uint64_t samples_passed;
};
#define TU_BORDER_COLOR_COUNT 4096
#define TU_BORDER_COLOR_BUILTIN 6
#define TU_BLIT_SHADER_SIZE 1024
/* This struct defines the layout of the global_bo */
struct tu6_global
{
/* clear/blit shaders */
uint32_t shaders[TU_BLIT_SHADER_SIZE];
uint32_t seqno_dummy; /* dummy seqno for CP_EVENT_WRITE */
uint32_t _pad0;
volatile uint32_t vsc_draw_overflow;
uint32_t _pad1;
volatile uint32_t vsc_prim_overflow;
uint32_t _pad2;
uint64_t predicate;
/* scratch space for VPC_SO[i].FLUSH_BASE_LO/HI, start on 32 byte boundary. */
struct {
uint32_t offset;
uint32_t pad[7];
} flush_base[4];
ALIGN16 uint32_t cs_indirect_xyz[3];
tu: Fix prim gen query and pipeline stats query interaction Fixed: - VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT was able to stop prim counter when pipeline stats query is running. - This may have happened when prim gen query was in secondary cmdbuf. - VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT counting geometry in each tile. - VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT counting geometry in each tile when pipeline stats query is started inside prim gen query and inside a renderpass. The matter of VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT and pipeline stats interaction is solved by tracking whether pipeline stats query is running both on CPU (for non secondary cmdbuf case) and on GPU (for secondary cmdbuf). Note, prim gen query is not allowed with secondary command buffers, so only pipeline stats query is tracked on gpu. See https://gitlab.khronos.org/vulkan/vulkan/-/issues/3142 Counting geometry per each tile is solved by: - Conditionally executing START/STOP_PRIMITIVE_CTRS to not run in tiling pass. Solves the case when prim gen query is inside a renderpass. - Stop prim counters before executing `draw_cs` and restarting them afterwards. Solves prim gen query being outside a renderpass. Fixes GL CTS tests with Zink + `TU_DEBUG=gmem`: GTF-GL46.gtf30.GL3Tests.transform_feedback.transform_feedback_max_separate GTF-GL46.gtf40.GL3Tests.transform_feedback2.transform_feedback2_basic GTF-GL46.gtf40.GL3Tests.transform_feedback2.transform_feedback2_framebuffer GTF-GL46.gtf40.GL3Tests.transform_feedback3.transform_feedback3_streams_overflow GTF-GL46.gtf40.GL3Tests.transform_feedback3.transform_feedback3_streams_queried GTF-GL46.gtf40.GL3Tests.transform_feedback2.transform_feedback2_states Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/6602 Signed-off-by: Danylo Piliaiev <dpiliaiev@igalia.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17163>
2022-06-27 17:01:08 +01:00
volatile uint32_t vtx_stats_query_not_running;
/* To know when renderpass stats for autotune are valid */
volatile uint32_t autotune_fence;
/* For recycling command buffers for dynamic suspend/resume comamnds */
volatile uint32_t dynamic_rendering_fence;
volatile uint32_t dbg_one;
volatile uint32_t dbg_gmem_total_loads;
volatile uint32_t dbg_gmem_taken_loads;
volatile uint32_t dbg_gmem_total_stores;
volatile uint32_t dbg_gmem_taken_stores;
/* Written from GPU */
volatile uint32_t breadcrumb_gpu_sync_seqno;
uint32_t _pad3;
/* Written from CPU, acknowledges value written from GPU */
volatile uint32_t breadcrumb_cpu_sync_seqno;
uint32_t _pad4;
/* note: larger global bo will be used for customBorderColors */
struct bcolor_entry bcolor_builtin[TU_BORDER_COLOR_BUILTIN], bcolor[];
};
#define gb_offset(member) offsetof(struct tu6_global, member)
#define global_iova(cmd, member) ((cmd)->device->global_bo->iova + gb_offset(member))
/* extra space in vsc draw/prim streams */
#define VSC_PAD 0x40
struct tu_device
{
struct vk_device vk;
struct tu_instance *instance;
struct tu_queue *queues[TU_MAX_QUEUE_FAMILIES];
int queue_count[TU_MAX_QUEUE_FAMILIES];
struct tu_physical_device *physical_device;
int fd;
struct ir3_compiler *compiler;
/* Backup in-memory cache to be used if the app doesn't provide one */
struct vk_pipeline_cache *mem_cache;
#define MIN_SCRATCH_BO_SIZE_LOG2 12 /* A page */
/* Currently the kernel driver uses a 32-bit GPU address space, but it
* should be impossible to go beyond 48 bits.
*/
struct {
struct tu_bo *bo;
mtx_t construct_mtx;
bool initialized;
} scratch_bos[48 - MIN_SCRATCH_BO_SIZE_LOG2];
struct tu_bo *global_bo;
uint32_t implicit_sync_bo_count;
/* Device-global BO suballocator for reducing BO management overhead for
* (read-only) pipeline state. Synchronized by pipeline_mutex.
*/
struct tu_suballocator pipeline_suballoc;
mtx_t pipeline_mutex;
/* Device-global BO suballocator for reducing BO management for small
* gmem/sysmem autotune result buffers. Synchronized by autotune_mutex.
*/
struct tu_suballocator autotune_suballoc;
mtx_t autotune_mutex;
/* the blob seems to always use 8K factor and 128K param sizes, copy them */
#define TU_TESS_FACTOR_SIZE (8 * 1024)
#define TU_TESS_PARAM_SIZE (128 * 1024)
#define TU_TESS_BO_SIZE (TU_TESS_FACTOR_SIZE + TU_TESS_PARAM_SIZE)
/* Lazily allocated, protected by the device mutex. */
struct tu_bo *tess_bo;
struct ir3_shader_variant *global_shader_variants[GLOBAL_SH_COUNT];
struct ir3_shader *global_shaders[GLOBAL_SH_COUNT];
uint64_t global_shader_va[GLOBAL_SH_COUNT];
uint32_t vsc_draw_strm_pitch;
uint32_t vsc_prim_strm_pitch;
BITSET_DECLARE(custom_border_color, TU_BORDER_COLOR_COUNT);
mtx_t mutex;
/* bo list for submits: */
struct drm_msm_gem_submit_bo *bo_list;
/* map bo handles to bo list index: */
uint32_t bo_count, bo_list_size;
mtx_t bo_mutex;
/* protects imported BOs creation/freeing */
struct u_rwlock dma_bo_lock;
/* This array holds all our 'struct tu_bo' allocations. We use this
* so we can add a refcount to our BOs and check if a particular BO
* was already allocated in this device using its GEM handle. This is
* necessary to properly manage BO imports, because the kernel doesn't
* refcount the underlying BO memory.
*
* Specifically, when self-importing (i.e. importing a BO into the same
* device that created it), the kernel will give us the same BO handle
* for both BOs and we must only free it once when both references are
* freed. Otherwise, if we are not self-importing, we get two different BO
* handles, and we want to free each one individually.
*
* The refcount is also useful for being able to maintain BOs across
* VK object lifetimes, such as pipelines suballocating out of BOs
* allocated on the device.
*/
struct util_sparse_array bo_map;
/* Command streams to set pass index to a scratch reg */
struct tu_cs *perfcntrs_pass_cs;
struct tu_cs_entry *perfcntrs_pass_cs_entries;
struct util_dynarray dynamic_rendering_pending;
VkCommandPool dynamic_rendering_pool;
uint32_t dynamic_rendering_fence;
/* Condition variable for timeline semaphore to notify waiters when a
* new submit is executed. */
pthread_cond_t timeline_cond;
pthread_mutex_t submit_mutex;
struct tu_autotune autotune;
struct breadcrumbs_context *breadcrumbs_ctx;
#ifdef ANDROID
const void *gralloc;
enum {
TU_GRALLOC_UNKNOWN,
TU_GRALLOC_CROS,
TU_GRALLOC_OTHER,
} gralloc_type;
#endif
uint32_t submit_count;
struct u_trace_context trace_context;
#ifdef HAVE_PERFETTO
struct tu_perfetto_state perfetto;
#endif
bool use_z24uint_s8uint;
};
void tu_init_clear_blit_shaders(struct tu_device *dev);
void tu_destroy_clear_blit_shaders(struct tu_device *dev);
VkResult tu_init_dynamic_rendering(struct tu_device *dev);
void tu_destroy_dynamic_rendering(struct tu_device *dev);
VkResult tu_insert_dynamic_cmdbufs(struct tu_device *dev,
struct tu_cmd_buffer ***cmds_ptr,
uint32_t *size);
VkResult
tu_device_submit_deferred_locked(struct tu_device *dev);
VkResult
tu_device_wait_u_trace(struct tu_device *dev, struct tu_u_trace_syncobj *syncobj);
uint64_t
tu_device_ticks_to_ns(struct tu_device *dev, uint64_t ts);
VkResult
tu_device_check_status(struct vk_device *vk_device);
VkResult
tu_bo_init_new(struct tu_device *dev, struct tu_bo **bo, uint64_t size,
enum tu_bo_alloc_flags flags);
VkResult
tu_bo_init_dmabuf(struct tu_device *dev,
struct tu_bo **bo,
uint64_t size,
int fd);
int
tu_bo_export_dmabuf(struct tu_device *dev, struct tu_bo *bo);
void
tu_bo_finish(struct tu_device *dev, struct tu_bo *bo);
VkResult
tu_bo_map(struct tu_device *dev, struct tu_bo *bo);
static inline struct tu_bo *
tu_device_lookup_bo(struct tu_device *device, uint32_t handle)
{
return (struct tu_bo *) util_sparse_array_get(&device->bo_map, handle);
}
static inline struct tu_bo *
tu_bo_get_ref(struct tu_bo *bo)
{
p_atomic_inc(&bo->refcnt);
return bo;
}
/* Get a scratch bo for use inside a command buffer. This will always return
* the same bo given the same size or similar sizes, so only one scratch bo
* can be used at the same time. It's meant for short-lived things where we
* need to write to some piece of memory, read from it, and then immediately
* discard it.
*/
VkResult
tu_get_scratch_bo(struct tu_device *dev, uint64_t size, struct tu_bo **bo);
struct tu_cs_entry
{
/* No ownership */
const struct tu_bo *bo;
uint32_t size;
uint32_t offset;
};
struct tu_cs_memory {
uint32_t *map;
uint64_t iova;
};
struct tu_draw_state {
uint64_t iova : 48;
uint32_t size : 16;
};
enum tu_dynamic_state
{
/* re-use VK_DYNAMIC_STATE_ enums for non-extended dynamic states */
TU_DYNAMIC_STATE_SAMPLE_LOCATIONS = VK_DYNAMIC_STATE_STENCIL_REFERENCE + 1,
TU_DYNAMIC_STATE_RB_DEPTH_CNTL,
TU_DYNAMIC_STATE_RB_STENCIL_CNTL,
TU_DYNAMIC_STATE_VB_STRIDE,
TU_DYNAMIC_STATE_RASTERIZER_DISCARD,
TU_DYNAMIC_STATE_BLEND,
TU_DYNAMIC_STATE_COUNT,
/* no associated draw state: */
TU_DYNAMIC_STATE_PRIMITIVE_TOPOLOGY = TU_DYNAMIC_STATE_COUNT,
TU_DYNAMIC_STATE_PRIMITIVE_RESTART_ENABLE,
TU_DYNAMIC_STATE_LOGIC_OP,
TU_DYNAMIC_STATE_COLOR_WRITE_ENABLE,
/* re-use the line width enum as it uses GRAS_SU_CNTL: */
TU_DYNAMIC_STATE_GRAS_SU_CNTL = VK_DYNAMIC_STATE_LINE_WIDTH,
};
enum tu_draw_state_group_id
{
TU_DRAW_STATE_PROGRAM_CONFIG,
TU_DRAW_STATE_PROGRAM,
TU_DRAW_STATE_PROGRAM_BINNING,
TU_DRAW_STATE_VB,
TU_DRAW_STATE_VI,
TU_DRAW_STATE_VI_BINNING,
TU_DRAW_STATE_RAST,
TU_DRAW_STATE_CONST,
TU_DRAW_STATE_DESC_SETS,
TU_DRAW_STATE_DESC_SETS_LOAD,
TU_DRAW_STATE_VS_PARAMS,
TU_DRAW_STATE_INPUT_ATTACHMENTS_GMEM,
TU_DRAW_STATE_INPUT_ATTACHMENTS_SYSMEM,
TU_DRAW_STATE_LRZ_AND_DEPTH_PLANE,
TU_DRAW_STATE_PRIM_MODE_GMEM,
TU_DRAW_STATE_PRIM_MODE_SYSMEM,
/* dynamic state related draw states */
TU_DRAW_STATE_DYNAMIC,
TU_DRAW_STATE_COUNT = TU_DRAW_STATE_DYNAMIC + TU_DYNAMIC_STATE_COUNT,
};
enum tu_cs_mode
{
/*
* A command stream in TU_CS_MODE_GROW mode grows automatically whenever it
* is full. tu_cs_begin must be called before command packet emission and
* tu_cs_end must be called after.
*
* This mode may create multiple entries internally. The entries must be
* submitted together.
*/
TU_CS_MODE_GROW,
/*
* A command stream in TU_CS_MODE_EXTERNAL mode wraps an external,
* fixed-size buffer. tu_cs_begin and tu_cs_end are optional and have no
* effect on it.
*
* This mode does not create any entry or any BO.
*/
TU_CS_MODE_EXTERNAL,
/*
* A command stream in TU_CS_MODE_SUB_STREAM mode does not support direct
* command packet emission. tu_cs_begin_sub_stream must be called to get a
* sub-stream to emit comamnd packets to. When done with the sub-stream,
* tu_cs_end_sub_stream must be called.
*
* This mode does not create any entry internally.
*/
TU_CS_MODE_SUB_STREAM,
};
#define TU_COND_EXEC_STACK_SIZE 4
struct tu_cs
{
uint32_t *start;
uint32_t *cur;
uint32_t *reserved_end;
uint32_t *end;
struct tu_device *device;
enum tu_cs_mode mode;
uint32_t next_bo_size;
struct tu_cs_entry *entries;
uint32_t entry_count;
uint32_t entry_capacity;
struct tu_bo **bos;
uint32_t bo_count;
uint32_t bo_capacity;
/* Optional BO that this CS is sub-allocated from for TU_CS_MODE_SUB_STREAM */
struct tu_bo *refcount_bo;
/* state for cond_exec_start/cond_exec_end */
uint32_t cond_stack_depth;
uint32_t cond_flags[TU_COND_EXEC_STACK_SIZE];
uint32_t *cond_dwords[TU_COND_EXEC_STACK_SIZE];
uint32_t breadcrumb_emit_after;
};
struct tu_device_memory
{
struct vk_object_base base;
struct tu_bo *bo;
};
struct tu_descriptor_range
{
uint64_t va;
uint32_t size;
};
struct tu_descriptor_set
{
struct vk_object_base base;
/* Link to descriptor pool's desc_sets list . */
struct list_head pool_link;
struct tu_descriptor_set_layout *layout;
struct tu_descriptor_pool *pool;
uint32_t size;
uint64_t va;
uint32_t *mapped_ptr;
uint32_t *dynamic_descriptors;
};
struct tu_descriptor_pool_entry
{
uint32_t offset;
uint32_t size;
struct tu_descriptor_set *set;
};
struct tu_descriptor_pool
{
struct vk_object_base base;
struct tu_bo *bo;
uint64_t current_offset;
uint64_t size;
uint8_t *host_memory_base;
uint8_t *host_memory_ptr;
uint8_t *host_memory_end;
uint8_t *host_bo;
struct list_head desc_sets;
uint32_t entry_count;
uint32_t max_entry_count;
struct tu_descriptor_pool_entry entries[0];
};
struct tu_descriptor_update_template_entry
{
VkDescriptorType descriptor_type;
/* The number of descriptors to update */
uint32_t descriptor_count;
/* Into mapped_ptr or dynamic_descriptors, in units of the respective array
*/
uint32_t dst_offset;
/* In dwords. Not valid/used for dynamic descriptors */
uint32_t dst_stride;
uint32_t buffer_offset;
/* Only valid for combined image samplers and samplers */
uint16_t has_sampler;
/* In bytes */
size_t src_offset;
size_t src_stride;
/* For push descriptors */
const struct tu_sampler *immutable_samplers;
};
struct tu_descriptor_update_template
{
struct vk_object_base base;
uint32_t entry_count;
VkPipelineBindPoint bind_point;
struct tu_descriptor_update_template_entry entry[0];
};
struct tu_buffer
{
struct vk_object_base base;
VkDeviceSize size;
VkBufferUsageFlags usage;
VkBufferCreateFlags flags;
2019-01-15 21:54:15 +00:00
struct tu_bo *bo;
uint64_t iova;
};
const char *
tu_get_debug_option_name(int id);
const char *
tu_get_perftest_option_name(int id);
struct tu_attachment_info
{
struct tu_image_view *attachment;
};
struct tu_framebuffer
{
struct vk_object_base base;
uint32_t width;
uint32_t height;
uint32_t layers;
/* size of the first tile */
VkExtent2D tile0;
/* number of tiles */
VkExtent2D tile_count;
/* size of the first VSC pipe */
VkExtent2D pipe0;
/* number of VSC pipes */
VkExtent2D pipe_count;
/* Whether binning should be used for gmem rendering using this framebuffer. */
bool binning;
/* Whether binning could be used for gmem rendering using this framebuffer. */
bool binning_possible;
/* pipe register values */
uint32_t pipe_config[MAX_VSC_PIPES];
uint32_t pipe_sizes[MAX_VSC_PIPES];
uint32_t attachment_count;
struct tu_attachment_info attachments[0];
};
struct tu_subpass_barrier {
VkPipelineStageFlags2 src_stage_mask;
VkPipelineStageFlags2 dst_stage_mask;
VkAccessFlags2 src_access_mask;
VkAccessFlags2 dst_access_mask;
bool incoherent_ccu_color, incoherent_ccu_depth;
};
struct tu_subpass_attachment
{
uint32_t attachment;
/* For input attachments, true if it needs to be patched to refer to GMEM
* in GMEM mode. This is false if it hasn't already been written as an
* attachment.
*/
bool patch_input_gmem;
};
struct tu_subpass
{
uint32_t input_count;
uint32_t color_count;
uint32_t resolve_count;
bool resolve_depth_stencil;
bool feedback_loop_color;
bool feedback_loop_ds;
/* True if we must invalidate UCHE thanks to a feedback loop. */
bool feedback_invalidate;
/* In other words - framebuffer fetch support */
bool raster_order_attachment_access;
struct tu_subpass_attachment *input_attachments;
struct tu_subpass_attachment *color_attachments;
struct tu_subpass_attachment *resolve_attachments;
struct tu_subpass_attachment depth_stencil_attachment;
VkSampleCountFlagBits samples;
uint32_t srgb_cntl;
uint32_t multiview_mask;
struct tu_subpass_barrier start_barrier;
};
struct tu_render_pass_attachment
{
VkFormat format;
uint32_t samples;
uint32_t cpp;
VkImageAspectFlags clear_mask;
uint32_t clear_views;
bool load;
bool store;
int32_t gmem_offset;
bool will_be_resolved;
/* for D32S8 separate stencil: */
bool load_stencil;
bool store_stencil;
bool cond_load_allowed;
bool cond_store_allowed;
int32_t gmem_offset_stencil;
};
struct tu_render_pass
{
struct vk_object_base base;
uint32_t attachment_count;
uint32_t subpass_count;
uint32_t gmem_pixels;
uint32_t tile_align_w;
/* memory bandwidth costs (in bytes) for gmem / sysmem rendering */
uint32_t gmem_bandwidth_per_pixel;
uint32_t sysmem_bandwidth_per_pixel;
struct tu_subpass_attachment *subpass_attachments;
struct tu_render_pass_attachment *attachments;
struct tu_subpass_barrier end_barrier;
struct tu_subpass subpasses[0];
};
void
tu_framebuffer_tiling_config(struct tu_framebuffer *fb,
const struct tu_device *device,
const struct tu_render_pass *pass);
struct tu_descriptor_state
{
struct tu_descriptor_set *sets[MAX_SETS];
struct tu_descriptor_set push_set;
uint32_t dynamic_descriptors[MAX_DYNAMIC_BUFFERS_SIZE];
};
enum tu_cmd_dirty_bits
{
TU_CMD_DIRTY_VERTEX_BUFFERS = BIT(0),
TU_CMD_DIRTY_VB_STRIDE = BIT(1),
TU_CMD_DIRTY_GRAS_SU_CNTL = BIT(2),
TU_CMD_DIRTY_RB_DEPTH_CNTL = BIT(3),
TU_CMD_DIRTY_RB_STENCIL_CNTL = BIT(4),
TU_CMD_DIRTY_DESC_SETS_LOAD = BIT(5),
TU_CMD_DIRTY_COMPUTE_DESC_SETS_LOAD = BIT(6),
TU_CMD_DIRTY_SHADER_CONSTS = BIT(7),
TU_CMD_DIRTY_LRZ = BIT(8),
TU_CMD_DIRTY_VS_PARAMS = BIT(9),
TU_CMD_DIRTY_RASTERIZER_DISCARD = BIT(10),
TU_CMD_DIRTY_VIEWPORTS = BIT(11),
TU_CMD_DIRTY_BLEND = BIT(12),
/* all draw states were disabled and need to be re-enabled: */
TU_CMD_DIRTY_DRAW_STATE = BIT(13)
};
/* There are only three cache domains we have to care about: the CCU, or
* color cache unit, which is used for color and depth/stencil attachments
* and copy/blit destinations, and is split conceptually into color and depth,
* and the universal cache or UCHE which is used for pretty much everything
* else, except for the CP (uncached) and host. We need to flush whenever data
* crosses these boundaries.
*/
enum tu_cmd_access_mask {
TU_ACCESS_UCHE_READ = 1 << 0,
TU_ACCESS_UCHE_WRITE = 1 << 1,
TU_ACCESS_CCU_COLOR_READ = 1 << 2,
TU_ACCESS_CCU_COLOR_WRITE = 1 << 3,
TU_ACCESS_CCU_DEPTH_READ = 1 << 4,
TU_ACCESS_CCU_DEPTH_WRITE = 1 << 5,
/* Experiments have shown that while it's safe to avoid flushing the CCU
* after each blit/renderpass, it's not safe to assume that subsequent
* lookups with a different attachment state will hit unflushed cache
* entries. That is, the CCU needs to be flushed and possibly invalidated
* when accessing memory with a different attachment state. Writing to an
* attachment under the following conditions after clearing using the
* normal 2d engine path is known to have issues:
*
* - It isn't the 0'th layer.
* - There are more than one attachment, and this isn't the 0'th attachment
* (this seems to also depend on the cpp of the attachments).
*
* Our best guess is that the layer/MRT state is used when computing
* the location of a cache entry in CCU, to avoid conflicts. We assume that
* any access in a renderpass after or before an access by a transfer needs
* a flush/invalidate, and use the _INCOHERENT variants to represent access
* by a renderpass.
*/
TU_ACCESS_CCU_COLOR_INCOHERENT_READ = 1 << 6,
TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE = 1 << 7,
TU_ACCESS_CCU_DEPTH_INCOHERENT_READ = 1 << 8,
TU_ACCESS_CCU_DEPTH_INCOHERENT_WRITE = 1 << 9,
/* Accesses which bypasses any cache. e.g. writes via the host,
* CP_EVENT_WRITE::BLIT, and the CP are SYSMEM_WRITE.
*/
TU_ACCESS_SYSMEM_READ = 1 << 10,
TU_ACCESS_SYSMEM_WRITE = 1 << 11,
/* Memory writes from the CP start in-order with draws and event writes,
* but execute asynchronously and hence need a CP_WAIT_MEM_WRITES if read.
*/
TU_ACCESS_CP_WRITE = 1 << 12,
TU_ACCESS_READ =
TU_ACCESS_UCHE_READ |
TU_ACCESS_CCU_COLOR_READ |
TU_ACCESS_CCU_DEPTH_READ |
TU_ACCESS_CCU_COLOR_INCOHERENT_READ |
TU_ACCESS_CCU_DEPTH_INCOHERENT_READ |
TU_ACCESS_SYSMEM_READ,
TU_ACCESS_WRITE =
TU_ACCESS_UCHE_WRITE |
TU_ACCESS_CCU_COLOR_WRITE |
TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE |
TU_ACCESS_CCU_DEPTH_WRITE |
TU_ACCESS_CCU_DEPTH_INCOHERENT_WRITE |
TU_ACCESS_SYSMEM_WRITE |
TU_ACCESS_CP_WRITE,
TU_ACCESS_ALL =
TU_ACCESS_READ |
TU_ACCESS_WRITE,
};
/* Starting with a6xx, the pipeline is split into several "clusters" (really
* pipeline stages). Each stage has its own pair of register banks and can
* switch them independently, so that earlier stages can run ahead of later
* ones. e.g. the FS of draw N and the VS of draw N + 1 can be executing at
* the same time.
*
* As a result of this, we need to insert a WFI when an earlier stage depends
* on the result of a later stage. CP_DRAW_* and CP_BLIT will wait for any
* pending WFI's to complete before starting, and usually before reading
* indirect params even, so a WFI also acts as a full "pipeline stall".
*
* Note, the names of the stages come from CLUSTER_* in devcoredump. We
* include all the stages for completeness, even ones which do not read/write
* anything.
*/
enum tu_stage {
/* This doesn't correspond to a cluster, but we need it for tracking
* indirect draw parameter reads etc.
*/
TU_STAGE_CP,
/* - Fetch index buffer
* - Fetch vertex attributes, dispatch VS
*/
TU_STAGE_FE,
/* Execute all geometry stages (VS thru GS) */
TU_STAGE_SP_VS,
/* Write to VPC, do primitive assembly. */
TU_STAGE_PC_VS,
/* Rasterization. RB_DEPTH_BUFFER_BASE only exists in CLUSTER_PS according
* to devcoredump so presumably this stage stalls for TU_STAGE_PS when
* early depth testing is enabled before dispatching fragments? However
* GRAS reads and writes LRZ directly.
*/
TU_STAGE_GRAS,
/* Execute FS */
TU_STAGE_SP_PS,
/* - Fragment tests
* - Write color/depth
* - Streamout writes (???)
* - Varying interpolation (???)
*/
TU_STAGE_PS,
};
enum tu_cmd_flush_bits {
TU_CMD_FLAG_CCU_FLUSH_DEPTH = 1 << 0,
TU_CMD_FLAG_CCU_FLUSH_COLOR = 1 << 1,
TU_CMD_FLAG_CCU_INVALIDATE_DEPTH = 1 << 2,
TU_CMD_FLAG_CCU_INVALIDATE_COLOR = 1 << 3,
TU_CMD_FLAG_CACHE_FLUSH = 1 << 4,
TU_CMD_FLAG_CACHE_INVALIDATE = 1 << 5,
TU_CMD_FLAG_WAIT_MEM_WRITES = 1 << 6,
TU_CMD_FLAG_WAIT_FOR_IDLE = 1 << 7,
TU_CMD_FLAG_WAIT_FOR_ME = 1 << 8,
TU_CMD_FLAG_ALL_FLUSH =
TU_CMD_FLAG_CCU_FLUSH_DEPTH |
TU_CMD_FLAG_CCU_FLUSH_COLOR |
TU_CMD_FLAG_CACHE_FLUSH |
/* Treat the CP as a sort of "cache" which may need to be "flushed" via
* waiting for writes to land with WAIT_FOR_MEM_WRITES.
*/
TU_CMD_FLAG_WAIT_MEM_WRITES,
TU_CMD_FLAG_ALL_INVALIDATE =
TU_CMD_FLAG_CCU_INVALIDATE_DEPTH |
TU_CMD_FLAG_CCU_INVALIDATE_COLOR |
2022-07-16 21:58:59 +01:00
TU_CMD_FLAG_CACHE_INVALIDATE |
/* Treat CP_WAIT_FOR_ME as a "cache" that needs to be invalidated when a
* a command that needs CP_WAIT_FOR_ME is executed. This means we may
* insert an extra WAIT_FOR_ME before an indirect command requiring it
* in case there was another command before the current command buffer
* that it needs to wait for.
*/
TU_CMD_FLAG_WAIT_FOR_ME,
};
/* Changing the CCU from sysmem mode to gmem mode or vice-versa is pretty
* heavy, involving a CCU cache flush/invalidate and a WFI in order to change
* which part of the gmem is used by the CCU. Here we keep track of what the
* state of the CCU.
*/
enum tu_cmd_ccu_state {
TU_CMD_CCU_SYSMEM,
TU_CMD_CCU_GMEM,
TU_CMD_CCU_UNKNOWN,
};
struct tu_cache_state {
/* Caches which must be made available (flushed) eventually if there are
* any users outside that cache domain, and caches which must be
* invalidated eventually if there are any reads.
*/
enum tu_cmd_flush_bits pending_flush_bits;
/* Pending flushes */
enum tu_cmd_flush_bits flush_bits;
};
enum tu_lrz_force_disable_mask {
TU_LRZ_FORCE_DISABLE_LRZ = 1 << 0,
TU_LRZ_FORCE_DISABLE_WRITE = 1 << 1,
};
enum tu_lrz_direction {
TU_LRZ_UNKNOWN,
/* Depth func less/less-than: */
TU_LRZ_LESS,
/* Depth func greater/greater-than: */
TU_LRZ_GREATER,
};
struct tu_lrz_pipeline
{
uint32_t force_disable_mask;
bool fs_has_kill;
bool force_late_z;
bool early_fragment_tests;
};
struct tu_lrz_state
{
/* Depth/Stencil image currently on use to do LRZ */
const struct tu_image_view *image_view;
VkClearValue depth_clear_value;
/* If LRZ is in invalid state we cannot use it until depth is cleared */
bool valid : 1;
/* Allows to temporary disable LRZ */
bool enabled : 1;
bool fast_clear : 1;
bool gpu_dir_tracking : 1;
/* Continue using old LRZ state (LOAD_OP_LOAD of depth) */
bool reuse_previous_state : 1;
enum tu_lrz_direction prev_direction;
};
struct tu_vs_params {
uint32_t vertex_offset;
uint32_t first_instance;
};
/* This should be for state that is set inside a renderpass and used at
* renderpass end time, e.g. to decide whether to use sysmem. This needs
* special handling for secondary cmdbufs and suspending/resuming render
* passes where the state may need to be combined afterwards.
*/
struct tu_render_pass_state
{
bool xfb_used;
bool has_tess;
bool has_prim_generated_query_in_rp;
bool disable_gmem;
/* Track whether conditional predicate for COND_REG_EXEC is changed in draw_cs */
bool draw_cs_writes_to_cond_pred;
uint32_t drawcall_count;
/* A calculated "draw cost" value for renderpass, which tries to
* estimate the bandwidth-per-sample of all the draws according
* to:
*
* foreach_draw (...) {
* sum += pipeline->color_bandwidth_per_sample;
* if (depth_test_enabled)
* sum += pipeline->depth_cpp_per_sample;
* if (depth_write_enabled)
* sum += pipeline->depth_cpp_per_sample;
* if (stencil_write_enabled)
* sum += pipeline->stencil_cpp_per_sample * 2;
* }
* drawcall_bandwidth_per_sample = sum / drawcall_count;
*
* It allows us to estimate the total bandwidth of drawcalls later, by
* calculating (drawcall_bandwidth_per_sample * zpass_sample_count).
*
* This does ignore depth buffer traffic for samples which do not
* pass due to depth-test fail, and some other details. But it is
* just intended to be a rough estimate that is easy to calculate.
*/
uint32_t drawcall_bandwidth_per_sample_sum;
};
void tu_render_pass_state_merge(struct tu_render_pass_state *dst,
const struct tu_render_pass_state *src);
struct tu_cmd_state
{
uint32_t dirty;
struct tu_pipeline *pipeline;
struct tu_pipeline *compute_pipeline;
struct tu_render_pass_state rp;
/* Vertex buffers, viewports, and scissors
* the states for these can be updated partially, so we need to save these
* to be able to emit a complete draw state
*/
struct {
uint64_t base;
uint32_t size;
uint32_t stride;
} vb[MAX_VBS];
VkViewport viewport[MAX_VIEWPORTS];
VkRect2D scissor[MAX_SCISSORS];
uint32_t max_viewport, max_scissor;
/* for dynamic states that can't be emitted directly */
uint32_t dynamic_stencil_mask;
uint32_t dynamic_stencil_wrmask;
uint32_t dynamic_stencil_ref;
uint32_t gras_su_cntl, rb_depth_cntl, rb_stencil_cntl;
uint32_t pc_raster_cntl, vpc_unknown_9107;
uint32_t rb_mrt_control[MAX_RTS], rb_mrt_blend_control[MAX_RTS];
uint32_t rb_mrt_control_rop;
uint32_t rb_blend_cntl, sp_blend_cntl;
uint32_t pipeline_color_write_enable, pipeline_blend_enable;
uint32_t color_write_enable;
bool logic_op_enabled;
bool rop_reads_dst;
enum pc_di_primtype primtype;
bool primitive_restart_enable;
/* saved states to re-emit in TU_CMD_DIRTY_DRAW_STATE case */
struct tu_draw_state dynamic_state[TU_DYNAMIC_STATE_COUNT];
struct tu_draw_state vertex_buffers;
struct tu_draw_state shader_const;
struct tu_draw_state desc_sets;
struct tu_draw_state vs_params;
/* Index buffer */
uint64_t index_va;
uint32_t max_index_count;
uint8_t index_size;
/* because streamout base has to be 32-byte aligned
* there is an extra offset to deal with when it is
* unaligned
*/
uint8_t streamout_offset[IR3_MAX_SO_BUFFERS];
/* Renderpasses are tricky, because we may need to flush differently if
* using sysmem vs. gmem and therefore we have to delay any flushing that
* happens before a renderpass. So we have to have two copies of the flush
* state, one for intra-renderpass flushes (i.e. renderpass dependencies)
* and one for outside a renderpass.
*/
struct tu_cache_state cache;
struct tu_cache_state renderpass_cache;
enum tu_cmd_ccu_state ccu_state;
const struct tu_render_pass *pass;
const struct tu_subpass *subpass;
const struct tu_framebuffer *framebuffer;
VkRect2D render_area;
const struct tu_image_view **attachments;
/* State that in the dynamic case comes from VkRenderingInfo and needs to
* be saved/restored when suspending. This holds the state for the last
* suspended renderpass, which may point to this command buffer's dynamic_*
* or another command buffer if executed on a secondary.
*/
struct {
const struct tu_render_pass *pass;
const struct tu_subpass *subpass;
const struct tu_framebuffer *framebuffer;
VkRect2D render_area;
const struct tu_image_view **attachments;
struct tu_lrz_state lrz;
} suspended_pass;
bool tessfactor_addr_set;
bool predication_active;
enum a5xx_line_mode line_mode;
bool z_negative_one_to_one;
tu: Fix prim gen query and pipeline stats query interaction Fixed: - VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT was able to stop prim counter when pipeline stats query is running. - This may have happened when prim gen query was in secondary cmdbuf. - VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT counting geometry in each tile. - VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT counting geometry in each tile when pipeline stats query is started inside prim gen query and inside a renderpass. The matter of VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT and pipeline stats interaction is solved by tracking whether pipeline stats query is running both on CPU (for non secondary cmdbuf case) and on GPU (for secondary cmdbuf). Note, prim gen query is not allowed with secondary command buffers, so only pipeline stats query is tracked on gpu. See https://gitlab.khronos.org/vulkan/vulkan/-/issues/3142 Counting geometry per each tile is solved by: - Conditionally executing START/STOP_PRIMITIVE_CTRS to not run in tiling pass. Solves the case when prim gen query is inside a renderpass. - Stop prim counters before executing `draw_cs` and restarting them afterwards. Solves prim gen query being outside a renderpass. Fixes GL CTS tests with Zink + `TU_DEBUG=gmem`: GTF-GL46.gtf30.GL3Tests.transform_feedback.transform_feedback_max_separate GTF-GL46.gtf40.GL3Tests.transform_feedback2.transform_feedback2_basic GTF-GL46.gtf40.GL3Tests.transform_feedback2.transform_feedback2_framebuffer GTF-GL46.gtf40.GL3Tests.transform_feedback3.transform_feedback3_streams_overflow GTF-GL46.gtf40.GL3Tests.transform_feedback3.transform_feedback3_streams_queried GTF-GL46.gtf40.GL3Tests.transform_feedback2.transform_feedback2_states Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/6602 Signed-off-by: Danylo Piliaiev <dpiliaiev@igalia.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17163>
2022-06-27 17:01:08 +01:00
/* VK_QUERY_PIPELINE_STATISTIC_CLIPPING_INVOCATIONS_BIT and
* VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT are allowed to run simultaniously,
* but they use the same {START,STOP}_PRIMITIVE_CTRS control.
*/
uint32_t prim_counters_running;
bool prim_generated_query_running_before_rp;
/* These are the states of the suspend/resume state machine. In addition to
* tracking whether we're in the middle of a chain of suspending and
* resuming passes that will be merged, we need to track whether the
* command buffer begins in the middle of such a chain, for when it gets
* merged with other command buffers. We call such a chain that begins
* before the command buffer starts a "pre-chain".
*
* Note that when this command buffer is finished, this state is untouched
* but it gains a different meaning. For example, if we finish in state
* SR_IN_CHAIN, we finished in the middle of a suspend/resume chain, so
* there's a suspend/resume chain that extends past the end of the command
* buffer. In this sense it's the "opposite" of SR_AFTER_PRE_CHAIN, which
* means that there's a suspend/resume chain that extends before the
* beginning.
*/
enum {
/* Either there are no suspend/resume chains, or they are entirely
* contained in the current command buffer.
*
* BeginCommandBuffer() <- start of current command buffer
* ...
* // we are here
*/
SR_NONE = 0,
/* We are in the middle of a suspend/resume chain that starts before the
* current command buffer. This happens when the command buffer begins
* with a resuming render pass and all of the passes up to the current
* one are suspending. In this state, our part of the chain is not saved
* and is in the current draw_cs/state.
*
* BeginRendering() ... EndRendering(suspending)
* BeginCommandBuffer() <- start of current command buffer
* BeginRendering(resuming) ... EndRendering(suspending)
* BeginRendering(resuming) ... EndRendering(suspending)
* ...
* // we are here
*/
SR_IN_PRE_CHAIN,
/* We are currently outside of any suspend/resume chains, but there is a
* chain starting before the current command buffer. It is saved in
* pre_chain.
*
* BeginRendering() ... EndRendering(suspending)
* BeginCommandBuffer() <- start of current command buffer
* // This part is stashed in pre_chain
* BeginRendering(resuming) ... EndRendering(suspending)
* BeginRendering(resuming) ... EndRendering(suspending)
* ...
* BeginRendering(resuming) ... EndRendering() // end of chain
* ...
* // we are here
*/
SR_AFTER_PRE_CHAIN,
/* We are in the middle of a suspend/resume chain and there is no chain
* starting before the current command buffer.
*
* BeginCommandBuffer() <- start of current command buffer
* ...
* BeginRendering() ... EndRendering(suspending)
* BeginRendering(resuming) ... EndRendering(suspending)
* BeginRendering(resuming) ... EndRendering(suspending)
* ...
* // we are here
*/
SR_IN_CHAIN,
/* We are in the middle of a suspend/resume chain and there is another,
* separate, chain starting before the current command buffer.
*
* BeginRendering() ... EndRendering(suspending)
* CommandBufferBegin() <- start of current command buffer
* // This part is stashed in pre_chain
* BeginRendering(resuming) ... EndRendering(suspending)
* BeginRendering(resuming) ... EndRendering(suspending)
* ...
* BeginRendering(resuming) ... EndRendering() // end of chain
* ...
* BeginRendering() ... EndRendering(suspending)
* BeginRendering(resuming) ... EndRendering(suspending)
* BeginRendering(resuming) ... EndRendering(suspending)
* ...
* // we are here
*/
SR_IN_CHAIN_AFTER_PRE_CHAIN,
} suspend_resume;
bool suspending, resuming;
struct tu_lrz_state lrz;
struct tu_draw_state lrz_and_depth_plane_state;
struct tu_vs_params last_vs_params;
};
struct tu_cmd_pool
{
struct vk_command_pool vk;
struct list_head cmd_buffers;
struct list_head free_cmd_buffers;
};
enum tu_cmd_buffer_status
{
TU_CMD_BUFFER_STATUS_INVALID,
TU_CMD_BUFFER_STATUS_INITIAL,
TU_CMD_BUFFER_STATUS_RECORDING,
TU_CMD_BUFFER_STATUS_EXECUTABLE,
TU_CMD_BUFFER_STATUS_PENDING,
};
struct tu_cmd_buffer
{
struct vk_command_buffer vk;
struct tu_device *device;
struct tu_cmd_pool *pool;
struct list_head pool_link;
struct u_trace trace;
struct u_trace_iterator trace_renderpass_start;
struct u_trace_iterator trace_renderpass_end;
struct list_head renderpass_autotune_results;
struct tu_autotune_results_buffer* autotune_buffer;
VkCommandBufferUsageFlags usage_flags;
enum tu_cmd_buffer_status status;
2018-12-28 16:25:17 +00:00
tu: Fix prim gen query and pipeline stats query interaction Fixed: - VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT was able to stop prim counter when pipeline stats query is running. - This may have happened when prim gen query was in secondary cmdbuf. - VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT counting geometry in each tile. - VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT counting geometry in each tile when pipeline stats query is started inside prim gen query and inside a renderpass. The matter of VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT and pipeline stats interaction is solved by tracking whether pipeline stats query is running both on CPU (for non secondary cmdbuf case) and on GPU (for secondary cmdbuf). Note, prim gen query is not allowed with secondary command buffers, so only pipeline stats query is tracked on gpu. See https://gitlab.khronos.org/vulkan/vulkan/-/issues/3142 Counting geometry per each tile is solved by: - Conditionally executing START/STOP_PRIMITIVE_CTRS to not run in tiling pass. Solves the case when prim gen query is inside a renderpass. - Stop prim counters before executing `draw_cs` and restarting them afterwards. Solves prim gen query being outside a renderpass. Fixes GL CTS tests with Zink + `TU_DEBUG=gmem`: GTF-GL46.gtf30.GL3Tests.transform_feedback.transform_feedback_max_separate GTF-GL46.gtf40.GL3Tests.transform_feedback2.transform_feedback2_basic GTF-GL46.gtf40.GL3Tests.transform_feedback2.transform_feedback2_framebuffer GTF-GL46.gtf40.GL3Tests.transform_feedback3.transform_feedback3_streams_overflow GTF-GL46.gtf40.GL3Tests.transform_feedback3.transform_feedback3_streams_queried GTF-GL46.gtf40.GL3Tests.transform_feedback2.transform_feedback2_states Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/6602 Signed-off-by: Danylo Piliaiev <dpiliaiev@igalia.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17163>
2022-06-27 17:01:08 +01:00
VkQueryPipelineStatisticFlags inherited_pipeline_statistics;
struct tu_cmd_state state;
uint32_t queue_family_index;
uint32_t push_constants[MAX_PUSH_CONSTANTS_SIZE / 4];
VkShaderStageFlags push_constant_stages;
struct tu_descriptor_set meta_push_descriptors;
struct tu_descriptor_state descriptors[MAX_BIND_POINTS];
struct tu_render_pass_attachment dynamic_rp_attachments[2 * (MAX_RTS + 1)];
struct tu_subpass_attachment dynamic_color_attachments[MAX_RTS];
struct tu_subpass_attachment dynamic_resolve_attachments[MAX_RTS + 1];
const struct tu_image_view *dynamic_attachments[2 * (MAX_RTS + 1)];
struct tu_render_pass dynamic_pass;
struct tu_subpass dynamic_subpass;
struct tu_framebuffer dynamic_framebuffer;
VkResult record_result;
struct tu_cs cs;
struct tu_cs draw_cs;
struct tu_cs tile_store_cs;
struct tu_cs draw_epilogue_cs;
struct tu_cs sub_cs;
/* If the first render pass in the command buffer is resuming, then it is
* part of a suspend/resume chain that starts before the current command
* buffer and needs to be merged later. In this case, its incomplete state
* is stored in pre_chain. In the symmetric case where the last render pass
* is suspending, we just skip ending the render pass and its state is
* stored in draw_cs/the current state. The first and last render pass
* might be part of different chains, which is why all the state may need
* to be saved separately here.
*/
struct {
struct tu_cs draw_cs;
struct tu_cs draw_epilogue_cs;
struct u_trace_iterator trace_renderpass_start, trace_renderpass_end;
struct tu_render_pass_state state;
} pre_chain;
uint32_t vsc_draw_strm_pitch;
uint32_t vsc_prim_strm_pitch;
};
/* Temporary struct for tracking a register state to be written, used by
* a6xx-pack.h and tu_cs_emit_regs()
*/
struct tu_reg_value {
uint32_t reg;
uint64_t value;
bool is_address;
struct tu_bo *bo;
bool bo_write;
uint32_t bo_offset;
uint32_t bo_shift;
};
VkResult tu_cmd_buffer_begin(struct tu_cmd_buffer *cmd_buffer,
VkCommandBufferUsageFlags usage_flags);
void tu_emit_cache_flush_renderpass(struct tu_cmd_buffer *cmd_buffer,
struct tu_cs *cs);
void tu_emit_cache_flush_ccu(struct tu_cmd_buffer *cmd_buffer,
struct tu_cs *cs,
enum tu_cmd_ccu_state ccu_state);
void tu_setup_dynamic_render_pass(struct tu_cmd_buffer *cmd_buffer,
const VkRenderingInfo *pRenderingInfo);
void tu_setup_dynamic_inheritance(struct tu_cmd_buffer *cmd_buffer,
const VkCommandBufferInheritanceRenderingInfo *info);
void tu_setup_dynamic_framebuffer(struct tu_cmd_buffer *cmd_buffer,
const VkRenderingInfo *pRenderingInfo);
void
tu_append_pre_chain(struct tu_cmd_buffer *cmd,
struct tu_cmd_buffer *secondary);
void
tu_append_pre_post_chain(struct tu_cmd_buffer *cmd,
struct tu_cmd_buffer *secondary);
void
tu_append_post_chain(struct tu_cmd_buffer *cmd,
struct tu_cmd_buffer *secondary);
void
tu_restore_suspended_pass(struct tu_cmd_buffer *cmd,
struct tu_cmd_buffer *suspended);
void tu_cmd_render(struct tu_cmd_buffer *cmd);
void
tu6_emit_event_write(struct tu_cmd_buffer *cmd,
struct tu_cs *cs,
enum vgt_event_type event);
static inline struct tu_descriptor_state *
tu_get_descriptors_state(struct tu_cmd_buffer *cmd_buffer,
VkPipelineBindPoint bind_point)
{
return &cmd_buffer->descriptors[bind_point];
}
struct tu_event
{
struct vk_object_base base;
struct tu_bo *bo;
};
struct tu_push_constant_range
{
uint32_t lo;
uint32_t dwords;
};
struct tu_shader
{
struct ir3_shader *ir3_shader;
struct tu_push_constant_range push_consts;
uint8_t active_desc_sets;
bool multi_pos_output;
};
struct tu_shader_key {
unsigned multiview_mask;
bool force_sample_interp;
enum ir3_wavesize_option api_wavesize, real_wavesize;
};
struct tu_compiled_shaders
{
struct vk_pipeline_cache_object base;
struct tu_push_constant_range push_consts[MESA_SHADER_STAGES];
uint8_t active_desc_sets;
bool multi_pos_output;
struct ir3_shader_variant *variants[MESA_SHADER_STAGES];
};
extern const struct vk_pipeline_cache_object_ops tu_shaders_ops;
bool
tu_nir_lower_multiview(nir_shader *nir, uint32_t mask, bool *multi_pos_output,
struct tu_device *dev);
nir_shader *
tu_spirv_to_nir(struct tu_device *dev,
void *mem_ctx,
const VkPipelineShaderStageCreateInfo *stage_info,
gl_shader_stage stage);
struct tu_shader *
tu_shader_create(struct tu_device *dev,
nir_shader *nir,
const struct tu_shader_key *key,
struct tu_pipeline_layout *layout,
const VkAllocationCallbacks *alloc);
void
tu_shader_destroy(struct tu_device *dev,
struct tu_shader *shader,
const VkAllocationCallbacks *alloc);
static bool inline
tu6_shared_constants_enable(const struct tu_pipeline_layout *layout,
const struct ir3_compiler *compiler)
{
return layout->push_constant_size > 0 &&
layout->push_constant_size <= (compiler->shared_consts_size * 16);
}
struct tu_program_descriptor_linkage
{
struct ir3_const_state const_state;
uint32_t constlen;
struct tu_push_constant_range push_consts;
};
struct tu_pipeline_executable {
gl_shader_stage stage;
struct ir3_info stats;
bool is_binning;
char *nir_from_spirv;
char *nir_final;
char *disasm;
};
struct tu_pipeline
{
struct vk_object_base base;
struct tu_cs cs;
struct tu_suballoc_bo bo;
/* Separate BO for private memory since it should GPU writable */
struct tu_bo *pvtmem_bo;
bool need_indirect_descriptor_sets;
VkShaderStageFlags active_stages;
uint32_t active_desc_sets;
/* mask of enabled dynamic states
* if BIT(i) is set, pipeline->dynamic_state[i] is *NOT* used
*/
uint32_t dynamic_state_mask;
struct tu_draw_state dynamic_state[TU_DYNAMIC_STATE_COUNT];
/* for dynamic states which use the same register: */
uint32_t gras_su_cntl, gras_su_cntl_mask;
uint32_t rb_depth_cntl, rb_depth_cntl_mask;
uint32_t rb_stencil_cntl, rb_stencil_cntl_mask;
uint32_t pc_raster_cntl, pc_raster_cntl_mask;
uint32_t vpc_unknown_9107, vpc_unknown_9107_mask;
uint32_t stencil_wrmask;
unsigned num_rts;
uint32_t rb_mrt_control[MAX_RTS], rb_mrt_control_mask;
uint32_t rb_mrt_blend_control[MAX_RTS];
uint32_t sp_blend_cntl, sp_blend_cntl_mask;
uint32_t rb_blend_cntl, rb_blend_cntl_mask;
uint32_t color_write_enable, blend_enable;
bool logic_op_enabled, rop_reads_dst;
bool rasterizer_discard;
bool rb_depth_cntl_disable;
enum a5xx_line_mode line_mode;
/* draw states for the pipeline */
struct tu_draw_state load_state, rast_state;
struct tu_draw_state prim_order_state_sysmem, prim_order_state_gmem;
/* for vertex buffers state */
uint32_t num_vbs;
struct tu_push_constant_range shared_consts;
struct
{
struct tu_draw_state config_state;
struct tu_draw_state state;
struct tu_draw_state binning_state;
struct tu_program_descriptor_linkage link[MESA_SHADER_STAGES];
} program;
struct
{
struct tu_draw_state state;
struct tu_draw_state binning_state;
} vi;
struct
{
enum pc_di_primtype primtype;
bool primitive_restart;
} ia;
2020-05-15 18:52:43 +01:00
struct
{
uint32_t patch_type;
uint32_t param_stride;
bool upper_left_domain_origin;
2020-05-15 18:52:43 +01:00
} tess;
struct
{
uint32_t local_size[3];
uint32_t subgroup_size;
} compute;
bool provoking_vertex_last;
struct tu_lrz_pipeline lrz;
/* In other words - framebuffer fetch support */
bool raster_order_attachment_access;
bool subpass_feedback_loop_ds;
bool z_negative_one_to_one;
/* memory bandwidth cost (in bytes) for color attachments */
uint32_t color_bandwidth_per_sample;
uint32_t depth_cpp_per_sample;
uint32_t stencil_cpp_per_sample;
void *executables_mem_ctx;
/* tu_pipeline_executable */
struct util_dynarray executables;
};
struct tu_image;
void
tu6_clear_lrz(struct tu_cmd_buffer *cmd, struct tu_cs *cs, struct tu_image* image, const VkClearValue *value);
void
tu6_dirty_lrz_fc(struct tu_cmd_buffer *cmd, struct tu_cs *cs, struct tu_image* image);
void
tu6_emit_lrz(struct tu_cmd_buffer *cmd, struct tu_cs *cs);
void
tu_disable_lrz(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
struct tu_image *image);
void
tu_lrz_clear_depth_image(struct tu_cmd_buffer *cmd,
struct tu_image *image,
const VkClearDepthStencilValue *pDepthStencil,
uint32_t rangeCount,
const VkImageSubresourceRange *pRanges);
void
tu_lrz_begin_renderpass(struct tu_cmd_buffer *cmd,
const VkClearValue *clear_values);
void
tu_lrz_begin_resumed_renderpass(struct tu_cmd_buffer *cmd,
const VkClearValue *clear_values);
void
tu_lrz_begin_secondary_cmdbuf(struct tu_cmd_buffer *cmd);
void
tu_lrz_tiling_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs);
void
tu_lrz_tiling_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs);
void
tu_lrz_sysmem_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs);
void
tu_lrz_sysmem_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs);
void
tu_lrz_disable_during_renderpass(struct tu_cmd_buffer *cmd);
void
tu6_emit_viewport(struct tu_cs *cs, const VkViewport *viewport, uint32_t num_viewport,
bool z_negative_one_to_one);
void
tu6_emit_scissor(struct tu_cs *cs, const VkRect2D *scs, uint32_t scissor_count);
void
tu6_emit_sample_locations(struct tu_cs *cs, const VkSampleLocationsInfoEXT *samp_loc);
void
tu6_emit_depth_bias(struct tu_cs *cs,
float constant_factor,
float clamp,
float slope_factor);
void tu6_emit_msaa(struct tu_cs *cs, VkSampleCountFlagBits samples,
enum a5xx_line_mode line_mode);
void tu6_emit_window_scissor(struct tu_cs *cs, uint32_t x1, uint32_t y1, uint32_t x2, uint32_t y2);
void tu6_emit_window_offset(struct tu_cs *cs, uint32_t x1, uint32_t y1);
uint32_t tu6_rb_mrt_control_rop(VkLogicOp op, bool *rop_reads_dst);
void tu_disable_draw_states(struct tu_cmd_buffer *cmd, struct tu_cs *cs);
void tu6_apply_depth_bounds_workaround(struct tu_device *device,
uint32_t *rb_depth_cntl);
struct tu_pvtmem_config {
uint64_t iova;
uint32_t per_fiber_size;
uint32_t per_sp_size;
bool per_wave;
};
void
tu6_emit_xs_config(struct tu_cs *cs,
gl_shader_stage stage,
const struct ir3_shader_variant *xs);
void
tu6_emit_xs(struct tu_cs *cs,
gl_shader_stage stage,
const struct ir3_shader_variant *xs,
const struct tu_pvtmem_config *pvtmem,
uint64_t binary_iova);
void
tu6_emit_vpc(struct tu_cs *cs,
const struct ir3_shader_variant *vs,
const struct ir3_shader_variant *hs,
const struct ir3_shader_variant *ds,
const struct ir3_shader_variant *gs,
const struct ir3_shader_variant *fs,
uint32_t patch_control_points);
void
tu6_emit_fs_inputs(struct tu_cs *cs, const struct ir3_shader_variant *fs);
struct tu_image_view;
void
tu_resolve_sysmem(struct tu_cmd_buffer *cmd,
struct tu_cs *cs,
const struct tu_image_view *src,
const struct tu_image_view *dst,
uint32_t layer_mask,
uint32_t layers,
const VkRect2D *rect);
void
tu_clear_sysmem_attachment(struct tu_cmd_buffer *cmd,
struct tu_cs *cs,
uint32_t a,
const VkClearValue *value);
void
tu_clear_gmem_attachment(struct tu_cmd_buffer *cmd,
struct tu_cs *cs,
uint32_t a,
const VkClearValue *value);
void
tu_load_gmem_attachment(struct tu_cmd_buffer *cmd,
struct tu_cs *cs,
uint32_t a,
bool cond_exec_allowed,
bool force_load);
/* expose this function to be able to emit load without checking LOAD_OP */
void
tu_emit_load_gmem_attachment(struct tu_cmd_buffer *cmd, struct tu_cs *cs, uint32_t a);
/* note: gmem store can also resolve */
void
tu_store_gmem_attachment(struct tu_cmd_buffer *cmd,
struct tu_cs *cs,
uint32_t a,
uint32_t gmem_a,
bool cond_exec_allowed);
enum pipe_format tu_vk_format_to_pipe_format(VkFormat vk_format);
struct tu_native_format
{
enum a6xx_format fmt : 8;
enum a3xx_color_swap swap : 8;
enum a6xx_tile_mode tile_mode : 8;
};
enum pipe_format tu_vk_format_to_pipe_format(VkFormat vk_format);
bool tu6_format_vtx_supported(VkFormat format);
struct tu_native_format tu6_format_vtx(VkFormat format);
bool tu6_format_color_supported(enum pipe_format format);
struct tu_native_format tu6_format_color(enum pipe_format format, enum a6xx_tile_mode tile_mode);
bool tu6_format_texture_supported(enum pipe_format format);
struct tu_native_format tu6_format_texture(enum pipe_format format, enum a6xx_tile_mode tile_mode);
static inline enum a6xx_format
tu6_base_format(enum pipe_format format)
{
/* note: tu6_format_color doesn't care about tiling for .fmt field */
return tu6_format_color(format, TILE6_LINEAR).fmt;
}
struct tu_image
{
struct vk_image vk;
struct fdl_layout layout[3];
uint32_t total_size;
2018-12-21 11:50:55 +00:00
#ifdef ANDROID
/* For VK_ANDROID_native_buffer, the WSI image owns the memory, */
VkDeviceMemory owned_memory;
#endif
/* Set when bound */
struct tu_bo *bo;
uint64_t iova;
uint32_t lrz_height;
uint32_t lrz_pitch;
uint32_t lrz_offset;
uint32_t lrz_fc_offset;
uint32_t lrz_fc_size;
};
uint32_t tu6_plane_count(VkFormat format);
enum pipe_format tu6_plane_format(VkFormat format, uint32_t plane);
uint32_t tu6_plane_index(VkFormat format, VkImageAspectFlags aspect_mask);
enum pipe_format tu_format_for_aspect(enum pipe_format format,
VkImageAspectFlags aspect_mask);
struct tu_image_view
{
struct vk_image_view vk;
struct tu_image *image; /**< VkImageViewCreateInfo::image */
struct fdl6_view view;
/* for d32s8 separate depth */
uint64_t depth_base_addr;
uint32_t depth_layer_size;
uint32_t depth_PITCH;
/* for d32s8 separate stencil */
uint64_t stencil_base_addr;
uint32_t stencil_layer_size;
uint32_t stencil_PITCH;
};
struct tu_sampler_ycbcr_conversion {
struct vk_object_base base;
VkFormat format;
VkSamplerYcbcrModelConversion ycbcr_model;
VkSamplerYcbcrRange ycbcr_range;
VkComponentMapping components;
VkChromaLocation chroma_offsets[2];
VkFilter chroma_filter;
};
struct tu_sampler {
struct vk_object_base base;
uint32_t descriptor[A6XX_TEX_SAMP_DWORDS];
struct tu_sampler_ycbcr_conversion *ycbcr_sampler;
};
void
tu_cs_image_ref(struct tu_cs *cs, const struct fdl6_view *iview, uint32_t layer);
void
tu_cs_image_ref_2d(struct tu_cs *cs, const struct fdl6_view *iview, uint32_t layer, bool src);
void
tu_cs_image_flag_ref(struct tu_cs *cs, const struct fdl6_view *iview, uint32_t layer);
void
tu_cs_image_stencil_ref(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer);
void
tu_cs_image_depth_ref(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer);
#define tu_image_view_stencil(iview, x) \
((iview->view.x & ~A6XX_##x##_COLOR_FORMAT__MASK) | A6XX_##x##_COLOR_FORMAT(FMT6_8_UINT))
#define tu_image_view_depth(iview, x) \
((iview->view.x & ~A6XX_##x##_COLOR_FORMAT__MASK) | A6XX_##x##_COLOR_FORMAT(FMT6_32_FLOAT))
VkResult
tu_gralloc_info(struct tu_device *device,
const VkNativeBufferANDROID *gralloc_info,
int *dma_buf,
uint64_t *modifier);
VkResult
tu_import_memory_from_gralloc_handle(VkDevice device_h,
int dma_buf,
const VkAllocationCallbacks *alloc,
VkImage image_h);
bool
tiling_possible(VkFormat format);
bool
ubwc_possible(VkFormat format, VkImageType type, VkImageUsageFlags usage, VkImageUsageFlags stencil_usage,
const struct fd_dev_info *info, VkSampleCountFlagBits samples,
bool use_z24uint_s8uint);
struct tu_buffer_view
{
struct vk_object_base base;
uint32_t descriptor[A6XX_TEX_CONST_DWORDS];
struct tu_buffer *buffer;
};
void
tu_buffer_view_init(struct tu_buffer_view *view,
struct tu_device *device,
const VkBufferViewCreateInfo *pCreateInfo);
#define PERF_CNTRS_REG 4
struct tu_perf_query_data
{
uint32_t gid; /* group-id */
uint32_t cid; /* countable-id within the group */
uint32_t cntr_reg; /* counter register within the group */
uint32_t pass; /* pass index that countables can be requested */
uint32_t app_idx; /* index provided by apps */
};
struct tu_query_pool
{
struct vk_object_base base;
VkQueryType type;
uint32_t stride;
uint64_t size;
uint32_t pipeline_statistics;
struct tu_bo *bo;
/* For performance query */
const struct fd_perfcntr_group *perf_group;
uint32_t perf_group_count;
uint32_t counter_index_count;
struct tu_perf_query_data perf_query_data[0];
};
uint32_t
tu_subpass_get_attachment_to_resolve(const struct tu_subpass *subpass, uint32_t index);
void
tu_update_descriptor_sets(const struct tu_device *device,
VkDescriptorSet overrideSet,
uint32_t descriptorWriteCount,
const VkWriteDescriptorSet *pDescriptorWrites,
uint32_t descriptorCopyCount,
const VkCopyDescriptorSet *pDescriptorCopies);
void
tu_update_descriptor_set_with_template(
const struct tu_device *device,
struct tu_descriptor_set *set,
VkDescriptorUpdateTemplate descriptorUpdateTemplate,
const void *pData);
VkResult
tu_physical_device_init(struct tu_physical_device *device,
struct tu_instance *instance);
VkResult
tu_enumerate_devices(struct tu_instance *instance);
int
tu_device_get_gpu_timestamp(struct tu_device *dev,
uint64_t *ts);
int
tu_device_get_suspend_count(struct tu_device *dev,
uint64_t *suspend_count);
int
tu_drm_submitqueue_new(const struct tu_device *dev,
int priority,
uint32_t *queue_id);
void
tu_drm_submitqueue_close(const struct tu_device *dev, uint32_t queue_id);
int
tu_syncobj_to_fd(struct tu_device *device, struct vk_sync *sync);
VkResult
tu_queue_submit(struct vk_queue *vk_queue, struct vk_queue_submit *submit);
void
tu_copy_timestamp_buffer(struct u_trace_context *utctx, void *cmdstream,
void *ts_from, uint32_t from_offset,
void *ts_to, uint32_t to_offset,
uint32_t count);
VkResult
tu_create_copy_timestamp_cs(struct tu_cmd_buffer *cmdbuf, struct tu_cs** cs,
struct u_trace **trace_copy);
/* If we copy trace and timestamps we will have to free them. */
struct tu_u_trace_cmd_data
{
struct tu_cs *timestamp_copy_cs;
struct u_trace *trace;
};
/* Data necessary to retrieve timestamps and clean all
* associated resources afterwards.
*/
struct tu_u_trace_submission_data
{
uint32_t submission_id;
/* We have to know when timestamps are available,
* this sync object indicates it.
*/
struct tu_u_trace_syncobj *syncobj;
uint32_t cmd_buffer_count;
uint32_t last_buffer_with_tracepoints;
struct tu_u_trace_cmd_data *cmd_trace_data;
};
VkResult
tu_u_trace_submission_data_create(
struct tu_device *device,
struct tu_cmd_buffer **cmd_buffers,
uint32_t cmd_buffer_count,
struct tu_u_trace_submission_data **submission_data);
void
tu_u_trace_submission_data_finish(
struct tu_device *device,
struct tu_u_trace_submission_data *submission_data);
void
tu_breadcrumbs_init(struct tu_device *device);
void
tu_breadcrumbs_finish(struct tu_device *device);
#define TU_FROM_HANDLE(__tu_type, __name, __handle) \
VK_FROM_HANDLE(__tu_type, __name, __handle)
VK_DEFINE_HANDLE_CASTS(tu_cmd_buffer, vk.base, VkCommandBuffer,
VK_OBJECT_TYPE_COMMAND_BUFFER)
VK_DEFINE_HANDLE_CASTS(tu_device, vk.base, VkDevice, VK_OBJECT_TYPE_DEVICE)
VK_DEFINE_HANDLE_CASTS(tu_instance, vk.base, VkInstance,
VK_OBJECT_TYPE_INSTANCE)
VK_DEFINE_HANDLE_CASTS(tu_physical_device, vk.base, VkPhysicalDevice,
VK_OBJECT_TYPE_PHYSICAL_DEVICE)
VK_DEFINE_HANDLE_CASTS(tu_queue, vk.base, VkQueue, VK_OBJECT_TYPE_QUEUE)
VK_DEFINE_NONDISP_HANDLE_CASTS(tu_cmd_pool, vk.base, VkCommandPool,
VK_OBJECT_TYPE_COMMAND_POOL)
VK_DEFINE_NONDISP_HANDLE_CASTS(tu_buffer, base, VkBuffer,
VK_OBJECT_TYPE_BUFFER)
VK_DEFINE_NONDISP_HANDLE_CASTS(tu_buffer_view, base, VkBufferView,
VK_OBJECT_TYPE_BUFFER_VIEW)
VK_DEFINE_NONDISP_HANDLE_CASTS(tu_descriptor_pool, base, VkDescriptorPool,
VK_OBJECT_TYPE_DESCRIPTOR_POOL)
VK_DEFINE_NONDISP_HANDLE_CASTS(tu_descriptor_set, base, VkDescriptorSet,
VK_OBJECT_TYPE_DESCRIPTOR_SET)
VK_DEFINE_NONDISP_HANDLE_CASTS(tu_descriptor_set_layout, base,
VkDescriptorSetLayout,
VK_OBJECT_TYPE_DESCRIPTOR_SET_LAYOUT)
VK_DEFINE_NONDISP_HANDLE_CASTS(tu_descriptor_update_template, base,
VkDescriptorUpdateTemplate,
VK_OBJECT_TYPE_DESCRIPTOR_UPDATE_TEMPLATE)
VK_DEFINE_NONDISP_HANDLE_CASTS(tu_device_memory, base, VkDeviceMemory,
VK_OBJECT_TYPE_DEVICE_MEMORY)
VK_DEFINE_NONDISP_HANDLE_CASTS(tu_event, base, VkEvent, VK_OBJECT_TYPE_EVENT)
VK_DEFINE_NONDISP_HANDLE_CASTS(tu_framebuffer, base, VkFramebuffer,
VK_OBJECT_TYPE_FRAMEBUFFER)
VK_DEFINE_NONDISP_HANDLE_CASTS(tu_image, vk.base, VkImage, VK_OBJECT_TYPE_IMAGE)
VK_DEFINE_NONDISP_HANDLE_CASTS(tu_image_view, vk.base, VkImageView,
VK_OBJECT_TYPE_IMAGE_VIEW);
VK_DEFINE_NONDISP_HANDLE_CASTS(tu_pipeline_cache, base, VkPipelineCache,
VK_OBJECT_TYPE_PIPELINE_CACHE)
VK_DEFINE_NONDISP_HANDLE_CASTS(tu_pipeline, base, VkPipeline,
VK_OBJECT_TYPE_PIPELINE)
VK_DEFINE_NONDISP_HANDLE_CASTS(tu_pipeline_layout, base, VkPipelineLayout,
VK_OBJECT_TYPE_PIPELINE_LAYOUT)
VK_DEFINE_NONDISP_HANDLE_CASTS(tu_query_pool, base, VkQueryPool,
VK_OBJECT_TYPE_QUERY_POOL)
VK_DEFINE_NONDISP_HANDLE_CASTS(tu_render_pass, base, VkRenderPass,
VK_OBJECT_TYPE_RENDER_PASS)
VK_DEFINE_NONDISP_HANDLE_CASTS(tu_sampler, base, VkSampler,
VK_OBJECT_TYPE_SAMPLER)
VK_DEFINE_NONDISP_HANDLE_CASTS(tu_sampler_ycbcr_conversion, base, VkSamplerYcbcrConversion,
VK_OBJECT_TYPE_SAMPLER_YCBCR_CONVERSION)
/* for TU_FROM_HANDLE with both VkFence and VkSemaphore: */
#define tu_syncobj_from_handle(x) ((struct tu_syncobj*) (uintptr_t) (x))
void
update_stencil_mask(uint32_t *value, VkStencilFaceFlags face, uint32_t mask);
#endif /* TU_PRIVATE_H */