mirror of https://gitlab.freedesktop.org/mesa/mesa
524 lines
14 KiB
C
524 lines
14 KiB
C
/*
|
|
* Copyright © 2016 Red Hat.
|
|
* Copyright © 2016 Bas Nieuwenhuizen
|
|
* SPDX-License-Identifier: MIT
|
|
*
|
|
* based in part on anv driver which is:
|
|
* Copyright © 2015 Intel Corporation
|
|
*/
|
|
|
|
#ifndef TU_DEVICE_H
|
|
#define TU_DEVICE_H
|
|
|
|
#include "tu_common.h"
|
|
|
|
#include "vk_buffer.h"
|
|
|
|
#include "tu_autotune.h"
|
|
#include "tu_pass.h"
|
|
#include "tu_perfetto.h"
|
|
#include "tu_suballoc.h"
|
|
#include "tu_util.h"
|
|
|
|
#include "util/vma.h"
|
|
|
|
/* queue types */
|
|
#define TU_QUEUE_GENERAL 0
|
|
|
|
#define TU_MAX_QUEUE_FAMILIES 1
|
|
|
|
#define TU_BORDER_COLOR_COUNT 4096
|
|
#define TU_BORDER_COLOR_BUILTIN 6
|
|
|
|
#define TU_BLIT_SHADER_SIZE 1024
|
|
|
|
/* extra space in vsc draw/prim streams */
|
|
#define VSC_PAD 0x40
|
|
|
|
enum global_shader {
|
|
GLOBAL_SH_VS_BLIT,
|
|
GLOBAL_SH_VS_CLEAR,
|
|
GLOBAL_SH_FS_BLIT,
|
|
GLOBAL_SH_FS_BLIT_ZSCALE,
|
|
GLOBAL_SH_FS_COPY_MS,
|
|
GLOBAL_SH_FS_CLEAR0,
|
|
GLOBAL_SH_FS_CLEAR_MAX = GLOBAL_SH_FS_CLEAR0 + MAX_RTS,
|
|
GLOBAL_SH_COUNT,
|
|
};
|
|
|
|
struct tu_memory_heap {
|
|
/* Standard bits passed on to the client */
|
|
VkDeviceSize size;
|
|
VkMemoryHeapFlags flags;
|
|
|
|
/** Copied from ANV:
|
|
*
|
|
* Driver-internal book-keeping.
|
|
*
|
|
* Align it to 64 bits to make atomic operations faster on 32 bit platforms.
|
|
*/
|
|
VkDeviceSize used __attribute__ ((aligned (8)));
|
|
};
|
|
|
|
struct tu_physical_device
|
|
{
|
|
struct vk_physical_device vk;
|
|
|
|
struct tu_instance *instance;
|
|
|
|
const char *name;
|
|
uint8_t driver_uuid[VK_UUID_SIZE];
|
|
uint8_t device_uuid[VK_UUID_SIZE];
|
|
uint8_t cache_uuid[VK_UUID_SIZE];
|
|
|
|
struct wsi_device wsi_device;
|
|
|
|
int local_fd;
|
|
bool has_local;
|
|
int64_t local_major;
|
|
int64_t local_minor;
|
|
int master_fd;
|
|
bool has_master;
|
|
int64_t master_major;
|
|
int64_t master_minor;
|
|
|
|
uint32_t gmem_size;
|
|
uint64_t gmem_base;
|
|
uint32_t ccu_offset_gmem;
|
|
uint32_t ccu_offset_bypass;
|
|
|
|
bool has_set_iova;
|
|
uint64_t va_start;
|
|
uint64_t va_size;
|
|
|
|
bool has_cached_coherent_memory;
|
|
bool has_cached_non_coherent_memory;
|
|
uintptr_t level1_dcache_size;
|
|
|
|
struct {
|
|
uint32_t type_count;
|
|
VkMemoryPropertyFlags types[VK_MAX_MEMORY_TYPES];
|
|
} memory;
|
|
|
|
struct fd_dev_id dev_id;
|
|
const struct fd_dev_info *info;
|
|
|
|
int msm_major_version;
|
|
int msm_minor_version;
|
|
|
|
/* Address space and global fault count for this local_fd with DRM backend */
|
|
uint64_t fault_count;
|
|
|
|
/* with 0 being the highest priority */
|
|
uint32_t submitqueue_priority_count;
|
|
|
|
struct tu_memory_heap heap;
|
|
mtx_t vma_mutex;
|
|
struct util_vma_heap vma;
|
|
|
|
struct vk_sync_type syncobj_type;
|
|
struct vk_sync_timeline_type timeline_type;
|
|
const struct vk_sync_type *sync_types[3];
|
|
};
|
|
VK_DEFINE_HANDLE_CASTS(tu_physical_device, vk.base, VkPhysicalDevice,
|
|
VK_OBJECT_TYPE_PHYSICAL_DEVICE)
|
|
|
|
struct tu_knl;
|
|
|
|
struct tu_instance
|
|
{
|
|
struct vk_instance vk;
|
|
|
|
const struct tu_knl *knl;
|
|
|
|
uint32_t api_version;
|
|
|
|
struct driOptionCache dri_options;
|
|
struct driOptionCache available_dri_options;
|
|
|
|
bool dont_care_as_load;
|
|
|
|
/* Conservative LRZ (default true) invalidates LRZ on draws with
|
|
* blend and depth-write enabled, because this can lead to incorrect
|
|
* rendering. Driconf can be used to disable conservative LRZ for
|
|
* games which do not have the problematic sequence of draws *and*
|
|
* suffer a performance loss with conservative LRZ.
|
|
*/
|
|
bool conservative_lrz;
|
|
};
|
|
VK_DEFINE_HANDLE_CASTS(tu_instance, vk.base, VkInstance,
|
|
VK_OBJECT_TYPE_INSTANCE)
|
|
|
|
struct tu_queue
|
|
{
|
|
struct vk_queue vk;
|
|
|
|
struct tu_device *device;
|
|
|
|
uint32_t msm_queue_id;
|
|
|
|
int64_t last_submit_timestamp; /* timestamp of the last queue submission for kgsl */
|
|
};
|
|
VK_DEFINE_HANDLE_CASTS(tu_queue, vk.base, VkQueue, VK_OBJECT_TYPE_QUEUE)
|
|
|
|
/* This struct defines the layout of the global_bo */
|
|
struct tu6_global
|
|
{
|
|
/* clear/blit shaders */
|
|
uint32_t shaders[TU_BLIT_SHADER_SIZE];
|
|
|
|
uint32_t seqno_dummy; /* dummy seqno for CP_EVENT_WRITE */
|
|
uint32_t _pad0;
|
|
volatile uint32_t vsc_draw_overflow;
|
|
uint32_t _pad1;
|
|
volatile uint32_t vsc_prim_overflow;
|
|
uint32_t _pad2;
|
|
uint64_t predicate;
|
|
|
|
/* scratch space for VPC_SO[i].FLUSH_BASE_LO/HI, start on 32 byte boundary. */
|
|
struct {
|
|
uint32_t offset;
|
|
uint32_t pad[7];
|
|
} flush_base[4];
|
|
|
|
alignas(16) uint32_t cs_indirect_xyz[3];
|
|
|
|
volatile uint32_t vtx_stats_query_not_running;
|
|
|
|
/* To know when renderpass stats for autotune are valid */
|
|
volatile uint32_t autotune_fence;
|
|
|
|
/* For recycling command buffers for dynamic suspend/resume comamnds */
|
|
volatile uint32_t dynamic_rendering_fence;
|
|
|
|
volatile uint32_t dbg_one;
|
|
volatile uint32_t dbg_gmem_total_loads;
|
|
volatile uint32_t dbg_gmem_taken_loads;
|
|
volatile uint32_t dbg_gmem_total_stores;
|
|
volatile uint32_t dbg_gmem_taken_stores;
|
|
|
|
/* Written from GPU */
|
|
volatile uint32_t breadcrumb_gpu_sync_seqno;
|
|
uint32_t _pad3;
|
|
/* Written from CPU, acknowledges value written from GPU */
|
|
volatile uint32_t breadcrumb_cpu_sync_seqno;
|
|
uint32_t _pad4;
|
|
|
|
/* note: larger global bo will be used for customBorderColors */
|
|
struct bcolor_entry bcolor_builtin[TU_BORDER_COLOR_BUILTIN], bcolor[];
|
|
};
|
|
#define gb_offset(member) offsetof(struct tu6_global, member)
|
|
#define global_iova(cmd, member) ((cmd)->device->global_bo->iova + gb_offset(member))
|
|
#define global_iova_arr(cmd, member, idx) \
|
|
(global_iova(cmd, member) + sizeof_field(struct tu6_global, member[0]) * (idx))
|
|
|
|
struct tu_pvtmem_bo {
|
|
mtx_t mtx;
|
|
struct tu_bo *bo;
|
|
uint32_t per_fiber_size, per_sp_size;
|
|
};
|
|
|
|
#ifdef ANDROID
|
|
enum tu_gralloc_type
|
|
{
|
|
TU_GRALLOC_UNKNOWN,
|
|
TU_GRALLOC_CROS,
|
|
TU_GRALLOC_OTHER,
|
|
};
|
|
#endif
|
|
|
|
struct tu_device
|
|
{
|
|
struct vk_device vk;
|
|
struct tu_instance *instance;
|
|
|
|
struct tu_queue *queues[TU_MAX_QUEUE_FAMILIES];
|
|
int queue_count[TU_MAX_QUEUE_FAMILIES];
|
|
|
|
struct tu_physical_device *physical_device;
|
|
int fd;
|
|
|
|
struct ir3_compiler *compiler;
|
|
|
|
/* Backup in-memory cache to be used if the app doesn't provide one */
|
|
struct vk_pipeline_cache *mem_cache;
|
|
|
|
#define MIN_SCRATCH_BO_SIZE_LOG2 12 /* A page */
|
|
|
|
/* Currently the kernel driver uses a 32-bit GPU address space, but it
|
|
* should be impossible to go beyond 48 bits.
|
|
*/
|
|
struct {
|
|
struct tu_bo *bo;
|
|
mtx_t construct_mtx;
|
|
bool initialized;
|
|
} scratch_bos[48 - MIN_SCRATCH_BO_SIZE_LOG2];
|
|
|
|
struct tu_pvtmem_bo fiber_pvtmem_bo, wave_pvtmem_bo;
|
|
|
|
struct tu_bo *global_bo;
|
|
struct tu6_global *global_bo_map;
|
|
|
|
uint32_t implicit_sync_bo_count;
|
|
|
|
/* Device-global BO suballocator for reducing BO management overhead for
|
|
* (read-only) pipeline state. Synchronized by pipeline_mutex.
|
|
*/
|
|
struct tu_suballocator pipeline_suballoc;
|
|
mtx_t pipeline_mutex;
|
|
|
|
/* Device-global BO suballocator for reducing BO management for small
|
|
* gmem/sysmem autotune result buffers. Synchronized by autotune_mutex.
|
|
*/
|
|
struct tu_suballocator autotune_suballoc;
|
|
mtx_t autotune_mutex;
|
|
|
|
/* the blob seems to always use 8K factor and 128K param sizes, copy them */
|
|
#define TU_TESS_FACTOR_SIZE (8 * 1024)
|
|
#define TU_TESS_PARAM_SIZE (128 * 1024)
|
|
#define TU_TESS_BO_SIZE (TU_TESS_FACTOR_SIZE + TU_TESS_PARAM_SIZE)
|
|
/* Lazily allocated, protected by the device mutex. */
|
|
struct tu_bo *tess_bo;
|
|
|
|
struct ir3_shader_variant *global_shader_variants[GLOBAL_SH_COUNT];
|
|
struct ir3_shader *global_shaders[GLOBAL_SH_COUNT];
|
|
uint64_t global_shader_va[GLOBAL_SH_COUNT];
|
|
|
|
uint32_t vsc_draw_strm_pitch;
|
|
uint32_t vsc_prim_strm_pitch;
|
|
BITSET_DECLARE(custom_border_color, TU_BORDER_COLOR_COUNT);
|
|
mtx_t mutex;
|
|
|
|
/* bo list for submits: */
|
|
struct drm_msm_gem_submit_bo *bo_list;
|
|
/* map bo handles to bo list index: */
|
|
uint32_t bo_count, bo_list_size;
|
|
mtx_t bo_mutex;
|
|
/* protects imported BOs creation/freeing */
|
|
struct u_rwlock dma_bo_lock;
|
|
|
|
/* Tracking of name -> size allocated for TU_DEBUG_BOS */
|
|
struct hash_table *bo_sizes;
|
|
|
|
/* This array holds all our 'struct tu_bo' allocations. We use this
|
|
* so we can add a refcount to our BOs and check if a particular BO
|
|
* was already allocated in this device using its GEM handle. This is
|
|
* necessary to properly manage BO imports, because the kernel doesn't
|
|
* refcount the underlying BO memory.
|
|
*
|
|
* Specifically, when self-importing (i.e. importing a BO into the same
|
|
* device that created it), the kernel will give us the same BO handle
|
|
* for both BOs and we must only free it once when both references are
|
|
* freed. Otherwise, if we are not self-importing, we get two different BO
|
|
* handles, and we want to free each one individually.
|
|
*
|
|
* The refcount is also useful for being able to maintain BOs across
|
|
* VK object lifetimes, such as pipelines suballocating out of BOs
|
|
* allocated on the device.
|
|
*/
|
|
struct util_sparse_array bo_map;
|
|
|
|
/* Command streams to set pass index to a scratch reg */
|
|
struct tu_cs *perfcntrs_pass_cs;
|
|
struct tu_cs_entry *perfcntrs_pass_cs_entries;
|
|
|
|
struct util_dynarray dynamic_rendering_pending;
|
|
VkCommandPool dynamic_rendering_pool;
|
|
uint32_t dynamic_rendering_fence;
|
|
|
|
/* Condition variable for timeline semaphore to notify waiters when a
|
|
* new submit is executed. */
|
|
pthread_cond_t timeline_cond;
|
|
pthread_mutex_t submit_mutex;
|
|
|
|
struct tu_autotune autotune;
|
|
|
|
struct breadcrumbs_context *breadcrumbs_ctx;
|
|
|
|
struct tu_cs *dbg_cmdbuf_stomp_cs;
|
|
struct tu_cs *dbg_renderpass_stomp_cs;
|
|
|
|
#ifdef ANDROID
|
|
const void *gralloc;
|
|
enum tu_gralloc_type gralloc_type;
|
|
#endif
|
|
|
|
uint32_t submit_count;
|
|
|
|
struct u_trace_context trace_context;
|
|
|
|
#ifdef HAVE_PERFETTO
|
|
struct tu_perfetto_state perfetto;
|
|
#endif
|
|
|
|
bool use_z24uint_s8uint;
|
|
};
|
|
VK_DEFINE_HANDLE_CASTS(tu_device, vk.base, VkDevice, VK_OBJECT_TYPE_DEVICE)
|
|
|
|
struct tu_device_memory
|
|
{
|
|
struct vk_object_base base;
|
|
|
|
struct tu_bo *bo;
|
|
};
|
|
VK_DEFINE_NONDISP_HANDLE_CASTS(tu_device_memory, base, VkDeviceMemory,
|
|
VK_OBJECT_TYPE_DEVICE_MEMORY)
|
|
|
|
struct tu_buffer
|
|
{
|
|
struct vk_buffer vk;
|
|
|
|
struct tu_bo *bo;
|
|
uint64_t iova;
|
|
};
|
|
VK_DEFINE_NONDISP_HANDLE_CASTS(tu_buffer, vk.base, VkBuffer,
|
|
VK_OBJECT_TYPE_BUFFER)
|
|
|
|
struct tu_attachment_info
|
|
{
|
|
struct tu_image_view *attachment;
|
|
};
|
|
|
|
struct tu_tiling_config {
|
|
/* size of the first tile */
|
|
VkExtent2D tile0;
|
|
/* number of tiles */
|
|
VkExtent2D tile_count;
|
|
|
|
/* size of the first VSC pipe */
|
|
VkExtent2D pipe0;
|
|
/* number of VSC pipes */
|
|
VkExtent2D pipe_count;
|
|
|
|
/* Whether using GMEM is even possible with this configuration */
|
|
bool possible;
|
|
|
|
/* Whether binning should be used for gmem rendering using this framebuffer. */
|
|
bool binning;
|
|
|
|
/* Whether binning could be used for gmem rendering using this framebuffer. */
|
|
bool binning_possible;
|
|
|
|
/* pipe register values */
|
|
uint32_t pipe_config[MAX_VSC_PIPES];
|
|
uint32_t pipe_sizes[MAX_VSC_PIPES];
|
|
};
|
|
|
|
struct tu_framebuffer
|
|
{
|
|
struct vk_object_base base;
|
|
|
|
uint32_t width;
|
|
uint32_t height;
|
|
uint32_t layers;
|
|
|
|
struct tu_tiling_config tiling[TU_GMEM_LAYOUT_COUNT];
|
|
|
|
uint32_t attachment_count;
|
|
struct tu_attachment_info attachments[0];
|
|
};
|
|
VK_DEFINE_NONDISP_HANDLE_CASTS(tu_framebuffer, base, VkFramebuffer,
|
|
VK_OBJECT_TYPE_FRAMEBUFFER)
|
|
|
|
struct tu_event
|
|
{
|
|
struct vk_object_base base;
|
|
struct tu_bo *bo;
|
|
};
|
|
VK_DEFINE_NONDISP_HANDLE_CASTS(tu_event, base, VkEvent, VK_OBJECT_TYPE_EVENT)
|
|
|
|
struct tu_sampler {
|
|
struct vk_object_base base;
|
|
|
|
uint32_t descriptor[A6XX_TEX_SAMP_DWORDS];
|
|
struct tu_sampler_ycbcr_conversion *ycbcr_sampler;
|
|
};
|
|
VK_DEFINE_NONDISP_HANDLE_CASTS(tu_sampler, base, VkSampler,
|
|
VK_OBJECT_TYPE_SAMPLER)
|
|
|
|
uint64_t
|
|
tu_get_system_heap_size(void);
|
|
|
|
VkResult
|
|
tu_physical_device_init(struct tu_physical_device *device,
|
|
struct tu_instance *instance);
|
|
|
|
uint64_t
|
|
tu_device_ticks_to_ns(struct tu_device *dev, uint64_t ts);
|
|
|
|
static inline struct tu_bo *
|
|
tu_device_lookup_bo(struct tu_device *device, uint32_t handle)
|
|
{
|
|
return (struct tu_bo *) util_sparse_array_get(&device->bo_map, handle);
|
|
}
|
|
|
|
struct u_trace_context *
|
|
tu_device_get_u_trace(struct tu_device *device);
|
|
|
|
/* Get a scratch bo for use inside a command buffer. This will always return
|
|
* the same bo given the same size or similar sizes, so only one scratch bo
|
|
* can be used at the same time. It's meant for short-lived things where we
|
|
* need to write to some piece of memory, read from it, and then immediately
|
|
* discard it.
|
|
*/
|
|
VkResult
|
|
tu_get_scratch_bo(struct tu_device *dev, uint64_t size, struct tu_bo **bo);
|
|
|
|
void tu_setup_dynamic_framebuffer(struct tu_cmd_buffer *cmd_buffer,
|
|
const VkRenderingInfo *pRenderingInfo);
|
|
|
|
void
|
|
tu_copy_timestamp_buffer(struct u_trace_context *utctx, void *cmdstream,
|
|
void *ts_from, uint32_t from_offset,
|
|
void *ts_to, uint32_t to_offset,
|
|
uint32_t count);
|
|
|
|
|
|
VkResult
|
|
tu_create_copy_timestamp_cs(struct tu_cmd_buffer *cmdbuf, struct tu_cs** cs,
|
|
struct u_trace **trace_copy);
|
|
|
|
/* If we copy trace and timestamps we will have to free them. */
|
|
struct tu_u_trace_cmd_data
|
|
{
|
|
struct tu_cs *timestamp_copy_cs;
|
|
struct u_trace *trace;
|
|
};
|
|
|
|
/* Data necessary to retrieve timestamps and clean all
|
|
* associated resources afterwards.
|
|
*/
|
|
struct tu_u_trace_submission_data
|
|
{
|
|
uint32_t submission_id;
|
|
/* We have to know when timestamps are available,
|
|
* this sync object indicates it.
|
|
*/
|
|
struct tu_u_trace_syncobj *syncobj;
|
|
|
|
uint32_t cmd_buffer_count;
|
|
uint32_t last_buffer_with_tracepoints;
|
|
struct tu_u_trace_cmd_data *cmd_trace_data;
|
|
};
|
|
|
|
VkResult
|
|
tu_u_trace_submission_data_create(
|
|
struct tu_device *device,
|
|
struct tu_cmd_buffer **cmd_buffers,
|
|
uint32_t cmd_buffer_count,
|
|
struct tu_u_trace_submission_data **submission_data);
|
|
|
|
void
|
|
tu_u_trace_submission_data_finish(
|
|
struct tu_device *device,
|
|
struct tu_u_trace_submission_data *submission_data);
|
|
|
|
const char *
|
|
tu_debug_bos_add(struct tu_device *dev, uint64_t size, const char *name);
|
|
void
|
|
tu_debug_bos_del(struct tu_device *dev, struct tu_bo *bo);
|
|
void
|
|
tu_debug_bos_print_stats(struct tu_device *dev);
|
|
|
|
#endif /* TU_DEVICE_H */
|