Compare commits

...

5 Commits

Author SHA1 Message Date
Hans-Kristian Arntzen f74705b11a vkd3d: Flush copy queue in GetCudaSurface.
Signed-off-by: Hans-Kristian Arntzen <post@arntzen-software.no>
2022-07-07 15:20:05 +02:00
Hans-Kristian Arntzen ad15a7eb01 vkd3d: MEGAHACK: Experiment with deferred descriptor copies.
Attempts to move overhead from render threads to submission threads
which are twiddling thumbs most of the time.

Signed-off-by: Hans-Kristian Arntzen <post@arntzen-software.no>
2022-07-07 14:59:51 +02:00
Hans-Kristian Arntzen 02d9b6c61c profiler: Add --delta to profile helper tool.
Signed-off-by: Hans-Kristian Arntzen <post@arntzen-software.no>
2022-07-06 12:44:58 +02:00
Hans-Kristian Arntzen b6aac42aa6 profiler: Use rdtsc instead of QPC.
Runs much faster and we don't really need accurate ns readings.

Signed-off-by: Hans-Kristian Arntzen <post@arntzen-software.no>
2022-07-06 12:44:58 +02:00
Hans-Kristian Arntzen be3b44b01c common: Add rdtsc helper.
Signed-off-by: Hans-Kristian Arntzen <post@arntzen-software.no>
2022-07-06 12:44:58 +02:00
8 changed files with 362 additions and 20 deletions

View File

@ -311,4 +311,19 @@ static inline uint64_t vkd3d_get_current_time_ns(void)
#endif
}
#ifdef _MSC_VER
#pragma intrinsic(__rdtsc)
#endif
static inline uint64_t vkd3d_get_current_time_ticks(void)
{
#ifdef _MSC_VER
return __rdtsc();
#elif defined(__i386__) || defined(__x86_64__)
return __builtin_ia32_rdtsc();
#else
return vkd3d_get_current_time_ns();
#endif
}
#endif /* __VKD3D_COMMON_H */

View File

@ -25,13 +25,6 @@
#ifdef VKD3D_ENABLE_PROFILING
#ifdef _WIN32
#define WIN32_LEAN_AND_MEAN
#include <windows.h>
#else
#include <time.h>
#endif
void vkd3d_init_profiling(void);
bool vkd3d_uses_profiling(void);
unsigned int vkd3d_profiling_register_region(const char *name, spinlock_t *lock, uint32_t *latch);
@ -48,12 +41,12 @@ void vkd3d_profiling_notify_work(unsigned int index, uint64_t start_ticks, uint6
do { \
if (!(_vkd3d_region_index_##name = vkd3d_atomic_uint32_load_explicit(&_vkd3d_region_latch_##name, vkd3d_memory_order_acquire))) \
_vkd3d_region_index_##name = vkd3d_profiling_register_region(#name, &_vkd3d_region_lock_##name, &_vkd3d_region_latch_##name); \
_vkd3d_region_begin_tick_##name = vkd3d_get_current_time_ns(); \
_vkd3d_region_begin_tick_##name = vkd3d_get_current_time_ticks(); \
} while(0)
#define VKD3D_REGION_END_ITERATIONS(name, iter) \
do { \
_vkd3d_region_end_tick_##name = vkd3d_get_current_time_ns(); \
_vkd3d_region_end_tick_##name = vkd3d_get_current_time_ticks(); \
vkd3d_profiling_notify_work(_vkd3d_region_index_##name, _vkd3d_region_begin_tick_##name, _vkd3d_region_end_tick_##name, iter); \
} while(0)

View File

@ -12413,6 +12413,8 @@ static void *d3d12_command_queue_submission_worker_main(void *userdata)
memmove(queue->submissions, queue->submissions + 1, queue->submissions_count * sizeof(submission));
pthread_mutex_unlock(&queue->queue_lock);
vkd3d_descriptor_update_ring_flush(&queue->device->descriptor_update_ring, queue->device);
switch (submission.type)
{
case VKD3D_SUBMISSION_STOP:

View File

@ -2875,6 +2875,8 @@ static void d3d12_device_destroy(struct d3d12_device *device)
const struct vkd3d_vk_device_procs *vk_procs = &device->vk_procs;
size_t i, j;
vkd3d_descriptor_update_ring_cleanup(&device->descriptor_update_ring, device);
for (i = 0; i < VKD3D_SCRATCH_POOL_KIND_COUNT; i++)
for (j = 0; j < device->scratch_pools[i].scratch_buffer_count; j++)
d3d12_device_destroy_scratch_buffer(device, &device->scratch_pools[i].scratch_buffers[j]);
@ -4008,10 +4010,20 @@ static void STDMETHODCALLTYPE d3d12_device_CreateConstantBufferView(d3d12_device
const D3D12_CONSTANT_BUFFER_VIEW_DESC *desc, D3D12_CPU_DESCRIPTOR_HANDLE descriptor)
{
struct d3d12_device *device = impl_from_ID3D12Device(iface);
vkd3d_cpu_descriptor_va_t src = 0, dst = 0;
TRACE("iface %p, desc %p, descriptor %#lx.\n", iface, desc, descriptor.ptr);
if (descriptor.ptr & VKD3D_RESOURCE_DESC_DEFER_COPY_MASK)
{
dst = descriptor.ptr;
descriptor.ptr = src = vkd3d_descriptor_update_ring_allocate_scratch(&device->descriptor_update_ring);
}
d3d12_desc_create_cbv(descriptor.ptr, device, desc);
if (dst)
vkd3d_descriptor_update_ring_push_copy(&device->descriptor_update_ring, device, src, dst);
}
static void STDMETHODCALLTYPE d3d12_device_CreateShaderResourceView(d3d12_device_iface *iface,
@ -4019,11 +4031,21 @@ static void STDMETHODCALLTYPE d3d12_device_CreateShaderResourceView(d3d12_device
D3D12_CPU_DESCRIPTOR_HANDLE descriptor)
{
struct d3d12_device *device = impl_from_ID3D12Device(iface);
vkd3d_cpu_descriptor_va_t src = 0, dst = 0;
TRACE("iface %p, resource %p, desc %p, descriptor %#lx.\n",
iface, resource, desc, descriptor.ptr);
if (descriptor.ptr & VKD3D_RESOURCE_DESC_DEFER_COPY_MASK)
{
dst = descriptor.ptr;
descriptor.ptr = src = vkd3d_descriptor_update_ring_allocate_scratch(&device->descriptor_update_ring);
}
d3d12_desc_create_srv(descriptor.ptr, device, impl_from_ID3D12Resource(resource), desc);
if (dst)
vkd3d_descriptor_update_ring_push_copy(&device->descriptor_update_ring, device, src, dst);
}
VKD3D_THREAD_LOCAL struct D3D12_UAV_INFO *d3d12_uav_info = NULL;
@ -4035,12 +4057,19 @@ static void STDMETHODCALLTYPE d3d12_device_CreateUnorderedAccessView(d3d12_devic
VkImageViewAddressPropertiesNVX out_info = { VK_STRUCTURE_TYPE_IMAGE_VIEW_ADDRESS_PROPERTIES_NVX };
VkImageViewHandleInfoNVX imageViewHandleInfo = { VK_STRUCTURE_TYPE_IMAGE_VIEW_HANDLE_INFO_NVX };
const struct vkd3d_vk_device_procs *vk_procs;
vkd3d_cpu_descriptor_va_t src = 0, dst = 0;
VkResult vr;
struct d3d12_resource *d3d12_resource_ = impl_from_ID3D12Resource(resource);
struct d3d12_device *device = impl_from_ID3D12Device(iface);
TRACE("iface %p, resource %p, counter_resource %p, desc %p, descriptor %#lx.\n",
iface, resource, counter_resource, desc, descriptor.ptr);
if (descriptor.ptr & VKD3D_RESOURCE_DESC_DEFER_COPY_MASK)
{
dst = descriptor.ptr;
descriptor.ptr = src = vkd3d_descriptor_update_ring_allocate_scratch(&device->descriptor_update_ring);
}
d3d12_desc_create_uav(descriptor.ptr,
device, d3d12_resource_,
impl_from_ID3D12Resource(counter_resource), desc);
@ -4067,6 +4096,9 @@ static void STDMETHODCALLTYPE d3d12_device_CreateUnorderedAccessView(d3d12_devic
/* Set this to null so that subsequent calls to this API wont update the previous pointer. */
d3d12_uav_info = NULL;
}
if (dst)
vkd3d_descriptor_update_ring_push_copy(&device->descriptor_update_ring, device, src, dst);
}
static void STDMETHODCALLTYPE d3d12_device_CreateRenderTargetView(d3d12_device_iface *iface,
@ -4216,7 +4248,10 @@ static void STDMETHODCALLTYPE d3d12_device_CopyDescriptorsSimple(d3d12_device_if
const D3D12_CPU_DESCRIPTOR_HANDLE src_descriptor_range_offset,
D3D12_DESCRIPTOR_HEAP_TYPE descriptor_heap_type)
{
vkd3d_cpu_descriptor_va_t src, dst;
struct d3d12_device *device;
UINT i;
TRACE("iface %p, descriptor_count %u, dst_descriptor_range_offset %#lx, "
"src_descriptor_range_offset %#lx, descriptor_heap_type %#x.\n",
iface, descriptor_count, dst_descriptor_range_offset.ptr, src_descriptor_range_offset.ptr,
@ -4224,7 +4259,21 @@ static void STDMETHODCALLTYPE d3d12_device_CopyDescriptorsSimple(d3d12_device_if
device = unsafe_impl_from_ID3D12Device(iface);
if (descriptor_heap_type == D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV ||
if (descriptor_heap_type == D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV &&
(dst_descriptor_range_offset.ptr & VKD3D_RESOURCE_DESC_DEFER_COPY_MASK))
{
src = src_descriptor_range_offset.ptr;
dst = dst_descriptor_range_offset.ptr;
for (i = 0; i < descriptor_count; i++)
{
vkd3d_descriptor_update_ring_push_copy(&device->descriptor_update_ring,
device, src, dst);
src += VKD3D_RESOURCE_DESC_INCREMENT;
dst += VKD3D_RESOURCE_DESC_INCREMENT;
}
}
else if (descriptor_heap_type == D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV ||
descriptor_heap_type == D3D12_DESCRIPTOR_HEAP_TYPE_SAMPLER)
{
/* Fast and hot path. */
@ -6229,10 +6278,13 @@ static HRESULT d3d12_device_init(struct d3d12_device *device,
if (FAILED(hr = vkd3d_shader_debug_ring_init(&device->debug_ring, device)))
goto out_cleanup_meta_ops;
if (FAILED(hr = vkd3d_descriptor_update_ring_init(&device->descriptor_update_ring, device)))
goto out_cleanup_debug_ring;
#ifdef VKD3D_ENABLE_BREADCRUMBS
if (vkd3d_config_flags & VKD3D_CONFIG_FLAG_BREADCRUMBS)
if (FAILED(hr = vkd3d_breadcrumb_tracer_init(&device->breadcrumb_tracer, device)))
goto out_cleanup_debug_ring;
goto out_cleanup_update_ring;
#endif
if (vkd3d_descriptor_debug_active_qa_checks())
@ -6267,8 +6319,10 @@ out_cleanup_breadcrumb_tracer:
#ifdef VKD3D_ENABLE_BREADCRUMBS
if (vkd3d_config_flags & VKD3D_CONFIG_FLAG_BREADCRUMBS)
vkd3d_breadcrumb_tracer_cleanup(&device->breadcrumb_tracer, device);
out_cleanup_debug_ring:
out_cleanup_update_ring:
#endif
vkd3d_descriptor_update_ring_cleanup(&device->descriptor_update_ring, device);
out_cleanup_debug_ring:
vkd3d_shader_debug_ring_cleanup(&device->debug_ring, device);
out_cleanup_meta_ops:
vkd3d_meta_ops_cleanup(&device->meta_ops, device);

View File

@ -190,6 +190,13 @@ static HRESULT STDMETHODCALLTYPE d3d12_device_vkd3d_ext_GetCudaSurfaceObject(ID3
return E_INVALIDARG;
device = d3d12_device_from_ID3D12DeviceExt(iface);
if (uav_handle.ptr & VKD3D_RESOURCE_DESC_DEFER_COPY_MASK)
{
INFO("Flushing copy queue in place due to weird GetCudaSurfaceObject call!\n");
vkd3d_descriptor_update_ring_flush(&device->descriptor_update_ring, device);
}
uav_desc = d3d12_desc_decode_va(uav_handle.ptr);
imageViewHandleInfo.imageView = uav_desc.view->info.view->vk_image_view;

View File

@ -5295,6 +5295,22 @@ static ULONG STDMETHODCALLTYPE d3d12_descriptor_heap_AddRef(ID3D12DescriptorHeap
return refcount;
}
void d3d12_descriptor_heap_inc_ref(struct d3d12_descriptor_heap *heap)
{
InterlockedIncrement(&heap->internal_refcount);
}
void d3d12_descriptor_heap_dec_ref(struct d3d12_descriptor_heap *heap)
{
ULONG refcount = InterlockedDecrement(&heap->internal_refcount);
if (!refcount)
{
d3d12_descriptor_heap_cleanup(heap);
vkd3d_free_aligned(heap);
}
}
static ULONG STDMETHODCALLTYPE d3d12_descriptor_heap_Release(ID3D12DescriptorHeap *iface)
{
struct d3d12_descriptor_heap *heap = impl_from_ID3D12DescriptorHeap(iface);
@ -5305,11 +5321,12 @@ static ULONG STDMETHODCALLTYPE d3d12_descriptor_heap_Release(ID3D12DescriptorHea
if (!refcount)
{
struct d3d12_device *device = heap->device;
d3d12_descriptor_heap_cleanup(heap);
/* There might be pending descriptor copies from our heap, so keep it alive
* until we have observed that all possible copies have completed using the current
* write_count timestamp. */
vkd3d_descriptor_update_ring_mark_heap_destruction(&device->descriptor_update_ring, heap);
vkd3d_private_store_destroy(&heap->private_store);
vkd3d_free_aligned(heap);
d3d12_descriptor_heap_dec_ref(heap);
d3d12_device_release(device);
}
@ -5853,6 +5870,7 @@ static HRESULT d3d12_descriptor_heap_init(struct d3d12_descriptor_heap *descript
memset(descriptor_heap, 0, sizeof(*descriptor_heap));
descriptor_heap->ID3D12DescriptorHeap_iface.lpVtbl = &d3d12_descriptor_heap_vtbl;
descriptor_heap->refcount = 1;
descriptor_heap->internal_refcount = 1;
descriptor_heap->device = device;
descriptor_heap->desc = *desc;
@ -6005,6 +6023,18 @@ HRESULT d3d12_descriptor_heap_create(struct d3d12_device *device,
{
/* See comments above on how this is supposed to work */
object->cpu_va.ptr = (SIZE_T)object + num_descriptor_bits;
/* FIXME: This is gross. We need to repurpose some of the lower order bits to make this robust. */
if (object->cpu_va.ptr & VKD3D_RESOURCE_DESC_DEFER_COPY_MASK)
{
FIXME("High bit in CPU va is set ...\n");
vkd3d_free_aligned(object);
return E_OUTOFMEMORY;
}
if (desc->Type == D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV &&
(desc->Flags & D3D12_DESCRIPTOR_HEAP_FLAG_SHADER_VISIBLE))
object->cpu_va.ptr |= VKD3D_RESOURCE_DESC_DEFER_COPY_MASK;
}
else
{
@ -6044,6 +6074,172 @@ void d3d12_descriptor_heap_cleanup(struct d3d12_descriptor_heap *descriptor_heap
vkd3d_descriptor_debug_unregister_heap(descriptor_heap->cookie);
}
HRESULT vkd3d_descriptor_update_ring_init(struct vkd3d_descriptor_update_ring *ring,
struct d3d12_device *device)
{
D3D12_DESCRIPTOR_HEAP_DESC desc;
HRESULT hr;
desc.Type = D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV;
desc.NumDescriptors = VKD3D_DESCRIPTOR_UPDATE_RING_SIZE;
desc.Flags = D3D12_DESCRIPTOR_HEAP_FLAG_NONE;
desc.NodeMask = 0;
if (FAILED(hr = d3d12_descriptor_heap_create(device, &desc, &ring->staging)))
return hr;
/* Hold private reference. */
d3d12_descriptor_heap_inc_ref(ring->staging);
d3d12_descriptor_heap_Release(&ring->staging->ID3D12DescriptorHeap_iface);
ring->staging_base = ring->staging->cpu_va.ptr;
ring->cbv_srv_uav_copies = vkd3d_calloc(VKD3D_DESCRIPTOR_UPDATE_RING_SIZE, sizeof(*ring->cbv_srv_uav_copies));
pthread_mutex_init(&ring->submission_lock, NULL);
return S_OK;
}
void vkd3d_descriptor_update_ring_cleanup(struct vkd3d_descriptor_update_ring *ring,
struct d3d12_device *device)
{
size_t i;
d3d12_descriptor_heap_dec_ref(ring->staging);
pthread_mutex_destroy(&ring->submission_lock);
vkd3d_free(ring->cbv_srv_uav_copies);
for (i = 0; i < ring->deferred_release_count; i++)
d3d12_descriptor_heap_dec_ref(ring->deferred_releases[i].heap);
vkd3d_free(ring->deferred_releases);
}
vkd3d_cpu_descriptor_va_t vkd3d_descriptor_update_ring_allocate_scratch(struct vkd3d_descriptor_update_ring *ring)
{
/* FIXME: This can in theory overflow before we've read all entries. If we start to fill the buffer,
* we can do an emergency stall, but it should basically never happen unless application
* is specifically trying to break us. We can read the read_count. */
uint32_t offset = vkd3d_atomic_uint32_increment(&ring->staging_write_count, vkd3d_memory_order_relaxed) - 1;
offset &= VKD3D_DESCRIPTOR_UPDATE_RING_MASK;
return ring->staging_base + offset * VKD3D_RESOURCE_DESC_INCREMENT;
}
void vkd3d_descriptor_update_ring_push_copy(struct vkd3d_descriptor_update_ring *ring,
struct d3d12_device *device,
vkd3d_cpu_descriptor_va_t src_, vkd3d_cpu_descriptor_va_t dst_)
{
/* FIXME: Have to handle the case where we're overflowing the buffers. */
struct vkd3d_descriptor_update_ring_copy *copy;
void *src = (void*)src_;
void *dst = (void*)dst_;
uint32_t write_count;
uint32_t read_count;
uint32_t offset;
/* Detect internal overflow. When we've filled half the buffer it's getting spicy,
* so perform an emergency flush.
* This should only happen in code that tries hard to break us. */
write_count = vkd3d_atomic_uint32_load_explicit(&ring->write_count, vkd3d_memory_order_relaxed);
read_count = vkd3d_atomic_uint32_load_explicit(&ring->read_count, vkd3d_memory_order_relaxed);
if ((write_count - read_count) > VKD3D_DESCRIPTOR_UPDATE_RING_SIZE / 2)
{
INFO("Emergency descriptor update ring flush in render threads!\n");
vkd3d_descriptor_update_ring_flush(ring, device);
}
write_count = vkd3d_atomic_uint32_increment(&ring->write_count, vkd3d_memory_order_relaxed) - 1;
offset = write_count & VKD3D_DESCRIPTOR_UPDATE_RING_MASK;
copy = &ring->cbv_srv_uav_copies[offset];
/* There is a race here where submission threads may observe that the write count has increased, but application
* has not completely written the descriptor copy write yet.
* Submission will stop processing in this case. Next iteration, the read count will start working on the first failed descriptor.
* All descriptor updates which happened before the ExecuteCommandLists
* is guaranteed to be completely written, so the only edge case occurs if ExecuteCommandLists is called
* concurrently with descriptor updates, but it's not guaranteed that ExecuteCommandLists would observe those descriptors
* in the first place. The only real memory order we need to consider is release semantic on the dst write.
* Submission thread will read dst first with acquire order, then src. */
vkd3d_atomic_ptr_store_explicit(&copy->src, src, vkd3d_memory_order_relaxed);
vkd3d_atomic_ptr_store_explicit(&copy->dst, dst, vkd3d_memory_order_release);
}
void vkd3d_descriptor_update_ring_mark_heap_destruction(struct vkd3d_descriptor_update_ring *ring,
struct d3d12_descriptor_heap *heap)
{
uint32_t write_count;
if (heap->desc.Type != D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV)
return;
write_count = vkd3d_atomic_uint32_load_explicit(&ring->write_count, vkd3d_memory_order_relaxed);
d3d12_descriptor_heap_inc_ref(heap);
pthread_mutex_lock(&ring->submission_lock);
vkd3d_array_reserve((void**)&ring->deferred_releases, &ring->deferred_release_size,
ring->deferred_release_count + 1, sizeof(*ring->deferred_releases));
ring->deferred_releases[ring->deferred_release_count].heap = heap;
ring->deferred_releases[ring->deferred_release_count].write_count = write_count;
ring->deferred_release_count++;
pthread_mutex_unlock(&ring->submission_lock);
}
void vkd3d_descriptor_update_ring_flush(struct vkd3d_descriptor_update_ring *ring,
struct d3d12_device *device)
{
/* This is called from submission threads and perform the main "meat" of the descriptor management. */
uint32_t write_count = vkd3d_atomic_uint32_load_explicit(&ring->write_count, vkd3d_memory_order_acquire);
struct vkd3d_descriptor_update_ring_copy *copy;
vkd3d_cpu_descriptor_va_t src, dst;
uint32_t read_count;
size_t i, j;
pthread_mutex_lock(&ring->submission_lock);
read_count = ring->read_count;
/* Wrap arithmetic, so != check is appropriate. */
while (read_count != write_count)
{
copy = &ring->cbv_srv_uav_copies[read_count & VKD3D_DESCRIPTOR_UPDATE_RING_MASK];
dst = (vkd3d_cpu_descriptor_va_t)vkd3d_atomic_ptr_load_explicit(&copy->dst, vkd3d_memory_order_acquire);
if (dst)
{
src = (vkd3d_cpu_descriptor_va_t) vkd3d_atomic_ptr_load_explicit(&copy->src, vkd3d_memory_order_relaxed);
d3d12_desc_copy_single(dst, src, device);
/* Consume the descriptor. */
vkd3d_atomic_ptr_store_explicit(&copy->dst, NULL, vkd3d_memory_order_release);
vkd3d_atomic_ptr_store_explicit(&copy->src, NULL, vkd3d_memory_order_release);
}
else
{
/* Race condition, but it's okay. We'll stop here and pick it up later.
* The write to dst will come in finite time. */
break;
}
read_count++;
}
vkd3d_atomic_uint32_store_explicit(&ring->read_count, read_count, vkd3d_memory_order_release);
/* If we're done copying descriptors from a heap, we can now release it safely. */
for (i = 0, j = 0; i < ring->deferred_release_count; i++)
{
/* With wrap arithmetic, test that read_count is positive. In the case where read counter hasn't caught up,
* the wrapped value will be close-ish to UINT_MAX instead. */
if ((read_count - ring->deferred_releases[i].write_count) <= VKD3D_DESCRIPTOR_UPDATE_RING_SIZE)
d3d12_descriptor_heap_dec_ref(ring->deferred_releases[i].heap);
else
ring->deferred_releases[j++] = ring->deferred_releases[i];
}
ring->deferred_release_count = j;
pthread_mutex_unlock(&ring->submission_lock);
}
static void d3d12_query_heap_set_name(struct d3d12_query_heap *heap, const char *name)
{
if (heap->vk_query_pool)

View File

@ -1037,6 +1037,9 @@ struct vkd3d_descriptor_binding
#define VKD3D_RESOURCE_DESC_INCREMENT_LOG2 5
#define VKD3D_RESOURCE_DESC_INCREMENT (1u << VKD3D_RESOURCE_DESC_INCREMENT_LOG2)
/* FIXME: Gross hack for now. */
#define VKD3D_RESOURCE_DESC_DEFER_COPY_MASK ((uintptr_t)INTPTR_MIN)
/* Arrange data so that it can pack as tightly as possible.
* When we copy descriptors, we must copy both structures.
* In copy_desc_range we scan through the entire metadata_binding, so
@ -1167,6 +1170,7 @@ struct d3d12_descriptor_heap
{
ID3D12DescriptorHeap ID3D12DescriptorHeap_iface;
LONG refcount;
LONG internal_refcount;
uint64_t gpu_va;
D3D12_DESCRIPTOR_HEAP_DESC desc;
@ -1200,6 +1204,8 @@ struct d3d12_descriptor_heap
HRESULT d3d12_descriptor_heap_create(struct d3d12_device *device,
const D3D12_DESCRIPTOR_HEAP_DESC *desc, struct d3d12_descriptor_heap **descriptor_heap);
void d3d12_descriptor_heap_dec_ref(struct d3d12_descriptor_heap *heap);
void d3d12_descriptor_heap_inc_ref(struct d3d12_descriptor_heap *heap);
void d3d12_descriptor_heap_cleanup(struct d3d12_descriptor_heap *descriptor_heap);
static inline struct d3d12_descriptor_heap *impl_from_ID3D12DescriptorHeap(ID3D12DescriptorHeap *iface)
@ -1235,6 +1241,8 @@ static inline struct d3d12_desc_split d3d12_desc_decode_va(vkd3d_cpu_descriptor_
* Above that, we have the d3d12_descriptor_heap, which is allocated with enough alignment
* to contain these twiddle bits. */
va &= ~VKD3D_RESOURCE_DESC_DEFER_COPY_MASK;
num_bits_descriptors = va & (VKD3D_RESOURCE_DESC_INCREMENT - 1);
heap_offset = (va >> VKD3D_RESOURCE_DESC_INCREMENT_LOG2) & (((size_t)1 << num_bits_descriptors) - 1);
split.offset = (uint32_t)heap_offset;
@ -1256,6 +1264,54 @@ static inline uint32_t d3d12_desc_heap_offset_from_gpu_handle(D3D12_GPU_DESCRIPT
return (uint32_t)handle.ptr / VKD3D_RESOURCE_DESC_INCREMENT;
}
struct vkd3d_descriptor_update_ring_copy
{
void *src;
void *dst;
};
struct vkd3d_descriptor_update_deferred_release
{
struct d3d12_descriptor_heap *heap;
uint32_t write_count;
};
#define VKD3D_DESCRIPTOR_UPDATE_RING_SIZE (256 * 1024)
#define VKD3D_DESCRIPTOR_UPDATE_RING_MASK (VKD3D_DESCRIPTOR_UPDATE_RING_SIZE - 1)
struct vkd3d_descriptor_update_ring
{
struct vkd3d_descriptor_update_ring_copy *cbv_srv_uav_copies;
uint32_t write_count;
uint32_t read_count;
vkd3d_cpu_descriptor_va_t staging_base;
struct d3d12_descriptor_heap *staging;
uint32_t staging_write_count;
struct vkd3d_descriptor_update_deferred_release *deferred_releases;
size_t deferred_release_count;
size_t deferred_release_size;
pthread_mutex_t submission_lock;
};
/* When writing shader visible descriptors, defer the write and perform the work
* last minute in submission queues. */
HRESULT vkd3d_descriptor_update_ring_init(struct vkd3d_descriptor_update_ring *ring,
struct d3d12_device *device);
void vkd3d_descriptor_update_ring_cleanup(struct vkd3d_descriptor_update_ring *ring,
struct d3d12_device *device);
void vkd3d_descriptor_update_ring_flush(struct vkd3d_descriptor_update_ring *ring,
struct d3d12_device *device);
vkd3d_cpu_descriptor_va_t vkd3d_descriptor_update_ring_allocate_scratch(struct vkd3d_descriptor_update_ring *ring);
void vkd3d_descriptor_update_ring_push_copy(struct vkd3d_descriptor_update_ring *ring,
struct d3d12_device *device,
vkd3d_cpu_descriptor_va_t src, vkd3d_cpu_descriptor_va_t dst);
void vkd3d_descriptor_update_ring_mark_heap_destruction(struct vkd3d_descriptor_update_ring *ring,
struct d3d12_descriptor_heap *heap);
/* ID3D12QueryHeap */
struct d3d12_query_heap
{
@ -3325,6 +3381,7 @@ struct d3d12_device
struct vkd3d_sampler_state sampler_state;
struct vkd3d_shader_debug_ring debug_ring;
struct vkd3d_pipeline_library_disk_cache disk_cache;
struct vkd3d_descriptor_update_ring descriptor_update_ring;
#ifdef VKD3D_ENABLE_BREADCRUMBS
struct vkd3d_breadcrumb_tracer breadcrumb_tracer;
#endif

View File

@ -75,17 +75,35 @@ def main():
parser.add_argument('--per-iteration', action = 'store_true', help = 'Represent ticks in terms of ticks / iteration. Cannot be used with --divider.')
parser.add_argument('--name', nargs = '+', type = str, help = 'Only display data for certain counters.')
parser.add_argument('--sort', type = str, default = 'none', help = 'Sorts input data according to "iterations" or "ticks".')
parser.add_argument('--delta', type = str, help = 'Subtract iterations and timing from other profile blob.')
parser.add_argument('profile', help = 'The profile binary blob.')
args = parser.parse_args()
if not args.profile:
raise AssertionError('Need profile folder.')
delta_map = {}
if args.delta is not None:
with open(args.delta, 'rb') as f:
for block in iter(lambda: f.read(64), b''):
if is_valid_block(block):
b = parse_block(block)
delta_map[b.name] = b
blocks = []
with open(args.profile, 'rb') as f:
for block in iter(lambda: f.read(64), b''):
if is_valid_block(block):
blocks.append(parse_block(block))
b = parse_block(block)
if b.name in delta_map:
d = delta_map[b.name]
b = ProfileCase(ticks = b.ticks - d.ticks,
iterations = b.iterations - d.iterations,
name = b.name)
if b.iterations < 0 or b.ticks < 0:
raise AssertionError('After subtracting, iterations or ticks became negative.')
if b.iterations > 0:
blocks.append(b)
if args.divider is not None:
if args.per_iteration:
@ -114,11 +132,11 @@ def main():
print(' Iterations:', block.iterations)
if args.divider is not None:
print(' Time spent per iteration of {}: {:.3f}'.format(args.divider, block.ticks / 1000.0), "us")
print(' Time spent per iteration of {}: {:.3f}'.format(args.divider, block.ticks / 1000.0), "Kcycles")
elif args.per_iteration:
print(' Time spent per iteration: {:.3f}'.format(block.ticks / 1000.0), "us")
print(' Time spent per iteration: {:.3f}'.format(block.ticks / 1000.0), "Kcycles")
else:
print(' Total time spent: {:.3f}'.format(block.ticks / 1000.0), "us")
print(' Total time spent: {:.3f}'.format(block.ticks / 1000.0), "Kcycles")
if __name__ == '__main__':
main()