diff --git a/libs/vkd3d/memory.c b/libs/vkd3d/memory.c index 8301a287..93f9b4cb 100644 --- a/libs/vkd3d/memory.c +++ b/libs/vkd3d/memory.c @@ -637,8 +637,92 @@ static void vkd3d_memory_allocator_remove_chunk(struct vkd3d_memory_allocator *a vkd3d_memory_chunk_destroy(chunk, device, allocator); } +static void vkd3d_memory_allocator_cleanup_clear_queue(struct vkd3d_memory_allocator *allocator, struct d3d12_device *device) +{ + struct vkd3d_memory_clear_queue *clear_queue = &allocator->clear_queue; + const struct vkd3d_vk_device_procs *vk_procs = &device->vk_procs; + + VK_CALL(vkDestroyCommandPool(device->vk_device, clear_queue->vk_command_pool, NULL)); + VK_CALL(vkDestroySemaphore(device->vk_device, clear_queue->vk_semaphore, NULL)); + + vkd3d_free(clear_queue->allocations); + pthread_mutex_destroy(&clear_queue->mutex); +} + +static HRESULT vkd3d_memory_allocator_init_clear_queue(struct vkd3d_memory_allocator *allocator, struct d3d12_device *device) +{ + struct vkd3d_memory_clear_queue *clear_queue = &allocator->clear_queue; + const struct vkd3d_vk_device_procs *vk_procs = &device->vk_procs; + VkSemaphoreTypeCreateInfoKHR semaphore_type_info; + VkCommandBufferAllocateInfo command_buffer_info; + VkCommandPoolCreateInfo command_pool_info; + VkSemaphoreCreateInfo semaphore_info; + VkResult vr; + HRESULT hr; + int rc; + + /* vkd3d_memory_allocator_init will memset the entire + * clear_queue struct to zero prior to calling this */ + clear_queue->last_known_value = VKD3D_MEMORY_CLEAR_COMMAND_BUFFER_COUNT; + clear_queue->next_signal_value = VKD3D_MEMORY_CLEAR_COMMAND_BUFFER_COUNT + 1; + + if ((rc = pthread_mutex_init(&allocator->mutex, NULL))) + return hresult_from_errno(rc); + + command_pool_info.sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO; + command_pool_info.pNext = NULL; + command_pool_info.flags = VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT; + command_pool_info.queueFamilyIndex = device->queues[VKD3D_QUEUE_FAMILY_INTERNAL_COMPUTE]->vk_family_index; + + if ((vr = VK_CALL(vkCreateCommandPool(device->vk_device, &command_pool_info, + NULL, &clear_queue->vk_command_pool))) < 0) + { + ERR("Failed to create command pool, vr %d.\n", vr); + hr = hresult_from_vk_result(vr); + goto fail; + } + + command_buffer_info.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO; + command_buffer_info.pNext = NULL; + command_buffer_info.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY; + command_buffer_info.commandPool = clear_queue->vk_command_pool; + command_buffer_info.commandBufferCount = VKD3D_MEMORY_CLEAR_COMMAND_BUFFER_COUNT; + + if ((vr = VK_CALL(vkAllocateCommandBuffers(device->vk_device, + &command_buffer_info, clear_queue->vk_command_buffers))) < 0) + { + ERR("Failed to allocate command buffer, vr %d.\n", vr); + hr = hresult_from_vk_result(vr); + goto fail; + } + + semaphore_type_info.sType = VK_STRUCTURE_TYPE_SEMAPHORE_TYPE_CREATE_INFO_KHR; + semaphore_type_info.pNext = NULL; + semaphore_type_info.semaphoreType = VK_SEMAPHORE_TYPE_TIMELINE_KHR; + semaphore_type_info.initialValue = VKD3D_MEMORY_CLEAR_COMMAND_BUFFER_COUNT; + + semaphore_info.sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO; + semaphore_info.pNext = &semaphore_type_info; + semaphore_info.flags = 0; + + if ((vr = VK_CALL(vkCreateSemaphore(device->vk_device, + &semaphore_info, NULL, &clear_queue->vk_semaphore))) < 0) + { + ERR("Failed to create semaphore, vr %d.\n", vr); + hr = hresult_from_vk_result(vr); + goto fail; + } + + return S_OK; + +fail: + vkd3d_memory_allocator_cleanup_clear_queue(allocator, device); + return hr; +} + HRESULT vkd3d_memory_allocator_init(struct vkd3d_memory_allocator *allocator, struct d3d12_device *device) { + HRESULT hr; int rc; memset(allocator, 0, sizeof(*allocator)); @@ -646,6 +730,12 @@ HRESULT vkd3d_memory_allocator_init(struct vkd3d_memory_allocator *allocator, st if ((rc = pthread_mutex_init(&allocator->mutex, NULL))) return hresult_from_errno(rc); + if (FAILED(hr = vkd3d_memory_allocator_init_clear_queue(allocator, device))) + { + pthread_mutex_destroy(&allocator->mutex); + return hr; + } + vkd3d_va_map_init(&allocator->va_map); return S_OK; } @@ -659,9 +749,288 @@ void vkd3d_memory_allocator_cleanup(struct vkd3d_memory_allocator *allocator, st vkd3d_free(allocator->chunks); vkd3d_va_map_cleanup(&allocator->va_map); + vkd3d_memory_allocator_cleanup_clear_queue(allocator, device); pthread_mutex_destroy(&allocator->mutex); } +static bool vkd3d_memory_allocator_wait_clear_semaphore(struct vkd3d_memory_allocator *allocator, + struct d3d12_device *device, uint64_t wait_value, uint64_t timeout) +{ + struct vkd3d_memory_clear_queue *clear_queue = &allocator->clear_queue; + const struct vkd3d_vk_device_procs *vk_procs = &device->vk_procs; + VkSemaphoreWaitInfo wait_info; + uint64_t old_value, new_value; + VkResult vr; + + old_value = vkd3d_atomic_uint64_load_explicit(&clear_queue->last_known_value, vkd3d_memory_order_acquire); + + if (old_value >= wait_value) + return true; + + if (timeout) + { + wait_info.sType = VK_STRUCTURE_TYPE_SEMAPHORE_WAIT_INFO_KHR; + wait_info.pNext = NULL; + wait_info.flags = 0; + wait_info.semaphoreCount = 1; + wait_info.pSemaphores = &clear_queue->vk_semaphore; + wait_info.pValues = &wait_value; + + vr = VK_CALL(vkWaitSemaphoresKHR(device->vk_device, &wait_info, timeout)); + new_value = wait_value; + } + else + { + vr = VK_CALL(vkGetSemaphoreCounterValueKHR(device->vk_device, + clear_queue->vk_semaphore, &new_value)); + } + + if (vr < 0) + { + ERR("Failed to wait for timeline semaphore, vr %d.\n", vr); + return false; + } + + while (new_value > old_value) + { + uint64_t cur_value = vkd3d_atomic_uint64_compare_exchange(&clear_queue->last_known_value, + old_value, new_value, vkd3d_memory_order_release, vkd3d_memory_order_acquire); + + if (cur_value == old_value) + break; + + old_value = cur_value; + } + + return new_value >= wait_value; +} + +static HRESULT vkd3d_memory_allocator_flush_clears_locked(struct vkd3d_memory_allocator *allocator, + struct d3d12_device *device) +{ + const VkPipelineStageFlags vk_stage_mask = VK_PIPELINE_STAGE_ALL_COMMANDS_BIT; + struct vkd3d_memory_clear_queue *clear_queue = &allocator->clear_queue; + const struct vkd3d_vk_device_procs *vk_procs = &device->vk_procs; + VkTimelineSemaphoreSubmitInfoKHR timeline_info; + struct vkd3d_queue *queue, *internal_queue; + VkCommandBufferBeginInfo begin_info; + uint32_t queue_mask, queue_index; + VkCommandBuffer vk_cmd_buffer; + VkSubmitInfo submit_info; + VkQueue vk_queue; + VkResult vr; + size_t i; + + if (!clear_queue->allocations_count) + return S_OK; + + /* Record commands late so that we can simply remove allocations from + * the queue if they got freed before the clear commands got dispatched, + * rather than rewriting the command buffer or dispatching the clear */ + internal_queue = device->queues[VKD3D_QUEUE_FAMILY_INTERNAL_COMPUTE]; + vk_cmd_buffer = clear_queue->vk_command_buffers[clear_queue->command_buffer_index]; + + vkd3d_memory_allocator_wait_clear_semaphore(allocator, device, + clear_queue->next_signal_value - VKD3D_MEMORY_CLEAR_COMMAND_BUFFER_COUNT, UINT64_MAX); + + if ((vr = VK_CALL(vkResetCommandBuffer(vk_cmd_buffer, 0)))) + { + ERR("Failed to reset command pool, vr %d.\n", vr); + return hresult_from_vk_result(vr); + } + + begin_info.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO; + begin_info.pNext = NULL; + begin_info.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT; + begin_info.pInheritanceInfo = NULL; + + if ((vr = VK_CALL(vkBeginCommandBuffer(vk_cmd_buffer, &begin_info))) < 0) + { + ERR("Failed to begin command buffer, vr %d.\n", vr); + return hresult_from_vk_result(vr); + } + + for (i = 0; i < clear_queue->allocations_count; i++) + { + const struct vkd3d_memory_allocation *allocation = clear_queue->allocations[i]; + + VK_CALL(vkCmdFillBuffer(vk_cmd_buffer, allocation->resource.vk_buffer, + allocation->offset, allocation->resource.size, 0)); + } + + if ((vr = VK_CALL(vkEndCommandBuffer(vk_cmd_buffer))) < 0) + { + ERR("Failed to end command buffer, vr %d.\n", vr); + return hresult_from_vk_result(vr); + } + + + if (!(vk_queue = vkd3d_queue_acquire(internal_queue))) + return E_FAIL; + + memset(&timeline_info, 0, sizeof(timeline_info)); + timeline_info.sType = VK_STRUCTURE_TYPE_TIMELINE_SEMAPHORE_SUBMIT_INFO_KHR; + timeline_info.signalSemaphoreValueCount = 1; + timeline_info.pSignalSemaphoreValues = &clear_queue->next_signal_value; + + memset(&submit_info, 0, sizeof(submit_info)); + submit_info.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO; + submit_info.pNext = &timeline_info; + submit_info.commandBufferCount = 1; + submit_info.pCommandBuffers = &vk_cmd_buffer; + submit_info.signalSemaphoreCount = 1; + submit_info.pSignalSemaphores = &clear_queue->vk_semaphore; + + vr = VK_CALL(vkQueueSubmit(vk_queue, 1, &submit_info, VK_NULL_HANDLE)); + vkd3d_queue_release(internal_queue); + + if (vr < 0) + { + ERR("Failed to submit command buffer, vr %d.\n", vr); + return hresult_from_vk_result(vr); + } + + /* Stall future submissions on other queues until the clear has finished */ + memset(&timeline_info, 0, sizeof(timeline_info)); + timeline_info.sType = VK_STRUCTURE_TYPE_TIMELINE_SEMAPHORE_SUBMIT_INFO_KHR; + timeline_info.waitSemaphoreValueCount = 1; + timeline_info.pWaitSemaphoreValues = &clear_queue->next_signal_value; + + memset(&submit_info, 0, sizeof(submit_info)); + submit_info.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO; + submit_info.pNext = &timeline_info; + submit_info.waitSemaphoreCount = 1; + submit_info.pWaitSemaphores = &clear_queue->vk_semaphore; + submit_info.pWaitDstStageMask = &vk_stage_mask; + + queue_mask = device->unique_queue_mask; + + while (queue_mask) + { + queue_index = vkd3d_bitmask_iter32(&queue_mask); + queue = device->queues[queue_index]; + + if (!(vk_queue = vkd3d_queue_acquire(queue))) + return E_FAIL; + + vr = VK_CALL(vkQueueSubmit(vk_queue, 1, &submit_info, VK_NULL_HANDLE)); + vkd3d_queue_release(queue); + + if (vr < 0) + { + ERR("Failed to submit semaphore wait, vr %d.\n", vr); + return hresult_from_vk_result(vr); + } + } + + /* Keep next_signal always one ahead of the last signaled value */ + clear_queue->next_signal_value += 1; + clear_queue->num_bytes_pending = 0; + clear_queue->allocations_count = 0; + clear_queue->command_buffer_index += 1; + clear_queue->command_buffer_index %= VKD3D_MEMORY_CLEAR_COMMAND_BUFFER_COUNT; + return S_OK; +} + +HRESULT vkd3d_memory_allocator_flush_clears(struct vkd3d_memory_allocator *allocator, struct d3d12_device *device) +{ + struct vkd3d_memory_clear_queue *clear_queue = &allocator->clear_queue; + HRESULT hr; + + pthread_mutex_lock(&clear_queue->mutex); + hr = vkd3d_memory_allocator_flush_clears_locked(allocator, device); + pthread_mutex_unlock(&clear_queue->mutex); + return hr; +} + +#define VKD3D_MEMORY_CLEAR_QUEUE_MAX_PENDING_BYTES (256ull << 20) /* 256 MiB */ + +static void vkd3d_memory_allocator_clear_allocation(struct vkd3d_memory_allocator *allocator, + struct d3d12_device *device, struct vkd3d_memory_allocation *allocation) +{ + struct vkd3d_memory_clear_queue *clear_queue = &allocator->clear_queue; + + if (allocation->cpu_address) + { + /* Probably faster than doing this on the GPU + * and having to worry about synchronization */ + memset(allocation->cpu_address, 0, allocation->resource.size); + } + else if (allocation->resource.vk_buffer) + { + pthread_mutex_lock(&clear_queue->mutex); + + if (!vkd3d_array_reserve((void**)&clear_queue->allocations, &clear_queue->allocations_size, + clear_queue->allocations_count + 1, sizeof(*clear_queue->allocations))) + { + ERR("Failed to insert free range.\n"); + pthread_mutex_unlock(&clear_queue->mutex); + return; + } + + allocation->clear_semaphore_value = clear_queue->next_signal_value; + + if (allocation->chunk) + allocation->chunk->allocation.clear_semaphore_value = clear_queue->next_signal_value; + + clear_queue->allocations[clear_queue->allocations_count++] = allocation; + clear_queue->num_bytes_pending += allocation->resource.size; + + if (clear_queue->num_bytes_pending >= VKD3D_MEMORY_CLEAR_QUEUE_MAX_PENDING_BYTES) + vkd3d_memory_allocator_flush_clears_locked(allocator, device); + + pthread_mutex_unlock(&clear_queue->mutex); + } +} + +static void vkd3d_memory_allocator_wait_allocation(struct vkd3d_memory_allocator *allocator, + struct d3d12_device *device, const struct vkd3d_memory_allocation *allocation) +{ + struct vkd3d_memory_clear_queue *clear_queue = &allocator->clear_queue; + uint64_t wait_value = allocation->clear_semaphore_value; + size_t i; + + /* If the clear semaphore has been signaled to the expected value, + * the GPU is already done clearing the allocation, and it cannot + * be in the clear queue either, so there is nothing to do. */ + if (vkd3d_memory_allocator_wait_clear_semaphore(allocator, device, wait_value, 0)) + return; + + /* If the allocation is still in the queue, the GPU has not started + * using it yet so we can remove it from the queue and exit. */ + pthread_mutex_lock(&clear_queue->mutex); + + for (i = 0; i < clear_queue->allocations_count; i++) + { + if (clear_queue->allocations[i] == allocation) + { + clear_queue->allocations[i] = clear_queue->allocations[--clear_queue->allocations_count]; + clear_queue->num_bytes_pending -= allocation->resource.size; + pthread_mutex_unlock(&clear_queue->mutex); + return; + } + } + + /* If this is a chunk and a suballocation from it had been immediately + * freed, it is possible that the suballocation got removed from the + * clear queue so that the chunk's wait value never gets signaled. Wait + * for the last signaled value in that case. */ + if (wait_value == clear_queue->next_signal_value) + wait_value = clear_queue->next_signal_value - 1; + + pthread_mutex_unlock(&clear_queue->mutex); + + /* If this allocation was suballocated from a chunk, we will wait + * on the semaphore when the parent chunk itself gets destroyed. */ + if (allocation->chunk) + return; + + /* Otherwise, we actually have to wait for the GPU. */ + WARN("Waiting for GPU to clear allocation %p.\n", allocation); + + vkd3d_memory_allocator_wait_clear_semaphore(allocator, device, wait_value, UINT64_MAX); +} + static HRESULT vkd3d_memory_allocator_add_chunk(struct vkd3d_memory_allocator *allocator, struct d3d12_device *device, const D3D12_HEAP_PROPERTIES *heap_properties, D3D12_HEAP_FLAGS heap_flags, uint32_t type_mask, struct vkd3d_memory_chunk **chunk) { diff --git a/libs/vkd3d/vkd3d_private.h b/libs/vkd3d/vkd3d_private.h index c685438b..bb9b56dc 100644 --- a/libs/vkd3d/vkd3d_private.h +++ b/libs/vkd3d/vkd3d_private.h @@ -538,6 +538,8 @@ struct vkd3d_memory_allocation uint32_t vk_memory_type; uint32_t flags; + uint64_t clear_semaphore_value; + struct vkd3d_memory_chunk *chunk; }; @@ -567,6 +569,27 @@ struct vkd3d_memory_chunk size_t free_ranges_count; }; +#define VKD3D_MEMORY_CLEAR_COMMAND_BUFFER_COUNT (16u) + +struct vkd3d_memory_clear_queue +{ + pthread_mutex_t mutex; + + VkCommandBuffer vk_command_buffers[VKD3D_MEMORY_CLEAR_COMMAND_BUFFER_COUNT]; + VkCommandPool vk_command_pool; + VkSemaphore vk_semaphore; + + UINT64 last_known_value; + UINT64 next_signal_value; + + VkDeviceSize num_bytes_pending; + uint32_t command_buffer_index; + + struct vkd3d_memory_allocation **allocations; + size_t allocations_size; + size_t allocations_count; +}; + struct vkd3d_memory_allocator { pthread_mutex_t mutex; @@ -576,6 +599,8 @@ struct vkd3d_memory_allocator size_t chunks_count; struct vkd3d_va_map va_map; + + struct vkd3d_memory_clear_queue clear_queue; }; void vkd3d_free_memory_2(struct d3d12_device *device, struct vkd3d_memory_allocator *allocator, @@ -587,6 +612,7 @@ HRESULT vkd3d_allocate_resource_memory_2(struct d3d12_device *device, struct vkd HRESULT vkd3d_memory_allocator_init(struct vkd3d_memory_allocator *allocator, struct d3d12_device *device); void vkd3d_memory_allocator_cleanup(struct vkd3d_memory_allocator *allocator, struct d3d12_device *device); +HRESULT vkd3d_memory_allocator_flush_clears(struct vkd3d_memory_allocator *allocator, struct d3d12_device *device); /* ID3D12Heap */ typedef ID3D12Heap1 d3d12_heap_iface;