/* * Copyright 2021 Philip Rebohle for Valve Corporation * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA */ #define VKD3D_DBG_CHANNEL VKD3D_DBG_CHANNEL_API #include "vkd3d_private.h" #include "vkd3d_descriptor_debug.h" static void vkd3d_memory_allocator_wait_allocation(struct vkd3d_memory_allocator *allocator, struct d3d12_device *device, const struct vkd3d_memory_allocation *allocation); static uint32_t vkd3d_select_memory_types(struct d3d12_device *device, const D3D12_HEAP_PROPERTIES *heap_properties, D3D12_HEAP_FLAGS heap_flags) { const VkPhysicalDeviceMemoryProperties *memory_info = &device->memory_properties; uint32_t type_mask = (1 << memory_info->memoryTypeCount) - 1; const struct vkd3d_memory_info_domain *domain_info; domain_info = d3d12_device_get_memory_info_domain(device, heap_properties); if (!(heap_flags & D3D12_HEAP_FLAG_DENY_BUFFERS)) type_mask &= domain_info->buffer_type_mask; if (!(heap_flags & D3D12_HEAP_FLAG_DENY_NON_RT_DS_TEXTURES)) type_mask &= domain_info->sampled_type_mask; /* Render targets are not allowed on UPLOAD and READBACK heaps */ if (!(heap_flags & D3D12_HEAP_FLAG_DENY_RT_DS_TEXTURES) && heap_properties->Type != D3D12_HEAP_TYPE_UPLOAD && heap_properties->Type != D3D12_HEAP_TYPE_READBACK) type_mask &= domain_info->rt_ds_type_mask; if (!type_mask) ERR("No memory type found for heap flags %#x.\n", heap_flags); return type_mask; } static uint32_t vkd3d_find_memory_types_with_flags(struct d3d12_device *device, VkMemoryPropertyFlags type_flags) { const VkPhysicalDeviceMemoryProperties *memory_info = &device->memory_properties; uint32_t i, mask = 0; for (i = 0; i < memory_info->memoryTypeCount; i++) { if ((memory_info->memoryTypes[i].propertyFlags & type_flags) == type_flags) mask |= 1u << i; } return mask; } static HRESULT vkd3d_select_memory_flags(struct d3d12_device *device, const D3D12_HEAP_PROPERTIES *heap_properties, VkMemoryPropertyFlags *type_flags) { HRESULT hr; switch (heap_properties->Type) { case D3D12_HEAP_TYPE_DEFAULT: *type_flags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT; break; case D3D12_HEAP_TYPE_UPLOAD: *type_flags = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT; if (vkd3d_config_flags & VKD3D_CONFIG_FLAG_FORCE_HOST_CACHED) *type_flags |= VK_MEMORY_PROPERTY_HOST_CACHED_BIT; else if (!(vkd3d_config_flags & VKD3D_CONFIG_FLAG_NO_UPLOAD_HVV)) *type_flags |= VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT; break; case D3D12_HEAP_TYPE_READBACK: *type_flags = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_CACHED_BIT; break; case D3D12_HEAP_TYPE_CUSTOM: if (FAILED(hr = d3d12_device_validate_custom_heap_type(device, heap_properties))) return hr; switch (heap_properties->CPUPageProperty) { case D3D12_CPU_PAGE_PROPERTY_WRITE_BACK: *type_flags = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_CACHED_BIT; break; case D3D12_CPU_PAGE_PROPERTY_WRITE_COMBINE: *type_flags = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT; if (vkd3d_config_flags & VKD3D_CONFIG_FLAG_FORCE_HOST_CACHED) *type_flags |= VK_MEMORY_PROPERTY_HOST_CACHED_BIT; break; case D3D12_CPU_PAGE_PROPERTY_NOT_AVAILABLE: *type_flags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT; break; default: return E_INVALIDARG; } break; default: WARN("Invalid heap type %#x.\n", heap_properties->Type); return E_INVALIDARG; } return S_OK; } static HRESULT vkd3d_create_global_buffer(struct d3d12_device *device, VkDeviceSize size, const D3D12_HEAP_PROPERTIES *heap_properties, D3D12_HEAP_FLAGS heap_flags, VkBuffer *vk_buffer) { D3D12_RESOURCE_DESC1 resource_desc; memset(&resource_desc, 0, sizeof(resource_desc)); resource_desc.Dimension = D3D12_RESOURCE_DIMENSION_BUFFER; resource_desc.Width = size; resource_desc.Height = 1; resource_desc.DepthOrArraySize = 1; resource_desc.MipLevels = 1; resource_desc.SampleDesc.Count = 1; resource_desc.Layout = D3D12_TEXTURE_LAYOUT_ROW_MAJOR; if (heap_flags & D3D12_HEAP_FLAG_SHARED_CROSS_ADAPTER) resource_desc.Flags |= D3D12_RESOURCE_FLAG_ALLOW_CROSS_ADAPTER; if (heap_properties->Type != D3D12_HEAP_TYPE_UPLOAD && heap_properties->Type != D3D12_HEAP_TYPE_READBACK) resource_desc.Flags |= D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS; return vkd3d_create_buffer(device, heap_properties, heap_flags, &resource_desc, vk_buffer); } void vkd3d_free_device_memory(struct d3d12_device *device, const struct vkd3d_device_memory_allocation *allocation) { const struct vkd3d_vk_device_procs *vk_procs = &device->vk_procs; VkDeviceSize *type_current; bool budget_sensitive; if (allocation->vk_memory == VK_NULL_HANDLE) { /* Deferred heap. Return early to skip confusing log messages. */ return; } VK_CALL(vkFreeMemory(device->vk_device, allocation->vk_memory, NULL)); budget_sensitive = !!(device->memory_info.budget_sensitive_mask & (1u << allocation->vk_memory_type)); if (budget_sensitive) { type_current = &device->memory_info.type_current[allocation->vk_memory_type]; pthread_mutex_lock(&device->memory_info.budget_lock); assert(*type_current >= allocation->size); *type_current -= allocation->size; if (vkd3d_config_flags & VKD3D_CONFIG_FLAG_LOG_MEMORY_BUDGET) { INFO("Freeing memory of type %u, new total allocated size %"PRIu64" MiB.\n", allocation->vk_memory_type, *type_current / (1024 * 1024)); } pthread_mutex_unlock(&device->memory_info.budget_lock); } else if (vkd3d_config_flags & VKD3D_CONFIG_FLAG_LOG_MEMORY_BUDGET) { INFO("Freeing memory of type %u, %"PRIu64" KiB.\n", allocation->vk_memory_type, allocation->size / 1024); } } static HRESULT vkd3d_try_allocate_device_memory(struct d3d12_device *device, VkDeviceSize size, VkMemoryPropertyFlags type_flags, uint32_t type_mask, void *pNext, struct vkd3d_device_memory_allocation *allocation) { const VkPhysicalDeviceMemoryProperties *memory_props = &device->memory_properties; const VkMemoryPropertyFlags optional_flags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT; const struct vkd3d_vk_device_procs *vk_procs = &device->vk_procs; struct vkd3d_memory_info *memory_info = &device->memory_info; VkMemoryAllocateInfo allocate_info; VkDeviceSize *type_current; VkDeviceSize *type_budget; bool budget_sensitive; VkResult vr; /* buffer_mask / sampled_mask etc will generally take care of this, * but for certain fallback scenarios where we select other memory * types, we need to mask here as well. */ type_mask &= device->memory_info.global_mask; allocate_info.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO; allocate_info.pNext = pNext; allocate_info.allocationSize = size; while (type_mask) { uint32_t type_index = vkd3d_bitmask_iter32(&type_mask); if ((memory_props->memoryTypes[type_index].propertyFlags & type_flags) != type_flags) continue; allocate_info.memoryTypeIndex = type_index; budget_sensitive = !!(device->memory_info.budget_sensitive_mask & (1u << type_index)); if (budget_sensitive) { type_budget = &memory_info->type_budget[type_index]; type_current = &memory_info->type_current[type_index]; pthread_mutex_lock(&memory_info->budget_lock); if (*type_current + size > *type_budget) { if (vkd3d_config_flags & VKD3D_CONFIG_FLAG_LOG_MEMORY_BUDGET) { INFO("Attempting to allocate from memory type %u, but exceeding fixed budget: %"PRIu64" + %"PRIu64" > %"PRIu64".\n", type_index, *type_current, size, *type_budget); } pthread_mutex_unlock(&memory_info->budget_lock); /* If we're out of DEVICE budget, don't try other types. */ if (type_flags & optional_flags) return E_OUTOFMEMORY; else continue; } } vr = VK_CALL(vkAllocateMemory(device->vk_device, &allocate_info, NULL, &allocation->vk_memory)); if (budget_sensitive) { if (vr == VK_SUCCESS) { *type_current += size; if (vkd3d_config_flags & VKD3D_CONFIG_FLAG_LOG_MEMORY_BUDGET) { INFO("Allocated memory of type %u, new total allocated size %"PRIu64" MiB.\n", type_index, *type_current / (1024 * 1024)); } } pthread_mutex_unlock(&memory_info->budget_lock); } else if (vkd3d_config_flags & VKD3D_CONFIG_FLAG_LOG_MEMORY_BUDGET) { INFO("%s memory of type #%u, size %"PRIu64" KiB.\n", (vr == VK_SUCCESS ? "Allocated" : "Failed to allocate"), type_index, allocate_info.allocationSize / 1024); } if (vr == VK_SUCCESS) { allocation->vk_memory_type = type_index; allocation->size = size; return S_OK; } else if (type_flags & optional_flags) { /* If we fail to allocate DEVICE_LOCAL memory, immediately fail the call. * This way we avoid any attempt to fall back to PCI-e BAR memory types * which are also DEVICE_LOCAL. * After failure, the calling code removes the DEVICE_LOCAL_BIT flag and tries again, * where we will fall back to system memory instead. */ return E_OUTOFMEMORY; } } return E_OUTOFMEMORY; } static bool vkd3d_memory_info_type_mask_covers_multiple_memory_heaps( const struct VkPhysicalDeviceMemoryProperties *props, uint32_t type_mask) { uint32_t heap_mask = 0; if (!type_mask) return false; while (type_mask) heap_mask |= 1u << props->memoryTypes[vkd3d_bitmask_iter32(&type_mask)].heapIndex; return !!(heap_mask & (heap_mask - 1u)); } HRESULT vkd3d_allocate_device_memory(struct d3d12_device *device, VkDeviceSize size, VkMemoryPropertyFlags type_flags, uint32_t type_mask, void *pNext, struct vkd3d_device_memory_allocation *allocation) { const VkMemoryPropertyFlags optional_flags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT; HRESULT hr; hr = vkd3d_try_allocate_device_memory(device, size, type_flags, type_mask, pNext, allocation); if (FAILED(hr) && (type_flags & optional_flags)) { if (vkd3d_memory_info_type_mask_covers_multiple_memory_heaps(&device->memory_properties, type_mask)) { WARN("Memory allocation failed, falling back to system memory.\n"); hr = vkd3d_try_allocate_device_memory(device, size, type_flags & ~optional_flags, type_mask, pNext, allocation); } else if (device->memory_properties.memoryHeapCount > 1) { /* It might be the case (NV with RT/DS heap) that we just cannot fall back in any meaningful way. * E.g. there exists no memory type that is not DEVICE_LOCAL and covers both RT and DS. * For this case, we have no choice but to not allocate, * and defer actual memory allocation to CreatePlacedResource() time. * NVIDIA bug reference for fixing this case: 2175829. */ WARN("Memory allocation failed, but it is not possible to fallback to system memory here. Deferring allocation.\n"); return hr; } /* If we fail to allocate, and only have one heap to work with (iGPU), * falling back is meaningless, just fail. */ } if (FAILED(hr)) { ERR("Failed to allocate device memory (size %"PRIu64", type_flags %#x, type_mask %#x).\n", size, type_flags, type_mask); } return hr; } static HRESULT vkd3d_import_host_memory(struct d3d12_device *device, void *host_address, VkDeviceSize size, VkMemoryPropertyFlags type_flags, uint32_t type_mask, void *pNext, struct vkd3d_device_memory_allocation *allocation) { VkImportMemoryHostPointerInfoEXT import_info; HRESULT hr; import_info.sType = VK_STRUCTURE_TYPE_IMPORT_MEMORY_HOST_POINTER_INFO_EXT; import_info.pNext = pNext; import_info.handleType = VK_EXTERNAL_MEMORY_HANDLE_TYPE_HOST_ALLOCATION_BIT_EXT; import_info.pHostPointer = host_address; if (FAILED(hr = vkd3d_try_allocate_device_memory(device, size, type_flags, type_mask, &import_info, allocation))) { WARN("Failed to import host memory, hr %#x.\n", hr); /* If we failed, fall back to a host-visible allocation. Generally * the app will access the memory thorugh the main host pointer, * so it's fine. */ hr = vkd3d_try_allocate_device_memory(device, size, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, type_mask, &import_info, allocation); } return hr; } static HRESULT vkd3d_allocation_assign_gpu_address(struct vkd3d_memory_allocation *allocation, struct d3d12_device *device, struct vkd3d_memory_allocator *allocator) { if (device->device_info.buffer_device_address_features.bufferDeviceAddress) allocation->resource.va = vkd3d_get_buffer_device_address(device, allocation->resource.vk_buffer); else if (!(allocation->flags & VKD3D_ALLOCATION_FLAG_INTERNAL_SCRATCH)) allocation->resource.va = vkd3d_va_map_alloc_fake_va(&allocator->va_map, allocation->resource.size); else allocation->resource.va = 0xdeadbeef; if (!allocation->resource.va) { ERR("Failed to get GPU address for allocation.\n"); return E_OUTOFMEMORY; } /* Internal scratch buffers are not visible to application so we never have to map it back to VkBuffer. */ if (!(allocation->flags & VKD3D_ALLOCATION_FLAG_INTERNAL_SCRATCH)) vkd3d_va_map_insert(&allocator->va_map, &allocation->resource); return S_OK; } static void *vkd3d_allocate_write_watch_pointer(const D3D12_HEAP_PROPERTIES *properties, VkDeviceSize size) { #ifdef _WIN32 DWORD protect; void *ptr; switch (properties->Type) { case D3D12_HEAP_TYPE_DEFAULT: return NULL; case D3D12_HEAP_TYPE_UPLOAD: protect = PAGE_READWRITE | PAGE_WRITECOMBINE; break; case D3D12_HEAP_TYPE_READBACK: /* WRITE_WATCH fails for this type in native D3D12, * otherwise it would be PAGE_READWRITE. */ return NULL; case D3D12_HEAP_TYPE_CUSTOM: switch (properties->CPUPageProperty) { case D3D12_CPU_PAGE_PROPERTY_UNKNOWN: return NULL; case D3D12_CPU_PAGE_PROPERTY_NOT_AVAILABLE: return NULL; case D3D12_CPU_PAGE_PROPERTY_WRITE_COMBINE: protect = PAGE_READWRITE | PAGE_WRITECOMBINE; break; case D3D12_CPU_PAGE_PROPERTY_WRITE_BACK: protect = PAGE_READWRITE; break; default: ERR("Invalid CPU page property %#x.\n", properties->CPUPageProperty); return NULL; } break; default: ERR("Invalid heap type %#x.\n", properties->Type); return NULL; } if (!(ptr = VirtualAlloc(NULL, (SIZE_T)size, MEM_COMMIT | MEM_RESERVE | MEM_WRITE_WATCH, protect))) { ERR("Failed to allocate write watch pointer %#x.\n", GetLastError()); return NULL; } return ptr; #else (void)properties; (void)size; ERR("WRITE_WATCH not supported on this platform.\n"); return NULL; #endif } static void vkd3d_free_write_watch_pointer(void *pointer) { #ifdef _WIN32 if (!VirtualFree(pointer, 0, MEM_RELEASE)) ERR("Failed to free write watch pointer %#x.\n", GetLastError()); #else /* Not supported on other platforms. */ (void)pointer; #endif } static void vkd3d_memory_allocation_free(const struct vkd3d_memory_allocation *allocation, struct d3d12_device *device, struct vkd3d_memory_allocator *allocator) { const struct vkd3d_vk_device_procs *vk_procs = &device->vk_procs; TRACE("allocation %p, device %p, allocator %p.\n", allocation, device, allocator); vkd3d_descriptor_debug_unregister_cookie(device->descriptor_qa_global_info, allocation->resource.cookie); if (allocation->flags & VKD3D_ALLOCATION_FLAG_ALLOW_WRITE_WATCH) vkd3d_free_write_watch_pointer(allocation->cpu_address); if ((allocation->flags & VKD3D_ALLOCATION_FLAG_GPU_ADDRESS) && allocation->resource.va) { if (!(allocation->flags & VKD3D_ALLOCATION_FLAG_INTERNAL_SCRATCH)) { vkd3d_va_map_remove(&allocator->va_map, &allocation->resource); if (!device->device_info.buffer_device_address_features.bufferDeviceAddress) vkd3d_va_map_free_fake_va(&allocator->va_map, allocation->resource.va, allocation->resource.size); } } if (allocation->resource.view_map) { vkd3d_view_map_destroy(allocation->resource.view_map, device); vkd3d_free(allocation->resource.view_map); } if (allocation->flags & VKD3D_ALLOCATION_FLAG_GLOBAL_BUFFER) VK_CALL(vkDestroyBuffer(device->vk_device, allocation->resource.vk_buffer, NULL)); vkd3d_free_device_memory(device, &allocation->device_allocation); } static HRESULT vkd3d_memory_allocation_init(struct vkd3d_memory_allocation *allocation, struct d3d12_device *device, struct vkd3d_memory_allocator *allocator, const struct vkd3d_allocate_memory_info *info) { const struct vkd3d_vk_device_procs *vk_procs = &device->vk_procs; VkMemoryRequirements memory_requirements; VkMemoryAllocateFlagsInfo flags_info; VkMemoryPropertyFlags type_flags; VkBindBufferMemoryInfo bind_info; void *host_ptr = info->host_ptr; uint32_t type_mask; VkResult vr; HRESULT hr; TRACE("allocation %p, device %p, allocator %p, info %p.\n", allocation, device, allocator, info); memset(allocation, 0, sizeof(*allocation)); allocation->heap_type = info->heap_properties.Type; allocation->heap_flags = info->heap_flags; allocation->flags = info->flags; /* This also sort of validates the heap description, * so we want to do this before creating any objects */ if (FAILED(hr = vkd3d_select_memory_flags(device, &info->heap_properties, &type_flags))) return hr; /* Mask out optional memory properties as needed. * This is relevant for chunk allocator fallbacks * since the info->memory_requirements already encodes * only HOST_VISIBLE types and we use NO_FALLBACK allocation mode. */ type_flags &= ~info->optional_memory_properties; if (allocation->flags & VKD3D_ALLOCATION_FLAG_GLOBAL_BUFFER) { /* If requested, create a buffer covering the entire allocation * and derive the exact memory requirements from that. Any buffer * resources are just going to use this buffer with an offset. */ if (FAILED(hr = vkd3d_create_global_buffer(device, info->memory_requirements.size, &info->heap_properties, info->heap_flags, &allocation->resource.vk_buffer))) return hr; VK_CALL(vkGetBufferMemoryRequirements(device->vk_device, allocation->resource.vk_buffer, &memory_requirements)); memory_requirements.memoryTypeBits &= info->memory_requirements.memoryTypeBits; } else { /* Respect existing memory requirements since there may not * be any buffer resource to get memory requirements from. */ memory_requirements = info->memory_requirements; } /* If an allocation is a dedicated fallback allocation, * we must not look at heap_flags, since we might end up noping out * the memory types we want to allocate with. */ type_mask = memory_requirements.memoryTypeBits; if (info->flags & VKD3D_ALLOCATION_FLAG_DEDICATED) type_mask &= device->memory_info.global_mask; else type_mask &= vkd3d_select_memory_types(device, &info->heap_properties, info->heap_flags); /* Allocate actual backing storage */ flags_info.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_FLAGS_INFO; flags_info.pNext = info->pNext; flags_info.flags = 0; if (allocation->resource.vk_buffer) { allocation->flags |= VKD3D_ALLOCATION_FLAG_GPU_ADDRESS; if (device->device_info.buffer_device_address_features.bufferDeviceAddress) flags_info.flags |= VK_MEMORY_ALLOCATE_DEVICE_ADDRESS_BIT_KHR; } allocation->resource.size = info->memory_requirements.size; if (info->heap_flags & D3D12_HEAP_FLAG_ALLOW_WRITE_WATCH) { assert(!host_ptr); allocation->flags |= VKD3D_ALLOCATION_FLAG_ALLOW_WRITE_WATCH; if (!(host_ptr = vkd3d_allocate_write_watch_pointer(&info->heap_properties, memory_requirements.size))) { VK_CALL(vkDestroyBuffer(device->vk_device, allocation->resource.vk_buffer, NULL)); return E_INVALIDARG; } } if (host_ptr) { hr = vkd3d_import_host_memory(device, host_ptr, memory_requirements.size, type_flags, type_mask, &flags_info, &allocation->device_allocation); } else if (info->flags & VKD3D_ALLOCATION_FLAG_NO_FALLBACK) { hr = vkd3d_try_allocate_device_memory(device, memory_requirements.size, type_flags, type_mask, &flags_info, &allocation->device_allocation); } else { hr = vkd3d_allocate_device_memory(device, memory_requirements.size, type_flags, type_mask, &flags_info, &allocation->device_allocation); } if (FAILED(hr)) { VK_CALL(vkDestroyBuffer(device->vk_device, allocation->resource.vk_buffer, NULL)); return hr; } /* Map memory if the allocation was requested to be host-visible, * but do not map if the allocation was meant to be device-local * since that may negatively impact performance. */ if (host_ptr) { allocation->flags |= VKD3D_ALLOCATION_FLAG_CPU_ACCESS; /* No need to call map here, we already know the pointer. */ allocation->cpu_address = host_ptr; } else if (type_flags & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT) { allocation->flags |= VKD3D_ALLOCATION_FLAG_CPU_ACCESS; if ((vr = VK_CALL(vkMapMemory(device->vk_device, allocation->device_allocation.vk_memory, 0, VK_WHOLE_SIZE, 0, &allocation->cpu_address)))) { ERR("Failed to map memory, vr %d.\n", vr); vkd3d_memory_allocation_free(allocation, device, allocator); return hresult_from_vk_result(vr); } } /* Bind memory to global or dedicated buffer as needed */ if (allocation->resource.vk_buffer) { bind_info.sType = VK_STRUCTURE_TYPE_BIND_BUFFER_MEMORY_INFO; bind_info.pNext = NULL; bind_info.buffer = allocation->resource.vk_buffer; bind_info.memory = allocation->device_allocation.vk_memory; bind_info.memoryOffset = 0; if ((vr = VK_CALL(vkBindBufferMemory2KHR(device->vk_device, 1, &bind_info))) < 0) { ERR("Failed to bind buffer memory, vr %d.\n", vr); vkd3d_memory_allocation_free(allocation, device, allocator); return hresult_from_vk_result(vr); } /* Assign GPU address as necessary. */ if (allocation->flags & VKD3D_ALLOCATION_FLAG_GPU_ADDRESS) { if (FAILED(hr = vkd3d_allocation_assign_gpu_address(allocation, device, allocator))) { vkd3d_memory_allocation_free(allocation, device, allocator); return hresult_from_vk_result(vr); } } } allocation->resource.cookie = vkd3d_allocate_cookie(); vkd3d_descriptor_debug_register_allocation_cookie(device->descriptor_qa_global_info, allocation->resource.cookie, info); TRACE("Created allocation %p on memory type %u (%"PRIu64" bytes).\n", allocation, allocation->device_allocation.vk_memory_type, allocation->resource.size); return S_OK; } static void vkd3d_memory_chunk_insert_range(struct vkd3d_memory_chunk *chunk, size_t index, VkDeviceSize offset, VkDeviceSize length) { if (!vkd3d_array_reserve((void**)&chunk->free_ranges, &chunk->free_ranges_size, chunk->free_ranges_count + 1, sizeof(*chunk->free_ranges))) { ERR("Failed to insert free range.\n"); return; } memmove(&chunk->free_ranges[index + 1], &chunk->free_ranges[index], sizeof(*chunk->free_ranges) * (chunk->free_ranges_count - index)); chunk->free_ranges[index].offset = offset; chunk->free_ranges[index].length = length; chunk->free_ranges_count++; } static void vkd3d_memory_chunk_remove_range(struct vkd3d_memory_chunk *chunk, size_t index) { chunk->free_ranges_count--; memmove(&chunk->free_ranges[index], &chunk->free_ranges[index + 1], sizeof(*chunk->free_ranges) * (chunk->free_ranges_count - index)); } static HRESULT vkd3d_memory_chunk_allocate_range(struct vkd3d_memory_chunk *chunk, const VkMemoryRequirements *memory_requirements, struct vkd3d_memory_allocation *allocation) { struct vkd3d_memory_free_range *pick_range; VkDeviceSize l_length, r_length; size_t i, pick_index; if (!chunk->free_ranges_count) return E_OUTOFMEMORY; pick_index = chunk->free_ranges_count; pick_range = NULL; for (i = 0; i < chunk->free_ranges_count; i++) { struct vkd3d_memory_free_range *range = &chunk->free_ranges[i]; if (range->offset + range->length < align(range->offset, memory_requirements->alignment) + memory_requirements->size) continue; /* Exact fit leaving no gaps */ if (range->length == memory_requirements->size) { pick_index = i; pick_range = range; break; } /* Alignment is almost always going to be 64 KiB, so * don't worry too much about misalignment gaps here */ if (!pick_range || range->length > pick_range->length) { pick_index = i; pick_range = range; } } if (!pick_range) return E_OUTOFMEMORY; /* Adjust offsets and addresses of the base allocation */ vkd3d_memory_allocation_slice(allocation, &chunk->allocation, align(pick_range->offset, memory_requirements->alignment), memory_requirements->size); allocation->chunk = chunk; /* Remove allocated range from the free list */ l_length = allocation->offset - pick_range->offset; r_length = pick_range->offset + pick_range->length - allocation->offset - allocation->resource.size; if (l_length) { pick_range->length = l_length; if (r_length) { vkd3d_memory_chunk_insert_range(chunk, pick_index + 1, allocation->offset + allocation->resource.size, r_length); } } else if (r_length) { pick_range->offset = allocation->offset + allocation->resource.size; pick_range->length = r_length; } else { vkd3d_memory_chunk_remove_range(chunk, pick_index); } return S_OK; } static size_t vkd3d_memory_chunk_find_range(struct vkd3d_memory_chunk *chunk, VkDeviceSize offset) { struct vkd3d_memory_free_range *range; size_t index, hi, lo; lo = 0; hi = chunk->free_ranges_count; while (lo < hi) { index = lo + (hi - lo) / 2; range = &chunk->free_ranges[index]; if (range->offset > offset) hi = index; else lo = index + 1; } return lo; } static void vkd3d_memory_chunk_free_range(struct vkd3d_memory_chunk *chunk, const struct vkd3d_memory_allocation *allocation) { struct vkd3d_memory_free_range *range; bool adjacent_l, adjacent_r; size_t index; index = vkd3d_memory_chunk_find_range(chunk, allocation->offset); adjacent_l = false; adjacent_r = false; if (index > 0) { range = &chunk->free_ranges[index - 1]; adjacent_l = range->offset + range->length == allocation->offset; } if (index < chunk->free_ranges_count) { range = &chunk->free_ranges[index]; adjacent_r = range->offset == allocation->offset + allocation->resource.size; } if (adjacent_l) { range = &chunk->free_ranges[index - 1]; range->length += allocation->resource.size; if (adjacent_r) { range->length += chunk->free_ranges[index].length; vkd3d_memory_chunk_remove_range(chunk, index); } } else if (adjacent_r) { range = &chunk->free_ranges[index]; range->offset = allocation->offset; range->length += allocation->resource.size; } else { vkd3d_memory_chunk_insert_range(chunk, index, allocation->offset, allocation->resource.size); } } static bool vkd3d_memory_chunk_is_free(struct vkd3d_memory_chunk *chunk) { return chunk->free_ranges_count == 1 && chunk->free_ranges[0].length == chunk->allocation.resource.size; } static HRESULT vkd3d_memory_chunk_create(struct d3d12_device *device, struct vkd3d_memory_allocator *allocator, const struct vkd3d_allocate_memory_info *info, struct vkd3d_memory_chunk **chunk) { struct vkd3d_memory_chunk *object; HRESULT hr; TRACE("device %p, allocator %p, info %p, chunk %p.\n", device, allocator, info, chunk); if (!(object = vkd3d_malloc(sizeof(*object)))) return E_OUTOFMEMORY; memset(object, 0, sizeof(*object)); if (FAILED(hr = vkd3d_memory_allocation_init(&object->allocation, device, allocator, info))) { vkd3d_free(object); return hr; } vkd3d_memory_chunk_insert_range(object, 0, 0, object->allocation.resource.size); *chunk = object; TRACE("Created chunk %p (allocation %p).\n", object, &object->allocation); return S_OK; } static void vkd3d_memory_chunk_destroy(struct vkd3d_memory_chunk *chunk, struct d3d12_device *device, struct vkd3d_memory_allocator *allocator) { TRACE("chunk %p, device %p, allocator %p.\n", chunk, device, allocator); if (chunk->allocation.clear_semaphore_value) vkd3d_memory_allocator_wait_allocation(allocator, device, &chunk->allocation); vkd3d_memory_allocation_free(&chunk->allocation, device, allocator); vkd3d_free(chunk->free_ranges); vkd3d_free(chunk); } static void vkd3d_memory_allocator_remove_chunk(struct vkd3d_memory_allocator *allocator, struct d3d12_device *device, struct vkd3d_memory_chunk *chunk) { size_t i; for (i = 0; i < allocator->chunks_count; i++) { if (allocator->chunks[i] == chunk) { allocator->chunks[i] = allocator->chunks[--allocator->chunks_count]; break; } } vkd3d_memory_chunk_destroy(chunk, device, allocator); } static void vkd3d_memory_allocator_cleanup_clear_queue(struct vkd3d_memory_allocator *allocator, struct d3d12_device *device) { struct vkd3d_memory_clear_queue *clear_queue = &allocator->clear_queue; const struct vkd3d_vk_device_procs *vk_procs = &device->vk_procs; VK_CALL(vkDestroyCommandPool(device->vk_device, clear_queue->vk_command_pool, NULL)); VK_CALL(vkDestroySemaphore(device->vk_device, clear_queue->vk_semaphore, NULL)); vkd3d_free(clear_queue->allocations); pthread_mutex_destroy(&clear_queue->mutex); } static HRESULT vkd3d_memory_allocator_init_clear_queue(struct vkd3d_memory_allocator *allocator, struct d3d12_device *device) { struct vkd3d_memory_clear_queue *clear_queue = &allocator->clear_queue; const struct vkd3d_vk_device_procs *vk_procs = &device->vk_procs; VkSemaphoreTypeCreateInfoKHR semaphore_type_info; VkCommandBufferAllocateInfo command_buffer_info; VkCommandPoolCreateInfo command_pool_info; VkSemaphoreCreateInfo semaphore_info; VkResult vr; HRESULT hr; int rc; /* vkd3d_memory_allocator_init will memset the entire * clear_queue struct to zero prior to calling this */ clear_queue->last_known_value = VKD3D_MEMORY_CLEAR_COMMAND_BUFFER_COUNT; clear_queue->next_signal_value = VKD3D_MEMORY_CLEAR_COMMAND_BUFFER_COUNT + 1; if ((rc = pthread_mutex_init(&allocator->mutex, NULL))) return hresult_from_errno(rc); command_pool_info.sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO; command_pool_info.pNext = NULL; command_pool_info.flags = VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT; command_pool_info.queueFamilyIndex = device->queue_families[VKD3D_QUEUE_FAMILY_INTERNAL_COMPUTE]->vk_family_index; if ((vr = VK_CALL(vkCreateCommandPool(device->vk_device, &command_pool_info, NULL, &clear_queue->vk_command_pool))) < 0) { ERR("Failed to create command pool, vr %d.\n", vr); hr = hresult_from_vk_result(vr); goto fail; } command_buffer_info.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO; command_buffer_info.pNext = NULL; command_buffer_info.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY; command_buffer_info.commandPool = clear_queue->vk_command_pool; command_buffer_info.commandBufferCount = VKD3D_MEMORY_CLEAR_COMMAND_BUFFER_COUNT; if ((vr = VK_CALL(vkAllocateCommandBuffers(device->vk_device, &command_buffer_info, clear_queue->vk_command_buffers))) < 0) { ERR("Failed to allocate command buffer, vr %d.\n", vr); hr = hresult_from_vk_result(vr); goto fail; } semaphore_type_info.sType = VK_STRUCTURE_TYPE_SEMAPHORE_TYPE_CREATE_INFO_KHR; semaphore_type_info.pNext = NULL; semaphore_type_info.semaphoreType = VK_SEMAPHORE_TYPE_TIMELINE_KHR; semaphore_type_info.initialValue = VKD3D_MEMORY_CLEAR_COMMAND_BUFFER_COUNT; semaphore_info.sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO; semaphore_info.pNext = &semaphore_type_info; semaphore_info.flags = 0; if ((vr = VK_CALL(vkCreateSemaphore(device->vk_device, &semaphore_info, NULL, &clear_queue->vk_semaphore))) < 0) { ERR("Failed to create semaphore, vr %d.\n", vr); hr = hresult_from_vk_result(vr); goto fail; } return S_OK; fail: vkd3d_memory_allocator_cleanup_clear_queue(allocator, device); return hr; } HRESULT vkd3d_memory_allocator_init(struct vkd3d_memory_allocator *allocator, struct d3d12_device *device) { HRESULT hr; int rc; memset(allocator, 0, sizeof(*allocator)); if ((rc = pthread_mutex_init(&allocator->mutex, NULL))) return hresult_from_errno(rc); if (FAILED(hr = vkd3d_memory_allocator_init_clear_queue(allocator, device))) { pthread_mutex_destroy(&allocator->mutex); return hr; } vkd3d_va_map_init(&allocator->va_map); allocator->vkd3d_queue = d3d12_device_allocate_vkd3d_queue(device, device->queue_families[VKD3D_QUEUE_FAMILY_INTERNAL_COMPUTE]); return S_OK; } void vkd3d_memory_allocator_cleanup(struct vkd3d_memory_allocator *allocator, struct d3d12_device *device) { size_t i; for (i = 0; i < allocator->chunks_count; i++) vkd3d_memory_chunk_destroy(allocator->chunks[i], device, allocator); vkd3d_free(allocator->chunks); vkd3d_va_map_cleanup(&allocator->va_map); vkd3d_memory_allocator_cleanup_clear_queue(allocator, device); pthread_mutex_destroy(&allocator->mutex); } static bool vkd3d_memory_allocator_wait_clear_semaphore(struct vkd3d_memory_allocator *allocator, struct d3d12_device *device, uint64_t wait_value, uint64_t timeout) { struct vkd3d_memory_clear_queue *clear_queue = &allocator->clear_queue; const struct vkd3d_vk_device_procs *vk_procs = &device->vk_procs; VkSemaphoreWaitInfo wait_info; uint64_t old_value, new_value; VkResult vr; old_value = vkd3d_atomic_uint64_load_explicit(&clear_queue->last_known_value, vkd3d_memory_order_acquire); if (old_value >= wait_value) return true; if (timeout) { wait_info.sType = VK_STRUCTURE_TYPE_SEMAPHORE_WAIT_INFO_KHR; wait_info.pNext = NULL; wait_info.flags = 0; wait_info.semaphoreCount = 1; wait_info.pSemaphores = &clear_queue->vk_semaphore; wait_info.pValues = &wait_value; vr = VK_CALL(vkWaitSemaphoresKHR(device->vk_device, &wait_info, timeout)); new_value = wait_value; } else { vr = VK_CALL(vkGetSemaphoreCounterValueKHR(device->vk_device, clear_queue->vk_semaphore, &new_value)); } if (vr < 0) { ERR("Failed to wait for timeline semaphore, vr %d.\n", vr); return false; } while (new_value > old_value) { uint64_t cur_value = vkd3d_atomic_uint64_compare_exchange(&clear_queue->last_known_value, old_value, new_value, vkd3d_memory_order_release, vkd3d_memory_order_acquire); if (cur_value == old_value) break; old_value = cur_value; } return new_value >= wait_value; } static HRESULT vkd3d_memory_allocator_flush_clears_locked(struct vkd3d_memory_allocator *allocator, struct d3d12_device *device) { const VkPipelineStageFlags vk_stage_mask = VK_PIPELINE_STAGE_ALL_COMMANDS_BIT; struct vkd3d_memory_clear_queue *clear_queue = &allocator->clear_queue; const struct vkd3d_vk_device_procs *vk_procs = &device->vk_procs; VkTimelineSemaphoreSubmitInfoKHR timeline_info; struct vkd3d_queue_family_info *queue_family; VkCommandBufferBeginInfo begin_info; uint32_t queue_mask, queue_index; VkCommandBuffer vk_cmd_buffer; VkSubmitInfo submit_info; VkQueue vk_queue; VkResult vr; size_t i; if (!clear_queue->allocations_count) return S_OK; /* Record commands late so that we can simply remove allocations from * the queue if they got freed before the clear commands got dispatched, * rather than rewriting the command buffer or dispatching the clear */ vk_cmd_buffer = clear_queue->vk_command_buffers[clear_queue->command_buffer_index]; if (vkd3d_config_flags & VKD3D_CONFIG_FLAG_LOG_MEMORY_BUDGET) { INFO("Submitting clear command list.\n"); for (i = 0; i < clear_queue->allocations_count; i++) INFO("Clearing allocation %zu: %"PRIu64".\n", i, clear_queue->allocations[i]->resource.size); } vkd3d_memory_allocator_wait_clear_semaphore(allocator, device, clear_queue->next_signal_value - VKD3D_MEMORY_CLEAR_COMMAND_BUFFER_COUNT, UINT64_MAX); if ((vr = VK_CALL(vkResetCommandBuffer(vk_cmd_buffer, 0)))) { ERR("Failed to reset command pool, vr %d.\n", vr); return hresult_from_vk_result(vr); } begin_info.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO; begin_info.pNext = NULL; begin_info.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT; begin_info.pInheritanceInfo = NULL; if ((vr = VK_CALL(vkBeginCommandBuffer(vk_cmd_buffer, &begin_info))) < 0) { ERR("Failed to begin command buffer, vr %d.\n", vr); return hresult_from_vk_result(vr); } for (i = 0; i < clear_queue->allocations_count; i++) { const struct vkd3d_memory_allocation *allocation = clear_queue->allocations[i]; VK_CALL(vkCmdFillBuffer(vk_cmd_buffer, allocation->resource.vk_buffer, allocation->offset, allocation->resource.size, 0)); } if ((vr = VK_CALL(vkEndCommandBuffer(vk_cmd_buffer))) < 0) { ERR("Failed to end command buffer, vr %d.\n", vr); return hresult_from_vk_result(vr); } if (!(vk_queue = vkd3d_queue_acquire(allocator->vkd3d_queue))) return E_FAIL; memset(&timeline_info, 0, sizeof(timeline_info)); timeline_info.sType = VK_STRUCTURE_TYPE_TIMELINE_SEMAPHORE_SUBMIT_INFO_KHR; timeline_info.signalSemaphoreValueCount = 1; timeline_info.pSignalSemaphoreValues = &clear_queue->next_signal_value; memset(&submit_info, 0, sizeof(submit_info)); submit_info.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO; submit_info.pNext = &timeline_info; submit_info.commandBufferCount = 1; submit_info.pCommandBuffers = &vk_cmd_buffer; submit_info.signalSemaphoreCount = 1; submit_info.pSignalSemaphores = &clear_queue->vk_semaphore; vr = VK_CALL(vkQueueSubmit(vk_queue, 1, &submit_info, VK_NULL_HANDLE)); vkd3d_queue_release(allocator->vkd3d_queue); VKD3D_DEVICE_REPORT_BREADCRUMB_IF(device, vr == VK_ERROR_DEVICE_LOST); if (vr < 0) { ERR("Failed to submit command buffer, vr %d.\n", vr); return hresult_from_vk_result(vr); } /* Stall future submissions on other queues until the clear has finished */ memset(&timeline_info, 0, sizeof(timeline_info)); timeline_info.sType = VK_STRUCTURE_TYPE_TIMELINE_SEMAPHORE_SUBMIT_INFO_KHR; timeline_info.waitSemaphoreValueCount = 1; timeline_info.pWaitSemaphoreValues = &clear_queue->next_signal_value; memset(&submit_info, 0, sizeof(submit_info)); submit_info.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO; submit_info.pNext = &timeline_info; submit_info.waitSemaphoreCount = 1; submit_info.pWaitSemaphores = &clear_queue->vk_semaphore; submit_info.pWaitDstStageMask = &vk_stage_mask; queue_mask = device->unique_queue_mask; while (queue_mask) { queue_index = vkd3d_bitmask_iter32(&queue_mask); queue_family = device->queue_families[queue_index]; for (i = 0; i < queue_family->queue_count; i++) { vkd3d_queue_add_wait(queue_family->queues[i], NULL, clear_queue->vk_semaphore, clear_queue->next_signal_value); } } /* Keep next_signal always one ahead of the last signaled value */ clear_queue->next_signal_value += 1; clear_queue->num_bytes_pending = 0; clear_queue->allocations_count = 0; clear_queue->command_buffer_index += 1; clear_queue->command_buffer_index %= VKD3D_MEMORY_CLEAR_COMMAND_BUFFER_COUNT; return S_OK; } HRESULT vkd3d_memory_allocator_flush_clears(struct vkd3d_memory_allocator *allocator, struct d3d12_device *device) { struct vkd3d_memory_clear_queue *clear_queue = &allocator->clear_queue; HRESULT hr; pthread_mutex_lock(&clear_queue->mutex); hr = vkd3d_memory_allocator_flush_clears_locked(allocator, device); pthread_mutex_unlock(&clear_queue->mutex); return hr; } #define VKD3D_MEMORY_CLEAR_QUEUE_MAX_PENDING_BYTES (256ull << 20) /* 256 MiB */ static void vkd3d_memory_allocator_clear_allocation(struct vkd3d_memory_allocator *allocator, struct d3d12_device *device, struct vkd3d_memory_allocation *allocation) { struct vkd3d_memory_clear_queue *clear_queue = &allocator->clear_queue; if (allocation->cpu_address) { /* Probably faster than doing this on the GPU * and having to worry about synchronization */ memset(allocation->cpu_address, 0, allocation->resource.size); } else if (allocation->resource.vk_buffer) { pthread_mutex_lock(&clear_queue->mutex); if (!vkd3d_array_reserve((void**)&clear_queue->allocations, &clear_queue->allocations_size, clear_queue->allocations_count + 1, sizeof(*clear_queue->allocations))) { ERR("Failed to insert free range.\n"); pthread_mutex_unlock(&clear_queue->mutex); return; } allocation->clear_semaphore_value = clear_queue->next_signal_value; if (allocation->chunk) allocation->chunk->allocation.clear_semaphore_value = clear_queue->next_signal_value; clear_queue->allocations[clear_queue->allocations_count++] = allocation; clear_queue->num_bytes_pending += allocation->resource.size; if (clear_queue->num_bytes_pending >= VKD3D_MEMORY_CLEAR_QUEUE_MAX_PENDING_BYTES) vkd3d_memory_allocator_flush_clears_locked(allocator, device); pthread_mutex_unlock(&clear_queue->mutex); } } static void vkd3d_memory_allocator_wait_allocation(struct vkd3d_memory_allocator *allocator, struct d3d12_device *device, const struct vkd3d_memory_allocation *allocation) { struct vkd3d_memory_clear_queue *clear_queue = &allocator->clear_queue; uint64_t wait_value = allocation->clear_semaphore_value; size_t i; /* If the clear semaphore has been signaled to the expected value, * the GPU is already done clearing the allocation, and it cannot * be in the clear queue either, so there is nothing to do. */ if (vkd3d_memory_allocator_wait_clear_semaphore(allocator, device, wait_value, 0)) return; /* If the allocation is still in the queue, the GPU has not started * using it yet so we can remove it from the queue and exit. */ pthread_mutex_lock(&clear_queue->mutex); for (i = 0; i < clear_queue->allocations_count; i++) { if (clear_queue->allocations[i] == allocation) { clear_queue->allocations[i] = clear_queue->allocations[--clear_queue->allocations_count]; clear_queue->num_bytes_pending -= allocation->resource.size; pthread_mutex_unlock(&clear_queue->mutex); return; } } /* If this is a chunk and a suballocation from it had been immediately * freed, it is possible that the suballocation got removed from the * clear queue so that the chunk's wait value never gets signaled. Wait * for the last signaled value in that case. */ if (wait_value == clear_queue->next_signal_value) wait_value = clear_queue->next_signal_value - 1; pthread_mutex_unlock(&clear_queue->mutex); /* If this allocation was suballocated from a chunk, we will wait * on the semaphore when the parent chunk itself gets destroyed. */ if (allocation->chunk) return; /* Otherwise, we actually have to wait for the GPU. */ WARN("Waiting for GPU to clear allocation %p.\n", allocation); vkd3d_memory_allocator_wait_clear_semaphore(allocator, device, wait_value, UINT64_MAX); } static HRESULT vkd3d_memory_allocator_try_add_chunk(struct vkd3d_memory_allocator *allocator, struct d3d12_device *device, const D3D12_HEAP_PROPERTIES *heap_properties, D3D12_HEAP_FLAGS heap_flags, uint32_t type_mask, VkMemoryPropertyFlags optional_properties, struct vkd3d_memory_chunk **chunk) { struct vkd3d_allocate_memory_info alloc_info; struct vkd3d_memory_chunk *object; HRESULT hr; memset(&alloc_info, 0, sizeof(alloc_info)); alloc_info.memory_requirements.size = VKD3D_MEMORY_CHUNK_SIZE; alloc_info.memory_requirements.alignment = 0; alloc_info.memory_requirements.memoryTypeBits = type_mask; alloc_info.heap_properties = *heap_properties; alloc_info.heap_flags = heap_flags; alloc_info.flags = VKD3D_ALLOCATION_FLAG_NO_FALLBACK; alloc_info.optional_memory_properties = optional_properties; if (!(heap_flags & D3D12_HEAP_FLAG_DENY_BUFFERS)) alloc_info.flags |= VKD3D_ALLOCATION_FLAG_GLOBAL_BUFFER; if (!vkd3d_array_reserve((void**)&allocator->chunks, &allocator->chunks_size, allocator->chunks_count + 1, sizeof(*allocator->chunks))) { ERR("Failed to allocate space for new chunk.\n"); return E_OUTOFMEMORY; } if (FAILED(hr = vkd3d_memory_chunk_create(device, allocator, &alloc_info, &object))) return hr; allocator->chunks[allocator->chunks_count++] = *chunk = object; return S_OK; } static HRESULT vkd3d_memory_allocator_try_suballocate_memory(struct vkd3d_memory_allocator *allocator, struct d3d12_device *device, const VkMemoryRequirements *memory_requirements, uint32_t type_mask, VkMemoryPropertyFlags optional_properties, const D3D12_HEAP_PROPERTIES *heap_properties, D3D12_HEAP_FLAGS heap_flags, struct vkd3d_memory_allocation *allocation) { const D3D12_HEAP_FLAGS heap_flag_mask = ~(D3D12_HEAP_FLAG_CREATE_NOT_ZEROED | D3D12_HEAP_FLAG_CREATE_NOT_RESIDENT); struct vkd3d_memory_chunk *chunk; HRESULT hr; size_t i; type_mask &= device->memory_info.global_mask; type_mask &= memory_requirements->memoryTypeBits; for (i = 0; i < allocator->chunks_count; i++) { chunk = allocator->chunks[i]; /* Match flags since otherwise the backing buffer * may not support our required usage flags */ if (chunk->allocation.heap_type != heap_properties->Type || chunk->allocation.heap_flags != (heap_flags & heap_flag_mask)) continue; /* Filter out unsupported memory types */ if (!(type_mask & (1u << chunk->allocation.device_allocation.vk_memory_type))) continue; if (SUCCEEDED(hr = vkd3d_memory_chunk_allocate_range(chunk, memory_requirements, allocation))) return hr; } /* Try allocating a new chunk on one of the supported memory type * before the caller falls back to potentially slower memory */ if (FAILED(hr = vkd3d_memory_allocator_try_add_chunk(allocator, device, heap_properties, heap_flags & heap_flag_mask, type_mask, optional_properties, &chunk))) return hr; return vkd3d_memory_chunk_allocate_range(chunk, memory_requirements, allocation); } void vkd3d_free_memory(struct d3d12_device *device, struct vkd3d_memory_allocator *allocator, const struct vkd3d_memory_allocation *allocation) { if (allocation->device_allocation.vk_memory == VK_NULL_HANDLE) return; if (allocation->clear_semaphore_value) vkd3d_memory_allocator_wait_allocation(allocator, device, allocation); if (allocation->chunk) { pthread_mutex_lock(&allocator->mutex); vkd3d_memory_chunk_free_range(allocation->chunk, allocation); if (vkd3d_memory_chunk_is_free(allocation->chunk)) vkd3d_memory_allocator_remove_chunk(allocator, device, allocation->chunk); pthread_mutex_unlock(&allocator->mutex); } else vkd3d_memory_allocation_free(allocation, device, allocator); } static HRESULT vkd3d_suballocate_memory(struct d3d12_device *device, struct vkd3d_memory_allocator *allocator, const struct vkd3d_allocate_memory_info *info, struct vkd3d_memory_allocation *allocation) { const VkMemoryPropertyFlags optional_flags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT; VkMemoryRequirements memory_requirements = info->memory_requirements; uint32_t required_mask, optional_mask; VkMemoryPropertyFlags type_flags; HRESULT hr; if (FAILED(hr = vkd3d_select_memory_flags(device, &info->heap_properties, &type_flags))) return hr; /* Prefer device-local memory if allowed for this allocation */ required_mask = vkd3d_find_memory_types_with_flags(device, type_flags & ~optional_flags); optional_mask = vkd3d_find_memory_types_with_flags(device, type_flags); pthread_mutex_lock(&allocator->mutex); hr = vkd3d_memory_allocator_try_suballocate_memory(allocator, device, &memory_requirements, optional_mask, 0, &info->heap_properties, info->heap_flags, allocation); if (FAILED(hr) && (required_mask & ~optional_mask)) { hr = vkd3d_memory_allocator_try_suballocate_memory(allocator, device, &memory_requirements, required_mask & ~optional_mask, optional_flags, &info->heap_properties, info->heap_flags, allocation); } pthread_mutex_unlock(&allocator->mutex); return hr; } static inline bool vkd3d_driver_implicitly_clears(VkDriverId driver_id) { switch (driver_id) { /* Known to pass test_stress_suballocation which hits this path. */ case VK_DRIVER_ID_MESA_RADV: case VK_DRIVER_ID_NVIDIA_PROPRIETARY: case VK_DRIVER_ID_INTEL_OPEN_SOURCE_MESA: return true; default: return false; } } HRESULT vkd3d_allocate_memory(struct d3d12_device *device, struct vkd3d_memory_allocator *allocator, const struct vkd3d_allocate_memory_info *info, struct vkd3d_memory_allocation *allocation) { bool implementation_implicitly_clears; bool needs_clear; bool suballocate; HRESULT hr; suballocate = !info->pNext && !info->host_ptr && info->memory_requirements.size < VKD3D_VA_BLOCK_SIZE && !(info->heap_flags & (D3D12_HEAP_FLAG_DENY_BUFFERS | D3D12_HEAP_FLAG_ALLOW_WRITE_WATCH)) && !(info->flags & VKD3D_ALLOCATION_FLAG_INTERNAL_SCRATCH); if (suballocate) hr = vkd3d_suballocate_memory(device, allocator, info, allocation); else hr = vkd3d_memory_allocation_init(allocation, device, allocator, info); if (FAILED(hr)) return hr; /* If we're allocating Vulkan memory directly, * we can rely on the driver doing this for us. * This is relying on implementation details. * RADV definitely does this, and it seems like NV also does it. * TODO: an extension for this would be nice. */ implementation_implicitly_clears = vkd3d_driver_implicitly_clears(device->device_info.driver_properties.driverID) && !suballocate; needs_clear = !implementation_implicitly_clears && !(info->heap_flags & D3D12_HEAP_FLAG_CREATE_NOT_ZEROED) && !(vkd3d_config_flags & VKD3D_CONFIG_FLAG_MEMORY_ALLOCATOR_SKIP_CLEAR); if (needs_clear) vkd3d_memory_allocator_clear_allocation(allocator, device, allocation); return hr; } static bool vkd3d_heap_allocation_accept_deferred_resource_placements(struct d3d12_device *device, const D3D12_HEAP_PROPERTIES *heap_properties, D3D12_HEAP_FLAGS heap_flags) { uint32_t type_mask; /* Normally, if a memory allocation fails, we consider it an error, but there are some exceptions * where we can defer memory allocation, like CreateHeap where fallback system memory type is not available. * In this case, we will defer memory allocation until CreatePlacedResource() time, and we should * accept that a memory allocation failed. */ /* Only accept deferrals for DEFAULT / CPU_NOT_AVAILABLE heaps. * If we're going for host memory, we have nowhere left to fall back to either way. */ if (is_cpu_accessible_heap(heap_properties)) return false; type_mask = vkd3d_select_memory_types(device, heap_properties, heap_flags); return device->memory_properties.memoryHeapCount > 1 && !vkd3d_memory_info_type_mask_covers_multiple_memory_heaps(&device->memory_properties, type_mask); } HRESULT vkd3d_allocate_heap_memory(struct d3d12_device *device, struct vkd3d_memory_allocator *allocator, const struct vkd3d_allocate_heap_memory_info *info, struct vkd3d_memory_allocation *allocation) { struct vkd3d_allocate_heap_memory_info heap_info; struct vkd3d_allocate_memory_info alloc_info; HRESULT hr; memset(&alloc_info, 0, sizeof(alloc_info)); alloc_info.memory_requirements.memoryTypeBits = ~0u; alloc_info.memory_requirements.alignment = info->heap_desc.Alignment; alloc_info.memory_requirements.size = info->heap_desc.SizeInBytes; alloc_info.heap_properties = info->heap_desc.Properties; alloc_info.heap_flags = info->heap_desc.Flags; alloc_info.host_ptr = info->host_ptr; alloc_info.flags |= info->extra_allocation_flags; if (!(info->heap_desc.Flags & D3D12_HEAP_FLAG_DENY_BUFFERS)) alloc_info.flags |= VKD3D_ALLOCATION_FLAG_GLOBAL_BUFFER; if (is_cpu_accessible_heap(&info->heap_desc.Properties)) { if (info->heap_desc.Flags & D3D12_HEAP_FLAG_DENY_BUFFERS) { /* If the heap was only designed to handle images, the heap is useless, * and we can force everything to go through committed path. */ memset(allocation, 0, sizeof(*allocation)); return S_OK; } else { /* CPU visible textures are never placed on a heap directly, * since LINEAR images have alignment / size requirements * that are vastly different from OPTIMAL ones. * We can place buffers however. */ heap_info = *info; info = &heap_info; heap_info.heap_desc.Flags |= D3D12_HEAP_FLAG_ALLOW_ONLY_BUFFERS; } } hr = vkd3d_allocate_memory(device, allocator, &alloc_info, allocation); if (hr == E_OUTOFMEMORY && vkd3d_heap_allocation_accept_deferred_resource_placements(device, &info->heap_desc.Properties, info->heap_desc.Flags)) { /* It's okay and sometimes expected that we fail here. * Defer allocation until CreatePlacedResource(). */ memset(allocation, 0, sizeof(*allocation)); hr = S_OK; } return hr; } HRESULT vkd3d_allocate_buffer_memory(struct d3d12_device *device, VkBuffer vk_buffer, VkMemoryPropertyFlags type_flags, struct vkd3d_device_memory_allocation *allocation) { const struct vkd3d_vk_device_procs *vk_procs = &device->vk_procs; VkMemoryRequirements memory_requirements; VkMemoryAllocateFlagsInfo flags_info; VkBindBufferMemoryInfo bind_info; VkResult vr; HRESULT hr; flags_info.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_FLAGS_INFO; flags_info.pNext = NULL; flags_info.flags = 0; if (device->device_info.buffer_device_address_features.bufferDeviceAddress) flags_info.flags |= VK_MEMORY_ALLOCATE_DEVICE_ADDRESS_BIT_KHR; VK_CALL(vkGetBufferMemoryRequirements(device->vk_device, vk_buffer, &memory_requirements)); if (FAILED(hr = vkd3d_allocate_device_memory(device, memory_requirements.size, type_flags, memory_requirements.memoryTypeBits, &flags_info, allocation))) return hr; bind_info.sType = VK_STRUCTURE_TYPE_BIND_BUFFER_MEMORY_INFO; bind_info.pNext = NULL; bind_info.buffer = vk_buffer; bind_info.memory = allocation->vk_memory; bind_info.memoryOffset = 0; if (FAILED(vr = VK_CALL(vkBindBufferMemory2KHR(device->vk_device, 1, &bind_info)))) return hresult_from_vk_result(vr); return hr; } HRESULT vkd3d_allocate_image_memory(struct d3d12_device *device, VkImage vk_image, VkMemoryPropertyFlags type_flags, struct vkd3d_device_memory_allocation *allocation) { const struct vkd3d_vk_device_procs *vk_procs = &device->vk_procs; VkMemoryRequirements memory_requirements; VkBindImageMemoryInfo bind_info; VkResult vr; HRESULT hr; VK_CALL(vkGetImageMemoryRequirements(device->vk_device, vk_image, &memory_requirements)); if (FAILED(hr = vkd3d_allocate_device_memory(device, memory_requirements.size, type_flags, memory_requirements.memoryTypeBits, NULL, allocation))) return hr; bind_info.sType = VK_STRUCTURE_TYPE_BIND_IMAGE_MEMORY_INFO; bind_info.pNext = NULL; bind_info.image = vk_image; bind_info.memory = allocation->vk_memory; bind_info.memoryOffset = 0; if (FAILED(vr = VK_CALL(vkBindImageMemory2KHR(device->vk_device, 1, &bind_info)))) return hresult_from_vk_result(vr); return hr; }