radv: Keep a global BO list for VkMemory.

With update after bind we can't attach bo's to the command buffer
from the descriptor set anymore, so we have to have a global BO
list.

I am somewhat surprised this works really well even though we have
implicit synchronization in the WSI based on the bo list associations
and with the new behavior every command buffer is associated with
every swapchain image. But I could not find slowdowns in games because
of it.

Reviewed-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
This commit is contained in:
Bas Nieuwenhuizen 2018-04-09 12:46:49 +02:00
parent 22d6b89e39
commit 4b13fe55a4
4 changed files with 149 additions and 42 deletions

View File

@ -1220,6 +1220,55 @@ radv_queue_finish(struct radv_queue *queue)
queue->device->ws->buffer_destroy(queue->compute_scratch_bo);
}
static void
radv_bo_list_init(struct radv_bo_list *bo_list)
{
pthread_mutex_init(&bo_list->mutex, NULL);
bo_list->list.count = bo_list->capacity = 0;
bo_list->list.bos = NULL;
}
static void
radv_bo_list_finish(struct radv_bo_list *bo_list)
{
free(bo_list->list.bos);
pthread_mutex_destroy(&bo_list->mutex);
}
static VkResult radv_bo_list_add(struct radv_bo_list *bo_list, struct radeon_winsys_bo *bo)
{
pthread_mutex_lock(&bo_list->mutex);
if (bo_list->list.count == bo_list->capacity) {
unsigned capacity = MAX2(4, bo_list->capacity * 2);
void *data = realloc(bo_list->list.bos, capacity * sizeof(struct radeon_winsys_bo*));
if (!data) {
pthread_mutex_unlock(&bo_list->mutex);
return VK_ERROR_OUT_OF_HOST_MEMORY;
}
bo_list->list.bos = (struct radeon_winsys_bo**)data;
bo_list->capacity = capacity;
}
bo_list->list.bos[bo_list->list.count++] = bo;
pthread_mutex_unlock(&bo_list->mutex);
return VK_SUCCESS;
}
static void radv_bo_list_remove(struct radv_bo_list *bo_list, struct radeon_winsys_bo *bo)
{
pthread_mutex_lock(&bo_list->mutex);
for(unsigned i = 0; i < bo_list->list.count; ++i) {
if (bo_list->list.bos[i] == bo) {
bo_list->list.bos[i] = bo_list->list.bos[bo_list->list.count - 1];
--bo_list->list.count;
break;
}
}
pthread_mutex_unlock(&bo_list->mutex);
}
static void
radv_device_init_gs_info(struct radv_device *device)
{
@ -1320,6 +1369,8 @@ VkResult radv_CreateDevice(
mtx_init(&device->shader_slab_mutex, mtx_plain);
list_inithead(&device->shader_slabs);
radv_bo_list_init(&device->bo_list);
for (unsigned i = 0; i < pCreateInfo->queueCreateInfoCount; i++) {
const VkDeviceQueueCreateInfo *queue_create = &pCreateInfo->pQueueCreateInfos[i];
uint32_t qfi = queue_create->queueFamilyIndex;
@ -1452,6 +1503,8 @@ VkResult radv_CreateDevice(
fail_meta:
radv_device_finish_meta(device);
fail:
radv_bo_list_finish(&device->bo_list);
if (device->trace_bo)
device->ws->buffer_destroy(device->trace_bo);
@ -1499,6 +1552,7 @@ void radv_DestroyDevice(
radv_destroy_shader_slabs(device);
radv_bo_list_finish(&device->bo_list);
vk_free(&device->alloc, device);
}
@ -2269,7 +2323,7 @@ static VkResult radv_signal_fence(struct radv_queue *queue,
ret = queue->device->ws->cs_submit(queue->hw_ctx, queue->queue_idx,
&queue->device->empty_cs[queue->queue_family_index],
1, NULL, NULL, &sem_info,
1, NULL, NULL, &sem_info, NULL,
false, fence->fence);
radv_free_sem_info(&sem_info);
@ -2346,7 +2400,7 @@ VkResult radv_QueueSubmit(
ret = queue->device->ws->cs_submit(ctx, queue->queue_idx,
&queue->device->empty_cs[queue->queue_family_index],
1, NULL, NULL,
&sem_info,
&sem_info, NULL,
false, base_fence);
if (ret) {
radv_loge("failed to submit CS %d\n", i);
@ -2384,11 +2438,15 @@ VkResult radv_QueueSubmit(
sem_info.cs_emit_wait = j == 0;
sem_info.cs_emit_signal = j + advance == pSubmits[i].commandBufferCount;
pthread_mutex_lock(&queue->device->bo_list.mutex);
ret = queue->device->ws->cs_submit(ctx, queue->queue_idx, cs_array + j,
advance, initial_preamble, continue_preamble_cs,
&sem_info,
&sem_info, &queue->device->bo_list.list,
can_patch, base_fence);
pthread_mutex_unlock(&queue->device->bo_list.mutex);
if (ret) {
radv_loge("failed to submit CS %d\n", i);
abort();
@ -2594,11 +2652,8 @@ static VkResult radv_alloc_memory(struct radv_device *device,
goto fail;
} else {
close(import_info->fd);
goto out_success;
}
}
if (host_ptr_info) {
} else if (host_ptr_info) {
assert(host_ptr_info->handleType == VK_EXTERNAL_MEMORY_HANDLE_TYPE_HOST_ALLOCATION_BIT_EXT);
assert(mem_type_index == RADV_MEM_TYPE_GTT_CACHED);
mem->bo = device->ws->buffer_from_ptr(device->ws, host_ptr_info->pHostPointer,
@ -2608,41 +2663,46 @@ static VkResult radv_alloc_memory(struct radv_device *device,
goto fail;
} else {
mem->user_ptr = host_ptr_info->pHostPointer;
goto out_success;
}
} else {
uint64_t alloc_size = align_u64(pAllocateInfo->allocationSize, 4096);
if (mem_type_index == RADV_MEM_TYPE_GTT_WRITE_COMBINE ||
mem_type_index == RADV_MEM_TYPE_GTT_CACHED)
domain = RADEON_DOMAIN_GTT;
else
domain = RADEON_DOMAIN_VRAM;
if (mem_type_index == RADV_MEM_TYPE_VRAM)
flags |= RADEON_FLAG_NO_CPU_ACCESS;
else
flags |= RADEON_FLAG_CPU_ACCESS;
if (mem_type_index == RADV_MEM_TYPE_GTT_WRITE_COMBINE)
flags |= RADEON_FLAG_GTT_WC;
if (!dedicate_info && !import_info && (!export_info || !export_info->handleTypes))
flags |= RADEON_FLAG_NO_INTERPROCESS_SHARING;
mem->bo = device->ws->buffer_create(device->ws, alloc_size, device->physical_device->rad_info.max_alignment,
domain, flags);
if (!mem->bo) {
result = VK_ERROR_OUT_OF_DEVICE_MEMORY;
goto fail;
}
mem->type_index = mem_type_index;
}
uint64_t alloc_size = align_u64(pAllocateInfo->allocationSize, 4096);
if (mem_type_index == RADV_MEM_TYPE_GTT_WRITE_COMBINE ||
mem_type_index == RADV_MEM_TYPE_GTT_CACHED)
domain = RADEON_DOMAIN_GTT;
else
domain = RADEON_DOMAIN_VRAM;
result = radv_bo_list_add(&device->bo_list, mem->bo);
if (result != VK_SUCCESS)
goto fail_bo;
if (mem_type_index == RADV_MEM_TYPE_VRAM)
flags |= RADEON_FLAG_NO_CPU_ACCESS;
else
flags |= RADEON_FLAG_CPU_ACCESS;
if (mem_type_index == RADV_MEM_TYPE_GTT_WRITE_COMBINE)
flags |= RADEON_FLAG_GTT_WC;
if (!dedicate_info && !import_info && (!export_info || !export_info->handleTypes))
flags |= RADEON_FLAG_NO_INTERPROCESS_SHARING;
mem->bo = device->ws->buffer_create(device->ws, alloc_size, device->physical_device->rad_info.max_alignment,
domain, flags);
if (!mem->bo) {
result = VK_ERROR_OUT_OF_DEVICE_MEMORY;
goto fail;
}
mem->type_index = mem_type_index;
out_success:
*pMem = radv_device_memory_to_handle(mem);
return VK_SUCCESS;
fail_bo:
device->ws->buffer_destroy(mem->bo);
fail:
vk_free2(&device->alloc, pAllocator, mem);
@ -2670,6 +2730,7 @@ void radv_FreeMemory(
if (mem == NULL)
return;
radv_bo_list_remove(&device->bo_list, mem->bo);
device->ws->buffer_destroy(mem->bo);
mem->bo = NULL;
@ -2989,7 +3050,7 @@ radv_sparse_image_opaque_bind_memory(struct radv_device *device,
queue->device->ws->cs_submit(queue->hw_ctx, queue->queue_idx,
&queue->device->empty_cs[queue->queue_family_index],
1, NULL, NULL,
&sem_info,
&sem_info, NULL,
false, base_fence);
fence_emitted = true;
if (fence)

View File

@ -598,6 +598,12 @@ struct radv_queue {
struct radeon_winsys_cs *continue_preamble_cs;
};
struct radv_bo_list {
struct radv_winsys_bo_list list;
unsigned capacity;
pthread_mutex_t mutex;
};
struct radv_device {
VK_LOADER_DATA _loader_data;
@ -660,6 +666,8 @@ struct radv_device {
uint64_t dmesg_timestamp;
struct radv_device_extension_table enabled_extensions;
struct radv_bo_list bo_list;
};
struct radv_device_memory {

View File

@ -178,6 +178,11 @@ struct radv_winsys_sem_info {
struct radv_winsys_sem_counts signal;
};
struct radv_winsys_bo_list {
struct radeon_winsys_bo **bos;
unsigned count;
};
struct radeon_winsys {
void (*destroy)(struct radeon_winsys *ws);
@ -246,6 +251,7 @@ struct radeon_winsys {
struct radeon_winsys_cs *initial_preamble_cs,
struct radeon_winsys_cs *continue_preamble_cs,
struct radv_winsys_sem_info *sem_info,
const struct radv_winsys_bo_list *bo_list, /* optional */
bool can_patch,
struct radeon_winsys_fence *fence);

View File

@ -552,6 +552,7 @@ static int radv_amdgpu_create_bo_list(struct radv_amdgpu_winsys *ws,
unsigned count,
struct radv_amdgpu_winsys_bo *extra_bo,
struct radeon_winsys_cs *extra_cs,
const struct radv_winsys_bo_list *radv_bo_list,
amdgpu_bo_list_handle *bo_list)
{
int r = 0;
@ -579,7 +580,7 @@ static int radv_amdgpu_create_bo_list(struct radv_amdgpu_winsys *ws,
bo_list);
free(handles);
pthread_mutex_unlock(&ws->global_bo_list_lock);
} else if (count == 1 && !extra_bo && !extra_cs &&
} else if (count == 1 && !extra_bo && !extra_cs && !radv_bo_list &&
!radv_amdgpu_cs(cs_array[0])->num_virtual_buffers) {
struct radv_amdgpu_cs *cs = (struct radv_amdgpu_cs*)cs_array[0];
if (cs->num_buffers == 0) {
@ -601,6 +602,11 @@ static int radv_amdgpu_create_bo_list(struct radv_amdgpu_winsys *ws,
if (extra_cs) {
total_buffer_count += ((struct radv_amdgpu_cs*)extra_cs)->num_buffers;
}
if (radv_bo_list) {
total_buffer_count += radv_bo_list->count;
}
if (total_buffer_count == 0) {
*bo_list = 0;
return 0;
@ -674,6 +680,27 @@ static int radv_amdgpu_create_bo_list(struct radv_amdgpu_winsys *ws,
}
}
if (radv_bo_list) {
unsigned unique_bo_so_far = unique_bo_count;
const unsigned default_bo_priority = 7;
for (unsigned i = 0; i < radv_bo_list->count; ++i) {
struct radv_amdgpu_winsys_bo *bo = radv_amdgpu_winsys_bo(radv_bo_list->bos[i]);
bool found = false;
for (unsigned j = 0; j < unique_bo_so_far; ++j) {
if (bo->bo == handles[j]) {
found = true;
priorities[j] = MAX2(priorities[j], default_bo_priority);
break;
}
}
if (!found) {
handles[unique_bo_count] = bo->bo;
priorities[unique_bo_count] = default_bo_priority;
++unique_bo_count;
}
}
}
if (unique_bo_count > 0) {
r = amdgpu_bo_list_create(ws->dev, unique_bo_count, handles,
priorities, bo_list);
@ -709,6 +736,7 @@ static void radv_assign_last_submit(struct radv_amdgpu_ctx *ctx,
static int radv_amdgpu_winsys_cs_submit_chained(struct radeon_winsys_ctx *_ctx,
int queue_idx,
struct radv_winsys_sem_info *sem_info,
const struct radv_winsys_bo_list *radv_bo_list,
struct radeon_winsys_cs **cs_array,
unsigned cs_count,
struct radeon_winsys_cs *initial_preamble_cs,
@ -745,7 +773,8 @@ static int radv_amdgpu_winsys_cs_submit_chained(struct radeon_winsys_ctx *_ctx,
}
}
r = radv_amdgpu_create_bo_list(cs0->ws, cs_array, cs_count, NULL, initial_preamble_cs, &bo_list);
r = radv_amdgpu_create_bo_list(cs0->ws, cs_array, cs_count, NULL, initial_preamble_cs,
radv_bo_list, &bo_list);
if (r) {
fprintf(stderr, "amdgpu: buffer list creation failed for the "
"chained submission(%d)\n", r);
@ -789,6 +818,7 @@ static int radv_amdgpu_winsys_cs_submit_chained(struct radeon_winsys_ctx *_ctx,
static int radv_amdgpu_winsys_cs_submit_fallback(struct radeon_winsys_ctx *_ctx,
int queue_idx,
struct radv_winsys_sem_info *sem_info,
const struct radv_winsys_bo_list *radv_bo_list,
struct radeon_winsys_cs **cs_array,
unsigned cs_count,
struct radeon_winsys_cs *initial_preamble_cs,
@ -813,7 +843,7 @@ static int radv_amdgpu_winsys_cs_submit_fallback(struct radeon_winsys_ctx *_ctx,
memset(&request, 0, sizeof(request));
r = radv_amdgpu_create_bo_list(cs0->ws, &cs_array[i], cnt, NULL,
preamble_cs, &bo_list);
preamble_cs, radv_bo_list, &bo_list);
if (r) {
fprintf(stderr, "amdgpu: buffer list creation failed "
"for the fallback submission (%d)\n", r);
@ -870,6 +900,7 @@ static int radv_amdgpu_winsys_cs_submit_fallback(struct radeon_winsys_ctx *_ctx,
static int radv_amdgpu_winsys_cs_submit_sysmem(struct radeon_winsys_ctx *_ctx,
int queue_idx,
struct radv_winsys_sem_info *sem_info,
const struct radv_winsys_bo_list *radv_bo_list,
struct radeon_winsys_cs **cs_array,
unsigned cs_count,
struct radeon_winsys_cs *initial_preamble_cs,
@ -939,7 +970,7 @@ static int radv_amdgpu_winsys_cs_submit_sysmem(struct radeon_winsys_ctx *_ctx,
r = radv_amdgpu_create_bo_list(cs0->ws, &cs_array[i], cnt,
(struct radv_amdgpu_winsys_bo*)bo,
preamble_cs, &bo_list);
preamble_cs, radv_bo_list, &bo_list);
if (r) {
fprintf(stderr, "amdgpu: buffer list creation failed "
"for the sysmem submission (%d)\n", r);
@ -990,6 +1021,7 @@ static int radv_amdgpu_winsys_cs_submit(struct radeon_winsys_ctx *_ctx,
struct radeon_winsys_cs *initial_preamble_cs,
struct radeon_winsys_cs *continue_preamble_cs,
struct radv_winsys_sem_info *sem_info,
const struct radv_winsys_bo_list *bo_list,
bool can_patch,
struct radeon_winsys_fence *_fence)
{
@ -999,13 +1031,13 @@ static int radv_amdgpu_winsys_cs_submit(struct radeon_winsys_ctx *_ctx,
assert(sem_info);
if (!cs->ws->use_ib_bos) {
ret = radv_amdgpu_winsys_cs_submit_sysmem(_ctx, queue_idx, sem_info, cs_array,
ret = radv_amdgpu_winsys_cs_submit_sysmem(_ctx, queue_idx, sem_info, bo_list, cs_array,
cs_count, initial_preamble_cs, continue_preamble_cs, _fence);
} else if (can_patch && cs_count > AMDGPU_CS_MAX_IBS_PER_SUBMIT && cs->ws->batchchain) {
ret = radv_amdgpu_winsys_cs_submit_chained(_ctx, queue_idx, sem_info, cs_array,
ret = radv_amdgpu_winsys_cs_submit_chained(_ctx, queue_idx, sem_info, bo_list, cs_array,
cs_count, initial_preamble_cs, continue_preamble_cs, _fence);
} else {
ret = radv_amdgpu_winsys_cs_submit_fallback(_ctx, queue_idx, sem_info, cs_array,
ret = radv_amdgpu_winsys_cs_submit_fallback(_ctx, queue_idx, sem_info, bo_list, cs_array,
cs_count, initial_preamble_cs, continue_preamble_cs, _fence);
}