radv: add support for local bos. (v3)
This uses the new kernel interfaces for reduced cs overhead, We only set the local flag for memory allocations that don't have a dedicated allocation and ones that aren't imports. v2: add to all the internal buffer creation paths. v3: missed some command submission paths, handle 0/empty bo lists. Reviewed-by: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl> Signed-off-by: Dave Airlie <airlied@redhat.com>
This commit is contained in:
parent
39c5c12f8f
commit
a639d40f13
|
@ -313,7 +313,8 @@ radv_cmd_buffer_resize_upload_buf(struct radv_cmd_buffer *cmd_buffer,
|
||||||
bo = device->ws->buffer_create(device->ws,
|
bo = device->ws->buffer_create(device->ws,
|
||||||
new_size, 4096,
|
new_size, 4096,
|
||||||
RADEON_DOMAIN_GTT,
|
RADEON_DOMAIN_GTT,
|
||||||
RADEON_FLAG_CPU_ACCESS);
|
RADEON_FLAG_CPU_ACCESS|
|
||||||
|
RADEON_FLAG_NO_INTERPROCESS_SHARING);
|
||||||
|
|
||||||
if (!bo) {
|
if (!bo) {
|
||||||
cmd_buffer->record_result = VK_ERROR_OUT_OF_DEVICE_MEMORY;
|
cmd_buffer->record_result = VK_ERROR_OUT_OF_DEVICE_MEMORY;
|
||||||
|
|
|
@ -61,7 +61,8 @@ radv_init_trace(struct radv_device *device)
|
||||||
|
|
||||||
device->trace_bo = ws->buffer_create(ws, TRACE_BO_SIZE, 8,
|
device->trace_bo = ws->buffer_create(ws, TRACE_BO_SIZE, 8,
|
||||||
RADEON_DOMAIN_VRAM,
|
RADEON_DOMAIN_VRAM,
|
||||||
RADEON_FLAG_CPU_ACCESS);
|
RADEON_FLAG_CPU_ACCESS|
|
||||||
|
RADEON_FLAG_NO_INTERPROCESS_SHARING);
|
||||||
if (!device->trace_bo)
|
if (!device->trace_bo)
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
|
|
|
@ -431,7 +431,7 @@ VkResult radv_CreateDescriptorPool(
|
||||||
|
|
||||||
if (bo_size) {
|
if (bo_size) {
|
||||||
pool->bo = device->ws->buffer_create(device->ws, bo_size,
|
pool->bo = device->ws->buffer_create(device->ws, bo_size,
|
||||||
32, RADEON_DOMAIN_VRAM, 0);
|
32, RADEON_DOMAIN_VRAM, RADEON_FLAG_NO_INTERPROCESS_SHARING);
|
||||||
pool->mapped_ptr = (uint8_t*)device->ws->buffer_map(pool->bo);
|
pool->mapped_ptr = (uint8_t*)device->ws->buffer_map(pool->bo);
|
||||||
}
|
}
|
||||||
pool->size = bo_size;
|
pool->size = bo_size;
|
||||||
|
|
|
@ -1394,6 +1394,7 @@ radv_get_preamble_cs(struct radv_queue *queue,
|
||||||
unsigned tess_factor_ring_size = 0, tess_offchip_ring_size = 0;
|
unsigned tess_factor_ring_size = 0, tess_offchip_ring_size = 0;
|
||||||
unsigned max_offchip_buffers;
|
unsigned max_offchip_buffers;
|
||||||
unsigned hs_offchip_param = 0;
|
unsigned hs_offchip_param = 0;
|
||||||
|
uint32_t ring_bo_flags = RADEON_FLAG_NO_CPU_ACCESS | RADEON_FLAG_NO_INTERPROCESS_SHARING;
|
||||||
if (!queue->has_tess_rings) {
|
if (!queue->has_tess_rings) {
|
||||||
if (needs_tess_rings)
|
if (needs_tess_rings)
|
||||||
add_tess_rings = true;
|
add_tess_rings = true;
|
||||||
|
@ -1427,7 +1428,7 @@ radv_get_preamble_cs(struct radv_queue *queue,
|
||||||
scratch_size,
|
scratch_size,
|
||||||
4096,
|
4096,
|
||||||
RADEON_DOMAIN_VRAM,
|
RADEON_DOMAIN_VRAM,
|
||||||
RADEON_FLAG_NO_CPU_ACCESS);
|
ring_bo_flags);
|
||||||
if (!scratch_bo)
|
if (!scratch_bo)
|
||||||
goto fail;
|
goto fail;
|
||||||
} else
|
} else
|
||||||
|
@ -1438,7 +1439,7 @@ radv_get_preamble_cs(struct radv_queue *queue,
|
||||||
compute_scratch_size,
|
compute_scratch_size,
|
||||||
4096,
|
4096,
|
||||||
RADEON_DOMAIN_VRAM,
|
RADEON_DOMAIN_VRAM,
|
||||||
RADEON_FLAG_NO_CPU_ACCESS);
|
ring_bo_flags);
|
||||||
if (!compute_scratch_bo)
|
if (!compute_scratch_bo)
|
||||||
goto fail;
|
goto fail;
|
||||||
|
|
||||||
|
@ -1450,7 +1451,7 @@ radv_get_preamble_cs(struct radv_queue *queue,
|
||||||
esgs_ring_size,
|
esgs_ring_size,
|
||||||
4096,
|
4096,
|
||||||
RADEON_DOMAIN_VRAM,
|
RADEON_DOMAIN_VRAM,
|
||||||
RADEON_FLAG_NO_CPU_ACCESS);
|
ring_bo_flags);
|
||||||
if (!esgs_ring_bo)
|
if (!esgs_ring_bo)
|
||||||
goto fail;
|
goto fail;
|
||||||
} else {
|
} else {
|
||||||
|
@ -1463,7 +1464,7 @@ radv_get_preamble_cs(struct radv_queue *queue,
|
||||||
gsvs_ring_size,
|
gsvs_ring_size,
|
||||||
4096,
|
4096,
|
||||||
RADEON_DOMAIN_VRAM,
|
RADEON_DOMAIN_VRAM,
|
||||||
RADEON_FLAG_NO_CPU_ACCESS);
|
ring_bo_flags);
|
||||||
if (!gsvs_ring_bo)
|
if (!gsvs_ring_bo)
|
||||||
goto fail;
|
goto fail;
|
||||||
} else {
|
} else {
|
||||||
|
@ -1476,14 +1477,14 @@ radv_get_preamble_cs(struct radv_queue *queue,
|
||||||
tess_factor_ring_size,
|
tess_factor_ring_size,
|
||||||
256,
|
256,
|
||||||
RADEON_DOMAIN_VRAM,
|
RADEON_DOMAIN_VRAM,
|
||||||
RADEON_FLAG_NO_CPU_ACCESS);
|
ring_bo_flags);
|
||||||
if (!tess_factor_ring_bo)
|
if (!tess_factor_ring_bo)
|
||||||
goto fail;
|
goto fail;
|
||||||
tess_offchip_ring_bo = queue->device->ws->buffer_create(queue->device->ws,
|
tess_offchip_ring_bo = queue->device->ws->buffer_create(queue->device->ws,
|
||||||
tess_offchip_ring_size,
|
tess_offchip_ring_size,
|
||||||
256,
|
256,
|
||||||
RADEON_DOMAIN_VRAM,
|
RADEON_DOMAIN_VRAM,
|
||||||
RADEON_FLAG_NO_CPU_ACCESS);
|
ring_bo_flags);
|
||||||
if (!tess_offchip_ring_bo)
|
if (!tess_offchip_ring_bo)
|
||||||
goto fail;
|
goto fail;
|
||||||
} else {
|
} else {
|
||||||
|
@ -1510,7 +1511,7 @@ radv_get_preamble_cs(struct radv_queue *queue,
|
||||||
size,
|
size,
|
||||||
4096,
|
4096,
|
||||||
RADEON_DOMAIN_VRAM,
|
RADEON_DOMAIN_VRAM,
|
||||||
RADEON_FLAG_CPU_ACCESS);
|
RADEON_FLAG_CPU_ACCESS|RADEON_FLAG_NO_INTERPROCESS_SHARING);
|
||||||
if (!descriptor_bo)
|
if (!descriptor_bo)
|
||||||
goto fail;
|
goto fail;
|
||||||
} else
|
} else
|
||||||
|
@ -2119,6 +2120,9 @@ VkResult radv_alloc_memory(VkDevice _device,
|
||||||
if (mem_flags & RADV_MEM_IMPLICIT_SYNC)
|
if (mem_flags & RADV_MEM_IMPLICIT_SYNC)
|
||||||
flags |= RADEON_FLAG_IMPLICIT_SYNC;
|
flags |= RADEON_FLAG_IMPLICIT_SYNC;
|
||||||
|
|
||||||
|
if (!dedicate_info && !import_info)
|
||||||
|
flags |= RADEON_FLAG_NO_INTERPROCESS_SHARING;
|
||||||
|
|
||||||
mem->bo = device->ws->buffer_create(device->ws, alloc_size, device->physical_device->rad_info.max_alignment,
|
mem->bo = device->ws->buffer_create(device->ws, alloc_size, device->physical_device->rad_info.max_alignment,
|
||||||
domain, flags);
|
domain, flags);
|
||||||
|
|
||||||
|
@ -2682,7 +2686,7 @@ VkResult radv_CreateEvent(
|
||||||
|
|
||||||
event->bo = device->ws->buffer_create(device->ws, 8, 8,
|
event->bo = device->ws->buffer_create(device->ws, 8, 8,
|
||||||
RADEON_DOMAIN_GTT,
|
RADEON_DOMAIN_GTT,
|
||||||
RADEON_FLAG_VA_UNCACHED | RADEON_FLAG_CPU_ACCESS);
|
RADEON_FLAG_VA_UNCACHED | RADEON_FLAG_CPU_ACCESS | RADEON_FLAG_NO_INTERPROCESS_SHARING);
|
||||||
if (!event->bo) {
|
if (!event->bo) {
|
||||||
vk_free2(&device->alloc, pAllocator, event);
|
vk_free2(&device->alloc, pAllocator, event);
|
||||||
return VK_ERROR_OUT_OF_DEVICE_MEMORY;
|
return VK_ERROR_OUT_OF_DEVICE_MEMORY;
|
||||||
|
|
|
@ -780,7 +780,7 @@ VkResult radv_CreateQueryPool(
|
||||||
size += 4 * pCreateInfo->queryCount;
|
size += 4 * pCreateInfo->queryCount;
|
||||||
|
|
||||||
pool->bo = device->ws->buffer_create(device->ws, size,
|
pool->bo = device->ws->buffer_create(device->ws, size,
|
||||||
64, RADEON_DOMAIN_GTT, 0);
|
64, RADEON_DOMAIN_GTT, RADEON_FLAG_NO_INTERPROCESS_SHARING);
|
||||||
|
|
||||||
if (!pool->bo) {
|
if (!pool->bo) {
|
||||||
vk_free2(&device->alloc, pAllocator, pool);
|
vk_free2(&device->alloc, pAllocator, pool);
|
||||||
|
|
|
@ -54,6 +54,7 @@ enum radeon_bo_flag { /* bitfield */
|
||||||
RADEON_FLAG_VIRTUAL = (1 << 3),
|
RADEON_FLAG_VIRTUAL = (1 << 3),
|
||||||
RADEON_FLAG_VA_UNCACHED = (1 << 4),
|
RADEON_FLAG_VA_UNCACHED = (1 << 4),
|
||||||
RADEON_FLAG_IMPLICIT_SYNC = (1 << 5),
|
RADEON_FLAG_IMPLICIT_SYNC = (1 << 5),
|
||||||
|
RADEON_FLAG_NO_INTERPROCESS_SHARING = (1 << 6),
|
||||||
};
|
};
|
||||||
|
|
||||||
enum radeon_bo_usage { /* bitfield */
|
enum radeon_bo_usage { /* bitfield */
|
||||||
|
|
|
@ -325,7 +325,7 @@ radv_alloc_shader_memory(struct radv_device *device,
|
||||||
|
|
||||||
slab->size = 256 * 1024;
|
slab->size = 256 * 1024;
|
||||||
slab->bo = device->ws->buffer_create(device->ws, slab->size, 256,
|
slab->bo = device->ws->buffer_create(device->ws, slab->size, 256,
|
||||||
RADEON_DOMAIN_VRAM, 0);
|
RADEON_DOMAIN_VRAM, RADEON_FLAG_NO_INTERPROCESS_SHARING);
|
||||||
slab->ptr = (char*)device->ws->buffer_map(slab->bo);
|
slab->ptr = (char*)device->ws->buffer_map(slab->bo);
|
||||||
list_inithead(&slab->shaders);
|
list_inithead(&slab->shaders);
|
||||||
|
|
||||||
|
|
|
@ -571,7 +571,8 @@ cik_create_gfx_config(struct radv_device *device)
|
||||||
device->gfx_init = device->ws->buffer_create(device->ws,
|
device->gfx_init = device->ws->buffer_create(device->ws,
|
||||||
cs->cdw * 4, 4096,
|
cs->cdw * 4, 4096,
|
||||||
RADEON_DOMAIN_GTT,
|
RADEON_DOMAIN_GTT,
|
||||||
RADEON_FLAG_CPU_ACCESS);
|
RADEON_FLAG_CPU_ACCESS|
|
||||||
|
RADEON_FLAG_NO_INTERPROCESS_SHARING);
|
||||||
if (!device->gfx_init)
|
if (!device->gfx_init)
|
||||||
goto fail;
|
goto fail;
|
||||||
|
|
||||||
|
|
|
@ -332,6 +332,10 @@ radv_amdgpu_winsys_bo_create(struct radeon_winsys *_ws,
|
||||||
request.flags |= AMDGPU_GEM_CREATE_CPU_GTT_USWC;
|
request.flags |= AMDGPU_GEM_CREATE_CPU_GTT_USWC;
|
||||||
if (!(flags & RADEON_FLAG_IMPLICIT_SYNC) && ws->info.drm_minor >= 22)
|
if (!(flags & RADEON_FLAG_IMPLICIT_SYNC) && ws->info.drm_minor >= 22)
|
||||||
request.flags |= AMDGPU_GEM_CREATE_EXPLICIT_SYNC;
|
request.flags |= AMDGPU_GEM_CREATE_EXPLICIT_SYNC;
|
||||||
|
if (flags & RADEON_FLAG_NO_INTERPROCESS_SHARING && ws->info.drm_minor >= 20) {
|
||||||
|
bo->is_local = true;
|
||||||
|
request.flags |= AMDGPU_GEM_CREATE_VM_ALWAYS_VALID;
|
||||||
|
}
|
||||||
|
|
||||||
/* this won't do anything on pre 4.9 kernels */
|
/* this won't do anything on pre 4.9 kernels */
|
||||||
if (ws->zero_all_vram_allocs && (initial_domain & RADEON_DOMAIN_VRAM))
|
if (ws->zero_all_vram_allocs && (initial_domain & RADEON_DOMAIN_VRAM))
|
||||||
|
|
|
@ -45,6 +45,7 @@ struct radv_amdgpu_winsys_bo {
|
||||||
uint64_t size;
|
uint64_t size;
|
||||||
struct radv_amdgpu_winsys *ws;
|
struct radv_amdgpu_winsys *ws;
|
||||||
bool is_virtual;
|
bool is_virtual;
|
||||||
|
bool is_local;
|
||||||
int ref_count;
|
int ref_count;
|
||||||
|
|
||||||
union {
|
union {
|
||||||
|
|
|
@ -202,7 +202,8 @@ radv_amdgpu_cs_create(struct radeon_winsys *ws,
|
||||||
if (cs->ws->use_ib_bos) {
|
if (cs->ws->use_ib_bos) {
|
||||||
cs->ib_buffer = ws->buffer_create(ws, ib_size, 0,
|
cs->ib_buffer = ws->buffer_create(ws, ib_size, 0,
|
||||||
RADEON_DOMAIN_GTT,
|
RADEON_DOMAIN_GTT,
|
||||||
RADEON_FLAG_CPU_ACCESS);
|
RADEON_FLAG_CPU_ACCESS|
|
||||||
|
RADEON_FLAG_NO_INTERPROCESS_SHARING);
|
||||||
if (!cs->ib_buffer) {
|
if (!cs->ib_buffer) {
|
||||||
free(cs);
|
free(cs);
|
||||||
return NULL;
|
return NULL;
|
||||||
|
@ -287,7 +288,8 @@ static void radv_amdgpu_cs_grow(struct radeon_winsys_cs *_cs, size_t min_size)
|
||||||
|
|
||||||
cs->ib_buffer = cs->ws->base.buffer_create(&cs->ws->base, ib_size, 0,
|
cs->ib_buffer = cs->ws->base.buffer_create(&cs->ws->base, ib_size, 0,
|
||||||
RADEON_DOMAIN_GTT,
|
RADEON_DOMAIN_GTT,
|
||||||
RADEON_FLAG_CPU_ACCESS);
|
RADEON_FLAG_CPU_ACCESS|
|
||||||
|
RADEON_FLAG_NO_INTERPROCESS_SHARING);
|
||||||
|
|
||||||
if (!cs->ib_buffer) {
|
if (!cs->ib_buffer) {
|
||||||
cs->base.cdw = 0;
|
cs->base.cdw = 0;
|
||||||
|
@ -471,6 +473,9 @@ static void radv_amdgpu_cs_add_buffer(struct radeon_winsys_cs *_cs,
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (bo->is_local)
|
||||||
|
return;
|
||||||
|
|
||||||
radv_amdgpu_cs_add_buffer_internal(cs, bo->bo, priority);
|
radv_amdgpu_cs_add_buffer_internal(cs, bo->bo, priority);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -541,6 +546,10 @@ static int radv_amdgpu_create_bo_list(struct radv_amdgpu_winsys *ws,
|
||||||
} else if (count == 1 && !extra_bo && !extra_cs &&
|
} else if (count == 1 && !extra_bo && !extra_cs &&
|
||||||
!radv_amdgpu_cs(cs_array[0])->num_virtual_buffers) {
|
!radv_amdgpu_cs(cs_array[0])->num_virtual_buffers) {
|
||||||
struct radv_amdgpu_cs *cs = (struct radv_amdgpu_cs*)cs_array[0];
|
struct radv_amdgpu_cs *cs = (struct radv_amdgpu_cs*)cs_array[0];
|
||||||
|
if (cs->num_buffers == 0) {
|
||||||
|
*bo_list = 0;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
r = amdgpu_bo_list_create(ws->dev, cs->num_buffers, cs->handles,
|
r = amdgpu_bo_list_create(ws->dev, cs->num_buffers, cs->handles,
|
||||||
cs->priorities, bo_list);
|
cs->priorities, bo_list);
|
||||||
} else {
|
} else {
|
||||||
|
@ -556,7 +565,10 @@ static int radv_amdgpu_create_bo_list(struct radv_amdgpu_winsys *ws,
|
||||||
if (extra_cs) {
|
if (extra_cs) {
|
||||||
total_buffer_count += ((struct radv_amdgpu_cs*)extra_cs)->num_buffers;
|
total_buffer_count += ((struct radv_amdgpu_cs*)extra_cs)->num_buffers;
|
||||||
}
|
}
|
||||||
|
if (total_buffer_count == 0) {
|
||||||
|
*bo_list = 0;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
amdgpu_bo_handle *handles = malloc(sizeof(amdgpu_bo_handle) * total_buffer_count);
|
amdgpu_bo_handle *handles = malloc(sizeof(amdgpu_bo_handle) * total_buffer_count);
|
||||||
uint8_t *priorities = malloc(sizeof(uint8_t) * total_buffer_count);
|
uint8_t *priorities = malloc(sizeof(uint8_t) * total_buffer_count);
|
||||||
if (!handles || !priorities) {
|
if (!handles || !priorities) {
|
||||||
|
@ -721,7 +733,8 @@ static int radv_amdgpu_winsys_cs_submit_chained(struct radeon_winsys_ctx *_ctx,
|
||||||
"see dmesg for more information.\n");
|
"see dmesg for more information.\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
amdgpu_bo_list_destroy(bo_list);
|
if (bo_list)
|
||||||
|
amdgpu_bo_list_destroy(bo_list);
|
||||||
|
|
||||||
if (fence)
|
if (fence)
|
||||||
radv_amdgpu_request_to_fence(ctx, fence, &request);
|
radv_amdgpu_request_to_fence(ctx, fence, &request);
|
||||||
|
@ -795,7 +808,8 @@ static int radv_amdgpu_winsys_cs_submit_fallback(struct radeon_winsys_ctx *_ctx,
|
||||||
"see dmesg for more information.\n");
|
"see dmesg for more information.\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
amdgpu_bo_list_destroy(bo_list);
|
if (bo_list)
|
||||||
|
amdgpu_bo_list_destroy(bo_list);
|
||||||
|
|
||||||
if (r)
|
if (r)
|
||||||
return r;
|
return r;
|
||||||
|
@ -856,7 +870,7 @@ static int radv_amdgpu_winsys_cs_submit_sysmem(struct radeon_winsys_ctx *_ctx,
|
||||||
}
|
}
|
||||||
assert(cnt);
|
assert(cnt);
|
||||||
|
|
||||||
bo = ws->buffer_create(ws, 4 * size, 4096, RADEON_DOMAIN_GTT, RADEON_FLAG_CPU_ACCESS);
|
bo = ws->buffer_create(ws, 4 * size, 4096, RADEON_DOMAIN_GTT, RADEON_FLAG_CPU_ACCESS|RADEON_FLAG_NO_INTERPROCESS_SHARING);
|
||||||
ptr = ws->buffer_map(bo);
|
ptr = ws->buffer_map(bo);
|
||||||
|
|
||||||
if (preamble_cs) {
|
if (preamble_cs) {
|
||||||
|
@ -905,7 +919,8 @@ static int radv_amdgpu_winsys_cs_submit_sysmem(struct radeon_winsys_ctx *_ctx,
|
||||||
"see dmesg for more information.\n");
|
"see dmesg for more information.\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
amdgpu_bo_list_destroy(bo_list);
|
if (bo_list)
|
||||||
|
amdgpu_bo_list_destroy(bo_list);
|
||||||
|
|
||||||
ws->buffer_destroy(bo);
|
ws->buffer_destroy(bo);
|
||||||
if (r)
|
if (r)
|
||||||
|
@ -1038,7 +1053,8 @@ static struct radeon_winsys_ctx *radv_amdgpu_ctx_create(struct radeon_winsys *_w
|
||||||
assert(AMDGPU_HW_IP_NUM * MAX_RINGS_PER_TYPE * sizeof(uint64_t) <= 4096);
|
assert(AMDGPU_HW_IP_NUM * MAX_RINGS_PER_TYPE * sizeof(uint64_t) <= 4096);
|
||||||
ctx->fence_bo = ws->base.buffer_create(&ws->base, 4096, 8,
|
ctx->fence_bo = ws->base.buffer_create(&ws->base, 4096, 8,
|
||||||
RADEON_DOMAIN_GTT,
|
RADEON_DOMAIN_GTT,
|
||||||
RADEON_FLAG_CPU_ACCESS);
|
RADEON_FLAG_CPU_ACCESS|
|
||||||
|
RADEON_FLAG_NO_INTERPROCESS_SHARING);
|
||||||
if (ctx->fence_bo)
|
if (ctx->fence_bo)
|
||||||
ctx->fence_map = (uint64_t*)ws->base.buffer_map(ctx->fence_bo);
|
ctx->fence_map = (uint64_t*)ws->base.buffer_map(ctx->fence_bo);
|
||||||
if (ctx->fence_map)
|
if (ctx->fence_map)
|
||||||
|
|
Loading…
Reference in New Issue