From c03d258046c09a6e951dd08b22ed2aef8d36b4a9 Mon Sep 17 00:00:00 2001 From: Bas Nieuwenhuizen Date: Mon, 6 Dec 2021 23:03:52 +0100 Subject: [PATCH] radv/amdgpu: Add a syncobj per queue. For merging our own dependencies in without submitting. Reviewed-by: Samuel Pitoiset Part-of: --- src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c | 67 ++++++++++++++----- src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.h | 2 + 2 files changed, 52 insertions(+), 17 deletions(-) diff --git a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c index a0778d29ccc..2fbef1e0eb8 100644 --- a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c +++ b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c @@ -78,6 +78,9 @@ struct radv_amdgpu_cs { unsigned num_old_cs_buffers; }; +static uint32_t radv_amdgpu_ctx_queue_syncobj(struct radv_amdgpu_ctx *ctx, unsigned ip, + unsigned ring); + static inline struct radv_amdgpu_cs * radv_amdgpu_cs(struct radeon_cmdbuf *base) { @@ -1303,11 +1306,29 @@ static void radv_amdgpu_ctx_destroy(struct radeon_winsys_ctx *rwctx) { struct radv_amdgpu_ctx *ctx = (struct radv_amdgpu_ctx *)rwctx; + + for (unsigned ip = 0; ip <= AMDGPU_HW_IP_DMA; ++ip) { + for (unsigned ring = 0; ring < MAX_RINGS_PER_TYPE; ++ring) { + if (ctx->queue_syncobj[ip][ring]) + amdgpu_cs_destroy_syncobj(ctx->ws->dev, ctx->queue_syncobj[ip][ring]); + } + } + ctx->ws->base.buffer_destroy(&ctx->ws->base, ctx->fence_bo); amdgpu_cs_ctx_free(ctx->ctx); FREE(ctx); } +static uint32_t +radv_amdgpu_ctx_queue_syncobj(struct radv_amdgpu_ctx *ctx, unsigned ip, unsigned ring) +{ + uint32_t *syncobj = &ctx->queue_syncobj[ip][ring]; + if (!*syncobj) { + amdgpu_cs_create_syncobj2(ctx->ws->dev, DRM_SYNCOBJ_CREATE_SIGNALED, syncobj); + } + return *syncobj; +} + static bool radv_amdgpu_ctx_wait_idle(struct radeon_winsys_ctx *rwctx, enum ring_type ring_type, int ring_index) { @@ -1328,12 +1349,13 @@ radv_amdgpu_ctx_wait_idle(struct radeon_winsys_ctx *rwctx, enum ring_type ring_t static void * radv_amdgpu_cs_alloc_syncobj_chunk(struct radv_winsys_sem_counts *counts, - const uint32_t *syncobj_override, + const uint32_t *syncobj_override, uint32_t queue_syncobj, struct drm_amdgpu_cs_chunk *chunk, int chunk_id) { const uint32_t *src = syncobj_override ? syncobj_override : counts->syncobj; + unsigned count = counts->syncobj_count + 1; struct drm_amdgpu_cs_chunk_sem *syncobj = - malloc(sizeof(struct drm_amdgpu_cs_chunk_sem) * counts->syncobj_count); + malloc(sizeof(struct drm_amdgpu_cs_chunk_sem) * count); if (!syncobj) return NULL; @@ -1342,8 +1364,10 @@ radv_amdgpu_cs_alloc_syncobj_chunk(struct radv_winsys_sem_counts *counts, sem->handle = src[i]; } + syncobj[counts->syncobj_count].handle = queue_syncobj; + chunk->chunk_id = chunk_id; - chunk->length_dw = sizeof(struct drm_amdgpu_cs_chunk_sem) / 4 * counts->syncobj_count; + chunk->length_dw = sizeof(struct drm_amdgpu_cs_chunk_sem) / 4 * count; chunk->chunk_data = (uint64_t)(uintptr_t)syncobj; return syncobj; } @@ -1351,12 +1375,13 @@ radv_amdgpu_cs_alloc_syncobj_chunk(struct radv_winsys_sem_counts *counts, static void * radv_amdgpu_cs_alloc_timeline_syncobj_chunk(struct radv_winsys_sem_counts *counts, const uint32_t *syncobj_override, + uint32_t queue_syncobj, struct drm_amdgpu_cs_chunk *chunk, int chunk_id) { const uint32_t *src = syncobj_override ? syncobj_override : counts->syncobj; + uint32_t count = counts->syncobj_count + counts->timeline_syncobj_count + 1; struct drm_amdgpu_cs_chunk_syncobj *syncobj = - malloc(sizeof(struct drm_amdgpu_cs_chunk_syncobj) * - (counts->syncobj_count + counts->timeline_syncobj_count)); + malloc(sizeof(struct drm_amdgpu_cs_chunk_syncobj) * count); if (!syncobj) return NULL; @@ -1374,9 +1399,12 @@ radv_amdgpu_cs_alloc_timeline_syncobj_chunk(struct radv_winsys_sem_counts *count sem->point = counts->points[i]; } + syncobj[count - 1].handle = queue_syncobj; + syncobj[count - 1].flags = 0; + syncobj[count - 1].point = 0; + chunk->chunk_id = chunk_id; - chunk->length_dw = sizeof(struct drm_amdgpu_cs_chunk_syncobj) / 4 * - (counts->syncobj_count + counts->timeline_syncobj_count); + chunk->length_dw = sizeof(struct drm_amdgpu_cs_chunk_syncobj) / 4 * count; chunk->chunk_data = (uint64_t)(uintptr_t)syncobj; return syncobj; } @@ -1494,6 +1522,10 @@ radv_amdgpu_cs_submit(struct radv_amdgpu_ctx *ctx, struct radv_amdgpu_cs_request int i; uint32_t bo_list = 0; VkResult result = VK_SUCCESS; + uint32_t queue_syncobj = radv_amdgpu_ctx_queue_syncobj(ctx, request->ip_type, request->ring); + + if (!queue_syncobj) + return VK_ERROR_OUT_OF_HOST_MEMORY; size = request->number_of_ibs + 2 /* user fence */ + (!use_bo_list_create ? 1 : 0) + 3; @@ -1537,19 +1569,19 @@ radv_amdgpu_cs_submit(struct radv_amdgpu_ctx *ctx, struct radv_amdgpu_cs_request fence_info.offset = (request->ip_type * MAX_RINGS_PER_TYPE + request->ring) * sizeof(uint64_t); amdgpu_cs_chunk_fence_info_to_data(&fence_info, &chunk_data[i]); - if ((sem_info->wait.syncobj_count || sem_info->wait.timeline_syncobj_count) && - sem_info->cs_emit_wait) { + if (sem_info->cs_emit_wait) { r = radv_amdgpu_cs_prepare_syncobjs(ctx->ws, &sem_info->wait, &in_syncobjs); if (r) goto error_out; if (ctx->ws->info.has_timeline_syncobj) { wait_syncobj = radv_amdgpu_cs_alloc_timeline_syncobj_chunk( - &sem_info->wait, in_syncobjs, &chunks[num_chunks], + &sem_info->wait, in_syncobjs, queue_syncobj, &chunks[num_chunks], AMDGPU_CHUNK_ID_SYNCOBJ_TIMELINE_WAIT); } else { - wait_syncobj = radv_amdgpu_cs_alloc_syncobj_chunk( - &sem_info->wait, in_syncobjs, &chunks[num_chunks], AMDGPU_CHUNK_ID_SYNCOBJ_IN); + wait_syncobj = + radv_amdgpu_cs_alloc_syncobj_chunk(&sem_info->wait, in_syncobjs, queue_syncobj, + &chunks[num_chunks], AMDGPU_CHUNK_ID_SYNCOBJ_IN); } if (!wait_syncobj) { result = VK_ERROR_OUT_OF_HOST_MEMORY; @@ -1560,14 +1592,15 @@ radv_amdgpu_cs_submit(struct radv_amdgpu_ctx *ctx, struct radv_amdgpu_cs_request sem_info->cs_emit_wait = false; } - if ((sem_info->signal.syncobj_count || sem_info->signal.timeline_syncobj_count) && - sem_info->cs_emit_signal) { + if (sem_info->cs_emit_signal) { if (ctx->ws->info.has_timeline_syncobj) { signal_syncobj = radv_amdgpu_cs_alloc_timeline_syncobj_chunk( - &sem_info->signal, NULL, &chunks[num_chunks], AMDGPU_CHUNK_ID_SYNCOBJ_TIMELINE_SIGNAL); + &sem_info->signal, NULL, queue_syncobj, &chunks[num_chunks], + AMDGPU_CHUNK_ID_SYNCOBJ_TIMELINE_SIGNAL); } else { - signal_syncobj = radv_amdgpu_cs_alloc_syncobj_chunk( - &sem_info->signal, NULL, &chunks[num_chunks], AMDGPU_CHUNK_ID_SYNCOBJ_OUT); + signal_syncobj = + radv_amdgpu_cs_alloc_syncobj_chunk(&sem_info->signal, NULL, queue_syncobj, + &chunks[num_chunks], AMDGPU_CHUNK_ID_SYNCOBJ_OUT); } if (!signal_syncobj) { result = VK_ERROR_OUT_OF_HOST_MEMORY; diff --git a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.h b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.h index e66ae3e8e4e..749a0e4caa9 100644 --- a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.h +++ b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.h @@ -49,6 +49,8 @@ struct radv_amdgpu_ctx { struct radv_amdgpu_fence last_submission[AMDGPU_HW_IP_DMA + 1][MAX_RINGS_PER_TYPE]; struct radeon_winsys_bo *fence_bo; + + uint32_t queue_syncobj[AMDGPU_HW_IP_DMA + 1][MAX_RINGS_PER_TYPE]; }; static inline struct radv_amdgpu_ctx *