From 0db95de577bca823109773d758990d1caff5f4ac Mon Sep 17 00:00:00 2001
From: Iago Toral Quiroga <itoral@igalia.com>
Date: Wed, 8 Jul 2020 09:56:44 +0200
Subject: [PATCH] v3dv: improve pipeline barrier handling

So far we have been getting away with finishing the current job in the
presence of a pipeline barrier and relying on the RCL serialization,
but of course this is not always enough.

This patch  addresses synchronization across different GPU units
(i.e. draw indirect after compute), as well as cases where we need to
sync before binning.

Fixes CTS failures in:
dEQP-VK.synchronization.op.single_queue.barrier.*

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/6766>
---
 src/broadcom/vulkan/v3dv_cmd_buffer.c | 81 ++++++++++++++++++++++++---
 src/broadcom/vulkan/v3dv_private.h    | 12 ++++
 src/broadcom/vulkan/v3dv_queue.c      | 17 +++++-
 3 files changed, 99 insertions(+), 11 deletions(-)

diff --git a/src/broadcom/vulkan/v3dv_cmd_buffer.c b/src/broadcom/vulkan/v3dv_cmd_buffer.c
index a0da96b648e..4f2e0b8db9c 100644
--- a/src/broadcom/vulkan/v3dv_cmd_buffer.c
+++ b/src/broadcom/vulkan/v3dv_cmd_buffer.c
@@ -727,6 +727,46 @@ v3dv_cmd_buffer_finish_job(struct v3dv_cmd_buffer *cmd_buffer)
    }
 }
 
+static bool
+job_type_is_gpu(struct v3dv_job *job)
+{
+   switch (job->type) {
+   case V3DV_JOB_TYPE_GPU_CL:
+   case V3DV_JOB_TYPE_GPU_CL_SECONDARY:
+   case V3DV_JOB_TYPE_GPU_TFU:
+   case V3DV_JOB_TYPE_GPU_CSD:
+      return true;
+   default:
+      return false;
+   }
+}
+
+static void
+cmd_buffer_serialize_job_if_needed(struct v3dv_cmd_buffer *cmd_buffer,
+                                   struct v3dv_job *job)
+{
+   assert(cmd_buffer && job);
+
+   if (!cmd_buffer->state.has_barrier)
+      return;
+
+   /* Serialization only affects GPU jobs, CPU jobs are always automatically
+    * serialized.
+    */
+   if (!job_type_is_gpu(job))
+      return;
+
+   job->serialize = true;
+   if (cmd_buffer->state.has_bcl_barrier &&
+       (job->type == V3DV_JOB_TYPE_GPU_CL ||
+        job->type == V3DV_JOB_TYPE_GPU_CL_SECONDARY)) {
+      job->needs_bcl_sync = true;
+   }
+
+   cmd_buffer->state.has_barrier = false;
+   cmd_buffer->state.has_bcl_barrier = false;
+}
+
 void
 v3dv_job_init(struct v3dv_job *job,
               enum v3dv_job_type type,
@@ -736,6 +776,9 @@ v3dv_job_init(struct v3dv_job *job,
 {
    assert(job);
 
+   /* Make sure we haven't made this new job current before calling here */
+   assert(!cmd_buffer || cmd_buffer->state.job != job);
+
    job->type = type;
 
    job->device = device;
@@ -777,6 +820,8 @@ v3dv_job_init(struct v3dv_job *job,
        */
       if (cmd_buffer->state.pass)
          job->first_subpass = subpass_idx;
+
+      cmd_buffer_serialize_job_if_needed(cmd_buffer, job);
    }
 }
 
@@ -804,8 +849,6 @@ v3dv_cmd_buffer_start_job(struct v3dv_cmd_buffer *cmd_buffer,
                                     sizeof(struct v3dv_job), 8,
                                     VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
 
-   cmd_buffer->state.job = job;
-
    if (!job) {
       fprintf(stderr, "Error: failed to allocate CPU memory for job\n");
       v3dv_flag_oom(cmd_buffer, NULL);
@@ -813,6 +856,7 @@ v3dv_cmd_buffer_start_job(struct v3dv_cmd_buffer *cmd_buffer,
    }
 
    v3dv_job_init(job, type, cmd_buffer->device, cmd_buffer, subpass_idx);
+   cmd_buffer->state.job = job;
 
    return job;
 }
@@ -2350,6 +2394,16 @@ cmd_buffer_execute_inside_pass(struct v3dv_cmd_buffer *primary,
             cl_emit(&primary_job->bcl, BRANCH_TO_SUB_LIST, branch) {
                branch.address = v3dv_cl_address(secondary_job->bcl.bo, 0);
             }
+
+            /* If this secondary has barriers, we need to flag them in the
+             * primary job.
+             *
+             * FIXME: This might be moving the sync point too early though,
+             * maybe we would need to split the primary in this case to ensure
+             * that barriers execute right before the secondary.
+             */
+            primary_job->serialize |= secondary_job->serialize;
+            primary_job->needs_bcl_sync |= secondary_job->needs_bcl_sync;
          } else if (secondary_job->type == V3DV_JOB_TYPE_CPU_CLEAR_ATTACHMENTS) {
             const struct v3dv_clear_attachments_cpu_job_info *info =
                &secondary_job->cpu.clear_attachments;
@@ -4038,18 +4092,27 @@ v3dv_CmdPipelineBarrier(VkCommandBuffer commandBuffer,
                         VkDependencyFlags dependencyFlags,
                         uint32_t memoryBarrierCount,
                         const VkMemoryBarrier *pMemoryBarriers,
-                        uint32_t bufferMemoryBarrierCount,
-                        const VkBufferMemoryBarrier *pBufferMemoryBarriers,
-                        uint32_t imageMemoryBarrierCount,
-                        const VkImageMemoryBarrier *pImageMemoryBarriers)
+                        uint32_t bufferBarrierCount,
+                        const VkBufferMemoryBarrier *pBufferBarriers,
+                        uint32_t imageBarrierCount,
+                        const VkImageMemoryBarrier *pImageBarriers)
 {
    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
 
+   /* If we have a recording job, finish it here */
    struct v3dv_job *job = cmd_buffer->state.job;
-   if (!job)
-      return;
+   if (job)
+      v3dv_cmd_buffer_finish_job(cmd_buffer);
 
-   v3dv_cmd_buffer_finish_job(cmd_buffer);
+   cmd_buffer->state.has_barrier = true;
+   if (dstStageMask & (VK_PIPELINE_STAGE_VERTEX_INPUT_BIT |
+                       VK_PIPELINE_STAGE_VERTEX_SHADER_BIT |
+                       VK_PIPELINE_STAGE_GEOMETRY_SHADER_BIT |
+                       VK_PIPELINE_STAGE_TESSELLATION_CONTROL_SHADER_BIT |
+                       VK_PIPELINE_STAGE_TESSELLATION_EVALUATION_SHADER_BIT |
+                       VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT)) {
+      cmd_buffer->state.has_bcl_barrier = true;
+   }
 }
 
 void
diff --git a/src/broadcom/vulkan/v3dv_private.h b/src/broadcom/vulkan/v3dv_private.h
index 71c25090671..3a17130c75c 100644
--- a/src/broadcom/vulkan/v3dv_private.h
+++ b/src/broadcom/vulkan/v3dv_private.h
@@ -805,6 +805,12 @@ struct v3dv_job {
     */
    bool always_flush;
 
+   /* Whether we need to serialize this job in our command stream */
+   bool serialize;
+
+   /* If this is a CL job, whether we should sync before binning */
+   bool needs_bcl_sync;
+
    /* Job specs for CPU jobs */
    union {
       struct v3dv_reset_query_cpu_job_info          query_reset;
@@ -903,6 +909,12 @@ struct v3dv_cmd_buffer_state {
    /* Used to flag OOM conditions during command buffer recording */
    bool oom;
 
+   /* Whether we have recorded a pipeline barrier that we still need to
+    * process.
+    */
+   bool has_barrier;
+   bool has_bcl_barrier;
+
    /* Command buffer state saved during a meta operation */
    struct {
       uint32_t subpass_idx;
diff --git a/src/broadcom/vulkan/v3dv_queue.c b/src/broadcom/vulkan/v3dv_queue.c
index 6a9cca8a91c..ec5d304a742 100644
--- a/src/broadcom/vulkan/v3dv_queue.c
+++ b/src/broadcom/vulkan/v3dv_queue.c
@@ -530,6 +530,12 @@ handle_cl_job(struct v3dv_queue *queue,
 
    struct drm_v3d_submit_cl submit;
 
+   /* Sanity check: we should only flag a bcl sync on a job that needs to be
+    * serialized.
+    */
+   assert(job->serialize || !job->needs_bcl_sync);
+   do_wait |= job->serialize;
+
    /* We expect to have just one RCL per job which should fit in just one BO.
     * Our BCL, could chain multiple BOS together though.
     */
@@ -575,9 +581,12 @@ handle_cl_job(struct v3dv_queue *queue,
     * we would have to extend our kernel interface to support the case where
     * we have more than one semaphore to wait on.
     */
+   const bool needs_bcl_sync = do_wait && job->needs_bcl_sync;
+   const bool needs_rcl_sync = do_wait && !needs_bcl_sync;
+
    mtx_lock(&queue->device->mutex);
-   submit.in_sync_bcl = 0;
-   submit.in_sync_rcl = do_wait ? device->last_job_sync : 0;
+   submit.in_sync_bcl = needs_bcl_sync ? device->last_job_sync : 0;
+   submit.in_sync_rcl = needs_rcl_sync ? device->last_job_sync : 0;
    submit.out_sync = device->last_job_sync;
    v3dv_clif_dump(device, job, &submit);
    int ret = v3dv_ioctl(device->render_fd, DRM_IOCTL_V3D_SUBMIT_CL, &submit);
@@ -605,6 +614,8 @@ handle_tfu_job(struct v3dv_queue *queue,
 {
    struct v3dv_device *device = queue->device;
 
+   do_wait |= job->serialize;
+
    mtx_lock(&device->mutex);
    job->tfu.in_sync = do_wait ? device->last_job_sync : 0;
    job->tfu.out_sync = device->last_job_sync;
@@ -628,6 +639,8 @@ handle_csd_job(struct v3dv_queue *queue,
 
    struct drm_v3d_submit_csd *submit = &job->csd.submit;
 
+   do_wait |= job->serialize;
+
    submit->bo_handle_count = job->bo_count;
    uint32_t *bo_handles =
       (uint32_t *) malloc(sizeof(uint32_t) * MAX2(4, submit->bo_handle_count * 2));