freedreno/drm: Userspace fences

Add a per-fd_pipe fence "timeline" so we can detect cases where we don't need to call into the kernel to determine if a fd_bo is still busy. This reuses table_lock, rather than introducing a per-bo lock to protect fence state updates because (a) the common / hotpath pattern is to update fences on a lot of objects, but checking the fence state of a single object is less common, and (b) because we already hold the table lock in common spots where we need to check the bo's fence state (ie. allocations from the bo-cache). Signed-off-by: Rob Clark <robdclark@chromium.org> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/10444>
2021-04-21 09:37:05 -07:00 · 2021-04-21 09:37:05 -07:00 · 7dabd62464
parent df78934cdf
commit 7dabd62464
8 changed files with 236 additions and 16 deletions
--- a/src/freedreno/drm/freedreno_bo.c
+++ b/src/freedreno/drm/freedreno_bo.c
@ -294,6 +294,35 @@ fd_bo_del(struct fd_bo *bo)
   simple_mtx_unlock(&table_lock);
 }

+/**
+ * Cleanup fences, dropping pipe references.  If 'expired' is true, only
+ * cleanup expired fences.
+ *
+ * Normally we expect at most a single fence, the exception being bo's
+ * shared between contexts
+ */
+static void
+cleanup_fences(struct fd_bo *bo, bool expired)
+{
+   simple_mtx_assert_locked(&table_lock);
+
+   for (int i = 0; i < bo->nr_fences; i++) {
+      struct fd_bo_fence *f = &bo->fences[i];
+
+      if (expired && fd_fence_before(f->pipe->control->fence, f->fence))
+         continue;
+
+      fd_pipe_del_locked(f->pipe);
+      bo->nr_fences--;
+
+      if (bo->nr_fences > 0) {
+         /* Shuffle up the last entry to replace the current slot: */
+         bo->fences[i] = bo->fences[bo->nr_fences];
+         i--;
+      }
+   }
+}
+
 /* Called under table_lock */
 void
 bo_del(struct fd_bo *bo)
@ -302,6 +331,9 @@ bo_del(struct fd_bo *bo)

   simple_mtx_assert_locked(&table_lock);

+   cleanup_fences(bo, false);
+   free(bo->fences);
+
   if (bo->map)
      os_munmap(bo->map, bo->size);

@ -340,6 +372,7 @@ fd_bo_get_name(struct fd_bo *bo, uint32_t *name)
      set_name(bo, req.name);
      simple_mtx_unlock(&table_lock);
      bo->bo_reuse = NO_CACHE;
+      bo->shared = true;
   }

   *name = bo->name;
@ -351,6 +384,7 @@ uint32_t
 fd_bo_handle(struct fd_bo *bo)
 {
   bo->bo_reuse = NO_CACHE;
+   bo->shared = true;
   return bo->handle;
 }

@ -366,6 +400,7 @@ fd_bo_dmabuf(struct fd_bo *bo)
   }

   bo->bo_reuse = NO_CACHE;
+   bo->shared = true;

   return prime_fd;
 }
@ -402,11 +437,73 @@ fd_bo_map(struct fd_bo *bo)
 int
 fd_bo_cpu_prep(struct fd_bo *bo, struct fd_pipe *pipe, uint32_t op)
 {
+   if (op & FD_BO_PREP_NOSYNC) {
+      simple_mtx_lock(&table_lock);
+      enum fd_bo_state state = fd_bo_state(bo);
+      simple_mtx_unlock(&table_lock);
+
+      switch (state) {
+      case FD_BO_STATE_IDLE:
+         return 0;
+      case FD_BO_STATE_BUSY:
+         return -EBUSY;
+      case FD_BO_STATE_UNKNOWN:
+         break;
+      }
+   }
   return bo->funcs->cpu_prep(bo, pipe, op);
 }

 void
 fd_bo_cpu_fini(struct fd_bo *bo)
 {
-   bo->funcs->cpu_fini(bo);
+// TODO until we have cached buffers, the kernel side ioctl does nothing,
+//      so just skip it.  When we have cached buffers, we can make the
+//      ioctl conditional
+//   bo->funcs->cpu_fini(bo);
 }
+
+void
+fd_bo_add_fence(struct fd_bo *bo, struct fd_pipe *pipe, uint32_t fence)
+{
+   simple_mtx_assert_locked(&table_lock);
+
+   if (bo->nosync)
+      return;
+
+   /* The common case is bo re-used on the same pipe it had previously
+    * been used on:
+    */
+   for (int i = 0; i < bo->nr_fences; i++) {
+      struct fd_bo_fence *f = &bo->fences[i];
+      if (f->pipe == pipe) {
+         assert(fd_fence_before(f->fence, fence));
+         f->fence = fence;
+         return;
+      }
+   }
+
+   cleanup_fences(bo, true);
+
+   APPEND(bo, fences, (struct fd_bo_fence){
+      .pipe = fd_pipe_ref_locked(pipe),
+      .fence = fence,
+   });
+}
+
+enum fd_bo_state
+fd_bo_state(struct fd_bo *bo)
+{
+   simple_mtx_assert_locked(&table_lock);
+
+   cleanup_fences(bo, true);
+
+   if (bo->shared || bo->nosync)
+      return FD_BO_STATE_UNKNOWN;
+
+   if (!bo->nr_fences)
+      return FD_BO_STATE_IDLE;
+
+   return FD_BO_STATE_BUSY;
+}
+
--- a/src/freedreno/drm/freedreno_bo_cache.c
+++ b/src/freedreno/drm/freedreno_bo_cache.c
@ -122,14 +122,6 @@ get_bucket(struct fd_bo_cache *cache, uint32_t size)
   return NULL;
 }

-static int
-is_idle(struct fd_bo *bo)
-{
-   return fd_bo_cpu_prep(bo, NULL,
-                         FD_BO_PREP_READ | FD_BO_PREP_WRITE |
-                            FD_BO_PREP_NOSYNC) == 0;
-}
-
 static struct fd_bo *
 find_in_bucket(struct fd_bo_bucket *bucket, uint32_t flags)
 {
@ -146,7 +138,7 @@ find_in_bucket(struct fd_bo_bucket *bucket, uint32_t flags)
   if (!list_is_empty(&bucket->list)) {
      bo = LIST_ENTRY(struct fd_bo, bucket->list.next, list);
      /* TODO check for compatible flags? */
-      if (is_idle(bo)) {
+      if (fd_bo_state(bo) == FD_BO_STATE_IDLE) {
         list_del(&bo->list);
      } else {
         bo = NULL;
--- a/src/freedreno/drm/freedreno_drmif.h
+++ b/src/freedreno/drm/freedreno_drmif.h
@ -63,6 +63,22 @@ enum fd_param_id {
   FD_GLOBAL_FAULTS, /* # of global (all context) faults */
 };

+/**
+ * Helper for fence/seqno comparisions which deals properly with rollover.
+ * Returns true if fence 'a' is before fence 'b'
+ */
+static inline bool
+fd_fence_before(uint32_t a, uint32_t b)
+{
+   return (int32_t)(a - b) < 0;
+}
+
+static inline bool
+fd_fence_after(uint32_t a, uint32_t b)
+{
+   return (int32_t)(a - b) > 0;
+}
+
 /* bo flags: */
 #define FD_BO_GPUREADONLY  BITSET_BIT(1)
 #define FD_BO_SCANOUT      BITSET_BIT(2)
@ -106,6 +122,7 @@ struct fd_pipe *fd_pipe_new(struct fd_device *dev, enum fd_pipe_id id);
 struct fd_pipe *fd_pipe_new2(struct fd_device *dev, enum fd_pipe_id id,
                             uint32_t prio);
 struct fd_pipe *fd_pipe_ref(struct fd_pipe *pipe);
+struct fd_pipe *fd_pipe_ref_locked(struct fd_pipe *pipe);
 void fd_pipe_del(struct fd_pipe *pipe);
 int fd_pipe_get_param(struct fd_pipe *pipe, enum fd_param_id param,
                      uint64_t *value);
--- a/src/freedreno/drm/freedreno_pipe.c
+++ b/src/freedreno/drm/freedreno_pipe.c
@ -60,6 +60,19 @@ fd_pipe_new2(struct fd_device *dev, enum fd_pipe_id id, uint32_t prio)
   fd_pipe_get_param(pipe, FD_GPU_ID, &val);
   pipe->gpu_id = val;

+   pipe->control_mem = fd_bo_new(dev, sizeof(*pipe->control),
+                                 0, "pipe-control");
+   pipe->control = fd_bo_map(pipe->control_mem);
+
+   /* We don't want the control_mem bo to hold a reference to the ourself,
+    * so disable userspace fencing.  This also means that we won't be able
+    * to determine if the buffer is idle which is needed by bo-cache.  But
+    * pipe creation/destroy is not a high frequency event so just disable
+    * the bo-cache as well:
+    */
+   pipe->control_mem->nosync = true;
+   pipe->control_mem->bo_reuse = NO_CACHE;
+
   return pipe;
 }

@ -72,16 +85,26 @@ fd_pipe_new(struct fd_device *dev, enum fd_pipe_id id)
 struct fd_pipe *
 fd_pipe_ref(struct fd_pipe *pipe)
 {
-   p_atomic_inc(&pipe->refcnt);
+   simple_mtx_lock(&table_lock);
+   fd_pipe_ref_locked(pipe);
+   simple_mtx_unlock(&table_lock);
+   return pipe;
+}
+
+struct fd_pipe *
+fd_pipe_ref_locked(struct fd_pipe *pipe)
+{
+   simple_mtx_assert_locked(&table_lock);
+   pipe->refcnt++;
   return pipe;
 }

 void
 fd_pipe_del(struct fd_pipe *pipe)
 {
-   if (!p_atomic_dec_zero(&pipe->refcnt))
-      return;
-   pipe->funcs->destroy(pipe);
+   simple_mtx_lock(&table_lock);
+   fd_pipe_del_locked(pipe);
+   simple_mtx_unlock(&table_lock);
 }

 void
@ -90,6 +113,7 @@ fd_pipe_del_locked(struct fd_pipe *pipe)
   simple_mtx_assert_locked(&table_lock);
   if (!p_atomic_dec_zero(&pipe->refcnt))
      return;
+   fd_bo_del_locked(pipe->control_mem);
   pipe->funcs->destroy(pipe);
 }

@ -108,5 +132,26 @@ fd_pipe_wait(struct fd_pipe *pipe, uint32_t timestamp)
 int
 fd_pipe_wait_timeout(struct fd_pipe *pipe, uint32_t timestamp, uint64_t timeout)
 {
+
   return pipe->funcs->wait(pipe, timestamp, timeout);
 }
+
+uint32_t
+fd_pipe_emit_fence(struct fd_pipe *pipe, struct fd_ringbuffer *ring)
+{
+   uint32_t fence = ++pipe->last_fence;
+
+   if (pipe->gpu_id >= 500) {
+      OUT_PKT7(ring, CP_EVENT_WRITE, 4);
+      OUT_RING(ring, CP_EVENT_WRITE_0_EVENT(CACHE_FLUSH_TS));
+      OUT_RELOC(ring, control_ptr(pipe, fence));   /* ADDR_LO/HI */
+      OUT_RING(ring, fence);
+   } else {
+      OUT_PKT3(ring, CP_EVENT_WRITE, 3);
+      OUT_RING(ring, CP_EVENT_WRITE_0_EVENT(CACHE_FLUSH_TS));
+      OUT_RELOC(ring, control_ptr(pipe, fence));   /* ADDR */
+      OUT_RING(ring, fence);
+   }
+
+   return fence;
+}
--- a/src/freedreno/drm/freedreno_priv.h
+++ b/src/freedreno/drm/freedreno_priv.h
@ -151,14 +151,39 @@ struct fd_pipe_funcs {
   void (*destroy)(struct fd_pipe *pipe);
 };

+struct fd_pipe_control {
+   uint32_t fence;
+};
+#define control_ptr(pipe, member) \
+   (pipe)->control_mem, offsetof(struct fd_pipe_control, member), 0, 0
+
 struct fd_pipe {
   struct fd_device *dev;
   enum fd_pipe_id id;
   uint32_t gpu_id;
+
+   /**
+    * Note refcnt is *not* atomic, but protected by table_lock, since the
+    * table_lock is held in fd_bo_add_fence(), which is the hotpath.
+    */
   int32_t refcnt;
+
+   /**
+    * Previous fence seqno allocated for this pipe.  The fd_pipe represents
+    * a single timeline, fences allocated by this pipe can be compared to
+    * each other, but fences from different pipes are not comparable (as
+    * there could be preemption of multiple priority level submitqueues at
+    * play)
+    */
+   uint32_t last_fence;
+   struct fd_bo *control_mem;
+   volatile struct fd_pipe_control *control;
+
   const struct fd_pipe_funcs *funcs;
 };

+uint32_t fd_pipe_emit_fence(struct fd_pipe *pipe, struct fd_ringbuffer *ring);
+
 struct fd_submit_funcs {
   struct fd_ringbuffer *(*new_ringbuffer)(struct fd_submit *submit,
                                           uint32_t size,
@ -173,6 +198,7 @@ struct fd_submit {
   const struct fd_submit_funcs *funcs;

   struct fd_ringbuffer *primary;
+   uint32_t fence;
 };

 struct fd_bo_funcs {
@ -185,6 +211,15 @@ struct fd_bo_funcs {
   void (*destroy)(struct fd_bo *bo);
 };

+struct fd_bo_fence {
+   /* For non-shared buffers, track the last pipe the buffer was active
+    * on, and the per-pipe fence value that indicates when the buffer is
+    * idle:
+    */
+   uint32_t fence;
+   struct fd_pipe *pipe;
+};
+
 struct fd_bo {
   struct fd_device *dev;
   uint32_t size;
@ -200,12 +235,35 @@ struct fd_bo {
      NO_CACHE = 0,
      BO_CACHE = 1,
      RING_CACHE = 2,
-   } bo_reuse;
+   } bo_reuse : 2;
+
+   /* Buffers that are shared (imported or exported) may be used in
+    * other processes, so we need to fallback to kernel to determine
+    * busyness.
+    */
+   bool shared : 1;
+
+   /* We need to be able to disable userspace fence synchronization for
+    * special internal buffers, namely the pipe->control buffer, to avoid
+    * a circular reference loop.
+    */
+   bool nosync : 1;

   struct list_head list; /* bucket-list entry */
   time_t free_time;      /* time when added to bucket-list */
+
+   DECLARE_ARRAY(struct fd_bo_fence, fences);
 };

+void fd_bo_add_fence(struct fd_bo *bo, struct fd_pipe *pipe, uint32_t fence);
+
+enum fd_bo_state {
+   FD_BO_STATE_IDLE,
+   FD_BO_STATE_BUSY,
+   FD_BO_STATE_UNKNOWN,
+};
+enum fd_bo_state fd_bo_state(struct fd_bo *bo);
+
 struct fd_bo *fd_bo_new_ring(struct fd_device *dev, uint32_t size);

 #define enable_debug 0 /* TODO make dynamic */
--- a/src/freedreno/drm/freedreno_ringbuffer.c
+++ b/src/freedreno/drm/freedreno_ringbuffer.c
@ -48,7 +48,7 @@ int
 fd_submit_flush(struct fd_submit *submit, int in_fence_fd, int *out_fence_fd,
                uint32_t *out_fence)
 {
-   debug_assert(submit->primary);
+   submit->fence = fd_pipe_emit_fence(submit->pipe, submit->primary);
   return submit->funcs->flush(submit, in_fence_fd, out_fence_fd, out_fence);
 }

--- a/src/freedreno/drm/msm_ringbuffer.c
+++ b/src/freedreno/drm/msm_ringbuffer.c
@ -343,6 +343,12 @@ msm_submit_flush(struct fd_submit *submit, int in_fence_fd, int *out_fence_fd,
      }
   }

+   simple_mtx_lock(&table_lock);
+   for (unsigned j = 0; j < msm_submit->nr_bos; j++) {
+      fd_bo_add_fence(msm_submit->bos[j], submit->pipe, submit->fence);
+   }
+   simple_mtx_unlock(&table_lock);
+
   if (in_fence_fd != -1) {
      req.flags |= MSM_SUBMIT_FENCE_FD_IN | MSM_SUBMIT_NO_IMPLICIT;
      req.fence_fd = in_fence_fd;
--- a/src/freedreno/drm/msm_ringbuffer_sp.c
+++ b/src/freedreno/drm/msm_ringbuffer_sp.c
@ -256,11 +256,16 @@ msm_submit_sp_flush(struct fd_submit *submit, int in_fence_fd,
   } else {
      submit_bos = malloc(msm_submit->nr_bos * sizeof(submit_bos[0]));
   }
+
+   simple_mtx_lock(&table_lock);
   for (unsigned i = 0; i < msm_submit->nr_bos; i++) {
      submit_bos[i].flags = msm_submit->bos[i]->flags;
      submit_bos[i].handle = msm_submit->bos[i]->handle;
      submit_bos[i].presumed = 0;
+      fd_bo_add_fence(msm_submit->bos[i], submit->pipe, submit->fence);
   }
+   simple_mtx_unlock(&table_lock);
+
   req.bos = VOID2U64(submit_bos), req.nr_bos = msm_submit->nr_bos;
   req.cmds = VOID2U64(cmds), req.nr_cmds = primary->u.nr_cmds;