iris: switch to explicit busy tracking

We're moving away from relying on the Kernel's implicit busy tracking into our own tracking, except for shared buffers. Not only this shouldn't hurt now (it doesn't, according to my measurements), when we switch to vm_bind we will be able to cut some significant overhead by simply omitting all the async buffers from the execbuf ioctl. v2: - Change iris_bo_busy() to bool (Ken). - Fix coding style issues (Ken). - Rebase on not having the refcount _inc and _dec helpers anymore (Ken). Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/4748 Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> Signed-off-by: Paulo Zanoni <paulo.r.zanoni@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/12363>
2021-08-05 12:23:04 -07:00 · 2021-08-05 12:23:04 -07:00 · 89a34cb845
parent d1c27d214b
commit 89a34cb845
4 changed files with 267 additions and 23 deletions
--- a/src/gallium/drivers/iris/iris_batch.c
+++ b/src/gallium/drivers/iris/iris_batch.c
@ -266,8 +266,15 @@ ensure_exec_obj_space(struct iris_batch *batch, uint32_t count)
 static void
 add_bo_to_batch(struct iris_batch *batch, struct iris_bo *bo, bool writable)
 {
+   uint64_t extra_flags = 0;
+
   assert(batch->exec_array_size > batch->exec_count);

+   if (writable)
+      extra_flags |= EXEC_OBJECT_WRITE;
+   if (!iris_bo_is_external(bo))
+      extra_flags |= EXEC_OBJECT_ASYNC;
+
   iris_bo_reference(bo);

   batch->exec_bos[batch->exec_count] = bo;
@ -276,7 +283,7 @@ add_bo_to_batch(struct iris_batch *batch, struct iris_bo *bo, bool writable)
      (struct drm_i915_gem_exec_object2) {
         .handle = bo->gem_handle,
         .offset = bo->address,
-         .flags = bo->kflags | (writable ? EXEC_OBJECT_WRITE : 0),
+         .flags = bo->kflags | extra_flags,
      };

   bo->index = batch->exec_count;
@ -346,12 +353,8 @@ iris_use_pinned_bo(struct iris_batch *batch,
          * we want to avoid synchronizing in this case.
          */
         if (other_entry &&
-             ((other_entry->flags & EXEC_OBJECT_WRITE) || writable)) {
+             ((other_entry->flags & EXEC_OBJECT_WRITE) || writable))
            iris_batch_flush(batch->other_batches[b]);
-            iris_batch_add_syncobj(batch,
-                                   batch->other_batches[b]->last_fence->syncobj,
-                                   I915_EXEC_FENCE_WAIT);
-         }
      }
   }

@ -627,6 +630,123 @@ iris_batch_check_for_reset(struct iris_batch *batch)
   return status;
 }

+static void
+move_syncobj_to_batch(struct iris_batch *batch,
+                      struct iris_syncobj **p_syncobj,
+                      unsigned flags)
+{
+   struct iris_bufmgr *bufmgr = batch->screen->bufmgr;
+
+   if (!*p_syncobj)
+      return;
+
+   bool found = false;
+   util_dynarray_foreach(&batch->syncobjs, struct iris_syncobj *, s) {
+      if (*p_syncobj == *s) {
+         found = true;
+         break;
+      }
+   }
+
+   if (!found)
+      iris_batch_add_syncobj(batch, *p_syncobj, flags);
+
+   iris_syncobj_reference(bufmgr, p_syncobj, NULL);
+}
+
+static void
+update_bo_syncobjs(struct iris_batch *batch, struct iris_bo *bo, bool write)
+{
+   struct iris_screen *screen = batch->screen;
+   struct iris_bufmgr *bufmgr = screen->bufmgr;
+
+   /* Make sure bo->deps is big enough */
+   if (screen->id >= bo->deps_size) {
+      int new_size = screen->id + 1;
+      bo->deps= realloc(bo->deps, new_size * sizeof(bo->deps[0]));
+      memset(&bo->deps[bo->deps_size], 0,
+             sizeof(bo->deps[0]) * (new_size - bo->deps_size));
+
+      bo->deps_size = new_size;
+   }
+
+   /* When it comes to execbuf submission of non-shared buffers, we only need
+    * to care about the reads and writes done by the other batches of our own
+    * screen, and we also don't care about the reads and writes done by our
+    * own batch, although we need to track them. Just note that other places of
+    * our code may need to care about all the operations done by every batch
+    * on every screen.
+    */
+   struct iris_bo_screen_deps *deps = &bo->deps[screen->id];
+   int batch_idx = batch->name;
+
+#if IRIS_BATCH_COUNT == 2
+   /* Due to the above, we exploit the fact that IRIS_NUM_BATCHES is actually
+    * 2, which means there's only one other batch we need to care about.
+    */
+   int other_batch_idx = 1 - batch_idx;
+#else
+   /* For IRIS_BATCH_COUNT == 3 we can do:
+    *   int other_batch_idxs[IRIS_BATCH_COUNT - 1] = {
+    *      (batch_idx ^ 1) & 1,
+    *      (batch_idx ^ 2) & 2,
+    *   };
+    * For IRIS_BATCH_COUNT == 4 we can do:
+    *   int other_batch_idxs[IRIS_BATCH_COUNT - 1] = {
+    *      (batch_idx + 1) & 3,
+    *      (batch_idx + 2) & 3,
+    *      (batch_idx + 3) & 3,
+    *   };
+    */
+#error "Implement me."
+#endif
+
+   /* If it is being written to by others, wait on it. */
+   if (deps->write_syncobjs[other_batch_idx])
+      move_syncobj_to_batch(batch, &deps->write_syncobjs[other_batch_idx],
+                            I915_EXEC_FENCE_WAIT);
+
+   struct iris_syncobj *batch_syncobj = iris_batch_get_signal_syncobj(batch);
+
+   if (write) {
+      /* If we're writing to it, set our batch's syncobj as write_syncobj so
+       * others can wait on us. Also wait every reader we care about before
+       * writing.
+       */
+      iris_syncobj_reference(bufmgr, &deps->write_syncobjs[batch_idx],
+                              batch_syncobj);
+
+      move_syncobj_to_batch(batch, &deps->read_syncobjs[other_batch_idx],
+                           I915_EXEC_FENCE_WAIT);
+
+   } else {
+      /* If we're reading, replace the other read from our batch index. */
+      iris_syncobj_reference(bufmgr, &deps->read_syncobjs[batch_idx],
+                             batch_syncobj);
+   }
+}
+
+static void
+update_batch_syncobjs(struct iris_batch *batch)
+{
+   struct iris_bufmgr *bufmgr = batch->screen->bufmgr;
+   simple_mtx_t *bo_deps_lock = iris_bufmgr_get_bo_deps_lock(bufmgr);
+
+   simple_mtx_lock(bo_deps_lock);
+
+   for (int i = 0; i < batch->exec_count; i++) {
+      struct iris_bo *bo = batch->exec_bos[i];
+      struct drm_i915_gem_exec_object2 *exec_obj = &batch->validation_list[i];
+      bool write = exec_obj->flags & EXEC_OBJECT_WRITE;
+
+      if (bo == batch->screen->workaround_bo)
+         continue;
+
+      update_bo_syncobjs(batch, bo, write);
+   }
+   simple_mtx_unlock(bo_deps_lock);
+}
+
 /**
 * Submit the batch to the GPU via execbuffer2.
 */
@ -711,6 +831,8 @@ _iris_batch_flush(struct iris_batch *batch, const char *file, int line)

   iris_finish_batch(batch);

+   update_batch_syncobjs(batch);
+
   if (INTEL_DEBUG & (DEBUG_BATCH | DEBUG_SUBMIT | DEBUG_PIPE_CONTROL)) {
      const char *basefile = strstr(file, "iris/");
      if (basefile)
--- a/src/gallium/drivers/iris/iris_batch.h
+++ b/src/gallium/drivers/iris/iris_batch.h
@ -56,8 +56,6 @@ enum iris_batch_name {
   IRIS_BATCH_COMPUTE,
 };

-#define IRIS_BATCH_COUNT 2
-
 struct iris_batch {
   struct iris_context *ice;
   struct iris_screen *screen;
--- a/src/gallium/drivers/iris/iris_bufmgr.c
+++ b/src/gallium/drivers/iris/iris_bufmgr.c
@ -181,6 +181,7 @@ struct iris_bufmgr {
   int fd;

   simple_mtx_t lock;
+   simple_mtx_t bo_deps_lock;

   /** Array of lists of cached gem objects of power-of-two sizes */
   struct bo_cache_bucket cache_bucket[14 * 4];
@ -381,20 +382,100 @@ vma_free(struct iris_bufmgr *bufmgr,
   util_vma_heap_free(&bufmgr->vma_allocator[memzone], address, size);
 }

-int
-iris_bo_busy(struct iris_bo *bo)
+static bool
+iris_bo_busy_gem(struct iris_bo *bo)
 {
   struct iris_bufmgr *bufmgr = bo->bufmgr;
   struct drm_i915_gem_busy busy = { .handle = bo->gem_handle };

   int ret = intel_ioctl(bufmgr->fd, DRM_IOCTL_I915_GEM_BUSY, &busy);
   if (ret == 0) {
-      bo->idle = !busy.busy;
      return busy.busy;
   }
   return false;
 }

+/* A timeout of 0 just checks for busyness. */
+static int
+iris_bo_wait_syncobj(struct iris_bo *bo, int64_t timeout_ns)
+{
+   int ret = 0;
+   struct iris_bufmgr *bufmgr = bo->bufmgr;
+
+   /* If we know it's idle, don't bother with the kernel round trip */
+   if (bo->idle)
+      return 0;
+
+   simple_mtx_lock(&bufmgr->bo_deps_lock);
+
+   uint32_t handles[bo->deps_size * IRIS_BATCH_COUNT * 2];
+   int handle_count = 0;
+
+   for (int d = 0; d < bo->deps_size; d++) {
+      for (int b = 0; b < IRIS_BATCH_COUNT; b++) {
+         struct iris_syncobj *r = bo->deps[d].read_syncobjs[b];
+         struct iris_syncobj *w = bo->deps[d].write_syncobjs[b];
+         if (r)
+            handles[handle_count++] = r->handle;
+         if (w)
+            handles[handle_count++] = w->handle;
+      }
+   }
+
+   if (handle_count == 0)
+      goto out;
+
+   /* Unlike the gem wait, negative values are not infinite here. */
+   int64_t timeout_abs = os_time_get_absolute_timeout(timeout_ns);
+   if (timeout_abs < 0)
+      timeout_abs = INT64_MAX;
+
+   struct drm_syncobj_wait args = {
+      .handles = (uintptr_t) handles,
+      .timeout_nsec = timeout_abs,
+      .count_handles = handle_count,
+      .flags = DRM_SYNCOBJ_WAIT_FLAGS_WAIT_ALL,
+   };
+
+   ret = intel_ioctl(bufmgr->fd, DRM_IOCTL_SYNCOBJ_WAIT, &args);
+   if (ret != 0) {
+      ret = -errno;
+      goto out;
+   }
+
+   /* We just waited everything, so clean all the deps. */
+   for (int d = 0; d < bo->deps_size; d++) {
+      for (int b = 0; b < IRIS_BATCH_COUNT; b++) {
+         iris_syncobj_reference(bufmgr, &bo->deps[d].write_syncobjs[b], NULL);
+         iris_syncobj_reference(bufmgr, &bo->deps[d].read_syncobjs[b], NULL);
+      }
+   }
+
+out:
+   simple_mtx_unlock(&bufmgr->bo_deps_lock);
+   return ret;
+}
+
+static bool
+iris_bo_busy_syncobj(struct iris_bo *bo)
+{
+   return iris_bo_wait_syncobj(bo, 0) == -ETIME;
+}
+
+bool
+iris_bo_busy(struct iris_bo *bo)
+{
+   bool busy;
+   if (iris_bo_is_external(bo))
+      busy = iris_bo_busy_gem(bo);
+   else
+      busy = iris_bo_busy_syncobj(bo);
+
+   bo->idle = !busy;
+
+   return busy;
+}
+
 int
 iris_bo_madvise(struct iris_bo *bo, int state)
 {
@ -865,6 +946,14 @@ bo_close(struct iris_bo *bo)
   /* Return the VMA for reuse */
   vma_free(bo->bufmgr, bo->address, bo->size);

+   for (int d = 0; d < bo->deps_size; d++) {
+      for (int b = 0; b < IRIS_BATCH_COUNT; b++) {
+         iris_syncobj_reference(bufmgr, &bo->deps[d].write_syncobjs[b], NULL);
+         iris_syncobj_reference(bufmgr, &bo->deps[d].read_syncobjs[b], NULL);
+      }
+   }
+   free(bo->deps);
+
   free(bo);
 }

@ -1149,6 +1238,22 @@ iris_bo_wait_rendering(struct iris_bo *bo)
   iris_bo_wait(bo, -1);
 }

+static int
+iris_bo_wait_gem(struct iris_bo *bo, int64_t timeout_ns)
+{
+   struct iris_bufmgr *bufmgr = bo->bufmgr;
+   struct drm_i915_gem_wait wait = {
+      .bo_handle = bo->gem_handle,
+      .timeout_ns = timeout_ns,
+   };
+
+   int ret = intel_ioctl(bufmgr->fd, DRM_IOCTL_I915_GEM_WAIT, &wait);
+   if (ret != 0)
+      return -errno;
+
+   return 0;
+}
+
 /**
 * Waits on a BO for the given amount of time.
 *
@ -1179,17 +1284,13 @@ iris_bo_wait_rendering(struct iris_bo *bo)
 int
 iris_bo_wait(struct iris_bo *bo, int64_t timeout_ns)
 {
-   struct iris_bufmgr *bufmgr = bo->bufmgr;
+   int ret;

-   /* If we know it's idle, don't bother with the kernel round trip */
-   if (bo->idle && !iris_bo_is_external(bo))
-      return 0;
+   if (iris_bo_is_external(bo))
+      ret = iris_bo_wait_gem(bo, timeout_ns);
+   else
+      ret = iris_bo_wait_syncobj(bo, timeout_ns);

-   struct drm_i915_gem_wait wait = {
-      .bo_handle = bo->gem_handle,
-      .timeout_ns = timeout_ns,
-   };
-   int ret = intel_ioctl(bufmgr->fd, DRM_IOCTL_I915_GEM_WAIT, &wait);
   if (ret != 0)
      return -errno;

@ -1208,6 +1309,7 @@ iris_bufmgr_destroy(struct iris_bufmgr *bufmgr)
   bufmgr->aux_map_ctx = NULL;

   simple_mtx_destroy(&bufmgr->lock);
+   simple_mtx_destroy(&bufmgr->bo_deps_lock);

   /* Free any cached buffer objects we were going to reuse */
   for (int i = 0; i < bufmgr->num_buckets; i++) {
@ -1786,6 +1888,7 @@ iris_bufmgr_create(struct intel_device_info *devinfo, int fd, bool bo_reuse)
   p_atomic_set(&bufmgr->refcount, 1);

   simple_mtx_init(&bufmgr->lock, mtx_plain);
+   simple_mtx_init(&bufmgr->bo_deps_lock, mtx_plain);

   list_inithead(&bufmgr->zombie_list);

@ -1924,3 +2027,9 @@ iris_bufmgr_get_aux_map_context(struct iris_bufmgr *bufmgr)
 {
   return bufmgr->aux_map_ctx;
 }
+
+simple_mtx_t *
+iris_bufmgr_get_bo_deps_lock(struct iris_bufmgr *bufmgr)
+{
+   return &bufmgr->bo_deps_lock;
+}
--- a/src/gallium/drivers/iris/iris_bufmgr.h
+++ b/src/gallium/drivers/iris/iris_bufmgr.h
@ -31,13 +31,15 @@
 #include "c11/threads.h"
 #include "util/macros.h"
 #include "util/u_atomic.h"
+#include "util/u_dynarray.h"
 #include "util/list.h"
+#include "util/simple_mtx.h"
 #include "pipe/p_defines.h"

-struct iris_batch;
 struct intel_device_info;
 struct pipe_debug_callback;
 struct isl_surf;
+struct iris_syncobj;

 /**
 * Memory zones.  When allocating a buffer, you can request that it is
@ -129,6 +131,13 @@ enum iris_mmap_mode {
   IRIS_MMAP_WB, /**< Write-back mapping with CPU caches enabled */
 };

+#define IRIS_BATCH_COUNT 2
+
+struct iris_bo_screen_deps {
+   struct iris_syncobj *write_syncobjs[IRIS_BATCH_COUNT];
+   struct iris_syncobj *read_syncobjs[IRIS_BATCH_COUNT];
+};
+
 struct iris_bo {
   /**
    * Size in bytes of the buffer object.
@ -213,6 +222,10 @@ struct iris_bo {
    */
   uint64_t last_seqnos[NUM_IRIS_DOMAINS] __attribute__ ((aligned (8)));

+   /** Up to one per screen, may need realloc. */
+   struct iris_bo_screen_deps *deps;
+   int deps_size;
+
   /**
    * Boolean of whether the GPU is definitely not accessing the buffer.
    *
@ -346,10 +359,10 @@ iris_bo_is_external(const struct iris_bo *bo)
 void iris_bo_mark_exported(struct iris_bo *bo);

 /**
- * Returns 1 if mapping the buffer for write could cause the process
+ * Returns true  if mapping the buffer for write could cause the process
 * to block, due to the object being active in the GPU.
 */
-int iris_bo_busy(struct iris_bo *bo);
+bool iris_bo_busy(struct iris_bo *bo);

 /**
 * Specify the volatility of the buffer.
@ -451,4 +464,6 @@ enum iris_memory_zone iris_memzone_for_address(uint64_t address);

 int iris_bufmgr_create_screen_id(struct iris_bufmgr *bufmgr);

+simple_mtx_t *iris_bufmgr_get_bo_deps_lock(struct iris_bufmgr *bufmgr);
+
 #endif /* IRIS_BUFMGR_H */