freedreno/drm/virtio: Use userspace IOVA allocation

If supported by host virglrenderer and host kernel, use userspace allocated GPU virtual addresses. This lets us avoid stalling on waiting for response from host kernel until we need to know the host handle (which is usually not until submit time). Handling the async response from host to get host_handle is done thru the submit_queue, so that in the submit path (hot) we do not need any additional synchronization to know that the host_handle is valid. Signed-off-by: Rob Clark <robdclark@chromium.org> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/16086>
2022-03-22 16:14:19 -07:00 · 2022-03-22 16:14:19 -07:00 · e6b2785811
parent ae01c27ac0
commit e6b2785811
10 changed files with 341 additions and 35 deletions
--- a/include/drm-uapi/msm_drm.h
+++ b/include/drm-uapi/msm_drm.h
@ -82,6 +82,10 @@ struct drm_msm_timespec {
 #define MSM_PARAM_FAULTS     0x09  /* RO */
 #define MSM_PARAM_SUSPENDS   0x0a  /* RO */
 #define MSM_PARAM_SYSPROF    0x0b  /* WO: 1 preserves perfcntrs, 2 also disables suspend */
+#define MSM_PARAM_COMM       0x0c  /* WO: override for task->comm */
+#define MSM_PARAM_CMDLINE    0x0d  /* WO: override for task cmdline */
+#define MSM_PARAM_VA_START   0x0e  /* RO: start of valid GPU iova range */
+#define MSM_PARAM_VA_SIZE    0x0f  /* RO: size of valid GPU iova range (bytes) */

 /* For backwards compat.  The original support for preemption was based on
 * a single ring per priority level so # of priority levels equals the #
@ -95,6 +99,8 @@ struct drm_msm_param {
 	__u32 pipe;           /* in, MSM_PIPE_x */
 	__u32 param;          /* in, MSM_PARAM_x */
 	__u64 value;          /* out (get_param) or in (set_param) */
+	__u32 len;            /* zero for non-pointer params */
+	__u32 pad;            /* must be zero */
 };

 /*
@ -131,6 +137,7 @@ struct drm_msm_gem_new {
 #define MSM_INFO_GET_IOVA	0x01   /* get iova, returned by value */
 #define MSM_INFO_SET_NAME	0x02   /* set the debug name (by pointer) */
 #define MSM_INFO_GET_NAME	0x03   /* get debug name, returned by pointer */
+#define MSM_INFO_SET_IOVA	0x04   /* set the iova, passed by value */

 struct drm_msm_gem_info {
 	__u32 handle;         /* in */
--- a/src/freedreno/drm/freedreno_device.c
+++ b/src/freedreno/drm/freedreno_device.c
@ -155,13 +155,17 @@ fd_device_del_impl(struct fd_device *dev)

   assert(list_is_empty(&dev->deferred_submits));

-   dev->funcs->destroy(dev);
-
   if (dev->suballoc_bo)
      fd_bo_del_locked(dev->suballoc_bo);

   fd_bo_cache_cleanup(&dev->bo_cache, 0);
   fd_bo_cache_cleanup(&dev->ring_cache, 0);
+
+   /* Needs to be after bo cache cleanup in case backend has a
+    * util_vma_heap that it destroys:
+    */
+   dev->funcs->destroy(dev);
+
   _mesa_hash_table_destroy(dev->handle_table, NULL);
   _mesa_hash_table_destroy(dev->name_table, NULL);

--- a/src/freedreno/drm/freedreno_ringbuffer_sp.h
+++ b/src/freedreno/drm/freedreno_ringbuffer_sp.h
@ -76,6 +76,10 @@ struct fd_submit_sp {
   /* Used in case out_fence==NULL: */
   struct util_queue_fence fence;

+   /* Used by retire_queue, if used by backend: */
+   int out_fence_fd;
+   struct util_queue_fence retire_fence;
+
   flush_submit_list_fn flush_submit_list;
 };
 FD_DEFINE_CAST(fd_submit, fd_submit_sp);
--- a/src/freedreno/drm/virtio/msm_proto.h
+++ b/src/freedreno/drm/virtio/msm_proto.h
@ -65,6 +65,7 @@ enum msm_ccmd {
   MSM_CCMD_SUBMITQUEUE_QUERY,
   MSM_CCMD_WAIT_FENCE,
   MSM_CCMD_SET_DEBUGINFO,
+   MSM_CCMD_GEM_CLOSE,
   MSM_CCMD_LAST,
 };

@ -138,6 +139,8 @@ struct msm_ccmd_gem_new_req {
   uint64_t size;
   uint32_t flags;
   uint32_t blob_id;
+
+   uint64_t iova;  /* non-zero for guest userspace iova allocation */
 };
 DEFINE_CAST(msm_ccmd_req, msm_ccmd_gem_new_req)

@ -161,6 +164,8 @@ struct msm_ccmd_gem_info_req {
   uint32_t res_id;
   uint32_t blob_mem;   // TODO do we need this?
   uint32_t blob_id;    // TODO do we need this?
+
+   uint64_t iova;  /* non-zero for guest userspace iova allocation */
 };
 DEFINE_CAST(msm_ccmd_req, msm_ccmd_gem_info_req)

@ -371,4 +376,19 @@ struct msm_ccmd_set_debuginfo_req {
 };
 DEFINE_CAST(msm_ccmd_req, msm_ccmd_set_debuginfo_req)

+/*
+ * MSM_CCMD_GEM_CLOSE
+ *
+ * If guest userspace allocated iova's are used, this request can be used
+ * to clear the vma when the guest bo is deleted.
+ *
+ * No response.
+ */
+struct msm_ccmd_gem_close_req {
+   struct msm_ccmd_req hdr;
+
+   uint32_t host_handle;
+};
+DEFINE_CAST(msm_ccmd_req, msm_ccmd_gem_close_req)
+
 #endif /* MSM_PROTO_H_ */
--- a/src/freedreno/drm/virtio/virtio_bo.c
+++ b/src/freedreno/drm/virtio/virtio_bo.c
@ -21,6 +21,8 @@
 * SOFTWARE.
 */

+#include "util/libsync.h"
+
 #include "virtio_priv.h"

 static int
@ -102,7 +104,7 @@ virtio_bo_cpu_prep(struct fd_bo *bo, struct fd_pipe *pipe, uint32_t op)

   struct msm_ccmd_gem_cpu_prep_req req = {
         .hdr = MSM_CCMD(GEM_CPU_PREP, sizeof(req)),
-         .host_handle = to_virtio_bo(bo)->host_handle,
+         .host_handle = virtio_bo_host_handle(bo),
         .op = op,
         .timeout = 5000000000,
   };
@ -170,7 +172,7 @@ virtio_bo_set_name(struct fd_bo *bo, const char *fmt, va_list ap)
   struct msm_ccmd_gem_set_name_req *req = (void *)buf;

   req->hdr = MSM_CCMD(GEM_SET_NAME, req_len);
-   req->host_handle = to_virtio_bo(bo)->host_handle;
+   req->host_handle = virtio_bo_host_handle(bo);
   req->len = sz;

   memcpy(req->payload, name, sz);
@ -187,7 +189,7 @@ virtio_bo_upload(struct fd_bo *bo, void *src, unsigned len)
   struct msm_ccmd_gem_upload_req *req = (void *)buf;

   req->hdr = MSM_CCMD(GEM_UPLOAD, req_len);
-   req->host_handle = to_virtio_bo(bo)->host_handle;
+   req->host_handle = virtio_bo_host_handle(bo);
   req->pad = 0;
   req->off = 0;
   req->len = len;
@ -201,6 +203,19 @@ static void
 virtio_bo_destroy(struct fd_bo *bo)
 {
   struct virtio_bo *virtio_bo = to_virtio_bo(bo);
+   struct virtio_device *virtio_dev = to_virtio_device(bo->dev);
+
+   if (virtio_dev->userspace_allocates_iova && bo->iova) {
+      struct msm_ccmd_gem_close_req req = {
+            .hdr = MSM_CCMD(GEM_CLOSE, sizeof(req)),
+            .host_handle = virtio_bo_host_handle(bo),
+      };
+
+      virtio_execbuf(bo->dev, &req.hdr, false);
+
+      virtio_dev_free_iova(bo->dev, bo->iova, bo->size);
+   }
+
   free(virtio_bo);
 }

@ -215,6 +230,50 @@ static const struct fd_bo_funcs funcs = {
   .destroy = virtio_bo_destroy,
 };

+struct allocation_wait {
+   struct fd_bo *bo;
+   int fence_fd;
+   struct msm_ccmd_gem_new_rsp *new_rsp;
+   struct msm_ccmd_gem_info_rsp *info_rsp;
+};
+
+static void
+allocation_wait_execute(void *job, void *gdata, int thread_index)
+{
+   struct allocation_wait *wait = job;
+   struct virtio_bo *virtio_bo = to_virtio_bo(wait->bo);
+
+   sync_wait(wait->fence_fd, -1);
+   close(wait->fence_fd);
+
+   if (wait->new_rsp) {
+      virtio_bo->host_handle = wait->new_rsp->host_handle;
+   } else {
+      virtio_bo->host_handle = wait->info_rsp->host_handle;
+      wait->bo->size = wait->info_rsp->size;
+   }
+   fd_bo_del(wait->bo);
+   free(wait);
+}
+
+static void
+enqueue_allocation_wait(struct fd_bo *bo, int fence_fd,
+                        struct msm_ccmd_gem_new_rsp *new_rsp,
+                        struct msm_ccmd_gem_info_rsp *info_rsp)
+{
+   struct allocation_wait *wait = malloc(sizeof(*wait));
+
+   wait->bo = fd_bo_ref(bo);
+   wait->fence_fd = fence_fd;
+   wait->new_rsp = new_rsp;
+   wait->info_rsp = info_rsp;
+
+   util_queue_add_job(&bo->dev->submit_queue,
+                      wait, &to_virtio_bo(bo)->fence,
+                      allocation_wait_execute,
+                      NULL, 0);
+}
+
 static struct fd_bo *
 bo_from_handle(struct fd_device *dev, uint32_t size, uint32_t handle)
 {
@ -225,7 +284,16 @@ bo_from_handle(struct fd_device *dev, uint32_t size, uint32_t handle)
   if (!virtio_bo)
      return NULL;

+   util_queue_fence_init(&virtio_bo->fence);
+
   bo = &virtio_bo->base;
+
+   /* Note we need to set these because allocation_wait_execute() could
+    * run before bo_init_commont():
+    */
+   bo->dev = dev;
+   p_atomic_set(&bo->refcnt, 1);
+
   bo->size = size;
   bo->funcs = &funcs;
   bo->handle = handle;
@ -239,6 +307,7 @@ bo_from_handle(struct fd_device *dev, uint32_t size, uint32_t handle)
 struct fd_bo *
 virtio_bo_from_handle(struct fd_device *dev, uint32_t size, uint32_t handle)
 {
+   struct virtio_device *virtio_dev = to_virtio_device(dev);
   struct fd_bo *bo = bo_from_handle(dev, size, handle);
   struct drm_virtgpu_resource_info args = {
         .bo_handle = handle,
@ -255,36 +324,59 @@ virtio_bo_from_handle(struct fd_device *dev, uint32_t size, uint32_t handle)
         .hdr = MSM_CCMD(GEM_INFO, sizeof(req)),
         .res_id = args.res_handle,
         .blob_mem = args.blob_mem,
-         .blob_id = p_atomic_inc_return(&to_virtio_device(dev)->next_blob_id),
+         .blob_id = p_atomic_inc_return(&virtio_dev->next_blob_id),
   };

+   if (virtio_dev->userspace_allocates_iova) {
+      req.iova = virtio_dev_alloc_iova(dev, size);
+      if (!req.iova) {
+         virtio_dev_free_iova(dev, req.iova, size);
+         ret = -ENOMEM;
+         goto fail;
+      }
+   }
+
   struct msm_ccmd_gem_info_rsp *rsp =
         virtio_alloc_rsp(dev, &req.hdr, sizeof(*rsp));

-   ret = virtio_execbuf(dev, &req.hdr, true);
-   if (ret) {
-      INFO_MSG("failed to get gem info: %s", strerror(errno));
-      goto fail;
-   }
-   if (rsp->ret) {
-      INFO_MSG("failed (on host) to get gem info: %s", strerror(rsp->ret));
-      goto fail;
-   }
-
   struct virtio_bo *virtio_bo = to_virtio_bo(bo);

   virtio_bo->blob_id = req.blob_id;
-   virtio_bo->host_handle = rsp->host_handle;
-   bo->iova = rsp->iova;

-   /* If the imported buffer is allocated via virgl context (for example
-    * minigbm/arc-cros-gralloc) then the guest gem object size is fake,
-    * potentially not accounting for UBWC meta data, required pitch
-    * alignment, etc.  But in the import path the gallium driver checks
-    * that the size matches the minimum size based on layout.  So replace
-    * the guest potentially-fake size with the real size from the host:
-    */
-   bo->size = rsp->size;
+   if (virtio_dev->userspace_allocates_iova) {
+      int fence_fd;
+      ret = virtio_execbuf_fenced(dev, &req.hdr, -1, &fence_fd, 0);
+      if (ret) {
+         INFO_MSG("failed to get gem info: %s", strerror(errno));
+         goto fail;
+      }
+
+      bo->iova = req.iova;
+
+      enqueue_allocation_wait(bo, fence_fd, NULL, rsp);
+   } else {
+      ret = virtio_execbuf(dev, &req.hdr, true);
+      if (ret) {
+         INFO_MSG("failed to get gem info: %s", strerror(errno));
+         goto fail;
+      }
+      if (rsp->ret) {
+         INFO_MSG("failed (on host) to get gem info: %s", strerror(rsp->ret));
+         goto fail;
+      }
+
+      virtio_bo->host_handle = rsp->host_handle;
+      bo->iova = rsp->iova;
+
+      /* If the imported buffer is allocated via virgl context (for example
+       * minigbm/arc-cros-gralloc) then the guest gem object size is fake,
+       * potentially not accounting for UBWC meta data, required pitch
+       * alignment, etc.  But in the import path the gallium driver checks
+       * that the size matches the minimum size based on layout.  So replace
+       * the guest potentially-fake size with the real size from the host:
+       */
+      bo->size = rsp->size;
+   }

   return bo;

@ -342,6 +434,14 @@ virtio_bo_new(struct fd_device *dev, uint32_t size, uint32_t flags)
      req.blob_id = args.blob_id;

      rsp = virtio_alloc_rsp(dev, &req.hdr, sizeof(*rsp));
+
+      if (virtio_dev->userspace_allocates_iova) {
+         req.iova = virtio_dev_alloc_iova(dev, size);
+         if (!req.iova) {
+            ret = -ENOMEM;
+            goto fail;
+         }
+      }
   }

   simple_mtx_lock(&virtio_dev->eb_lock);
@ -358,19 +458,52 @@ virtio_bo_new(struct fd_device *dev, uint32_t size, uint32_t flags)
   virtio_bo->blob_id = args.blob_id;

   if (rsp) {
-      /* RESOURCE_CREATE_BLOB is async, so we need to wait for host..
-       * which is a bit unfortunate, but better to sync here than
-       * add extra code to check if we need to wait each time we
-       * emit a reloc.
-       */
-      virtio_host_sync(dev, &req.hdr);
+      if (virtio_dev->userspace_allocates_iova) {
+         int fence_fd;

-      virtio_bo->host_handle = rsp->host_handle;
-      bo->iova = rsp->iova;
+         /* We can't get a fence fd from RESOURCE_CREATE_BLOB, so send
+          * a NOP packet just for that purpose:
+          */
+         struct msm_ccmd_nop_req nop = {
+               .hdr = MSM_CCMD(NOP, sizeof(nop)),
+         };
+
+         ret = virtio_execbuf_fenced(dev, &nop.hdr, -1, &fence_fd, 0);
+         if (ret) {
+            INFO_MSG("failed to get gem info: %s", strerror(errno));
+            goto fail;
+         }
+
+         bo->iova = req.iova;
+
+         enqueue_allocation_wait(bo, fence_fd, rsp, NULL);
+      } else {
+         /* RESOURCE_CREATE_BLOB is async, so we need to wait for host..
+          * which is a bit unfortunate, but better to sync here than
+          * add extra code to check if we need to wait each time we
+          * emit a reloc.
+          */
+         virtio_host_sync(dev, &req.hdr);
+
+         virtio_bo->host_handle = rsp->host_handle;
+         bo->iova = rsp->iova;
+      }
   }

   return bo;

 fail:
+   if (req.iova) {
+      assert(virtio_dev->userspace_allocates_iova);
+      virtio_dev_free_iova(dev, req.iova, size);
+   }
   return NULL;
 }
+
+uint32_t
+virtio_bo_host_handle(struct fd_bo *bo)
+{
+   struct virtio_bo *virtio_bo = to_virtio_bo(bo);
+   util_queue_fence_wait(&virtio_bo->fence);
+   return virtio_bo->host_handle;
+}
--- a/src/freedreno/drm/virtio/virtio_device.c
+++ b/src/freedreno/drm/virtio/virtio_device.c
@ -34,7 +34,12 @@ static void
 virtio_device_destroy(struct fd_device *dev)
 {
   struct virtio_device *virtio_dev = to_virtio_device(dev);
+
   fd_bo_del_locked(virtio_dev->shmem_bo);
+
+   if (virtio_dev->userspace_allocates_iova) {
+      util_vma_heap_finish(&virtio_dev->address_space);
+   }
 }

 static const struct fd_device_funcs funcs = {
@ -149,6 +154,8 @@ virtio_device_new(int fd, drmVersionPtr version)
   INFO_MSG("version_minor:       %u", caps.version_minor);
   INFO_MSG("version_patchlevel:  %u", caps.version_patchlevel);
   INFO_MSG("has_cached_coherent: %u", caps.u.msm.has_cached_coherent);
+   INFO_MSG("va_start:            0x%0"PRIx64, caps.u.msm.va_start);
+   INFO_MSG("va_size:             0x%0"PRIx64, caps.u.msm.va_size);

   if (caps.wire_format_version != 1) {
      ERROR_MSG("Unsupported protocol version: %u", caps.wire_format_version);
@ -188,6 +195,15 @@ virtio_device_new(int fd, drmVersionPtr version)

   set_debuginfo(dev);

+   if (caps.u.msm.va_start && caps.u.msm.va_size) {
+      virtio_dev->userspace_allocates_iova = true;
+
+      util_vma_heap_init(&virtio_dev->address_space,
+                         caps.u.msm.va_start,
+                         caps.u.msm.va_size);
+      simple_mtx_init(&virtio_dev->address_space_lock, mtx_plain);
+   }
+
   return dev;
 }

--- a/src/freedreno/drm/virtio/virtio_pipe.c
+++ b/src/freedreno/drm/virtio/virtio_pipe.c
@ -170,6 +170,9 @@ virtio_pipe_destroy(struct fd_pipe *pipe)
 {
   struct virtio_pipe *virtio_pipe = to_virtio_pipe(pipe);

+   if (util_queue_is_initialized(&virtio_pipe->retire_queue))
+      util_queue_destroy(&virtio_pipe->retire_queue);
+
   close_submitqueue(pipe, virtio_pipe->queue_id);
   fd_pipe_sp_ringpool_fini(pipe);
   free(virtio_pipe);
@ -254,6 +257,11 @@ virtio_pipe_new(struct fd_device *dev, enum fd_pipe_id id, uint32_t prio)
   if (!(virtio_pipe->gpu_id || virtio_pipe->chip_id))
      goto fail;

+   if (to_virtio_device(dev)->userspace_allocates_iova) {
+      util_queue_init(&virtio_pipe->retire_queue, "rq", 8, 1,
+                      UTIL_QUEUE_INIT_RESIZE_IF_FULL, NULL);
+   }
+
   INFO_MSG("Pipe Info:");
   INFO_MSG(" GPU-id:          %d", virtio_pipe->gpu_id);
   INFO_MSG(" Chip-id:         0x%016"PRIx64, virtio_pipe->chip_id);
--- a/src/freedreno/drm/virtio/virtio_priv.h
+++ b/src/freedreno/drm/virtio/virtio_priv.h
@ -31,6 +31,7 @@
 #include "util/u_atomic.h"
 #include "util/slab.h"
 #include "util/timespec.h"
+#include "util/vma.h"

 #include "pipe/p_defines.h"

@ -53,11 +54,57 @@ struct virtio_device {

   uint32_t next_blob_id;
   uint32_t next_seqno;
+
+   bool userspace_allocates_iova;
+
+   /*
+    * Notes on address space allocation:
+    *
+    * In both the import (GEM_INFO) and new (GEM_NEW) path we allocate
+    * the iova.  Since the iova (vma on kernel side) is local to the
+    * address space, and that is 1:1 with drm fd (which is 1:1 with
+    * virtio_device and therefore address_space) which is not shared
+    * with anything outside of the driver, and because of the handle
+    * de-duplication, we can safely assume that an iova has not yet
+    * been set on imported buffers.
+    *
+    * The other complication with userspace allocated iova is that
+    * the kernel holds on to a reference to the bo (and the GPU is
+    * still using it's iova) until the submit retires.  So a per-pipe
+    * retire_queue is used to hold an extra reference to the submit
+    * (and indirectly all the bo's referenced) until the out-fence is
+    * signaled.
+    */
+   struct util_vma_heap address_space;
+   simple_mtx_t address_space_lock;
 };
 FD_DEFINE_CAST(fd_device, virtio_device);

 struct fd_device *virtio_device_new(int fd, drmVersionPtr version);

+static inline void
+virtio_dev_free_iova(struct fd_device *dev, uint64_t iova, uint32_t size)
+{
+   struct virtio_device *virtio_dev = to_virtio_device(dev);
+
+   simple_mtx_lock(&virtio_dev->address_space_lock);
+   util_vma_heap_free(&virtio_dev->address_space, iova, size);
+   simple_mtx_unlock(&virtio_dev->address_space_lock);
+}
+
+static inline uint64_t
+virtio_dev_alloc_iova(struct fd_device *dev, uint32_t size)
+{
+   struct virtio_device *virtio_dev = to_virtio_device(dev);
+   uint64_t iova;
+
+   simple_mtx_lock(&virtio_dev->address_space_lock);
+   iova = util_vma_heap_alloc(&virtio_dev->address_space, size, 0x1000);
+   simple_mtx_unlock(&virtio_dev->address_space_lock);
+
+   return iova;
+}
+
 struct virtio_pipe {
   struct fd_pipe base;
   uint32_t pipe;
@ -88,6 +135,17 @@ struct virtio_pipe {
    *   ca3ffcbeb0c8 ("drm/msm/gpu: Don't allow zero fence_id")
    */
   int32_t next_submit_fence;
+
+   /**
+    * When userspace_allocates_iova, we need to defer deleting bo's (and
+    * therefore releasing their address) until submits referencing them
+    * have completed.  This is accomplished by enqueueing a job, holding
+    * a reference to the submit, that waits on the submit's out-fence
+    * before dropping the reference to the submit.  The submit holds a
+    * reference to the associated ring buffers, which in turn hold a ref
+    * to the associated bo's.
+    */
+   struct util_queue retire_queue;
 };
 FD_DEFINE_CAST(fd_pipe, virtio_pipe);

@ -100,6 +158,15 @@ struct virtio_bo {
   struct fd_bo base;
   uint64_t offset;

+   struct util_queue_fence fence;
+
+   /*
+    * Note: all access to host_handle must wait on fence, *other* than
+    * access from the submit_queue thread (because async bo allocations
+    * are retired on the submit_queue, guaranteeing that the fence is
+    * signaled before host_handle is accessed).  All other access must
+    * use virtio_bo_host_handle().
+    */
   uint32_t host_handle;
   uint32_t blob_id;
 };
@ -109,6 +176,8 @@ struct fd_bo *virtio_bo_new(struct fd_device *dev, uint32_t size, uint32_t flags
 struct fd_bo *virtio_bo_from_handle(struct fd_device *dev, uint32_t size,
                                    uint32_t handle);

+uint32_t virtio_bo_host_handle(struct fd_bo *bo);
+
 /*
 * Internal helpers:
 */
--- a/src/freedreno/drm/virtio/virtio_ringbuffer.c
+++ b/src/freedreno/drm/virtio/virtio_ringbuffer.c
@ -25,17 +25,35 @@
 #include <inttypes.h>
 #include <pthread.h>

+#include "util/libsync.h"
 #include "util/os_file.h"

 #include "drm/freedreno_ringbuffer_sp.h"
 #include "virtio_priv.h"

+static void
+retire_execute(void *job, void *gdata, int thread_index)
+{
+   struct fd_submit_sp *fd_submit = job;
+
+   sync_wait(fd_submit->out_fence_fd, -1);
+   close(fd_submit->out_fence_fd);
+}
+
+static void
+retire_cleanup(void *job, void *gdata, int thread_index)
+{
+   struct fd_submit_sp *fd_submit = job;
+   fd_submit_del(&fd_submit->base);
+}
+
 static int
 flush_submit_list(struct list_head *submit_list)
 {
   struct fd_submit_sp *fd_submit = to_fd_submit_sp(last_submit(submit_list));
   struct virtio_pipe *virtio_pipe = to_virtio_pipe(fd_submit->base.pipe);
   struct fd_device *dev = virtio_pipe->base.dev;
+   struct virtio_device *virtio_dev = to_virtio_device(dev);

   unsigned nr_cmds = 0;

@ -116,8 +134,12 @@ flush_submit_list(struct list_head *submit_list)
   }

   for (unsigned i = 0; i < fd_submit->nr_bos; i++) {
+      struct virtio_bo *virtio_bo = to_virtio_bo(fd_submit->bos[i]);
+
+      assert(util_queue_fence_is_signalled(&virtio_bo->fence));
+
      submit_bos[i].flags = fd_submit->bos[i]->reloc_flags;
-      submit_bos[i].handle = to_virtio_bo(fd_submit->bos[i])->host_handle;
+      submit_bos[i].handle = virtio_bo->host_handle;
      submit_bos[i].presumed = 0;
   }

@ -156,6 +178,12 @@ flush_submit_list(struct list_head *submit_list)
       */
      out_fence->use_fence_fd = true;
      out_fence_fd = &out_fence->fence_fd;
+   } else if (virtio_dev->userspace_allocates_iova) {
+      /* we are using retire_queue, so we need an out-fence for each
+       * submit.. we can just re-use fd_submit->out_fence_fd for temporary
+       * storage.
+       */
+      out_fence_fd = &fd_submit->out_fence_fd;
   }

   if (fd_submit->in_fence_fd != -1) {
@ -177,6 +205,20 @@ flush_submit_list(struct list_head *submit_list)
   if (fd_submit->in_fence_fd != -1)
      close(fd_submit->in_fence_fd);

+   if (virtio_dev->userspace_allocates_iova) {
+      if (out_fence_fd != &fd_submit->out_fence_fd)
+         fd_submit->out_fence_fd = os_dupfd_cloexec(*out_fence_fd);
+      fd_submit_ref(&fd_submit->base);
+
+      util_queue_fence_init(&fd_submit->retire_fence);
+
+      util_queue_add_job(&virtio_pipe->retire_queue,
+                         fd_submit, &fd_submit->retire_fence,
+                         retire_execute,
+                         retire_cleanup,
+                         0);
+   }
+
   return 0;
 }

--- a/src/virtio/virtio-gpu/drm_hw.h
+++ b/src/virtio/virtio-gpu/drm_hw.h
@ -18,6 +18,9 @@ struct virgl_renderer_capset_drm {
   union {
      struct {
         uint32_t has_cached_coherent;
+         uint32_t priorities;
+         uint64_t va_start;
+         uint64_t va_size;
      } msm;  /* context_type == VIRTGPU_DRM_CONTEXT_MSM */
   } u;
 };