mesa/src/vulkan/runtime/vk_queue.c

/*
 * Copyright © 2021 Intel Corporation
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice (including the next
 * paragraph) shall be included in all copies or substantial portions of the
 * Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 * IN THE SOFTWARE.
 */

#include "vk_queue.h"

#include "util/debug.h"
#include <inttypes.h>

#include "vk_alloc.h"
#include "vk_command_buffer.h"
#include "vk_command_pool.h"
#include "vk_common_entrypoints.h"
#include "vk_device.h"
#include "vk_fence.h"
#include "vk_log.h"
#include "vk_physical_device.h"
#include "vk_semaphore.h"
#include "vk_sync.h"
#include "vk_sync_binary.h"
#include "vk_sync_dummy.h"
#include "vk_sync_timeline.h"
#include "vk_util.h"

#include "vulkan/wsi/wsi_common.h"

static VkResult
vk_queue_start_submit_thread(struct vk_queue *queue);

VkResult
vk_queue_init(struct vk_queue *queue, struct vk_device *device,
              const VkDeviceQueueCreateInfo *pCreateInfo,
              uint32_t index_in_family)
{
   VkResult result = VK_SUCCESS;
   int ret;

   memset(queue, 0, sizeof(*queue));
   vk_object_base_init(device, &queue->base, VK_OBJECT_TYPE_QUEUE);

   list_addtail(&queue->link, &device->queues);

   queue->flags = pCreateInfo->flags;
   queue->queue_family_index = pCreateInfo->queueFamilyIndex;

   assert(index_in_family < pCreateInfo->queueCount);
   queue->index_in_family = index_in_family;

   queue->submit.mode = device->submit_mode;
   if (queue->submit.mode == VK_QUEUE_SUBMIT_MODE_THREADED_ON_DEMAND)
      queue->submit.mode = VK_QUEUE_SUBMIT_MODE_IMMEDIATE;

   list_inithead(&queue->submit.submits);

   ret = mtx_init(&queue->submit.mutex, mtx_plain);
   if (ret == thrd_error) {
      result = vk_errorf(queue, VK_ERROR_UNKNOWN, "mtx_init failed");
      goto fail_mutex;
   }

   ret = cnd_init(&queue->submit.push);
   if (ret == thrd_error) {
      result = vk_errorf(queue, VK_ERROR_UNKNOWN, "cnd_init failed");
      goto fail_push;
   }

   ret = cnd_init(&queue->submit.pop);
   if (ret == thrd_error) {
      result = vk_errorf(queue, VK_ERROR_UNKNOWN, "cnd_init failed");
      goto fail_pop;
   }

   if (queue->submit.mode == VK_QUEUE_SUBMIT_MODE_THREADED) {
      result = vk_queue_start_submit_thread(queue);
      if (result != VK_SUCCESS)
         goto fail_thread;
   }

   util_dynarray_init(&queue->labels, NULL);
   queue->region_begin = true;

   return VK_SUCCESS;

fail_thread:
   cnd_destroy(&queue->submit.pop);
fail_pop:
   cnd_destroy(&queue->submit.push);
fail_push:
   mtx_destroy(&queue->submit.mutex);
fail_mutex:
   return result;
}

VkResult
_vk_queue_set_lost(struct vk_queue *queue,
                   const char *file, int line,
                   const char *msg, ...)
{
   if (queue->_lost.lost)
      return VK_ERROR_DEVICE_LOST;

   queue->_lost.lost = true;
   queue->_lost.error_file = file;
   queue->_lost.error_line = line;

   va_list ap;
   va_start(ap, msg);
   vsnprintf(queue->_lost.error_msg, sizeof(queue->_lost.error_msg), msg, ap);
   va_end(ap);

   p_atomic_inc(&queue->base.device->_lost.lost);

   if (env_var_as_boolean("MESA_VK_ABORT_ON_DEVICE_LOSS", false)) {
      _vk_device_report_lost(queue->base.device);
      abort();
   }

   return VK_ERROR_DEVICE_LOST;
}

static struct vk_queue_submit *
vk_queue_submit_alloc(struct vk_queue *queue,
                      uint32_t wait_count,
                      uint32_t command_buffer_count,
                      uint32_t buffer_bind_count,
                      uint32_t image_opaque_bind_count,
                      uint32_t image_bind_count,
                      uint32_t bind_entry_count,
                      uint32_t image_bind_entry_count,
                      uint32_t signal_count,
                      VkSparseMemoryBind **bind_entries,
                      VkSparseImageMemoryBind **image_bind_entries)
{
   VK_MULTIALLOC(ma);
   VK_MULTIALLOC_DECL(&ma, struct vk_queue_submit, submit, 1);
   VK_MULTIALLOC_DECL(&ma, struct vk_sync_wait, waits, wait_count);
   VK_MULTIALLOC_DECL(&ma, struct vk_command_buffer *, command_buffers,
                      command_buffer_count);
   VK_MULTIALLOC_DECL(&ma, VkSparseBufferMemoryBindInfo, buffer_binds,
                      buffer_bind_count);
   VK_MULTIALLOC_DECL(&ma, VkSparseImageOpaqueMemoryBindInfo,
                      image_opaque_binds, image_opaque_bind_count);
   VK_MULTIALLOC_DECL(&ma, VkSparseImageMemoryBindInfo, image_binds,
                      image_bind_count);
   VK_MULTIALLOC_DECL(&ma, VkSparseMemoryBind,
                      bind_entries_local, bind_entry_count);
   VK_MULTIALLOC_DECL(&ma, VkSparseImageMemoryBind, image_bind_entries_local,
                      image_bind_entry_count);
   VK_MULTIALLOC_DECL(&ma, struct vk_sync_signal, signals, signal_count);
   VK_MULTIALLOC_DECL(&ma, struct vk_sync *, wait_temps, wait_count);

   struct vk_sync_timeline_point **wait_points = NULL, **signal_points = NULL;
   if (queue->base.device->timeline_mode == VK_DEVICE_TIMELINE_MODE_EMULATED) {
      vk_multialloc_add(&ma, &wait_points,
                        struct vk_sync_timeline_point *, wait_count);
      vk_multialloc_add(&ma, &signal_points,
                        struct vk_sync_timeline_point *, signal_count);
   }

   if (!vk_multialloc_zalloc(&ma, &queue->base.device->alloc,
                             VK_SYSTEM_ALLOCATION_SCOPE_DEVICE))
      return NULL;

   submit->wait_count            = wait_count;
   submit->command_buffer_count  = command_buffer_count;
   submit->signal_count          = signal_count;
   submit->buffer_bind_count     = buffer_bind_count;
   submit->image_opaque_bind_count = image_opaque_bind_count;
   submit->image_bind_count      = image_bind_count;

   submit->waits           = waits;
   submit->command_buffers = command_buffers;
   submit->signals         = signals;
   submit->buffer_binds    = buffer_binds;
   submit->image_opaque_binds = image_opaque_binds;
   submit->image_binds     = image_binds;
   submit->_wait_temps     = wait_temps;
   submit->_wait_points    = wait_points;
   submit->_signal_points  = signal_points;

   if (bind_entries)
      *bind_entries = bind_entries_local;

   if (image_bind_entries)
      *image_bind_entries = image_bind_entries_local;

   return submit;
}

static void
vk_queue_submit_cleanup(struct vk_queue *queue,
                        struct vk_queue_submit *submit)
{
   for (uint32_t i = 0; i < submit->wait_count; i++) {
      if (submit->_wait_temps[i] != NULL)
         vk_sync_destroy(queue->base.device, submit->_wait_temps[i]);
   }

   if (submit->_mem_signal_temp != NULL)
      vk_sync_destroy(queue->base.device, submit->_mem_signal_temp);

   if (submit->_wait_points != NULL) {
      for (uint32_t i = 0; i < submit->wait_count; i++) {
         if (unlikely(submit->_wait_points[i] != NULL)) {
            vk_sync_timeline_point_release(queue->base.device,
                                           submit->_wait_points[i]);
         }
      }
   }

   if (submit->_signal_points != NULL) {
      for (uint32_t i = 0; i < submit->signal_count; i++) {
         if (unlikely(submit->_signal_points[i] != NULL)) {
            vk_sync_timeline_point_free(queue->base.device,
                                        submit->_signal_points[i]);
         }
      }
   }
}

static void
vk_queue_submit_free(struct vk_queue *queue,
                     struct vk_queue_submit *submit)
{
   vk_free(&queue->base.device->alloc, submit);
}

static void
vk_queue_submit_destroy(struct vk_queue *queue,
                        struct vk_queue_submit *submit)
{
   vk_queue_submit_cleanup(queue, submit);
   vk_queue_submit_free(queue, submit);
}

static void
vk_queue_push_submit(struct vk_queue *queue,
                     struct vk_queue_submit *submit)
{
   mtx_lock(&queue->submit.mutex);
   list_addtail(&submit->link, &queue->submit.submits);
   cnd_signal(&queue->submit.push);
   mtx_unlock(&queue->submit.mutex);
}

static VkResult
vk_queue_drain(struct vk_queue *queue)
{
   VkResult result = VK_SUCCESS;

   mtx_lock(&queue->submit.mutex);
   while (!list_is_empty(&queue->submit.submits)) {
      if (vk_device_is_lost(queue->base.device)) {
         result = VK_ERROR_DEVICE_LOST;
         break;
      }

      int ret = cnd_wait(&queue->submit.pop, &queue->submit.mutex);
      if (ret == thrd_error) {
         result = vk_queue_set_lost(queue, "cnd_wait failed");
         break;
      }
   }
   mtx_unlock(&queue->submit.mutex);

   return result;
}

static VkResult
vk_queue_submit_final(struct vk_queue *queue,
                      struct vk_queue_submit *submit)
{
   VkResult result;

   /* Now that we know all our time points exist, fetch the time point syncs
    * from any vk_sync_timelines.  While we're here, also compact down the
    * list of waits to get rid of any trivial timeline waits.
    */
   uint32_t wait_count = 0;
   for (uint32_t i = 0; i < submit->wait_count; i++) {
      /* A timeline wait on 0 is always a no-op */
      if ((submit->waits[i].sync->flags & VK_SYNC_IS_TIMELINE) &&
          submit->waits[i].wait_value == 0)
         continue;

      /* Waits on dummy vk_syncs are no-ops */
      if (vk_sync_type_is_dummy(submit->waits[i].sync->type)) {
         /* We are about to lose track of this wait, if it has a temporary
          * we need to destroy it now, as vk_queue_submit_cleanup will not
          * know about it */
         if (submit->_wait_temps[i] != NULL) {
            vk_sync_destroy(queue->base.device, submit->_wait_temps[i]);
            submit->waits[i].sync = NULL;
         }
         continue;
      }

      /* For emulated timelines, we have a binary vk_sync associated with
       * each time point and pass the binary vk_sync to the driver.
       */
      struct vk_sync_timeline *timeline =
         vk_sync_as_timeline(submit->waits[i].sync);
      if (timeline) {
         assert(queue->base.device->timeline_mode ==
                VK_DEVICE_TIMELINE_MODE_EMULATED);
         result = vk_sync_timeline_get_point(queue->base.device, timeline,
                                             submit->waits[i].wait_value,
                                             &submit->_wait_points[i]);
         if (unlikely(result != VK_SUCCESS)) {
            result = vk_queue_set_lost(queue,
                                       "Time point >= %"PRIu64" not found",
                                       submit->waits[i].wait_value);
         }

         /* This can happen if the point is long past */
         if (submit->_wait_points[i] == NULL)
            continue;

         submit->waits[i].sync = &submit->_wait_points[i]->sync;
         submit->waits[i].wait_value = 0;
      }

      struct vk_sync_binary *binary =
         vk_sync_as_binary(submit->waits[i].sync);
      if (binary) {
         submit->waits[i].sync = &binary->timeline;
         submit->waits[i].wait_value = binary->next_point;
      }

      assert((submit->waits[i].sync->flags & VK_SYNC_IS_TIMELINE) ||
             submit->waits[i].wait_value == 0);

      assert(wait_count <= i);
      if (wait_count < i) {
         submit->waits[wait_count] = submit->waits[i];
         submit->_wait_temps[wait_count] = submit->_wait_temps[i];
         if (submit->_wait_points)
            submit->_wait_points[wait_count] = submit->_wait_points[i];
      }
      wait_count++;
   }

   assert(wait_count <= submit->wait_count);
   submit->wait_count = wait_count;

   for (uint32_t i = 0; i < submit->signal_count; i++) {
      assert((submit->signals[i].sync->flags & VK_SYNC_IS_TIMELINE) ||
             submit->signals[i].signal_value == 0);

      struct vk_sync_binary *binary =
         vk_sync_as_binary(submit->signals[i].sync);
      if (binary) {
         submit->signals[i].sync = &binary->timeline;
         submit->signals[i].signal_value = ++binary->next_point;
      }
   }

   result = queue->driver_submit(queue, submit);
   if (unlikely(result != VK_SUCCESS))
      return result;

   if (submit->_signal_points) {
      for (uint32_t i = 0; i < submit->signal_count; i++) {
         if (submit->_signal_points[i] == NULL)
            continue;

         vk_sync_timeline_point_install(queue->base.device,
                                        submit->_signal_points[i]);
         submit->_signal_points[i] = NULL;
      }
   }

   return VK_SUCCESS;
}

VkResult
vk_queue_flush(struct vk_queue *queue, uint32_t *submit_count_out)
{
   VkResult result = VK_SUCCESS;

   assert(queue->submit.mode == VK_QUEUE_SUBMIT_MODE_DEFERRED);

   mtx_lock(&queue->submit.mutex);

   uint32_t submit_count = 0;
   while (!list_is_empty(&queue->submit.submits)) {
      struct vk_queue_submit *submit =
         list_first_entry(&queue->submit.submits,
                          struct vk_queue_submit, link);

      for (uint32_t i = 0; i < submit->wait_count; i++) {
         /* In emulated timeline mode, only emulated timelines are allowed */
         if (!vk_sync_type_is_vk_sync_timeline(submit->waits[i].sync->type)) {
            assert(!(submit->waits[i].sync->flags & VK_SYNC_IS_TIMELINE));
            continue;
         }

         result = vk_sync_wait(queue->base.device,
                               submit->waits[i].sync,
                               submit->waits[i].wait_value,
                               VK_SYNC_WAIT_PENDING, 0);
         if (result == VK_TIMEOUT) {
            /* This one's not ready yet */
            result = VK_SUCCESS;
            goto done;
         } else if (result != VK_SUCCESS) {
            result = vk_queue_set_lost(queue, "Wait for time points failed");
            goto done;
         }
      }

      result = vk_queue_submit_final(queue, submit);
      if (unlikely(result != VK_SUCCESS)) {
         result = vk_queue_set_lost(queue, "queue::driver_submit failed");
         goto done;
      }

      submit_count++;

      list_del(&submit->link);

      vk_queue_submit_destroy(queue, submit);
   }

done:
   if (submit_count)
      cnd_broadcast(&queue->submit.pop);

   mtx_unlock(&queue->submit.mutex);

   if (submit_count_out)
      *submit_count_out = submit_count;

   return result;
}

static int
vk_queue_submit_thread_func(void *_data)
{
   struct vk_queue *queue = _data;
   VkResult result;

   mtx_lock(&queue->submit.mutex);

   while (queue->submit.thread_run) {
      if (list_is_empty(&queue->submit.submits)) {
         int ret = cnd_wait(&queue->submit.push, &queue->submit.mutex);
         if (ret == thrd_error) {
            mtx_unlock(&queue->submit.mutex);
            vk_queue_set_lost(queue, "cnd_wait failed");
            return 1;
         }
         continue;
      }

      struct vk_queue_submit *submit =
         list_first_entry(&queue->submit.submits,
                          struct vk_queue_submit, link);

      /* Drop the lock while we wait */
      mtx_unlock(&queue->submit.mutex);

      result = vk_sync_wait_many(queue->base.device,
                                 submit->wait_count, submit->waits,
                                 VK_SYNC_WAIT_PENDING, UINT64_MAX);
      if (unlikely(result != VK_SUCCESS)) {
         vk_queue_set_lost(queue, "Wait for time points failed");
         return 1;
      }

      result = vk_queue_submit_final(queue, submit);
      if (unlikely(result != VK_SUCCESS)) {
         vk_queue_set_lost(queue, "queue::driver_submit failed");
         return 1;
      }

      /* Do all our cleanup of individual fences etc. outside the lock.
       * We can't actually remove it from the list yet.  We have to do
       * that under the lock.
       */
      vk_queue_submit_cleanup(queue, submit);

      mtx_lock(&queue->submit.mutex);

      /* Only remove the submit from from the list and free it after
       * queue->submit() has completed.  This ensures that, when
       * vk_queue_drain() completes, there are no more pending jobs.
       */
      list_del(&submit->link);
      vk_queue_submit_free(queue, submit);

      cnd_broadcast(&queue->submit.pop);
   }

   mtx_unlock(&queue->submit.mutex);
   return 0;
}

static VkResult
vk_queue_start_submit_thread(struct vk_queue *queue)
{
   int ret;

   mtx_lock(&queue->submit.mutex);
   queue->submit.thread_run = true;
   mtx_unlock(&queue->submit.mutex);

   ret = thrd_create(&queue->submit.thread,
                     vk_queue_submit_thread_func,
                     queue);
   if (ret == thrd_error)
      return vk_errorf(queue, VK_ERROR_UNKNOWN, "thrd_create failed");

   return VK_SUCCESS;
}

static void
vk_queue_stop_submit_thread(struct vk_queue *queue)
{
   vk_queue_drain(queue);

   /* Kick the thread to disable it */
   mtx_lock(&queue->submit.mutex);
   queue->submit.thread_run = false;
   cnd_signal(&queue->submit.push);
   mtx_unlock(&queue->submit.mutex);

   thrd_join(queue->submit.thread, NULL);

   assert(list_is_empty(&queue->submit.submits));
   queue->submit.mode = VK_QUEUE_SUBMIT_MODE_IMMEDIATE;
}

VkResult
vk_queue_enable_submit_thread(struct vk_queue *queue)
{
   assert(vk_device_supports_threaded_submit(queue->base.device));

   if (queue->submit.mode == VK_QUEUE_SUBMIT_MODE_THREADED)
      return VK_SUCCESS;

   VkResult result = vk_queue_start_submit_thread(queue);
   if (result != VK_SUCCESS)
      return result;

   queue->submit.mode = VK_QUEUE_SUBMIT_MODE_THREADED;

   return VK_SUCCESS;
}

struct vulkan_submit_info {
   const void *pNext;

   uint32_t command_buffer_count;
   const VkCommandBufferSubmitInfo *command_buffers;

   uint32_t wait_count;
   const VkSemaphoreSubmitInfo *waits;

   uint32_t signal_count;
   const VkSemaphoreSubmitInfo *signals;

   uint32_t buffer_bind_count;
   const VkSparseBufferMemoryBindInfo *buffer_binds;

   uint32_t image_opaque_bind_count;
   const VkSparseImageOpaqueMemoryBindInfo *image_opaque_binds;

   uint32_t image_bind_count;
   const VkSparseImageMemoryBindInfo *image_binds;

   struct vk_fence *fence;
};

static VkResult
vk_queue_submit(struct vk_queue *queue,
                const struct vulkan_submit_info *info)
{
   struct vk_device *device = queue->base.device;
   VkResult result;
   uint32_t sparse_memory_bind_entry_count = 0;
   uint32_t sparse_memory_image_bind_entry_count = 0;
   VkSparseMemoryBind *sparse_memory_bind_entries = NULL;
   VkSparseImageMemoryBind *sparse_memory_image_bind_entries = NULL;

   for (uint32_t i = 0; i < info->buffer_bind_count; ++i)
      sparse_memory_bind_entry_count += info->buffer_binds[i].bindCount;

   for (uint32_t i = 0; i < info->image_opaque_bind_count; ++i)
      sparse_memory_bind_entry_count += info->image_opaque_binds[i].bindCount;

   for (uint32_t i = 0; i < info->image_bind_count; ++i)
      sparse_memory_image_bind_entry_count += info->image_binds[i].bindCount;

   const struct wsi_memory_signal_submit_info *mem_signal =
      vk_find_struct_const(info->pNext, WSI_MEMORY_SIGNAL_SUBMIT_INFO_MESA);
   bool signal_mem_sync = mem_signal != NULL &&
                          mem_signal->memory != VK_NULL_HANDLE &&
                          queue->base.device->create_sync_for_memory != NULL;

   struct vk_queue_submit *submit =
      vk_queue_submit_alloc(queue, info->wait_count,
                            info->command_buffer_count,
                            info->buffer_bind_count,
                            info->image_opaque_bind_count,
                            info->image_bind_count,
                            sparse_memory_bind_entry_count,
                            sparse_memory_image_bind_entry_count,
                            info->signal_count +
                            signal_mem_sync + (info->fence != NULL),
                            &sparse_memory_bind_entries,
                            &sparse_memory_image_bind_entries);
   if (unlikely(submit == NULL))
      return vk_error(queue, VK_ERROR_OUT_OF_HOST_MEMORY);

   /* From the Vulkan 1.2.194 spec:
    *
    *    "If the VkSubmitInfo::pNext chain does not include this structure,
    *    the batch defaults to use counter pass index 0."
    */
   const VkPerformanceQuerySubmitInfoKHR *perf_info =
      vk_find_struct_const(info->pNext, PERFORMANCE_QUERY_SUBMIT_INFO_KHR);
   submit->perf_pass_index = perf_info ? perf_info->counterPassIndex : 0;

   bool has_binary_permanent_semaphore_wait = false;
   for (uint32_t i = 0; i < info->wait_count; i++) {
      VK_FROM_HANDLE(vk_semaphore, semaphore,
                     info->waits[i].semaphore);

      /* From the Vulkan 1.2.194 spec:
       *
       *    "Applications can import a semaphore payload into an existing
       *    semaphore using an external semaphore handle. The effects of the
       *    import operation will be either temporary or permanent, as
       *    specified by the application. If the import is temporary, the
       *    implementation must restore the semaphore to its prior permanent
       *    state after submitting the next semaphore wait operation."
       *
       * and
       *
       *    VUID-VkImportSemaphoreFdInfoKHR-flags-03323
       *
       *    "If flags contains VK_SEMAPHORE_IMPORT_TEMPORARY_BIT, the
       *    VkSemaphoreTypeCreateInfo::semaphoreType field of the semaphore
       *    from which handle or name was exported must not be
       *    VK_SEMAPHORE_TYPE_TIMELINE"
       */
      struct vk_sync *sync;
      if (semaphore->temporary) {
         assert(semaphore->type == VK_SEMAPHORE_TYPE_BINARY);
         sync = submit->_wait_temps[i] = semaphore->temporary;
         semaphore->temporary = NULL;
      } else {
         if (semaphore->type == VK_SEMAPHORE_TYPE_BINARY) {
            if (vk_device_supports_threaded_submit(device))
               assert(semaphore->permanent.type->move);
            has_binary_permanent_semaphore_wait = true;
         }

         sync = &semaphore->permanent;
      }

      uint32_t wait_value = semaphore->type == VK_SEMAPHORE_TYPE_TIMELINE ?
                            info->waits[i].value : 0;

      submit->waits[i] = (struct vk_sync_wait) {
         .sync = sync,
         .stage_mask = info->waits[i].stageMask,
         .wait_value = wait_value,
      };
   }

   for (uint32_t i = 0; i < info->command_buffer_count; i++) {
      VK_FROM_HANDLE(vk_command_buffer, cmd_buffer,
                     info->command_buffers[i].commandBuffer);
      assert(info->command_buffers[i].deviceMask == 0 ||
             info->command_buffers[i].deviceMask == 1);
      assert(cmd_buffer->pool->queue_family_index == queue->queue_family_index);
      submit->command_buffers[i] = cmd_buffer;
   }

   sparse_memory_bind_entry_count = 0;
   sparse_memory_image_bind_entry_count = 0;

   if (info->buffer_binds)
      typed_memcpy(submit->buffer_binds, info->buffer_binds, info->buffer_bind_count);

   for (uint32_t i = 0; i < info->buffer_bind_count; ++i) {
      VkSparseMemoryBind *binds = sparse_memory_bind_entries +
                                  sparse_memory_bind_entry_count;
      submit->buffer_binds[i].pBinds = binds;
      typed_memcpy(binds, info->buffer_binds[i].pBinds,
                   info->buffer_binds[i].bindCount);

      sparse_memory_bind_entry_count += info->buffer_binds[i].bindCount;
   }

   if (info->image_opaque_binds)
      typed_memcpy(submit->image_opaque_binds, info->image_opaque_binds,
                   info->image_opaque_bind_count);

   for (uint32_t i = 0; i < info->image_opaque_bind_count; ++i) {
      VkSparseMemoryBind *binds = sparse_memory_bind_entries +
                                  sparse_memory_bind_entry_count;
      submit->image_opaque_binds[i].pBinds = binds;
      typed_memcpy(binds, info->image_opaque_binds[i].pBinds,
                   info->image_opaque_binds[i].bindCount);

      sparse_memory_bind_entry_count += info->image_opaque_binds[i].bindCount;
   }

   if (info->image_binds)
      typed_memcpy(submit->image_binds, info->image_binds, info->image_bind_count);

   for (uint32_t i = 0; i < info->image_bind_count; ++i) {
      VkSparseImageMemoryBind *binds = sparse_memory_image_bind_entries +
                                       sparse_memory_image_bind_entry_count;
      submit->image_binds[i].pBinds = binds;
      typed_memcpy(binds, info->image_binds[i].pBinds,
                   info->image_binds[i].bindCount);

      sparse_memory_image_bind_entry_count += info->image_binds[i].bindCount;
   }

   for (uint32_t i = 0; i < info->signal_count; i++) {
      VK_FROM_HANDLE(vk_semaphore, semaphore,
                     info->signals[i].semaphore);

      struct vk_sync *sync = vk_semaphore_get_active_sync(semaphore);
      uint32_t signal_value = info->signals[i].value;
      if (semaphore->type == VK_SEMAPHORE_TYPE_TIMELINE) {
         if (signal_value == 0) {
            result = vk_queue_set_lost(queue,
               "Tried to signal a timeline with value 0");
            goto fail;
         }
      } else {
         signal_value = 0;
      }

      /* For emulated timelines, we need to associate a binary vk_sync with
       * each time point and pass the binary vk_sync to the driver.  We could
       * do this in vk_queue_submit_final but it might require doing memory
       * allocation and we don't want to to add extra failure paths there.
       * Instead, allocate and replace the driver-visible vk_sync now and
       * we'll insert it into the timeline in vk_queue_submit_final.  The
       * insert step is guaranteed to not fail.
       */
      struct vk_sync_timeline *timeline = vk_sync_as_timeline(sync);
      if (timeline) {
         assert(queue->base.device->timeline_mode ==
                VK_DEVICE_TIMELINE_MODE_EMULATED);
         result = vk_sync_timeline_alloc_point(queue->base.device, timeline,
                                               signal_value,
                                               &submit->_signal_points[i]);
         if (unlikely(result != VK_SUCCESS))
            goto fail;

         sync = &submit->_signal_points[i]->sync;
         signal_value = 0;
      }

      submit->signals[i] = (struct vk_sync_signal) {
         .sync = sync,
         .stage_mask = info->signals[i].stageMask,
         .signal_value = signal_value,
      };
   }

   uint32_t signal_count = info->signal_count;
   if (signal_mem_sync) {
      struct vk_sync *mem_sync;
      result = queue->base.device->create_sync_for_memory(queue->base.device,
                                                          mem_signal->memory,
                                                          true, &mem_sync);
      if (unlikely(result != VK_SUCCESS))
         goto fail;

      submit->_mem_signal_temp = mem_sync;

      assert(submit->signals[signal_count].sync == NULL);
      submit->signals[signal_count++] = (struct vk_sync_signal) {
         .sync = mem_sync,
         .stage_mask = VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
      };
   }

   if (info->fence != NULL) {
      assert(submit->signals[signal_count].sync == NULL);
      submit->signals[signal_count++] = (struct vk_sync_signal) {
         .sync = vk_fence_get_active_sync(info->fence),
         .stage_mask = VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
      };
   }

   assert(signal_count == submit->signal_count);

   /* If this device supports threaded submit, we can't rely on the client
    * ordering requirements to ensure submits happen in the right order.  Even
    * if this queue doesn't have a submit thread, another queue (possibly in a
    * different process) may and that means we our dependencies may not have
    * been submitted to the kernel yet.  Do a quick zero-timeout WAIT_PENDING
    * on all the wait semaphores to see if we need to start up our own thread.
    */
   if (device->submit_mode == VK_QUEUE_SUBMIT_MODE_THREADED_ON_DEMAND &&
       queue->submit.mode != VK_QUEUE_SUBMIT_MODE_THREADED) {
      assert(queue->submit.mode == VK_QUEUE_SUBMIT_MODE_IMMEDIATE);

      result = vk_sync_wait_many(queue->base.device,
                                 submit->wait_count, submit->waits,
                                 VK_SYNC_WAIT_PENDING, 0);
      if (result == VK_TIMEOUT)
         result = vk_queue_enable_submit_thread(queue);
      if (unlikely(result != VK_SUCCESS))
         goto fail;
   }

   switch (queue->submit.mode) {
   case VK_QUEUE_SUBMIT_MODE_IMMEDIATE:
      result = vk_queue_submit_final(queue, submit);
      if (unlikely(result != VK_SUCCESS))
         goto fail;

      /* If threaded submit is possible on this device, we need to ensure that
       * binary semaphore payloads get reset so that any other threads can
       * properly wait on them for dependency checking.  Because we don't
       * currently have a submit thread, we can directly reset that binary
       * semaphore payloads.
       *
       * If we the vk_sync is in our signal et, we can consider it to have
       * been both reset and signaled by queue_submit_final().  A reset in
       * this case would be wrong because it would throw away our signal
       * operation.  If we don't signal the vk_sync, then we need to reset it.
       */
      if (vk_device_supports_threaded_submit(device) &&
          has_binary_permanent_semaphore_wait) {
         for (uint32_t i = 0; i < submit->wait_count; i++) {
            if ((submit->waits[i].sync->flags & VK_SYNC_IS_TIMELINE) ||
                submit->_wait_temps[i] != NULL)
               continue;

            bool was_signaled = false;
            for (uint32_t j = 0; j < submit->signal_count; j++) {
               if (submit->signals[j].sync == submit->waits[i].sync) {
                  was_signaled = true;
                  break;
               }
            }

            if (!was_signaled) {
               result = vk_sync_reset(queue->base.device,
                                      submit->waits[i].sync);
               if (unlikely(result != VK_SUCCESS))
                  goto fail;
            }
         }
      }

      vk_queue_submit_destroy(queue, submit);
      return result;

   case VK_QUEUE_SUBMIT_MODE_DEFERRED:
      vk_queue_push_submit(queue, submit);
      return vk_device_flush(queue->base.device);

   case VK_QUEUE_SUBMIT_MODE_THREADED:
      if (has_binary_permanent_semaphore_wait) {
         for (uint32_t i = 0; i < info->wait_count; i++) {
            VK_FROM_HANDLE(vk_semaphore, semaphore,
                           info->waits[i].semaphore);

            if (semaphore->type != VK_SEMAPHORE_TYPE_BINARY)
               continue;

            /* From the Vulkan 1.2.194 spec:
             *
             *    "When a batch is submitted to a queue via a queue
             *    submission, and it includes semaphores to be waited on,
             *    it defines a memory dependency between prior semaphore
             *    signal operations and the batch, and defines semaphore
             *    wait operations.
             *
             *    Such semaphore wait operations set the semaphores
             *    created with a VkSemaphoreType of
             *    VK_SEMAPHORE_TYPE_BINARY to the unsignaled state."
             *
             * For threaded submit, we depend on tracking the unsignaled
             * state of binary semaphores to determine when we can safely
             * submit.  The VK_SYNC_WAIT_PENDING check above as well as the
             * one in the sumbit thread depend on all binary semaphores
             * being reset when they're not in active use from the point
             * of view of the client's CPU timeline.  This means we need to
             * reset them inside vkQueueSubmit and cannot wait until the
             * actual submit which happens later in the thread.
             *
             * We've already stolen temporary semaphore payloads above as
             * part of basic semaphore processing.  We steal permanent
             * semaphore payloads here by way of vk_sync_move.  For shared
             * semaphores, this can be a bit expensive (sync file import
             * and export) but, for non-shared semaphores, it can be made
             * fairly cheap.  Also, we only do this semaphore swapping in
             * the case where you have real timelines AND the client is
             * using timeline semaphores with wait-before-signal (that's
             * the only way to get a submit thread) AND mixing those with
             * waits on binary semaphores AND said binary semaphore is
             * using its permanent payload.  In other words, this code
             * should basically only ever get executed in CTS tests.
             */
            if (submit->_wait_temps[i] != NULL)
               continue;

            assert(submit->waits[i].sync == &semaphore->permanent);

            /* From the Vulkan 1.2.194 spec:
             *
             *    VUID-vkQueueSubmit-pWaitSemaphores-03238
             *
             *    "All elements of the pWaitSemaphores member of all
             *    elements of pSubmits created with a VkSemaphoreType of
             *    VK_SEMAPHORE_TYPE_BINARY must reference a semaphore
             *    signal operation that has been submitted for execution
             *    and any semaphore signal operations on which it depends
             *    (if any) must have also been submitted for execution."
             *
             * Therefore, we can safely do a blocking wait here and it
             * won't actually block for long.  This ensures that the
             * vk_sync_move below will succeed.
             */
            result = vk_sync_wait(queue->base.device,
                                  submit->waits[i].sync, 0,
                                  VK_SYNC_WAIT_PENDING, UINT64_MAX);
            if (unlikely(result != VK_SUCCESS))
               goto fail;

            result = vk_sync_create(queue->base.device,
                                    semaphore->permanent.type,
                                    0 /* flags */,
                                    0 /* initial value */,
                                    &submit->_wait_temps[i]);
            if (unlikely(result != VK_SUCCESS))
               goto fail;

            result = vk_sync_move(queue->base.device,
                                  submit->_wait_temps[i],
                                  &semaphore->permanent);
            if (unlikely(result != VK_SUCCESS))
               goto fail;

            submit->waits[i].sync = submit->_wait_temps[i];
         }
      }

      vk_queue_push_submit(queue, submit);

      if (signal_mem_sync) {
         /* If we're signaling a memory object, we have to ensure that
          * vkQueueSubmit does not return until the kernel submission has
          * happened.  Otherwise, we may get a race between this process
          * and whatever is going to wait on the object where the other
          * process may wait before we've submitted our work.  Drain the
          * queue now to avoid this.  It's the responsibility of the caller
          * to ensure that any vkQueueSubmit which signals a memory object
          * has fully resolved dependencies.
          */
         result = vk_queue_drain(queue);
         if (unlikely(result != VK_SUCCESS))
            return result;
      }

      return VK_SUCCESS;

   case VK_QUEUE_SUBMIT_MODE_THREADED_ON_DEMAND:
      unreachable("Invalid vk_queue::submit.mode");
   }
   unreachable("Invalid submit mode");

fail:
   vk_queue_submit_destroy(queue, submit);
   return result;
}

VkResult
vk_queue_wait_before_present(struct vk_queue *queue,
                             const VkPresentInfoKHR *pPresentInfo)
{
   if (vk_device_is_lost(queue->base.device))
      return VK_ERROR_DEVICE_LOST;

   /* From the Vulkan 1.2.194 spec:
    *
    *    VUID-vkQueuePresentKHR-pWaitSemaphores-03268
    *
    *    "All elements of the pWaitSemaphores member of pPresentInfo must
    *    reference a semaphore signal operation that has been submitted for
    *    execution and any semaphore signal operations on which it depends (if
    *    any) must have also been submitted for execution."
    *
    * As with vkQueueSubmit above, we need to ensure that any binary
    * semaphores we use in this present actually exist.  If we don't have
    * timeline semaphores, this is a non-issue.  If they're emulated, then
    * this is ensured for us by the vk_device_flush() at the end of every
    * vkQueueSubmit() and every vkSignalSemaphore().  For real timeline
    * semaphores, however, we need to do a wait.  Thanks to the above bit of
    * spec text, that wait should never block for long.
    */
   if (!vk_device_supports_threaded_submit(queue->base.device))
      return VK_SUCCESS;

   const uint32_t wait_count = pPresentInfo->waitSemaphoreCount;
   STACK_ARRAY(struct vk_sync_wait, waits, wait_count);

   for (uint32_t i = 0; i < wait_count; i++) {
      VK_FROM_HANDLE(vk_semaphore, semaphore,
                     pPresentInfo->pWaitSemaphores[i]);

      /* From the Vulkan 1.2.194 spec:
       *
       *    VUID-vkQueuePresentKHR-pWaitSemaphores-03267
       *
       *    "All elements of the pWaitSemaphores member of pPresentInfo must
       *    be created with a VkSemaphoreType of VK_SEMAPHORE_TYPE_BINARY."
       */
      assert(semaphore->type == VK_SEMAPHORE_TYPE_BINARY);

      waits[i] = (struct vk_sync_wait) {
         .sync = vk_semaphore_get_active_sync(semaphore),
         .stage_mask = ~(VkPipelineStageFlags2)0,
      };
   }

   VkResult result = vk_sync_wait_many(queue->base.device, wait_count, waits,
                                       VK_SYNC_WAIT_PENDING, UINT64_MAX);

   STACK_ARRAY_FINISH(waits);

   /* Check again, just in case */
   if (vk_device_is_lost(queue->base.device))
      return VK_ERROR_DEVICE_LOST;

   return result;
}

static VkResult
vk_queue_signal_sync(struct vk_queue *queue,
                     struct vk_sync *sync,
                     uint32_t signal_value)
{
   struct vk_queue_submit *submit = vk_queue_submit_alloc(queue, 0, 0, 0, 0, 0,
                                                          0, 0, 1, NULL, NULL);
   if (unlikely(submit == NULL))
      return vk_error(queue, VK_ERROR_OUT_OF_HOST_MEMORY);

   submit->signals[0] = (struct vk_sync_signal) {
      .sync = sync,
      .stage_mask = VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
      .signal_value = signal_value,
   };

   VkResult result;
   switch (queue->submit.mode) {
   case VK_QUEUE_SUBMIT_MODE_IMMEDIATE:
      result = vk_queue_submit_final(queue, submit);
      vk_queue_submit_destroy(queue, submit);
      return result;

   case VK_QUEUE_SUBMIT_MODE_DEFERRED:
      vk_queue_push_submit(queue, submit);
      return vk_device_flush(queue->base.device);

   case VK_QUEUE_SUBMIT_MODE_THREADED:
      vk_queue_push_submit(queue, submit);
      return VK_SUCCESS;

   case VK_QUEUE_SUBMIT_MODE_THREADED_ON_DEMAND:
      unreachable("Invalid vk_queue::submit.mode");
   }
   unreachable("Invalid timeline mode");
}

void
vk_queue_finish(struct vk_queue *queue)
{
   if (queue->submit.mode == VK_QUEUE_SUBMIT_MODE_THREADED)
      vk_queue_stop_submit_thread(queue);

   while (!list_is_empty(&queue->submit.submits)) {
      assert(vk_device_is_lost_no_report(queue->base.device));

      struct vk_queue_submit *submit =
         list_first_entry(&queue->submit.submits,
                          struct vk_queue_submit, link);

      list_del(&submit->link);
      vk_queue_submit_destroy(queue, submit);
   }

   cnd_destroy(&queue->submit.pop);
   cnd_destroy(&queue->submit.push);
   mtx_destroy(&queue->submit.mutex);

   util_dynarray_fini(&queue->labels);
   list_del(&queue->link);
   vk_object_base_finish(&queue->base);
}

VKAPI_ATTR VkResult VKAPI_CALL
vk_common_QueueSubmit2KHR(VkQueue _queue,
                          uint32_t submitCount,
                          const VkSubmitInfo2 *pSubmits,
                          VkFence _fence)
{
   VK_FROM_HANDLE(vk_queue, queue, _queue);
   VK_FROM_HANDLE(vk_fence, fence, _fence);

   if (vk_device_is_lost(queue->base.device))
      return VK_ERROR_DEVICE_LOST;

   if (submitCount == 0) {
      if (fence == NULL) {
         return VK_SUCCESS;
      } else {
         return vk_queue_signal_sync(queue, vk_fence_get_active_sync(fence), 0);
      }
   }

   for (uint32_t i = 0; i < submitCount; i++) {
      struct vulkan_submit_info info = {
         .pNext = pSubmits[i].pNext,
         .command_buffer_count = pSubmits[i].commandBufferInfoCount,
         .command_buffers = pSubmits[i].pCommandBufferInfos,
         .wait_count = pSubmits[i].waitSemaphoreInfoCount,
         .waits = pSubmits[i].pWaitSemaphoreInfos,
         .signal_count = pSubmits[i].signalSemaphoreInfoCount,
         .signals = pSubmits[i].pSignalSemaphoreInfos,
         .fence = i == submitCount - 1 ? fence : NULL
      };
      VkResult result = vk_queue_submit(queue, &info);
      if (unlikely(result != VK_SUCCESS))
         return result;
   }

   return VK_SUCCESS;
}

VKAPI_ATTR VkResult VKAPI_CALL
vk_common_QueueBindSparse(VkQueue _queue,
                          uint32_t bindInfoCount,
                          const VkBindSparseInfo *pBindInfo,
                          VkFence _fence)
{
   VK_FROM_HANDLE(vk_queue, queue, _queue);
   VK_FROM_HANDLE(vk_fence, fence, _fence);

   if (vk_device_is_lost(queue->base.device))
      return VK_ERROR_DEVICE_LOST;

   if (bindInfoCount == 0) {
      if (fence == NULL) {
         return VK_SUCCESS;
      } else {
         return vk_queue_signal_sync(queue, vk_fence_get_active_sync(fence), 0);
      }
   }

   for (uint32_t i = 0; i < bindInfoCount; i++) {
      const VkTimelineSemaphoreSubmitInfo *timeline_info =
         vk_find_struct_const(pBindInfo[i].pNext, TIMELINE_SEMAPHORE_SUBMIT_INFO);
      const uint64_t *wait_values = NULL;
      const uint64_t *signal_values = NULL;

      if (timeline_info && timeline_info->waitSemaphoreValueCount) {
         /* From the Vulkan 1.3.204 spec:
          *
          *    VUID-VkBindSparseInfo-pNext-03248
          *
          *    "If the pNext chain of this structure includes a VkTimelineSemaphoreSubmitInfo structure
          *    and any element of pSignalSemaphores was created with a VkSemaphoreType of
          *    VK_SEMAPHORE_TYPE_TIMELINE, then its signalSemaphoreValueCount member must equal
          *    signalSemaphoreCount"
          */
         assert(timeline_info->waitSemaphoreValueCount == pBindInfo[i].waitSemaphoreCount);
         wait_values = timeline_info->pWaitSemaphoreValues;
      }

      if (timeline_info && timeline_info->signalSemaphoreValueCount) {
         /* From the Vulkan 1.3.204 spec:
          *
          * VUID-VkBindSparseInfo-pNext-03247
          *
          *    "If the pNext chain of this structure includes a VkTimelineSemaphoreSubmitInfo structure
          *    and any element of pWaitSemaphores was created with a VkSemaphoreType of
          *    VK_SEMAPHORE_TYPE_TIMELINE, then its waitSemaphoreValueCount member must equal
          *    waitSemaphoreCount"
          */
         assert(timeline_info->signalSemaphoreValueCount == pBindInfo[i].signalSemaphoreCount);
         signal_values = timeline_info->pSignalSemaphoreValues;
      }

      STACK_ARRAY(VkSemaphoreSubmitInfo, wait_semaphore_infos,
                  pBindInfo[i].waitSemaphoreCount);
      STACK_ARRAY(VkSemaphoreSubmitInfo, signal_semaphore_infos,
                  pBindInfo[i].signalSemaphoreCount);

      if (!wait_semaphore_infos || !signal_semaphore_infos) {
         STACK_ARRAY_FINISH(wait_semaphore_infos);
         STACK_ARRAY_FINISH(signal_semaphore_infos);
         return vk_error(queue, VK_ERROR_OUT_OF_HOST_MEMORY);
      }

      for (uint32_t j = 0; j < pBindInfo[i].waitSemaphoreCount; j++) {
         wait_semaphore_infos[j] = (VkSemaphoreSubmitInfo) {
            .sType = VK_STRUCTURE_TYPE_SEMAPHORE_SUBMIT_INFO,
            .semaphore = pBindInfo[i].pWaitSemaphores[j],
            .value = wait_values ? wait_values[j] : 0,
         };
      }

      for (uint32_t j = 0; j < pBindInfo[i].signalSemaphoreCount; j++) {
         signal_semaphore_infos[j] = (VkSemaphoreSubmitInfo) {
            .sType = VK_STRUCTURE_TYPE_SEMAPHORE_SUBMIT_INFO,
            .semaphore = pBindInfo[i].pSignalSemaphores[j],
            .value = signal_values ? signal_values[j] : 0,
         };
      }
      struct vulkan_submit_info info = {
         .pNext = pBindInfo[i].pNext,
         .wait_count = pBindInfo[i].waitSemaphoreCount,
         .waits = wait_semaphore_infos,
         .signal_count = pBindInfo[i].signalSemaphoreCount,
         .signals = signal_semaphore_infos,
         .buffer_bind_count = pBindInfo[i].bufferBindCount,
         .buffer_binds = pBindInfo[i].pBufferBinds,
         .image_opaque_bind_count = pBindInfo[i].imageOpaqueBindCount,
         .image_opaque_binds = pBindInfo[i].pImageOpaqueBinds,
         .image_bind_count = pBindInfo[i].imageBindCount,
         .image_binds = pBindInfo[i].pImageBinds,
         .fence = i == bindInfoCount - 1 ? fence : NULL
      };
      VkResult result = vk_queue_submit(queue, &info);

      STACK_ARRAY_FINISH(wait_semaphore_infos);
      STACK_ARRAY_FINISH(signal_semaphore_infos);

      if (unlikely(result != VK_SUCCESS))
         return result;
   }

   return VK_SUCCESS;
}

static const struct vk_sync_type *
get_cpu_wait_type(struct vk_physical_device *pdevice)
{
   for (const struct vk_sync_type *const *t =
        pdevice->supported_sync_types; *t; t++) {
      if (((*t)->features & VK_SYNC_FEATURE_BINARY) &&
          ((*t)->features & VK_SYNC_FEATURE_CPU_WAIT))
         return *t;
   }

   unreachable("You must have a non-timeline CPU wait sync type");
}

VKAPI_ATTR VkResult VKAPI_CALL
vk_common_QueueWaitIdle(VkQueue _queue)
{
   VK_FROM_HANDLE(vk_queue, queue, _queue);
   VkResult result;

   if (vk_device_is_lost(queue->base.device))
      return VK_ERROR_DEVICE_LOST;

   const struct vk_sync_type *sync_type =
      get_cpu_wait_type(queue->base.device->physical);

   struct vk_sync *sync;
   result = vk_sync_create(queue->base.device, sync_type, 0, 0, &sync);
   if (unlikely(result != VK_SUCCESS))
      return result;

   result = vk_queue_signal_sync(queue, sync, 0);
   if (unlikely(result != VK_SUCCESS))
      return result;

   result = vk_sync_wait(queue->base.device, sync, 0,
                         VK_SYNC_WAIT_COMPLETE, UINT64_MAX);

   vk_sync_destroy(queue->base.device, sync);

   VkResult device_status = vk_device_check_status(queue->base.device);
   if (device_status != VK_SUCCESS)
      return device_status;

   return result;
}