anv: allow multiple command buffers in anv_queue_submit

v2: Fixup crash spotted by Mark about missing alloc vfuncs v3: Fixup double iteration over device->memory_objects (that ought to be expensive...) (Ken) v4: Add more asserts for non-softpin cases (Ken) Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/2371>
2020-12-09 13:22:45 +02:00 · 2020-12-09 13:22:45 +02:00 · 83fee30e85
parent 882fc72442
commit 83fee30e85
3 changed files with 265 additions and 123 deletions
--- a/src/intel/vulkan/anv_batch_chain.c
+++ b/src/intel/vulkan/anv_batch_chain.c
@ -614,6 +614,37 @@ cmd_buffer_chain_to_batch_bo(struct anv_cmd_buffer *cmd_buffer,
   anv_batch_bo_finish(current_bbo, batch);
 }

+static void
+anv_cmd_buffer_record_chain_submit(struct anv_cmd_buffer *cmd_buffer_from,
+                                   struct anv_cmd_buffer *cmd_buffer_to)
+{
+   assert(cmd_buffer_from->device->physical->use_softpin);
+
+   uint32_t *bb_start = cmd_buffer_from->batch_end;
+
+   struct anv_batch_bo *last_bbo =
+      list_last_entry(&cmd_buffer_from->batch_bos, struct anv_batch_bo, link);
+   struct anv_batch_bo *first_bbo =
+      list_first_entry(&cmd_buffer_to->batch_bos, struct anv_batch_bo, link);
+
+   struct GEN8_MI_BATCH_BUFFER_START gen_bb_start = {
+      __anv_cmd_header(GEN8_MI_BATCH_BUFFER_START),
+      .SecondLevelBatchBuffer    = Firstlevelbatch,
+      .AddressSpaceIndicator     = ASI_PPGTT,
+      .BatchBufferStartAddress   = (struct anv_address) { first_bbo->bo, 0 },
+   };
+   struct anv_batch local_batch = {
+      .start  = last_bbo->bo->map,
+      .end    = last_bbo->bo->map + last_bbo->bo->size,
+      .relocs = &last_bbo->relocs,
+      .alloc  = &cmd_buffer_from->pool->alloc,
+   };
+
+   __anv_cmd_pack(GEN8_MI_BATCH_BUFFER_START)(&local_batch, bb_start, &gen_bb_start);
+
+   last_bbo->chained = true;
+}
+
 static void
 anv_cmd_buffer_record_end_submit(struct anv_cmd_buffer *cmd_buffer)
 {
@ -1135,6 +1166,11 @@ struct anv_execbuf {
   /* Allocated length of the 'objects' and 'bos' arrays */
   uint32_t                                  array_length;

+   /* List of relocations for surface states, only used with platforms not
+    * using softpin.
+    */
+   void *                                    surface_states_relocs;
+
   /* Indicates whether any of the command buffers have relocations. This
    * doesn't not necessarily mean we'll need the kernel to process them. It
    * might be that a previous execbuf has already placed things in the VMA
@ -1157,6 +1193,7 @@ anv_execbuf_init(struct anv_execbuf *exec)
 static void
 anv_execbuf_finish(struct anv_execbuf *exec)
 {
+   vk_free(exec->alloc, exec->surface_states_relocs);
   vk_free(exec->alloc, exec->objects);
   vk_free(exec->alloc, exec->bos);
 }
@ -1434,8 +1471,7 @@ anv_reloc_list_apply(struct anv_device *device,
 * have to make a full copy of all the relocations lists.
 */
 static bool
-relocate_cmd_buffer(struct anv_cmd_buffer *cmd_buffer,
-                    struct anv_execbuf *exec)
+execbuf_can_skip_relocations(struct anv_execbuf *exec)
 {
   if (!exec->has_relocs)
      return true;
@ -1459,6 +1495,13 @@ relocate_cmd_buffer(struct anv_cmd_buffer *cmd_buffer,
         return false;
   }

+   return true;
+}
+
+static void
+relocate_cmd_buffer(struct anv_cmd_buffer *cmd_buffer,
+                    struct anv_execbuf *exec)
+{
   /* Since surface states are shared between command buffers and we don't
    * know what order they will be submitted to the kernel, we don't know
    * what address is actually written in the surface state object at any
@ -1482,16 +1525,27 @@ relocate_cmd_buffer(struct anv_cmd_buffer *cmd_buffer,

   for (uint32_t i = 0; i < exec->bo_count; i++)
      exec->objects[i].offset = exec->bos[i]->offset;
+}

-   return true;
+static void
+reset_cmd_buffer_surface_offsets(struct anv_cmd_buffer *cmd_buffer)
+{
+   /* In the case where we fall back to doing kernel relocations, we need to
+    * ensure that the relocation list is valid. All relocations on the batch
+    * buffers are already valid and kept up-to-date. Since surface states are
+    * shared between command buffers and we don't know what order they will be
+    * submitted to the kernel, we don't know what address is actually written
+    * in the surface state object at any given time. The only option is to set
+    * a bogus presumed offset and let the kernel relocate them.
+    */
+   for (size_t i = 0; i < cmd_buffer->surface_relocs.num_relocs; i++)
+      cmd_buffer->surface_relocs.relocs[i].presumed_offset = -1;
 }

 static VkResult
 setup_execbuf_for_cmd_buffer(struct anv_execbuf *execbuf,
-                             struct anv_queue *queue,
                             struct anv_cmd_buffer *cmd_buffer)
 {
-   struct anv_batch *batch = &cmd_buffer->batch;
   struct anv_state_pool *ss_pool =
      &cmd_buffer->device->surface_state_pool;

@ -1499,58 +1553,10 @@ setup_execbuf_for_cmd_buffer(struct anv_execbuf *execbuf,
                                      cmd_buffer->last_ss_pool_center);
   VkResult result;
   if (cmd_buffer->device->physical->use_softpin) {
-      anv_block_pool_foreach_bo(bo, &ss_pool->block_pool) {
-         result = anv_execbuf_add_bo(cmd_buffer->device, execbuf,
-                                     bo, NULL, 0);
-         if (result != VK_SUCCESS)
-            return result;
-      }
      /* Add surface dependencies (BOs) to the execbuf */
      anv_execbuf_add_bo_bitset(cmd_buffer->device, execbuf,
                                cmd_buffer->surface_relocs.dep_words,
                                cmd_buffer->surface_relocs.deps, 0);
-
-      /* Add the BOs for all memory objects */
-      list_for_each_entry(struct anv_device_memory, mem,
-                          &cmd_buffer->device->memory_objects, link) {
-         result = anv_execbuf_add_bo(cmd_buffer->device, execbuf,
-                                     mem->bo, NULL, 0);
-         if (result != VK_SUCCESS)
-            return result;
-      }
-
-      struct anv_block_pool *pool;
-      pool = &cmd_buffer->device->general_state_pool.block_pool;
-      anv_block_pool_foreach_bo(bo, pool) {
-         result = anv_execbuf_add_bo(cmd_buffer->device, execbuf,
-                                     bo, NULL, 0);
-         if (result != VK_SUCCESS)
-            return result;
-      }
-
-      pool = &cmd_buffer->device->dynamic_state_pool.block_pool;
-      anv_block_pool_foreach_bo(bo, pool) {
-         result = anv_execbuf_add_bo(cmd_buffer->device, execbuf,
-                                     bo, NULL, 0);
-         if (result != VK_SUCCESS)
-            return result;
-      }
-
-      pool = &cmd_buffer->device->instruction_state_pool.block_pool;
-      anv_block_pool_foreach_bo(bo, pool) {
-         result = anv_execbuf_add_bo(cmd_buffer->device, execbuf,
-                                     bo, NULL, 0);
-         if (result != VK_SUCCESS)
-            return result;
-      }
-
-      pool = &cmd_buffer->device->binding_table_pool.block_pool;
-      anv_block_pool_foreach_bo(bo, pool) {
-         result = anv_execbuf_add_bo(cmd_buffer->device, execbuf,
-                                     bo, NULL, 0);
-         if (result != VK_SUCCESS)
-            return result;
-      }
   } else {
      /* Since we aren't in the softpin case, all of our STATE_BASE_ADDRESS BOs
       * will get added automatically by processing relocations on the batch
@ -1584,8 +1590,146 @@ setup_execbuf_for_cmd_buffer(struct anv_execbuf *execbuf,
    */
   cmd_buffer->last_ss_pool_center = ss_pool->block_pool.center_bo_offset;

+   return VK_SUCCESS;
+}
+
+static void
+chain_command_buffers(struct anv_cmd_buffer **cmd_buffers,
+                      uint32_t num_cmd_buffers)
+{
+   if (!anv_cmd_buffer_is_chainable(cmd_buffers[0])) {
+      assert(num_cmd_buffers == 1);
+      return;
+   }
+
+   /* Chain the N-1 first batch buffers */
+   for (uint32_t i = 0; i < (num_cmd_buffers - 1); i++)
+      anv_cmd_buffer_record_chain_submit(cmd_buffers[i], cmd_buffers[i + 1]);
+
+   /* Put an end to the last one */
+   anv_cmd_buffer_record_end_submit(cmd_buffers[num_cmd_buffers - 1]);
+}
+
+static VkResult
+setup_execbuf_for_cmd_buffers(struct anv_execbuf *execbuf,
+                              struct anv_queue *queue,
+                              struct anv_cmd_buffer **cmd_buffers,
+                              uint32_t num_cmd_buffers)
+{
+   struct anv_device *device = queue->device;
+   struct anv_state_pool *ss_pool = &device->surface_state_pool;
+   VkResult result;
+
+   /* Edit the tail of the command buffers to chain them all together if they
+    * can be.
+    */
+   chain_command_buffers(cmd_buffers, num_cmd_buffers);
+
+   for (uint32_t i = 0; i < num_cmd_buffers; i++) {
+      result = setup_execbuf_for_cmd_buffer(execbuf, cmd_buffers[i]);
+      if (result != VK_SUCCESS)
+         return result;
+   }
+
+   /* Add all the global BOs to the object list for softpin case. */
+   if (device->physical->use_softpin) {
+      anv_block_pool_foreach_bo(bo, &ss_pool->block_pool) {
+         result = anv_execbuf_add_bo(device, execbuf, bo, NULL, 0);
+         if (result != VK_SUCCESS)
+            return result;
+      }
+
+      struct anv_block_pool *pool;
+      pool = &device->dynamic_state_pool.block_pool;
+      anv_block_pool_foreach_bo(bo, pool) {
+         result = anv_execbuf_add_bo(device, execbuf, bo, NULL, 0);
+         if (result != VK_SUCCESS)
+            return result;
+      }
+
+      pool = &device->instruction_state_pool.block_pool;
+      anv_block_pool_foreach_bo(bo, pool) {
+         result = anv_execbuf_add_bo(device, execbuf, bo, NULL, 0);
+         if (result != VK_SUCCESS)
+            return result;
+      }
+
+      pool = &device->binding_table_pool.block_pool;
+      anv_block_pool_foreach_bo(bo, pool) {
+         result = anv_execbuf_add_bo(device, execbuf, bo, NULL, 0);
+         if (result != VK_SUCCESS)
+            return result;
+      }
+
+      /* Add the BOs for all user allocated memory objects because we can't
+       * track after binding updates of VK_EXT_descriptor_indexing.
+       */
+      list_for_each_entry(struct anv_device_memory, mem,
+                          &device->memory_objects, link) {
+         result = anv_execbuf_add_bo(device, execbuf, mem->bo, NULL, 0);
+         if (result != VK_SUCCESS)
+            return result;
+      }
+   } else {
+      /* We do not support chaining primary command buffers without
+       * softpin.
+       */
+      assert(num_cmd_buffers == 1);
+   }
+
+   bool no_reloc = true;
+   if (execbuf->has_relocs) {
+      no_reloc = execbuf_can_skip_relocations(execbuf);
+      if (no_reloc) {
+         /* If we were able to successfully relocate everything, tell the
+          * kernel that it can skip doing relocations. The requirement for
+          * using NO_RELOC is:
+          *
+          *  1) The addresses written in the objects must match the
+          *     corresponding reloc.presumed_offset which in turn must match
+          *     the corresponding execobject.offset.
+          *
+          *  2) To avoid stalling, execobject.offset should match the current
+          *     address of that object within the active context.
+          *
+          * In order to satisfy all of the invariants that make userspace
+          * relocations to be safe (see relocate_cmd_buffer()), we need to
+          * further ensure that the addresses we use match those used by the
+          * kernel for the most recent execbuf2.
+          *
+          * The kernel may still choose to do relocations anyway if something
+          * has moved in the GTT. In this case, the relocation list still
+          * needs to be valid. All relocations on the batch buffers are
+          * already valid and kept up-to-date. For surface state relocations,
+          * by applying the relocations in relocate_cmd_buffer, we ensured
+          * that the address in the RENDER_SURFACE_STATE matches
+          * presumed_offset, so it should be safe for the kernel to relocate
+          * them as needed.
+          */
+         for (uint32_t i = 0; i < num_cmd_buffers; i++) {
+            relocate_cmd_buffer(cmd_buffers[i], execbuf);
+
+            anv_reloc_list_apply(device, &cmd_buffers[i]->surface_relocs,
+                                 device->surface_state_pool.block_pool.bo,
+                                 true /* always relocate surface states */);
+         }
+      } else {
+         /* In the case where we fall back to doing kernel relocations, we
+          * need to ensure that the relocation list is valid. All relocations
+          * on the batch buffers are already valid and kept up-to-date. Since
+          * surface states are shared between command buffers and we don't
+          * know what order they will be submitted to the kernel, we don't
+          * know what address is actually written in the surface state object
+          * at any given time. The only option is to set a bogus presumed
+          * offset and let the kernel relocate them.
+          */
+         for (uint32_t i = 0; i < num_cmd_buffers; i++)
+            reset_cmd_buffer_surface_offsets(cmd_buffers[i]);
+      }
+   }
+
   struct anv_batch_bo *first_batch_bo =
-      list_first_entry(&cmd_buffer->batch_bos, struct anv_batch_bo, link);
+      list_first_entry(&cmd_buffers[0]->batch_bos, struct anv_batch_bo, link);

   /* The kernel requires that the last entry in the validation list be the
    * batch buffer to execute.  We can simply swap the element
@ -1609,28 +1753,34 @@ setup_execbuf_for_cmd_buffer(struct anv_execbuf *execbuf,
   }

   /* If we are pinning our BOs, we shouldn't have to relocate anything */
-   if (cmd_buffer->device->physical->use_softpin)
+   if (device->physical->use_softpin)
      assert(!execbuf->has_relocs);

-   /* Now we go through and fixup all of the relocation lists to point to
-    * the correct indices in the object array.  We have to do this after we
-    * reorder the list above as some of the indices may have changed.
+   /* Now we go through and fixup all of the relocation lists to point to the
+    * correct indices in the object array (I915_EXEC_HANDLE_LUT).  We have to
+    * do this after we reorder the list above as some of the indices may have
+    * changed.
    */
+   struct anv_batch_bo **bbo;
   if (execbuf->has_relocs) {
-      u_vector_foreach(bbo, &cmd_buffer->seen_bbos)
-         anv_cmd_buffer_process_relocs(cmd_buffer, &(*bbo)->relocs);
+      assert(num_cmd_buffers == 1);
+      u_vector_foreach(bbo, &cmd_buffers[0]->seen_bbos)
+         anv_cmd_buffer_process_relocs(cmd_buffers[0], &(*bbo)->relocs);

-      anv_cmd_buffer_process_relocs(cmd_buffer, &cmd_buffer->surface_relocs);
+      anv_cmd_buffer_process_relocs(cmd_buffers[0], &cmd_buffers[0]->surface_relocs);
   }

-   if (!cmd_buffer->device->info.has_llc) {
+   if (!device->info.has_llc) {
      __builtin_ia32_mfence();
-      u_vector_foreach(bbo, &cmd_buffer->seen_bbos) {
-         for (uint32_t i = 0; i < (*bbo)->length; i += CACHELINE_SIZE)
-            __builtin_ia32_clflush((*bbo)->bo->map + i);
+      for (uint32_t i = 0; i < num_cmd_buffers; i++) {
+         u_vector_foreach(bbo, &cmd_buffers[i]->seen_bbos) {
+            for (uint32_t i = 0; i < (*bbo)->length; i += CACHELINE_SIZE)
+               __builtin_ia32_clflush((*bbo)->bo->map + i);
+         }
      }
   }

+   struct anv_batch *batch = &cmd_buffers[0]->batch;
   execbuf->execbuf = (struct drm_i915_gem_execbuffer2) {
      .buffers_ptr = (uintptr_t) execbuf->objects,
      .buffer_count = execbuf->bo_count,
@ -1640,51 +1790,11 @@ setup_execbuf_for_cmd_buffer(struct anv_execbuf *execbuf,
      .num_cliprects = 0,
      .DR1 = 0,
      .DR4 = 0,
-      .flags = I915_EXEC_HANDLE_LUT | queue->exec_flags,
-      .rsvd1 = cmd_buffer->device->context_id,
+      .flags = I915_EXEC_HANDLE_LUT | queue->exec_flags | (no_reloc ? I915_EXEC_NO_RELOC : 0),
+      .rsvd1 = device->context_id,
      .rsvd2 = 0,
   };

-   if (relocate_cmd_buffer(cmd_buffer, execbuf)) {
-      /* If we were able to successfully relocate everything, tell the kernel
-       * that it can skip doing relocations. The requirement for using
-       * NO_RELOC is:
-       *
-       *  1) The addresses written in the objects must match the corresponding
-       *     reloc.presumed_offset which in turn must match the corresponding
-       *     execobject.offset.
-       *
-       *  2) To avoid stalling, execobject.offset should match the current
-       *     address of that object within the active context.
-       *
-       * In order to satisfy all of the invariants that make userspace
-       * relocations to be safe (see relocate_cmd_buffer()), we need to
-       * further ensure that the addresses we use match those used by the
-       * kernel for the most recent execbuf2.
-       *
-       * The kernel may still choose to do relocations anyway if something has
-       * moved in the GTT. In this case, the relocation list still needs to be
-       * valid.  All relocations on the batch buffers are already valid and
-       * kept up-to-date.  For surface state relocations, by applying the
-       * relocations in relocate_cmd_buffer, we ensured that the address in
-       * the RENDER_SURFACE_STATE matches presumed_offset, so it should be
-       * safe for the kernel to relocate them as needed.
-       */
-      execbuf->execbuf.flags |= I915_EXEC_NO_RELOC;
-   } else {
-      /* In the case where we fall back to doing kernel relocations, we need
-       * to ensure that the relocation list is valid.  All relocations on the
-       * batch buffers are already valid and kept up-to-date.  Since surface
-       * states are shared between command buffers and we don't know what
-       * order they will be submitted to the kernel, we don't know what
-       * address is actually written in the surface state object at any given
-       * time.  The only option is to set a bogus presumed offset and let the
-       * kernel relocate them.
-       */
-      for (size_t i = 0; i < cmd_buffer->surface_relocs.num_relocs; i++)
-         cmd_buffer->surface_relocs.relocs[i].presumed_offset = -1;
-   }
-
   return VK_SUCCESS;
 }

@ -1764,10 +1874,10 @@ anv_queue_execbuf_locked(struct anv_queue *queue,
         goto error;
   }

-   if (submit->cmd_buffer) {
-      if (!anv_cmd_buffer_is_chainable(submit->cmd_buffer))
-         anv_cmd_buffer_record_end_submit(submit->cmd_buffer);
-      result = setup_execbuf_for_cmd_buffer(&execbuf, queue, submit->cmd_buffer);
+   if (submit->cmd_buffer_count) {
+      result = setup_execbuf_for_cmd_buffers(&execbuf, queue,
+                                             submit->cmd_buffers,
+                                             submit->cmd_buffer_count);
   } else if (submit->simple_bo) {
      result = anv_execbuf_add_bo(device, &execbuf, submit->simple_bo, NULL, 0);
      if (result != VK_SUCCESS)
@ -1791,14 +1901,14 @@ anv_queue_execbuf_locked(struct anv_queue *queue,

   const bool has_perf_query =
      submit->perf_query_pass >= 0 &&
-      submit->cmd_buffer &&
-      submit->cmd_buffer->perf_query_pool;
+      submit->cmd_buffer_count &&
+      submit->perf_query_pool;

   if (INTEL_DEBUG & DEBUG_BATCH) {
      fprintf(stderr, "Batch on queue %d\n", (int)(queue - device->queues));
-      if (submit->cmd_buffer) {
+      if (submit->cmd_buffer_count) {
         if (has_perf_query) {
-            struct anv_query_pool *query_pool = submit->cmd_buffer->perf_query_pool;
+            struct anv_query_pool *query_pool = submit->perf_query_pool;
            struct anv_bo *pass_batch_bo = query_pool->bo;
            uint64_t pass_batch_offset =
               khr_perf_query_preamble_offset(query_pool,
@ -1809,11 +1919,14 @@ anv_queue_execbuf_locked(struct anv_queue *queue,
                            pass_batch_bo->offset + pass_batch_offset, false);
         }

-         struct anv_batch_bo **bo = u_vector_tail(&submit->cmd_buffer->seen_bbos);
-         device->cmd_buffer_being_decoded = submit->cmd_buffer;
-         gen_print_batch(&device->decoder_ctx, (*bo)->bo->map,
-                         (*bo)->bo->size, (*bo)->bo->offset, false);
-         device->cmd_buffer_being_decoded = NULL;
+         for (uint32_t i = 0; i < submit->cmd_buffer_count; i++) {
+            struct anv_batch_bo **bo =
+               u_vector_tail(&submit->cmd_buffers[i]->seen_bbos);
+            device->cmd_buffer_being_decoded = submit->cmd_buffers[i];
+            gen_print_batch(&device->decoder_ctx, (*bo)->bo->map,
+                            (*bo)->bo->size, (*bo)->bo->offset, false);
+            device->cmd_buffer_being_decoded = NULL;
+         }
      } else if (submit->simple_bo) {
         gen_print_batch(&device->decoder_ctx, submit->simple_bo->map,
                         submit->simple_bo->size, submit->simple_bo->offset, false);
@ -1853,7 +1966,7 @@ anv_queue_execbuf_locked(struct anv_queue *queue,
   }

   if (has_perf_query) {
-      struct anv_query_pool *query_pool = submit->cmd_buffer->perf_query_pool;
+      struct anv_query_pool *query_pool = submit->perf_query_pool;
      assert(submit->perf_query_pass < query_pool->n_passes);
      struct gen_perf_query_info *query_info =
         query_pool->pass_query[submit->perf_query_pass];
--- a/src/intel/vulkan/anv_private.h
+++ b/src/intel/vulkan/anv_private.h
@ -1090,7 +1090,9 @@ VkResult anv_init_wsi(struct anv_physical_device *physical_device);
 void anv_finish_wsi(struct anv_physical_device *physical_device);

 struct anv_queue_submit {
-   struct anv_cmd_buffer *                   cmd_buffer;
+   struct anv_cmd_buffer **                  cmd_buffers;
+   uint32_t                                  cmd_buffer_count;
+   uint32_t                                  cmd_buffer_array_length;

   uint32_t                                  fence_count;
   uint32_t                                  fence_array_length;
@ -1132,6 +1134,7 @@ struct anv_queue_submit {
   uintptr_t *                               fence_bos;

   int                                       perf_query_pass;
+   struct anv_query_pool *                   perf_query_pool;

   const VkAllocationCallbacks *             alloc;
   VkSystemAllocationScope                   alloc_scope;
--- a/src/intel/vulkan/anv_queue.c
+++ b/src/intel/vulkan/anv_queue.c
@ -112,6 +112,7 @@ anv_queue_submit_free(struct anv_device *device,
   vk_free(alloc, submit->signal_timelines);
   vk_free(alloc, submit->signal_timeline_values);
   vk_free(alloc, submit->fence_bos);
+   vk_free(alloc, submit->cmd_buffers);
   vk_free(alloc, submit);
 }

@ -1207,6 +1208,29 @@ anv_post_queue_fence_update(struct anv_device *device, VkFence _fence)
   }
 }

+static VkResult
+anv_queue_submit_add_cmd_buffer(struct anv_queue_submit *submit,
+                                struct anv_cmd_buffer *cmd_buffer)
+{
+   if (submit->cmd_buffer_count >= submit->cmd_buffer_array_length) {
+      uint32_t new_len = MAX2(submit->cmd_buffer_array_length * 2, 4);
+      struct anv_cmd_buffer **new_cmd_buffers =
+         vk_realloc(submit->alloc,
+                    submit->cmd_buffers, new_len * sizeof(*submit->cmd_buffers),
+                    8, submit->alloc_scope);
+      if (new_cmd_buffers == NULL)
+         return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+
+      submit->cmd_buffers = new_cmd_buffers;
+      submit->cmd_buffer_array_length = new_len;
+   }
+
+   submit->cmd_buffers[submit->cmd_buffer_count++] = cmd_buffer;
+   submit->perf_query_pool = cmd_buffer->perf_query_pool;
+
+   return VK_SUCCESS;
+}
+
 static VkResult
 anv_queue_submit_empty(struct anv_queue *queue,
                       const VkSemaphore *in_semaphores,
@ -1362,7 +1386,9 @@ VkResult anv_QueueSubmit(
            goto out;
         }

-         submit->cmd_buffer = cmd_buffer;
+         result = anv_queue_submit_add_cmd_buffer(submit, cmd_buffer);
+         if (result != VK_SUCCESS)
+            goto out;

         if (j == 0) {
            /* Only the first batch gets the in semaphores */