anv: allow multiple command buffers in anv_queue_submit

v2: Fixup crash spotted by Mark about missing alloc vfuncs

v3: Fixup double iteration over device->memory_objects (that ought to
    be expensive...) (Ken)

v4: Add more asserts for non-softpin cases (Ken)

Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/2371>
This commit is contained in:
Lionel Landwerlin 2020-12-09 13:22:45 +02:00
parent 882fc72442
commit 83fee30e85
3 changed files with 265 additions and 123 deletions

View File

@ -614,6 +614,37 @@ cmd_buffer_chain_to_batch_bo(struct anv_cmd_buffer *cmd_buffer,
anv_batch_bo_finish(current_bbo, batch);
}
static void
anv_cmd_buffer_record_chain_submit(struct anv_cmd_buffer *cmd_buffer_from,
struct anv_cmd_buffer *cmd_buffer_to)
{
assert(cmd_buffer_from->device->physical->use_softpin);
uint32_t *bb_start = cmd_buffer_from->batch_end;
struct anv_batch_bo *last_bbo =
list_last_entry(&cmd_buffer_from->batch_bos, struct anv_batch_bo, link);
struct anv_batch_bo *first_bbo =
list_first_entry(&cmd_buffer_to->batch_bos, struct anv_batch_bo, link);
struct GEN8_MI_BATCH_BUFFER_START gen_bb_start = {
__anv_cmd_header(GEN8_MI_BATCH_BUFFER_START),
.SecondLevelBatchBuffer = Firstlevelbatch,
.AddressSpaceIndicator = ASI_PPGTT,
.BatchBufferStartAddress = (struct anv_address) { first_bbo->bo, 0 },
};
struct anv_batch local_batch = {
.start = last_bbo->bo->map,
.end = last_bbo->bo->map + last_bbo->bo->size,
.relocs = &last_bbo->relocs,
.alloc = &cmd_buffer_from->pool->alloc,
};
__anv_cmd_pack(GEN8_MI_BATCH_BUFFER_START)(&local_batch, bb_start, &gen_bb_start);
last_bbo->chained = true;
}
static void
anv_cmd_buffer_record_end_submit(struct anv_cmd_buffer *cmd_buffer)
{
@ -1135,6 +1166,11 @@ struct anv_execbuf {
/* Allocated length of the 'objects' and 'bos' arrays */
uint32_t array_length;
/* List of relocations for surface states, only used with platforms not
* using softpin.
*/
void * surface_states_relocs;
/* Indicates whether any of the command buffers have relocations. This
* doesn't not necessarily mean we'll need the kernel to process them. It
* might be that a previous execbuf has already placed things in the VMA
@ -1157,6 +1193,7 @@ anv_execbuf_init(struct anv_execbuf *exec)
static void
anv_execbuf_finish(struct anv_execbuf *exec)
{
vk_free(exec->alloc, exec->surface_states_relocs);
vk_free(exec->alloc, exec->objects);
vk_free(exec->alloc, exec->bos);
}
@ -1434,8 +1471,7 @@ anv_reloc_list_apply(struct anv_device *device,
* have to make a full copy of all the relocations lists.
*/
static bool
relocate_cmd_buffer(struct anv_cmd_buffer *cmd_buffer,
struct anv_execbuf *exec)
execbuf_can_skip_relocations(struct anv_execbuf *exec)
{
if (!exec->has_relocs)
return true;
@ -1459,6 +1495,13 @@ relocate_cmd_buffer(struct anv_cmd_buffer *cmd_buffer,
return false;
}
return true;
}
static void
relocate_cmd_buffer(struct anv_cmd_buffer *cmd_buffer,
struct anv_execbuf *exec)
{
/* Since surface states are shared between command buffers and we don't
* know what order they will be submitted to the kernel, we don't know
* what address is actually written in the surface state object at any
@ -1482,16 +1525,27 @@ relocate_cmd_buffer(struct anv_cmd_buffer *cmd_buffer,
for (uint32_t i = 0; i < exec->bo_count; i++)
exec->objects[i].offset = exec->bos[i]->offset;
}
return true;
static void
reset_cmd_buffer_surface_offsets(struct anv_cmd_buffer *cmd_buffer)
{
/* In the case where we fall back to doing kernel relocations, we need to
* ensure that the relocation list is valid. All relocations on the batch
* buffers are already valid and kept up-to-date. Since surface states are
* shared between command buffers and we don't know what order they will be
* submitted to the kernel, we don't know what address is actually written
* in the surface state object at any given time. The only option is to set
* a bogus presumed offset and let the kernel relocate them.
*/
for (size_t i = 0; i < cmd_buffer->surface_relocs.num_relocs; i++)
cmd_buffer->surface_relocs.relocs[i].presumed_offset = -1;
}
static VkResult
setup_execbuf_for_cmd_buffer(struct anv_execbuf *execbuf,
struct anv_queue *queue,
struct anv_cmd_buffer *cmd_buffer)
{
struct anv_batch *batch = &cmd_buffer->batch;
struct anv_state_pool *ss_pool =
&cmd_buffer->device->surface_state_pool;
@ -1499,58 +1553,10 @@ setup_execbuf_for_cmd_buffer(struct anv_execbuf *execbuf,
cmd_buffer->last_ss_pool_center);
VkResult result;
if (cmd_buffer->device->physical->use_softpin) {
anv_block_pool_foreach_bo(bo, &ss_pool->block_pool) {
result = anv_execbuf_add_bo(cmd_buffer->device, execbuf,
bo, NULL, 0);
if (result != VK_SUCCESS)
return result;
}
/* Add surface dependencies (BOs) to the execbuf */
anv_execbuf_add_bo_bitset(cmd_buffer->device, execbuf,
cmd_buffer->surface_relocs.dep_words,
cmd_buffer->surface_relocs.deps, 0);
/* Add the BOs for all memory objects */
list_for_each_entry(struct anv_device_memory, mem,
&cmd_buffer->device->memory_objects, link) {
result = anv_execbuf_add_bo(cmd_buffer->device, execbuf,
mem->bo, NULL, 0);
if (result != VK_SUCCESS)
return result;
}
struct anv_block_pool *pool;
pool = &cmd_buffer->device->general_state_pool.block_pool;
anv_block_pool_foreach_bo(bo, pool) {
result = anv_execbuf_add_bo(cmd_buffer->device, execbuf,
bo, NULL, 0);
if (result != VK_SUCCESS)
return result;
}
pool = &cmd_buffer->device->dynamic_state_pool.block_pool;
anv_block_pool_foreach_bo(bo, pool) {
result = anv_execbuf_add_bo(cmd_buffer->device, execbuf,
bo, NULL, 0);
if (result != VK_SUCCESS)
return result;
}
pool = &cmd_buffer->device->instruction_state_pool.block_pool;
anv_block_pool_foreach_bo(bo, pool) {
result = anv_execbuf_add_bo(cmd_buffer->device, execbuf,
bo, NULL, 0);
if (result != VK_SUCCESS)
return result;
}
pool = &cmd_buffer->device->binding_table_pool.block_pool;
anv_block_pool_foreach_bo(bo, pool) {
result = anv_execbuf_add_bo(cmd_buffer->device, execbuf,
bo, NULL, 0);
if (result != VK_SUCCESS)
return result;
}
} else {
/* Since we aren't in the softpin case, all of our STATE_BASE_ADDRESS BOs
* will get added automatically by processing relocations on the batch
@ -1584,8 +1590,146 @@ setup_execbuf_for_cmd_buffer(struct anv_execbuf *execbuf,
*/
cmd_buffer->last_ss_pool_center = ss_pool->block_pool.center_bo_offset;
return VK_SUCCESS;
}
static void
chain_command_buffers(struct anv_cmd_buffer **cmd_buffers,
uint32_t num_cmd_buffers)
{
if (!anv_cmd_buffer_is_chainable(cmd_buffers[0])) {
assert(num_cmd_buffers == 1);
return;
}
/* Chain the N-1 first batch buffers */
for (uint32_t i = 0; i < (num_cmd_buffers - 1); i++)
anv_cmd_buffer_record_chain_submit(cmd_buffers[i], cmd_buffers[i + 1]);
/* Put an end to the last one */
anv_cmd_buffer_record_end_submit(cmd_buffers[num_cmd_buffers - 1]);
}
static VkResult
setup_execbuf_for_cmd_buffers(struct anv_execbuf *execbuf,
struct anv_queue *queue,
struct anv_cmd_buffer **cmd_buffers,
uint32_t num_cmd_buffers)
{
struct anv_device *device = queue->device;
struct anv_state_pool *ss_pool = &device->surface_state_pool;
VkResult result;
/* Edit the tail of the command buffers to chain them all together if they
* can be.
*/
chain_command_buffers(cmd_buffers, num_cmd_buffers);
for (uint32_t i = 0; i < num_cmd_buffers; i++) {
result = setup_execbuf_for_cmd_buffer(execbuf, cmd_buffers[i]);
if (result != VK_SUCCESS)
return result;
}
/* Add all the global BOs to the object list for softpin case. */
if (device->physical->use_softpin) {
anv_block_pool_foreach_bo(bo, &ss_pool->block_pool) {
result = anv_execbuf_add_bo(device, execbuf, bo, NULL, 0);
if (result != VK_SUCCESS)
return result;
}
struct anv_block_pool *pool;
pool = &device->dynamic_state_pool.block_pool;
anv_block_pool_foreach_bo(bo, pool) {
result = anv_execbuf_add_bo(device, execbuf, bo, NULL, 0);
if (result != VK_SUCCESS)
return result;
}
pool = &device->instruction_state_pool.block_pool;
anv_block_pool_foreach_bo(bo, pool) {
result = anv_execbuf_add_bo(device, execbuf, bo, NULL, 0);
if (result != VK_SUCCESS)
return result;
}
pool = &device->binding_table_pool.block_pool;
anv_block_pool_foreach_bo(bo, pool) {
result = anv_execbuf_add_bo(device, execbuf, bo, NULL, 0);
if (result != VK_SUCCESS)
return result;
}
/* Add the BOs for all user allocated memory objects because we can't
* track after binding updates of VK_EXT_descriptor_indexing.
*/
list_for_each_entry(struct anv_device_memory, mem,
&device->memory_objects, link) {
result = anv_execbuf_add_bo(device, execbuf, mem->bo, NULL, 0);
if (result != VK_SUCCESS)
return result;
}
} else {
/* We do not support chaining primary command buffers without
* softpin.
*/
assert(num_cmd_buffers == 1);
}
bool no_reloc = true;
if (execbuf->has_relocs) {
no_reloc = execbuf_can_skip_relocations(execbuf);
if (no_reloc) {
/* If we were able to successfully relocate everything, tell the
* kernel that it can skip doing relocations. The requirement for
* using NO_RELOC is:
*
* 1) The addresses written in the objects must match the
* corresponding reloc.presumed_offset which in turn must match
* the corresponding execobject.offset.
*
* 2) To avoid stalling, execobject.offset should match the current
* address of that object within the active context.
*
* In order to satisfy all of the invariants that make userspace
* relocations to be safe (see relocate_cmd_buffer()), we need to
* further ensure that the addresses we use match those used by the
* kernel for the most recent execbuf2.
*
* The kernel may still choose to do relocations anyway if something
* has moved in the GTT. In this case, the relocation list still
* needs to be valid. All relocations on the batch buffers are
* already valid and kept up-to-date. For surface state relocations,
* by applying the relocations in relocate_cmd_buffer, we ensured
* that the address in the RENDER_SURFACE_STATE matches
* presumed_offset, so it should be safe for the kernel to relocate
* them as needed.
*/
for (uint32_t i = 0; i < num_cmd_buffers; i++) {
relocate_cmd_buffer(cmd_buffers[i], execbuf);
anv_reloc_list_apply(device, &cmd_buffers[i]->surface_relocs,
device->surface_state_pool.block_pool.bo,
true /* always relocate surface states */);
}
} else {
/* In the case where we fall back to doing kernel relocations, we
* need to ensure that the relocation list is valid. All relocations
* on the batch buffers are already valid and kept up-to-date. Since
* surface states are shared between command buffers and we don't
* know what order they will be submitted to the kernel, we don't
* know what address is actually written in the surface state object
* at any given time. The only option is to set a bogus presumed
* offset and let the kernel relocate them.
*/
for (uint32_t i = 0; i < num_cmd_buffers; i++)
reset_cmd_buffer_surface_offsets(cmd_buffers[i]);
}
}
struct anv_batch_bo *first_batch_bo =
list_first_entry(&cmd_buffer->batch_bos, struct anv_batch_bo, link);
list_first_entry(&cmd_buffers[0]->batch_bos, struct anv_batch_bo, link);
/* The kernel requires that the last entry in the validation list be the
* batch buffer to execute. We can simply swap the element
@ -1609,28 +1753,34 @@ setup_execbuf_for_cmd_buffer(struct anv_execbuf *execbuf,
}
/* If we are pinning our BOs, we shouldn't have to relocate anything */
if (cmd_buffer->device->physical->use_softpin)
if (device->physical->use_softpin)
assert(!execbuf->has_relocs);
/* Now we go through and fixup all of the relocation lists to point to
* the correct indices in the object array. We have to do this after we
* reorder the list above as some of the indices may have changed.
/* Now we go through and fixup all of the relocation lists to point to the
* correct indices in the object array (I915_EXEC_HANDLE_LUT). We have to
* do this after we reorder the list above as some of the indices may have
* changed.
*/
struct anv_batch_bo **bbo;
if (execbuf->has_relocs) {
u_vector_foreach(bbo, &cmd_buffer->seen_bbos)
anv_cmd_buffer_process_relocs(cmd_buffer, &(*bbo)->relocs);
assert(num_cmd_buffers == 1);
u_vector_foreach(bbo, &cmd_buffers[0]->seen_bbos)
anv_cmd_buffer_process_relocs(cmd_buffers[0], &(*bbo)->relocs);
anv_cmd_buffer_process_relocs(cmd_buffer, &cmd_buffer->surface_relocs);
anv_cmd_buffer_process_relocs(cmd_buffers[0], &cmd_buffers[0]->surface_relocs);
}
if (!cmd_buffer->device->info.has_llc) {
if (!device->info.has_llc) {
__builtin_ia32_mfence();
u_vector_foreach(bbo, &cmd_buffer->seen_bbos) {
for (uint32_t i = 0; i < (*bbo)->length; i += CACHELINE_SIZE)
__builtin_ia32_clflush((*bbo)->bo->map + i);
for (uint32_t i = 0; i < num_cmd_buffers; i++) {
u_vector_foreach(bbo, &cmd_buffers[i]->seen_bbos) {
for (uint32_t i = 0; i < (*bbo)->length; i += CACHELINE_SIZE)
__builtin_ia32_clflush((*bbo)->bo->map + i);
}
}
}
struct anv_batch *batch = &cmd_buffers[0]->batch;
execbuf->execbuf = (struct drm_i915_gem_execbuffer2) {
.buffers_ptr = (uintptr_t) execbuf->objects,
.buffer_count = execbuf->bo_count,
@ -1640,51 +1790,11 @@ setup_execbuf_for_cmd_buffer(struct anv_execbuf *execbuf,
.num_cliprects = 0,
.DR1 = 0,
.DR4 = 0,
.flags = I915_EXEC_HANDLE_LUT | queue->exec_flags,
.rsvd1 = cmd_buffer->device->context_id,
.flags = I915_EXEC_HANDLE_LUT | queue->exec_flags | (no_reloc ? I915_EXEC_NO_RELOC : 0),
.rsvd1 = device->context_id,
.rsvd2 = 0,
};
if (relocate_cmd_buffer(cmd_buffer, execbuf)) {
/* If we were able to successfully relocate everything, tell the kernel
* that it can skip doing relocations. The requirement for using
* NO_RELOC is:
*
* 1) The addresses written in the objects must match the corresponding
* reloc.presumed_offset which in turn must match the corresponding
* execobject.offset.
*
* 2) To avoid stalling, execobject.offset should match the current
* address of that object within the active context.
*
* In order to satisfy all of the invariants that make userspace
* relocations to be safe (see relocate_cmd_buffer()), we need to
* further ensure that the addresses we use match those used by the
* kernel for the most recent execbuf2.
*
* The kernel may still choose to do relocations anyway if something has
* moved in the GTT. In this case, the relocation list still needs to be
* valid. All relocations on the batch buffers are already valid and
* kept up-to-date. For surface state relocations, by applying the
* relocations in relocate_cmd_buffer, we ensured that the address in
* the RENDER_SURFACE_STATE matches presumed_offset, so it should be
* safe for the kernel to relocate them as needed.
*/
execbuf->execbuf.flags |= I915_EXEC_NO_RELOC;
} else {
/* In the case where we fall back to doing kernel relocations, we need
* to ensure that the relocation list is valid. All relocations on the
* batch buffers are already valid and kept up-to-date. Since surface
* states are shared between command buffers and we don't know what
* order they will be submitted to the kernel, we don't know what
* address is actually written in the surface state object at any given
* time. The only option is to set a bogus presumed offset and let the
* kernel relocate them.
*/
for (size_t i = 0; i < cmd_buffer->surface_relocs.num_relocs; i++)
cmd_buffer->surface_relocs.relocs[i].presumed_offset = -1;
}
return VK_SUCCESS;
}
@ -1764,10 +1874,10 @@ anv_queue_execbuf_locked(struct anv_queue *queue,
goto error;
}
if (submit->cmd_buffer) {
if (!anv_cmd_buffer_is_chainable(submit->cmd_buffer))
anv_cmd_buffer_record_end_submit(submit->cmd_buffer);
result = setup_execbuf_for_cmd_buffer(&execbuf, queue, submit->cmd_buffer);
if (submit->cmd_buffer_count) {
result = setup_execbuf_for_cmd_buffers(&execbuf, queue,
submit->cmd_buffers,
submit->cmd_buffer_count);
} else if (submit->simple_bo) {
result = anv_execbuf_add_bo(device, &execbuf, submit->simple_bo, NULL, 0);
if (result != VK_SUCCESS)
@ -1791,14 +1901,14 @@ anv_queue_execbuf_locked(struct anv_queue *queue,
const bool has_perf_query =
submit->perf_query_pass >= 0 &&
submit->cmd_buffer &&
submit->cmd_buffer->perf_query_pool;
submit->cmd_buffer_count &&
submit->perf_query_pool;
if (INTEL_DEBUG & DEBUG_BATCH) {
fprintf(stderr, "Batch on queue %d\n", (int)(queue - device->queues));
if (submit->cmd_buffer) {
if (submit->cmd_buffer_count) {
if (has_perf_query) {
struct anv_query_pool *query_pool = submit->cmd_buffer->perf_query_pool;
struct anv_query_pool *query_pool = submit->perf_query_pool;
struct anv_bo *pass_batch_bo = query_pool->bo;
uint64_t pass_batch_offset =
khr_perf_query_preamble_offset(query_pool,
@ -1809,11 +1919,14 @@ anv_queue_execbuf_locked(struct anv_queue *queue,
pass_batch_bo->offset + pass_batch_offset, false);
}
struct anv_batch_bo **bo = u_vector_tail(&submit->cmd_buffer->seen_bbos);
device->cmd_buffer_being_decoded = submit->cmd_buffer;
gen_print_batch(&device->decoder_ctx, (*bo)->bo->map,
(*bo)->bo->size, (*bo)->bo->offset, false);
device->cmd_buffer_being_decoded = NULL;
for (uint32_t i = 0; i < submit->cmd_buffer_count; i++) {
struct anv_batch_bo **bo =
u_vector_tail(&submit->cmd_buffers[i]->seen_bbos);
device->cmd_buffer_being_decoded = submit->cmd_buffers[i];
gen_print_batch(&device->decoder_ctx, (*bo)->bo->map,
(*bo)->bo->size, (*bo)->bo->offset, false);
device->cmd_buffer_being_decoded = NULL;
}
} else if (submit->simple_bo) {
gen_print_batch(&device->decoder_ctx, submit->simple_bo->map,
submit->simple_bo->size, submit->simple_bo->offset, false);
@ -1853,7 +1966,7 @@ anv_queue_execbuf_locked(struct anv_queue *queue,
}
if (has_perf_query) {
struct anv_query_pool *query_pool = submit->cmd_buffer->perf_query_pool;
struct anv_query_pool *query_pool = submit->perf_query_pool;
assert(submit->perf_query_pass < query_pool->n_passes);
struct gen_perf_query_info *query_info =
query_pool->pass_query[submit->perf_query_pass];

View File

@ -1090,7 +1090,9 @@ VkResult anv_init_wsi(struct anv_physical_device *physical_device);
void anv_finish_wsi(struct anv_physical_device *physical_device);
struct anv_queue_submit {
struct anv_cmd_buffer * cmd_buffer;
struct anv_cmd_buffer ** cmd_buffers;
uint32_t cmd_buffer_count;
uint32_t cmd_buffer_array_length;
uint32_t fence_count;
uint32_t fence_array_length;
@ -1132,6 +1134,7 @@ struct anv_queue_submit {
uintptr_t * fence_bos;
int perf_query_pass;
struct anv_query_pool * perf_query_pool;
const VkAllocationCallbacks * alloc;
VkSystemAllocationScope alloc_scope;

View File

@ -112,6 +112,7 @@ anv_queue_submit_free(struct anv_device *device,
vk_free(alloc, submit->signal_timelines);
vk_free(alloc, submit->signal_timeline_values);
vk_free(alloc, submit->fence_bos);
vk_free(alloc, submit->cmd_buffers);
vk_free(alloc, submit);
}
@ -1207,6 +1208,29 @@ anv_post_queue_fence_update(struct anv_device *device, VkFence _fence)
}
}
static VkResult
anv_queue_submit_add_cmd_buffer(struct anv_queue_submit *submit,
struct anv_cmd_buffer *cmd_buffer)
{
if (submit->cmd_buffer_count >= submit->cmd_buffer_array_length) {
uint32_t new_len = MAX2(submit->cmd_buffer_array_length * 2, 4);
struct anv_cmd_buffer **new_cmd_buffers =
vk_realloc(submit->alloc,
submit->cmd_buffers, new_len * sizeof(*submit->cmd_buffers),
8, submit->alloc_scope);
if (new_cmd_buffers == NULL)
return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
submit->cmd_buffers = new_cmd_buffers;
submit->cmd_buffer_array_length = new_len;
}
submit->cmd_buffers[submit->cmd_buffer_count++] = cmd_buffer;
submit->perf_query_pool = cmd_buffer->perf_query_pool;
return VK_SUCCESS;
}
static VkResult
anv_queue_submit_empty(struct anv_queue *queue,
const VkSemaphore *in_semaphores,
@ -1362,7 +1386,9 @@ VkResult anv_QueueSubmit(
goto out;
}
submit->cmd_buffer = cmd_buffer;
result = anv_queue_submit_add_cmd_buffer(submit, cmd_buffer);
if (result != VK_SUCCESS)
goto out;
if (j == 0) {
/* Only the first batch gets the in semaphores */