turnip/perfetto: reusable command buffers support

The limitation is that the reusable command buffer should be created
when perfetto is already connected in order to write timestamps.
Otherwise such cmd buffer won't be traces.

Signed-off-by: Danylo Piliaiev <dpiliaiev@igalia.com>
Reviewed-by: Rob Clark <robdclark@chromium.org>
Reviewed-by: Hyunjun Ko <zzoon@igalia.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/10969>
This commit is contained in:
Danylo Piliaiev 2021-06-07 13:16:25 +03:00
parent 0565c993f9
commit 5c6f0d46e7
3 changed files with 187 additions and 27 deletions

View File

@ -1326,10 +1326,86 @@ tu_trace_delete_flush_data(struct u_trace_context *utctx, void *flush_data)
container_of(utctx, struct tu_device, trace_context);
struct tu_u_trace_flush_data *trace_flush_data = flush_data;
tu_u_trace_cmd_data_finish(device, trace_flush_data->cmd_trace_data,
trace_flush_data->trace_count);
vk_free(&device->vk.alloc, trace_flush_data->syncobj);
vk_free(&device->vk.alloc, trace_flush_data);
}
void
tu_copy_timestamp_buffer(struct u_trace_context *utctx, void *cmdstream,
void *ts_from, uint32_t from_offset,
void *ts_to, uint32_t to_offset,
uint32_t count)
{
struct tu_cs *cs = cmdstream;
struct tu_bo *bo_from = ts_from;
struct tu_bo *bo_to = ts_to;
tu_cs_emit_pkt7(cs, CP_MEMCPY, 5);
tu_cs_emit(cs, count * sizeof(uint64_t) / sizeof(uint32_t));
tu_cs_emit_qw(cs, bo_from->iova + from_offset * sizeof(uint64_t));
tu_cs_emit_qw(cs, bo_to->iova + to_offset * sizeof(uint64_t));
}
VkResult
tu_create_copy_timestamp_cs(struct tu_cmd_buffer *cmdbuf, struct tu_cs** cs,
struct u_trace **trace_copy)
{
*cs = vk_zalloc(&cmdbuf->device->vk.alloc, sizeof(struct tu_cs), 8,
VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
if (*cs == NULL) {
return VK_ERROR_OUT_OF_HOST_MEMORY;
}
tu_cs_init(*cs, cmdbuf->device, TU_CS_MODE_GROW,
list_length(&cmdbuf->trace.trace_chunks) * 6 + 3);
tu_cs_begin(*cs);
tu_cs_emit_wfi(*cs);
tu_cs_emit_pkt7(*cs, CP_WAIT_FOR_ME, 0);
*trace_copy = vk_zalloc(&cmdbuf->device->vk.alloc, sizeof(struct u_trace), 8,
VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
if (*trace_copy == NULL) {
return VK_ERROR_OUT_OF_HOST_MEMORY;
}
u_trace_init(*trace_copy, cmdbuf->trace.utctx);
u_trace_clone_append(u_trace_begin_iterator(&cmdbuf->trace),
u_trace_end_iterator(&cmdbuf->trace),
*trace_copy, *cs,
tu_copy_timestamp_buffer);
tu_cs_emit_wfi(*cs);
tu_cs_end(*cs);
return VK_SUCCESS;
}
void
tu_u_trace_cmd_data_finish(struct tu_device *device,
struct tu_u_trace_cmd_data *trace_data,
uint32_t entry_count)
{
for (uint32_t i = 0; i < entry_count; ++i) {
/* Only if we had to create a copy of trace we should free it */
if (trace_data[i].timestamp_copy_cs != NULL) {
tu_cs_finish(trace_data[i].timestamp_copy_cs);
vk_free(&device->vk.alloc, trace_data[i].timestamp_copy_cs);
u_trace_fini(trace_data[i].trace);
vk_free(&device->vk.alloc, trace_data[i].trace);
}
}
vk_free(&device->vk.alloc, trace_data);
}
VKAPI_ATTR VkResult VKAPI_CALL
tu_CreateDevice(VkPhysicalDevice physicalDevice,
const VkDeviceCreateInfo *pCreateInfo,

View File

@ -38,6 +38,8 @@
#include "tu_private.h"
#include "tu_cs.h"
struct tu_binary_syncobj {
uint32_t permanent, temporary;
};
@ -85,6 +87,7 @@ struct tu_queue_submit
struct list_head link;
VkCommandBuffer *cmd_buffers;
struct tu_u_trace_cmd_data *cmd_buffer_trace_data;
uint32_t cmd_buffer_count;
struct tu_syncobj **wait_semaphores;
@ -938,6 +941,9 @@ tu_queue_submit_create_locked(struct tu_queue *queue,
}
}
bool u_trace_enabled = u_trace_context_tracing(&queue->device->trace_context);
bool has_trace_points = false;
uint32_t entry_count = 0;
for (uint32_t j = 0; j < new_submit->cmd_buffer_count; ++j) {
TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, new_submit->cmd_buffers[j]);
@ -946,6 +952,13 @@ tu_queue_submit_create_locked(struct tu_queue *queue,
entry_count++;
entry_count += cmdbuf->cs.entry_count;
if (u_trace_enabled && u_trace_has_points(&cmdbuf->trace)) {
if (!(cmdbuf->usage_flags & VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT))
entry_count++;
has_trace_points = true;
}
}
new_submit->cmds = vk_zalloc(&queue->device->vk.alloc,
@ -957,6 +970,39 @@ tu_queue_submit_create_locked(struct tu_queue *queue,
goto fail_cmds;
}
if (has_trace_points) {
new_submit->cmd_buffer_trace_data = vk_zalloc(&queue->device->vk.alloc,
new_submit->cmd_buffer_count * sizeof(struct tu_u_trace_cmd_data), 8,
VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
if (new_submit->cmd_buffer_trace_data == NULL) {
result = vk_error(queue->device->instance, VK_ERROR_OUT_OF_HOST_MEMORY)
goto fail_cmd_trace_data;
}
for (uint32_t i = 0; i < new_submit->cmd_buffer_count; ++i) {
TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, new_submit->cmd_buffers[i]);
if (!(cmdbuf->usage_flags & VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT) &&
u_trace_has_points(&cmdbuf->trace)) {
/* A single command buffer could be submitted several times, but we
* already backed timestamp iova addresses and trace points are
* single-use. Therefor we have to copy trace points and create
* a new timestamp buffer on every submit of reusable command buffer.
*/
if (tu_create_copy_timestamp_cs(cmdbuf,
&new_submit->cmd_buffer_trace_data[i].timestamp_copy_cs,
&new_submit->cmd_buffer_trace_data[i].trace) != VK_SUCCESS) {
result = vk_error(queue->device->instance, VK_ERROR_OUT_OF_HOST_MEMORY)
goto fail_copy_timestamp_cs;
}
assert(new_submit->cmd_buffer_trace_data[i].timestamp_copy_cs->entry_count == 1);
} else {
new_submit->cmd_buffer_trace_data[i].trace = &cmdbuf->trace;
}
}
}
/* Allocate without wait timeline semaphores */
new_submit->in_syncobjs = vk_zalloc(&queue->device->vk.alloc,
(nr_in_syncobjs - new_submit->wait_timeline_count) *
@ -992,6 +1038,12 @@ tu_queue_submit_create_locked(struct tu_queue *queue,
fail_out_syncobjs:
vk_free(&queue->device->vk.alloc, new_submit->in_syncobjs);
fail_in_syncobjs:
if (new_submit->cmd_buffer_trace_data)
tu_u_trace_cmd_data_finish(queue->device, new_submit->cmd_buffer_trace_data,
new_submit->cmd_buffer_count);
fail_copy_timestamp_cs:
vk_free(&queue->device->vk.alloc, new_submit->cmd_buffer_trace_data);
fail_cmd_trace_data:
vk_free(&queue->device->vk.alloc, new_submit->cmds);
fail_cmds:
fail_signal_timelines:
@ -1059,6 +1111,23 @@ tu_queue_build_msm_gem_submit_cmds(struct tu_queue *queue,
cmds[entry_idx].nr_relocs = 0;
cmds[entry_idx].relocs = 0;
}
if (submit->cmd_buffer_trace_data) {
struct tu_cs *ts_cs = submit->cmd_buffer_trace_data[j].timestamp_copy_cs;
if (ts_cs) {
cmds[entry_idx].type = MSM_SUBMIT_CMD_BUF;
cmds[entry_idx].submit_idx =
queue->device->bo_idx[ts_cs->entries[0].bo->gem_handle];
assert(cmds[entry_idx].submit_idx < queue->device->bo_count);
cmds[entry_idx].submit_offset = ts_cs->entries[0].offset;
cmds[entry_idx].size = ts_cs->entries[0].size;
cmds[entry_idx].pad = 0;
cmds[entry_idx].nr_relocs = 0;
cmds[entry_idx++].relocs = 0;
}
}
}
}
@ -1137,32 +1206,24 @@ tu_queue_submit_locked(struct tu_queue *queue, struct tu_queue_submit *submit)
sem->timeline.highest_submitted = signal_value;
}
if (u_trace_context_tracing(&queue->device->trace_context)) {
bool has_chunks = false;
if (submit->cmd_buffer_trace_data) {
struct tu_u_trace_flush_data *flush_data =
vk_alloc(&queue->device->vk.alloc, sizeof(struct tu_u_trace_flush_data),
8, VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
flush_data->submission_id = queue->device->submit_count;
flush_data->syncobj =
vk_alloc(&queue->device->vk.alloc, sizeof(struct tu_u_trace_syncobj),
8, VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
flush_data->syncobj->fence = req.fence;
flush_data->syncobj->msm_queue_id = queue->msm_queue_id;
flush_data->cmd_trace_data = submit->cmd_buffer_trace_data;
flush_data->trace_count = submit->cmd_buffer_count;
submit->cmd_buffer_trace_data = NULL;
for (uint32_t i = 0; i < submit->cmd_buffer_count; i++) {
TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, submit->cmd_buffers[i]);
if (!list_is_empty(&cmdbuf->trace.trace_chunks)) {
has_chunks = true;
break;
}
}
if (has_chunks) {
struct tu_u_trace_flush_data *flush_data =
vk_alloc(&queue->device->vk.alloc, sizeof(struct tu_u_trace_flush_data),
8, VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
flush_data->submission_id = queue->device->submit_count;
flush_data->syncobj =
vk_alloc(&queue->device->vk.alloc, sizeof(struct tu_u_trace_syncobj),
8, VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
flush_data->syncobj->fence = req.fence;
flush_data->syncobj->msm_queue_id = queue->msm_queue_id;
for (uint32_t i = 0; i < submit->cmd_buffer_count; i++) {
TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, submit->cmd_buffers[i]);
bool free_data = i == (submit->cmd_buffer_count - 1);
u_trace_flush(&cmdbuf->trace, flush_data, free_data);
}
bool free_data = i == (submit->cmd_buffer_count - 1);
u_trace_flush(flush_data->cmd_trace_data[i].trace, flush_data, free_data);
}
}
@ -1320,8 +1381,6 @@ tu_device_wait_u_trace(struct tu_device *dev, struct tu_u_trace_syncobj *syncobj
return VK_TIMEOUT;
}
close(syncobj->fence);
return VK_SUCCESS;
}

View File

@ -1728,10 +1728,35 @@ tu_signal_fences(struct tu_device *device, struct tu_syncobj *fence1, struct tu_
int
tu_syncobj_to_fd(struct tu_device *device, struct tu_syncobj *sync);
void
tu_copy_timestamp_buffer(struct u_trace_context *utctx, void *cmdstream,
void *ts_from, uint32_t from_offset,
void *ts_to, uint32_t to_offset,
uint32_t count);
VkResult
tu_create_copy_timestamp_cs(struct tu_cmd_buffer *cmdbuf, struct tu_cs** cs,
struct u_trace **trace_copy);
struct tu_u_trace_cmd_data
{
struct tu_cs *timestamp_copy_cs;
struct u_trace *trace;
};
void
tu_u_trace_cmd_data_finish(struct tu_device *device,
struct tu_u_trace_cmd_data *trace_data,
uint32_t entry_count);
struct tu_u_trace_flush_data
{
uint32_t submission_id;
struct tu_u_trace_syncobj *syncobj;
uint32_t trace_count;
struct tu_u_trace_cmd_data *cmd_trace_data;
};
#define TU_DEFINE_HANDLE_CASTS(__tu_type, __VkType) \