turnip: implement basic perfetto support

Signed-off-by: Danylo Piliaiev <dpiliaiev@igalia.com>
Reviewed-by: Rob Clark <robdclark@chromium.org>
Reviewed-by: Hyunjun Ko <zzoon@igalia.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/10969>
This commit is contained in:
Danylo Piliaiev 2021-05-24 19:54:47 +03:00
parent cefaa73909
commit 3dd1bb6355
12 changed files with 851 additions and 1 deletions

View File

@ -50,6 +50,7 @@ libtu_files = files(
'tu_shader.c',
'tu_util.c',
'tu_util.h',
'tu_perfetto.h',
'vk_format.h',
)
@ -106,9 +107,28 @@ else
tu_deps += dep_libdrm
endif
tu_tracepoints = custom_target(
'tu_tracepoints.[ch]',
input: 'tu_tracepoints.py',
output: ['tu_tracepoints.c', 'tu_tracepoints.h', 'tu_tracepoints_perfetto.h'],
command: [
prog_python, '@INPUT@',
'-p', join_paths(meson.source_root(), 'src/util/perf/'),
'--utrace-src', '@OUTPUT0@',
'--utrace-hdr', '@OUTPUT1@',
'--perfetto-hdr', '@OUTPUT2@',
],
depend_files: u_trace_py,
)
if with_perfetto
libtu_files += ['tu_perfetto.cc', 'tu_perfetto_util.c']
tu_deps += dep_perfetto
endif
libvulkan_freedreno = shared_library(
'vulkan_freedreno',
[libtu_files, tu_entrypoints, freedreno_xml_header_files],
[libtu_files, tu_entrypoints, tu_tracepoints, freedreno_xml_header_files],
include_directories : [
inc_include,
inc_src,

View File

@ -19,6 +19,8 @@
#include "util/half_float.h"
#include "compiler/nir/nir_builder.h"
#include "tu_tracepoints.h"
static uint32_t
tu_pack_float32_for_unorm(float val, int bits)
{
@ -1370,6 +1372,8 @@ tu6_blit_image(struct tu_cmd_buffer *cmd,
unreachable("unexpected D32_S8 aspect mask in blit_image");
}
trace_start_blit(&cmd->trace);
ops->setup(cmd, cs, format, info->dstSubresource.aspectMask,
blit_param, false, dst_image->layout[0].ubwc,
dst_image->layout[0].nr_samples);
@ -1418,6 +1422,12 @@ tu6_blit_image(struct tu_cmd_buffer *cmd,
}
ops->teardown(cmd, cs);
trace_end_blit(&cmd->trace,
ops == &r3d_ops,
src_image->vk_format,
dst_image->vk_format,
layers);
}
VKAPI_ATTR void VKAPI_CALL
@ -2032,6 +2042,8 @@ resolve_sysmem(struct tu_cmd_buffer *cmd,
{
const struct blit_ops *ops = &r2d_ops;
trace_start_resolve(&cmd->trace);
ops->setup(cmd, cs, format, VK_IMAGE_ASPECT_COLOR_BIT,
0, false, dst->ubwc_enabled, VK_SAMPLE_COUNT_1_BIT);
ops->coords(cs, &rect->offset, &rect->offset, &rect->extent);
@ -2048,6 +2060,8 @@ resolve_sysmem(struct tu_cmd_buffer *cmd,
}
ops->teardown(cmd, cs);
trace_end_resolve(&cmd->trace);
}
void

View File

@ -35,6 +35,8 @@
#include "tu_cs.h"
#include "tu_tracepoints.h"
void
tu6_emit_event_write(struct tu_cmd_buffer *cmd,
struct tu_cs *cs,
@ -987,9 +989,13 @@ tu6_emit_binning_pass(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
tu_cs_emit_regs(cs,
A6XX_SP_TP_WINDOW_OFFSET(.x = 0, .y = 0));
trace_start_binning_ib(&cmd->trace);
/* emit IB to binning drawcmds: */
tu_cs_emit_call(cs, &cmd->draw_cs);
trace_end_binning_ib(&cmd->trace);
/* switching from binning pass to GMEM pass will cause a switch from
* PROGRAM_BINNING to PROGRAM, which invalidates const state (XS_CONST states)
* so make sure these states are re-emitted
@ -1336,13 +1342,18 @@ tu_cmd_render_tiles(struct tu_cmd_buffer *cmd)
for (uint32_t ty = ty1; ty < ty2; ty++) {
for (uint32_t tx = tx1; tx < tx2; tx++, slot++) {
tu6_emit_tile_select(cmd, &cmd->cs, tx, ty, pipe, slot);
trace_start_draw_ib_gmem(&cmd->trace);
tu6_render_tile(cmd, &cmd->cs);
trace_end_draw_ib_gmem(&cmd->trace);
}
}
}
}
tu6_tile_render_end(cmd, &cmd->cs);
trace_end_render_pass(&cmd->trace, fb);
}
static void
@ -1350,9 +1361,15 @@ tu_cmd_render_sysmem(struct tu_cmd_buffer *cmd)
{
tu6_sysmem_render_begin(cmd, &cmd->cs);
trace_start_draw_ib_sysmem(&cmd->trace);
tu_cs_emit_call(&cmd->cs, &cmd->draw_cs);
trace_end_draw_ib_sysmem(&cmd->trace);
tu6_sysmem_render_end(cmd, &cmd->cs);
trace_end_render_pass(&cmd->trace, cmd->state.framebuffer);
}
static VkResult
@ -1384,6 +1401,8 @@ tu_create_cmd_buffer(struct tu_device *device,
cmd_buffer->queue_family_index = TU_QUEUE_GENERAL;
}
u_trace_init(&cmd_buffer->trace, &device->trace_context);
tu_cs_init(&cmd_buffer->cs, device, TU_CS_MODE_GROW, 4096);
tu_cs_init(&cmd_buffer->draw_cs, device, TU_CS_MODE_GROW, 4096);
tu_cs_init(&cmd_buffer->tile_store_cs, device, TU_CS_MODE_GROW, 2048);
@ -1406,6 +1425,8 @@ tu_cmd_buffer_destroy(struct tu_cmd_buffer *cmd_buffer)
tu_cs_finish(&cmd_buffer->draw_epilogue_cs);
tu_cs_finish(&cmd_buffer->sub_cs);
u_trace_fini(&cmd_buffer->trace);
vk_object_free(&cmd_buffer->device->vk, &cmd_buffer->pool->alloc, cmd_buffer);
}
@ -1425,6 +1446,9 @@ tu_reset_cmd_buffer(struct tu_cmd_buffer *cmd_buffer)
memset(&cmd_buffer->descriptors[i].push_set, 0, sizeof(cmd_buffer->descriptors[i].push_set));
}
u_trace_fini(&cmd_buffer->trace);
u_trace_init(&cmd_buffer->trace, &cmd_buffer->device->trace_context);
cmd_buffer->status = TU_CMD_BUFFER_STATUS_INITIAL;
return cmd_buffer->record_result;
@ -3069,6 +3093,8 @@ tu_CmdBeginRenderPass2(VkCommandBuffer commandBuffer,
cmd->state.framebuffer = fb;
cmd->state.render_area = pRenderPassBegin->renderArea;
trace_start_render_pass(&cmd->trace);
/* Note: because this is external, any flushes will happen before draw_cs
* gets called. However deferred flushes could have to happen later as part
* of the subpass.
@ -4468,6 +4494,8 @@ tu_dispatch(struct tu_cmd_buffer *cmd,
A6XX_HLSQ_CS_KERNEL_GROUP_Y(1),
A6XX_HLSQ_CS_KERNEL_GROUP_Z(1));
trace_start_compute(&cmd->trace);
if (info->indirect) {
uint64_t iova = tu_buffer_iova(info->indirect) + info->indirect_offset;
@ -4486,6 +4514,11 @@ tu_dispatch(struct tu_cmd_buffer *cmd,
tu_cs_emit(cs, CP_EXEC_CS_3_NGROUPS_Z(info->blocks[2]));
}
trace_end_compute(&cmd->trace,
info->indirect != NULL,
local_size[0], local_size[1], local_size[2],
info->blocks[0], info->blocks[1], info->blocks[2]);
tu_cs_emit_wfi(cs);
}

View File

@ -347,6 +347,10 @@ tu_CreateInstance(const VkInstanceCreateInfo *pCreateInfo,
*pInstance = tu_instance_to_handle(instance);
#ifdef HAVE_PERFETTO
tu_perfetto_init();
#endif
return VK_SUCCESS;
}
@ -1240,6 +1244,92 @@ tu_queue_finish(struct tu_queue *queue)
tu_drm_submitqueue_close(queue->device, queue->msm_queue_id);
}
uint64_t
tu_device_ticks_to_ns(struct tu_device *dev, uint64_t ts)
{
/* This is based on the 19.2MHz always-on rbbm timer.
*
* TODO we should probably query this value from kernel..
*/
return ts * (1000000000 / 19200000);
}
static void*
tu_trace_create_ts_buffer(struct u_trace_context *utctx, uint32_t size)
{
struct tu_device *device =
container_of(utctx, struct tu_device, trace_context);
struct tu_bo *bo = ralloc(NULL, struct tu_bo);
tu_bo_init_new(device, bo, size, false);
return bo;
}
static void
tu_trace_destroy_ts_buffer(struct u_trace_context *utctx, void *timestamps)
{
struct tu_device *device =
container_of(utctx, struct tu_device, trace_context);
struct tu_bo *bo = timestamps;
tu_bo_finish(device, bo);
ralloc_free(bo);
}
static void
tu_trace_record_ts(struct u_trace *ut, void *timestamps,
unsigned idx)
{
struct tu_cmd_buffer *cmd = container_of(ut, struct tu_cmd_buffer, trace);
struct tu_bo *bo = timestamps;
struct tu_cs *cs = &cmd->cs;
unsigned ts_offset = idx * sizeof(uint64_t);
tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 4);
tu_cs_emit(cs, CP_EVENT_WRITE_0_EVENT(RB_DONE_TS) | CP_EVENT_WRITE_0_TIMESTAMP);
tu_cs_emit_qw(cs, bo->iova + ts_offset);
tu_cs_emit(cs, 0x00000000);
}
static uint64_t
tu_trace_read_ts(struct u_trace_context *utctx,
void *timestamps, unsigned idx, void *flush_data)
{
struct tu_device *device =
container_of(utctx, struct tu_device, trace_context);
struct tu_bo *bo = timestamps;
struct tu_u_trace_flush_data *trace_flush_data = flush_data;
/* Only need to stall on results for the first entry: */
if (idx == 0) {
tu_device_wait_u_trace(device, trace_flush_data->syncobj);
}
if (tu_bo_map(device, bo) != VK_SUCCESS) {
return U_TRACE_NO_TIMESTAMP;
}
uint64_t *ts = bo->map;
/* Don't translate the no-timestamp marker: */
if (ts[idx] == U_TRACE_NO_TIMESTAMP)
return U_TRACE_NO_TIMESTAMP;
return tu_device_ticks_to_ns(device, ts[idx]);
}
static void
tu_trace_delete_flush_data(struct u_trace_context *utctx, void *flush_data)
{
struct tu_device *device =
container_of(utctx, struct tu_device, trace_context);
struct tu_u_trace_flush_data *trace_flush_data = flush_data;
vk_free(&device->vk.alloc, trace_flush_data->syncobj);
vk_free(&device->vk.alloc, trace_flush_data);
}
VKAPI_ATTR VkResult VKAPI_CALL
tu_CreateDevice(VkPhysicalDevice physicalDevice,
const VkDeviceCreateInfo *pCreateInfo,
@ -1480,6 +1570,14 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
mtx_init(&device->mutex, mtx_plain);
device->submit_count = 0;
u_trace_context_init(&device->trace_context, device,
tu_trace_create_ts_buffer,
tu_trace_destroy_ts_buffer,
tu_trace_record_ts,
tu_trace_read_ts,
tu_trace_delete_flush_data);
*pDevice = tu_device_to_handle(device);
return VK_SUCCESS;
@ -1521,6 +1619,8 @@ tu_DestroyDevice(VkDevice _device, const VkAllocationCallbacks *pAllocator)
if (!device)
return;
u_trace_context_fini(&device->trace_context);
for (unsigned i = 0; i < TU_MAX_QUEUE_FAMILIES; i++) {
for (unsigned q = 0; q < device->queue_count[i]; q++)
tu_queue_finish(&device->queues[i][q]);

View File

@ -34,6 +34,7 @@
#include "drm-uapi/msm_drm.h"
#include "util/timespec.h"
#include "util/os_time.h"
#include "util/perf/u_trace.h"
#include "tu_private.h"
@ -112,6 +113,12 @@ struct tu_queue_submit
uint32_t counter_pass_index;
};
struct tu_u_trace_syncobj
{
uint32_t msm_queue_id;
uint32_t fence;
};
static int
tu_drm_get_param(const struct tu_physical_device *dev,
uint32_t param,
@ -165,6 +172,12 @@ tu_drm_get_gmem_base(const struct tu_physical_device *dev, uint64_t *base)
return tu_drm_get_param(dev, MSM_PARAM_GMEM_BASE, base);
}
int
tu_drm_get_timestamp(struct tu_physical_device *device, uint64_t *ts)
{
return tu_drm_get_param(device, MSM_PARAM_TIMESTAMP, ts);
}
int
tu_drm_submitqueue_new(const struct tu_device *dev,
int priority,
@ -1052,6 +1065,12 @@ tu_queue_build_msm_gem_submit_cmds(struct tu_queue *queue,
static VkResult
tu_queue_submit_locked(struct tu_queue *queue, struct tu_queue_submit *submit)
{
queue->device->submit_count++;
#if HAVE_PERFETTO
tu_perfetto_submit(queue->device, queue->device->submit_count);
#endif
uint32_t flags = MSM_PIPE_3D0;
if (submit->nr_in_syncobjs)
@ -1118,6 +1137,35 @@ tu_queue_submit_locked(struct tu_queue *queue, struct tu_queue_submit *submit)
sem->timeline.highest_submitted = signal_value;
}
if (u_trace_context_tracing(&queue->device->trace_context)) {
bool has_chunks = false;
for (uint32_t i = 0; i < submit->cmd_buffer_count; i++) {
TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, submit->cmd_buffers[i]);
if (!list_is_empty(&cmdbuf->trace.trace_chunks)) {
has_chunks = true;
break;
}
}
if (has_chunks) {
struct tu_u_trace_flush_data *flush_data =
vk_alloc(&queue->device->vk.alloc, sizeof(struct tu_u_trace_flush_data),
8, VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
flush_data->submission_id = queue->device->submit_count;
flush_data->syncobj =
vk_alloc(&queue->device->vk.alloc, sizeof(struct tu_u_trace_syncobj),
8, VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
flush_data->syncobj->fence = req.fence;
flush_data->syncobj->msm_queue_id = queue->msm_queue_id;
for (uint32_t i = 0; i < submit->cmd_buffer_count; i++) {
TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, submit->cmd_buffers[i]);
bool free_data = i == (submit->cmd_buffer_count - 1);
u_trace_flush(&cmdbuf->trace, flush_data, free_data);
}
}
}
pthread_cond_broadcast(&queue->device->timeline_cond);
return VK_SUCCESS;
@ -1246,6 +1294,37 @@ tu_device_submit_deferred_locked(struct tu_device *dev)
return result;
}
static inline void
get_abs_timeout(struct drm_msm_timespec *tv, uint64_t ns)
{
struct timespec t;
clock_gettime(CLOCK_MONOTONIC, &t);
tv->tv_sec = t.tv_sec + ns / 1000000000;
tv->tv_nsec = t.tv_nsec + ns % 1000000000;
}
VkResult
tu_device_wait_u_trace(struct tu_device *dev, struct tu_u_trace_syncobj *syncobj)
{
struct drm_msm_wait_fence req = {
.fence = syncobj->fence,
.queueid = syncobj->msm_queue_id,
};
int ret;
get_abs_timeout(&req.timeout, 1000000000);
ret = drmCommandWrite(dev->fd, DRM_MSM_WAIT_FENCE, &req, sizeof(req));
if (ret && (ret != -ETIMEDOUT)) {
fprintf(stderr, "wait-fence failed! %d (%s)", ret, strerror(errno));
return VK_TIMEOUT;
}
close(syncobj->fence);
return VK_SUCCESS;
}
VKAPI_ATTR VkResult VKAPI_CALL
tu_QueueSubmit(VkQueue _queue,
uint32_t submitCount,

View File

@ -0,0 +1,291 @@
/*
* Copyright © 2021 Google, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#include <perfetto.h>
#include "tu_perfetto.h"
#include "util/u_perfetto.h"
#include "util/hash_table.h"
#include "tu_tracepoints.h"
#include "tu_tracepoints_perfetto.h"
static uint32_t gpu_clock_id;
static uint64_t next_clock_sync_ns; /* cpu time of next clk sync */
/**
* The timestamp at the point where we first emitted the clock_sync..
* this will be a *later* timestamp that the first GPU traces (since
* we capture the first clock_sync from the CPU *after* the first GPU
* tracepoints happen). To avoid confusing perfetto we need to drop
* the GPU traces with timestamps before this.
*/
static uint64_t sync_gpu_ts;
struct TuRenderpassIncrementalState {
bool was_cleared = true;
};
struct TuRenderpassTraits : public perfetto::DefaultDataSourceTraits {
using IncrementalStateType = TuRenderpassIncrementalState;
};
class TuRenderpassDataSource : public perfetto::DataSource<TuRenderpassDataSource, TuRenderpassTraits> {
public:
void OnSetup(const SetupArgs &) override
{
// Use this callback to apply any custom configuration to your data source
// based on the TraceConfig in SetupArgs.
}
void OnStart(const StartArgs &) override
{
// This notification can be used to initialize the GPU driver, enable
// counters, etc. StartArgs will contains the DataSourceDescriptor,
// which can be extended.
u_trace_perfetto_start();
PERFETTO_LOG("Tracing started");
/* Note: clock_id's below 128 are reserved.. for custom clock sources,
* using the hash of a namespaced string is the recommended approach.
* See: https://perfetto.dev/docs/concepts/clock-sync
*/
gpu_clock_id =
_mesa_hash_string("org.freedesktop.mesa.freedreno") | 0x80000000;
}
void OnStop(const StopArgs &) override
{
PERFETTO_LOG("Tracing stopped");
// Undo any initialization done in OnStart.
u_trace_perfetto_stop();
// TODO we should perhaps block until queued traces are flushed?
Trace([](TuRenderpassDataSource::TraceContext ctx) {
auto packet = ctx.NewTracePacket();
packet->Finalize();
ctx.Flush();
});
}
};
PERFETTO_DECLARE_DATA_SOURCE_STATIC_MEMBERS(TuRenderpassDataSource);
PERFETTO_DEFINE_DATA_SOURCE_STATIC_MEMBERS(TuRenderpassDataSource);
static void
send_descriptors(TuRenderpassDataSource::TraceContext &ctx, uint64_t ts_ns)
{
PERFETTO_LOG("Sending renderstage descriptors");
auto packet = ctx.NewTracePacket();
packet->set_timestamp(0);
auto event = packet->set_gpu_render_stage_event();
event->set_gpu_id(0);
auto spec = event->set_specifications();
for (unsigned i = 0; i < ARRAY_SIZE(queues); i++) {
auto desc = spec->add_hw_queue();
desc->set_name(queues[i].name);
desc->set_description(queues[i].desc);
}
for (unsigned i = 0; i < ARRAY_SIZE(stages); i++) {
auto desc = spec->add_stage();
desc->set_name(stages[i].name);
if (stages[i].desc)
desc->set_description(stages[i].desc);
}
}
static void
stage_start(struct tu_device *dev, uint64_t ts_ns, enum tu_stage_id stage)
{
struct tu_perfetto_state *p = tu_device_get_perfetto_state(dev);
p->start_ts[stage] = ts_ns;
}
typedef void (*trace_payload_as_extra_func)(perfetto::protos::pbzero::GpuRenderStageEvent *, const void*);
static void
stage_end(struct tu_device *dev, uint64_t ts_ns, enum tu_stage_id stage,
uint32_t submission_id, const void* payload = nullptr,
trace_payload_as_extra_func payload_as_extra = nullptr)
{
struct tu_perfetto_state *p = tu_device_get_perfetto_state(dev);
/* If we haven't managed to calibrate the alignment between GPU and CPU
* timestamps yet, then skip this trace, otherwise perfetto won't know
* what to do with it.
*/
if (!sync_gpu_ts)
return;
TuRenderpassDataSource::Trace([=](TuRenderpassDataSource::TraceContext tctx) {
if (auto state = tctx.GetIncrementalState(); state->was_cleared) {
send_descriptors(tctx, p->start_ts[stage]);
state->was_cleared = false;
}
auto packet = tctx.NewTracePacket();
packet->set_timestamp(p->start_ts[stage]);
packet->set_timestamp_clock_id(gpu_clock_id);
auto event = packet->set_gpu_render_stage_event();
event->set_event_id(0); // ???
event->set_hw_queue_id(DEFAULT_HW_QUEUE_ID);
event->set_duration(ts_ns - p->start_ts[stage]);
event->set_stage_id(stage);
event->set_context((uintptr_t)dev);
event->set_submission_id(submission_id);
if (payload && payload_as_extra) {
payload_as_extra(event, payload);
}
});
}
#ifdef __cplusplus
extern "C" {
#endif
void
tu_perfetto_init(void)
{
util_perfetto_init();
perfetto::DataSourceDescriptor dsd;
dsd.set_name("gpu.renderstages.msm");
TuRenderpassDataSource::Register(dsd);
}
static void
sync_timestamp(struct tu_device *dev)
{
uint64_t cpu_ts = perfetto::base::GetBootTimeNs().count();
uint64_t gpu_ts = 0;
if (cpu_ts < next_clock_sync_ns)
return;
if (tu_device_get_timestamp(dev, &gpu_ts)) {
PERFETTO_ELOG("Could not sync CPU and GPU clocks");
return;
}
/* convert GPU ts into ns: */
gpu_ts = tu_device_ticks_to_ns(dev, gpu_ts);
TuRenderpassDataSource::Trace([=](TuRenderpassDataSource::TraceContext tctx) {
auto packet = tctx.NewTracePacket();
packet->set_timestamp(cpu_ts);
auto event = packet->set_clock_snapshot();
{
auto clock = event->add_clocks();
clock->set_clock_id(perfetto::protos::pbzero::BUILTIN_CLOCK_BOOTTIME);
clock->set_timestamp(cpu_ts);
}
{
auto clock = event->add_clocks();
clock->set_clock_id(gpu_clock_id);
clock->set_timestamp(gpu_ts);
}
sync_gpu_ts = gpu_ts;
next_clock_sync_ns = cpu_ts + 30000000;
});
}
static void
emit_submit_id(uint32_t submission_id)
{
TuRenderpassDataSource::Trace([=](TuRenderpassDataSource::TraceContext tctx) {
auto packet = tctx.NewTracePacket();
packet->set_timestamp(perfetto::base::GetBootTimeNs().count());
auto event = packet->set_vulkan_api_event();
auto submit = event->set_vk_queue_submit();
submit->set_submission_id(submission_id);
});
}
void
tu_perfetto_submit(struct tu_device *dev, uint32_t submission_id)
{
sync_timestamp(dev);
emit_submit_id(submission_id);
}
/*
* Trace callbacks, called from u_trace once the timestamps from GPU have been
* collected.
*/
#define CREATE_EVENT_CALLBACK(event_name, stage) \
void \
tu_start_##event_name(struct tu_device *dev, uint64_t ts_ns, \
const void *flush_data, \
const struct trace_start_##event_name *payload) \
{ \
stage_start(dev, ts_ns, stage); \
} \
\
void \
tu_end_##event_name(struct tu_device *dev, uint64_t ts_ns, \
const void *flush_data, \
const struct trace_end_##event_name *payload) \
{ \
auto trace_flush_data = (const struct tu_u_trace_flush_data *) flush_data; \
uint32_t submission_id = \
tu_u_trace_flush_data_get_submit_id(trace_flush_data); \
stage_end(dev, ts_ns, stage, submission_id, payload, \
(trace_payload_as_extra_func) &trace_payload_as_extra_end_##event_name);\
}
CREATE_EVENT_CALLBACK(render_pass, SURFACE_STAGE_ID)
CREATE_EVENT_CALLBACK(binning_ib, BINNING_STAGE_ID)
CREATE_EVENT_CALLBACK(draw_ib_gmem, GMEM_STAGE_ID)
CREATE_EVENT_CALLBACK(draw_ib_sysmem, BYPASS_STAGE_ID)
CREATE_EVENT_CALLBACK(blit, BLIT_STAGE_ID)
CREATE_EVENT_CALLBACK(compute, COMPUTE_STAGE_ID)
CREATE_EVENT_CALLBACK(resolve, RESOLVE_STAGE_ID)
#ifdef __cplusplus
}
#endif

View File

@ -0,0 +1,110 @@
/*
* Copyright © 2021 Google, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#ifndef TU_PERFETTO_H_
#define TU_PERFETTO_H_
#ifdef __cplusplus
extern "C" {
#endif
#ifdef HAVE_PERFETTO
/**
* Render-stage id's
*/
enum tu_stage_id {
SURFACE_STAGE_ID, /* Surface is a sort of meta-stage for render-target info */
BINNING_STAGE_ID,
GMEM_STAGE_ID,
BYPASS_STAGE_ID,
BLIT_STAGE_ID,
COMPUTE_STAGE_ID,
CLEAR_RESTORE_STAGE_ID,
RESOLVE_STAGE_ID,
// TODO add the rest
NUM_STAGES
};
static const struct {
const char *name;
const char *desc;
} stages[] = {
[SURFACE_STAGE_ID] = {"Surface"},
[BINNING_STAGE_ID] = {"Binning", "Perform Visibility pass and determine target bins"},
[GMEM_STAGE_ID] = {"Render", "Rendering to GMEM"},
[BYPASS_STAGE_ID] = {"Render", "Rendering to system memory"},
[BLIT_STAGE_ID] = {"Blit", "Performing a Blit operation"},
[COMPUTE_STAGE_ID] = {"Compute", "Compute job"},
[CLEAR_RESTORE_STAGE_ID] = {"Clear/Restore", "Clear (sysmem) or per-tile clear or restore (GMEM)"},
[RESOLVE_STAGE_ID] = {"Resolve", "Per tile resolve (GMEM to system memory"},
// TODO add the rest
};
/**
* Queue-id's
*/
enum {
DEFAULT_HW_QUEUE_ID,
};
static const struct {
const char *name;
const char *desc;
} queues[] = {
[DEFAULT_HW_QUEUE_ID] = {"GPU Queue 0", "Default Adreno Hardware Queue"},
};
struct tu_perfetto_state {
uint64_t start_ts[NUM_STAGES];
};
void tu_perfetto_init(void);
struct tu_device;
void tu_perfetto_submit(struct tu_device *dev, uint32_t submission_id);
/* Helpers */
struct tu_perfetto_state *
tu_device_get_perfetto_state(struct tu_device *dev);
int
tu_device_get_timestamp(struct tu_device *dev,
uint64_t *ts);
uint64_t
tu_device_ticks_to_ns(struct tu_device *dev, uint64_t ts);
struct tu_u_trace_flush_data;
uint32_t
tu_u_trace_flush_data_get_submit_id(const struct tu_u_trace_flush_data *data);
#endif
#ifdef __cplusplus
}
#endif
#endif /* TU_PERFETTO_H_ */

View File

@ -0,0 +1,48 @@
/*
* Copyright © 2021 Igalia S.L.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#include "tu_private.h"
#include "tu_perfetto.h"
/* Including tu_private.h in tu_perfetto.cc doesn't work, so
* we need some helper methods to access tu_device.
*/
struct tu_perfetto_state *
tu_device_get_perfetto_state(struct tu_device *dev)
{
return &dev->perfetto;
}
int
tu_device_get_timestamp(struct tu_device *dev,
uint64_t *ts)
{
return tu_drm_get_timestamp(dev->physical_device, ts);
}
uint32_t
tu_u_trace_flush_data_get_submit_id(const struct tu_u_trace_flush_data *data)
{
return data->submission_id;
}

View File

@ -53,6 +53,7 @@
#include "util/macros.h"
#include "util/u_atomic.h"
#include "util/u_dynarray.h"
#include "util/perf/u_trace.h"
#include "vk_alloc.h"
#include "vk_debug_report.h"
#include "vk_device.h"
@ -75,6 +76,7 @@
#include "tu_descriptor_set.h"
#include "tu_util.h"
#include "tu_perfetto.h"
/* Pre-declarations needed for WSI entrypoints */
struct wl_surface;
@ -291,6 +293,7 @@ struct tu_pipeline_key
#define TU_MAX_QUEUE_FAMILIES 1
struct tu_syncobj;
struct tu_u_trace_syncobj;
struct tu_queue
{
@ -425,6 +428,14 @@ struct tu_device
TU_GRALLOC_OTHER,
} gralloc_type;
#endif
uint32_t submit_count;
struct u_trace_context trace_context;
#ifdef HAVE_PERFETTO
struct tu_perfetto_state perfetto;
#endif
};
void tu_init_clear_blit_shaders(struct tu_device *dev);
@ -445,6 +456,12 @@ tu_device_is_lost(struct tu_device *device)
VkResult
tu_device_submit_deferred_locked(struct tu_device *dev);
VkResult
tu_device_wait_u_trace(struct tu_device *dev, struct tu_u_trace_syncobj *syncobj);
uint64_t
tu_device_ticks_to_ns(struct tu_device *dev, uint64_t ts);
enum tu_bo_alloc_flags
{
TU_BO_ALLOC_NO_FLAGS = 0,
@ -1042,6 +1059,8 @@ struct tu_cmd_buffer
struct tu_cmd_pool *pool;
struct list_head pool_link;
struct u_trace trace;
VkCommandBufferUsageFlags usage_flags;
VkCommandBufferLevel level;
enum tu_cmd_buffer_status status;
@ -1691,6 +1710,10 @@ tu_physical_device_init(struct tu_physical_device *device,
VkResult
tu_enumerate_devices(struct tu_instance *instance);
int
tu_drm_get_timestamp(struct tu_physical_device *device,
uint64_t *ts);
int
tu_drm_submitqueue_new(const struct tu_device *dev,
int priority,
@ -1705,6 +1728,12 @@ tu_signal_fences(struct tu_device *device, struct tu_syncobj *fence1, struct tu_
int
tu_syncobj_to_fd(struct tu_device *device, struct tu_syncobj *sync);
struct tu_u_trace_flush_data
{
uint32_t submission_id;
struct tu_u_trace_syncobj *syncobj;
};
#define TU_DEFINE_HANDLE_CASTS(__tu_type, __VkType) \
\
static inline struct __tu_type *__tu_type##_from_handle(__VkType _handle) \

View File

@ -0,0 +1,115 @@
#
# Copyright © 2021 Igalia S.L.
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
# to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice (including the next
# paragraph) shall be included in all copies or substantial portions of the
# Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
#
import argparse
import sys
#
# TODO can we do this with less boilerplate?
#
parser = argparse.ArgumentParser()
parser.add_argument('-p', '--import-path', required=True)
parser.add_argument('--utrace-src', required=True)
parser.add_argument('--utrace-hdr', required=True)
parser.add_argument('--perfetto-hdr', required=True)
args = parser.parse_args()
sys.path.insert(0, args.import_path)
from u_trace import Header, HeaderScope
from u_trace import ForwardDecl
from u_trace import Tracepoint
from u_trace import TracepointArg as Arg
from u_trace import TracepointArgStruct as ArgStruct
from u_trace import utrace_generate
from u_trace import utrace_generate_perfetto_utils
#
# Tracepoint definitions:
#
Header('util/u_dump.h')
Header('vk_format.h')
Header('freedreno/vulkan/tu_private.h', scope=HeaderScope.SOURCE)
ForwardDecl('struct tu_device')
Tracepoint('start_render_pass',
tp_perfetto='tu_start_render_pass'
)
Tracepoint('end_render_pass',
args=[ArgStruct(type='const struct tu_framebuffer *', var='fb')],
tp_struct=[Arg(type='uint16_t', name='width', var='fb->width', c_format='%u'),
Arg(type='uint16_t', name='height', var='fb->height', c_format='%u'),
Arg(type='uint8_t', name='MRTs', var='fb->attachment_count', c_format='%u'),
# Arg(type='uint8_t', name='samples', var='fb->samples', c_format='%u'),
Arg(type='uint16_t', name='numberOfBins', var='fb->tile_count.width * fb->tile_count.height', c_format='%u'),
Arg(type='uint16_t', name='binWidth', var='fb->tile0.width', c_format='%u'),
Arg(type='uint16_t', name='binHeight', var='fb->tile0.height', c_format='%u')],
tp_perfetto='tu_end_render_pass')
Tracepoint('start_binning_ib',
tp_perfetto='tu_start_binning_ib')
Tracepoint('end_binning_ib',
tp_perfetto='tu_end_binning_ib')
Tracepoint('start_resolve',
tp_perfetto='tu_start_resolve')
Tracepoint('end_resolve',
tp_perfetto='tu_end_resolve')
Tracepoint('start_draw_ib_sysmem',
tp_perfetto='tu_start_draw_ib_sysmem')
Tracepoint('end_draw_ib_sysmem',
tp_perfetto='tu_end_draw_ib_sysmem')
Tracepoint('start_draw_ib_gmem',
tp_perfetto='tu_start_draw_ib_gmem')
Tracepoint('end_draw_ib_gmem',
tp_perfetto='tu_end_draw_ib_gmem')
Tracepoint('start_blit',
tp_perfetto='tu_start_blit',
)
Tracepoint('end_blit',
# TODO: add source megapixels count and target megapixels count arguments
args=[Arg(type='uint8_t', var='uses_3d_blit', c_format='%u'),
Arg(type='enum VkFormat', var='src_format', c_format='%s', to_prim_type='vk_format_description({})->short_name'),
Arg(type='enum VkFormat', var='dst_format', c_format='%s', to_prim_type='vk_format_description({})->short_name'),
Arg(type='uint8_t', var='layers', c_format='%u')],
tp_perfetto='tu_end_blit')
Tracepoint('start_compute',
tp_perfetto='tu_start_compute')
Tracepoint('end_compute',
args=[Arg(type='uint8_t', var='indirect', c_format='%u'),
Arg(type='uint16_t', var='local_size_x', c_format='%u'),
Arg(type='uint16_t', var='local_size_y', c_format='%u'),
Arg(type='uint16_t', var='local_size_z', c_format='%u'),
Arg(type='uint16_t', var='num_groups_x', c_format='%u'),
Arg(type='uint16_t', var='num_groups_y', c_format='%u'),
Arg(type='uint16_t', var='num_groups_z', c_format='%u')],
tp_perfetto='tu_end_compute')
utrace_generate(cpath=args.utrace_src, hpath=args.utrace_hdr, ctx_param='struct tu_device *dev')
utrace_generate_perfetto_utils(hpath=args.perfetto_hdr)

View File

@ -247,6 +247,9 @@ VKAPI_ATTR VkResult VKAPI_CALL
tu_QueuePresentKHR(VkQueue _queue, const VkPresentInfoKHR *pPresentInfo)
{
TU_FROM_HANDLE(tu_queue, queue, _queue);
u_trace_context_process(&queue->device->trace_context, true);
return wsi_common_queue_present(
&queue->device->physical_device->wsi_device,
tu_device_to_handle(queue->device), _queue, queue->queue_family_index,

View File

@ -28,6 +28,10 @@
#include <vulkan/vulkan_core.h>
#include "util/format/u_format.h"
#ifdef __cplusplus
extern "C" {
#endif
enum pipe_format
vk_format_to_pipe_format(enum VkFormat vkformat);
@ -84,4 +88,8 @@ vk_format_stencil_only(VkFormat format)
return VK_FORMAT_S8_UINT;
}
#ifdef __cplusplus
}
#endif
#endif