v3dv: Implement VK_KHR_performance_query

Reviewed-by: Iago Toral Quiroga <itoral@igalia.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/14061>
This commit is contained in:
Ella Stanforth 2021-11-23 22:29:48 +00:00 committed by Marge Bot
parent f2a24fd4a2
commit f392b6c1ad
6 changed files with 713 additions and 85 deletions

View File

@ -510,7 +510,7 @@ Khronos extensions that are not part of any Vulkan version:
VK_KHR_get_display_properties2 DONE (anv, lvp, radv, tu, v3dv) VK_KHR_get_display_properties2 DONE (anv, lvp, radv, tu, v3dv)
VK_KHR_get_surface_capabilities2 DONE (anv, lvp, radv, tu, v3dv, vn) VK_KHR_get_surface_capabilities2 DONE (anv, lvp, radv, tu, v3dv, vn)
VK_KHR_incremental_present DONE (anv, lvp, radv, tu, v3dv, vn) VK_KHR_incremental_present DONE (anv, lvp, radv, tu, v3dv, vn)
VK_KHR_performance_query DONE (anv/gen8+, tu) VK_KHR_performance_query DONE (anv/gen8+, tu, v3dv)
VK_KHR_pipeline_executable_properties DONE (anv, radv, tu, v3dv) VK_KHR_pipeline_executable_properties DONE (anv, radv, tu, v3dv)
VK_KHR_pipeline_library DONE (lvp, radv) VK_KHR_pipeline_library DONE (lvp, radv)
VK_KHR_push_descriptor DONE (anv, lvp, radv, tu) VK_KHR_push_descriptor DONE (anv, lvp, radv, tu)

View File

@ -777,6 +777,8 @@ v3dv_job_init(struct v3dv_job *job,
job->is_transfer = cmd_buffer->state.is_transfer; job->is_transfer = cmd_buffer->state.is_transfer;
cmd_buffer_serialize_job_if_needed(cmd_buffer, job); cmd_buffer_serialize_job_if_needed(cmd_buffer, job);
job->perf = cmd_buffer->state.query.active_query.perf;
} }
} }
@ -3223,24 +3225,44 @@ v3dv_cmd_buffer_begin_query(struct v3dv_cmd_buffer *cmd_buffer,
uint32_t query, uint32_t query,
VkQueryControlFlags flags) VkQueryControlFlags flags)
{ {
/* FIXME: we only support one active query for now */
assert(cmd_buffer->state.query.active_query.bo == NULL);
assert(query < pool->query_count); assert(query < pool->query_count);
switch (pool->query_type) {
case VK_QUERY_TYPE_OCCLUSION:
/* FIXME: we only support one active occlusion query for now */
assert(cmd_buffer->state.query.active_query.bo == NULL);
cmd_buffer->state.query.active_query.bo = pool->queries[query].bo; cmd_buffer->state.query.active_query.bo = pool->queries[query].bo;
cmd_buffer->state.query.active_query.offset = pool->queries[query].offset; cmd_buffer->state.query.active_query.offset = pool->queries[query].offset;
cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_OCCLUSION_QUERY; cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_OCCLUSION_QUERY;
break;
case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
assert(cmd_buffer->state.query.active_query.perf == NULL);
if (cmd_buffer->state.pass)
v3dv_cmd_buffer_subpass_finish(cmd_buffer);
cmd_buffer->state.query.active_query.perf =
&pool->queries[query].perf;
if (cmd_buffer->state.pass) {
v3dv_cmd_buffer_subpass_resume(cmd_buffer,
cmd_buffer->state.subpass_idx);
}
break;
}
default:
unreachable("Unsupported query type");
}
} }
void static void
v3dv_cmd_buffer_end_query(struct v3dv_cmd_buffer *cmd_buffer, v3dv_cmd_buffer_schedule_end_query(struct v3dv_cmd_buffer *cmd_buffer,
struct v3dv_query_pool *pool, struct v3dv_query_pool *pool,
uint32_t query) uint32_t query)
{ {
assert(query < pool->query_count); assert(query < pool->query_count);
assert(cmd_buffer->state.query.active_query.bo != NULL);
if (cmd_buffer->state.pass) { if (cmd_buffer->state.pass &&
pool->query_type != VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
/* Queue the EndQuery in the command buffer state, we will create a CPU /* Queue the EndQuery in the command buffer state, we will create a CPU
* job to flag all of these queries as possibly available right after the * job to flag all of these queries as possibly available right after the
* render pass job in which they have been recorded. * render pass job in which they have been recorded.
@ -3295,11 +3317,57 @@ v3dv_cmd_buffer_end_query(struct v3dv_cmd_buffer *cmd_buffer,
list_addtail(&job->list_link, &cmd_buffer->jobs); list_addtail(&job->list_link, &cmd_buffer->jobs);
} }
}
static void
v3dv_cmd_buffer_end_occlusion_query(struct v3dv_cmd_buffer *cmd_buffer,
struct v3dv_query_pool *pool,
uint32_t query)
{
assert(query < pool->query_count);
assert(cmd_buffer->state.query.active_query.bo != NULL);
v3dv_cmd_buffer_schedule_end_query(cmd_buffer, pool, query);
cmd_buffer->state.query.active_query.bo = NULL; cmd_buffer->state.query.active_query.bo = NULL;
cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_OCCLUSION_QUERY; cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_OCCLUSION_QUERY;
} }
static void
v3dv_cmd_buffer_end_performance_query(struct v3dv_cmd_buffer *cmd_buffer,
struct v3dv_query_pool *pool,
uint32_t query)
{
assert(query < pool->query_count);
assert(cmd_buffer->state.query.active_query.perf != NULL);
if (cmd_buffer->state.pass)
v3dv_cmd_buffer_subpass_finish(cmd_buffer);
v3dv_cmd_buffer_schedule_end_query(cmd_buffer, pool, query);
cmd_buffer->state.query.active_query.perf = NULL;
if (cmd_buffer->state.pass)
v3dv_cmd_buffer_subpass_resume(cmd_buffer, cmd_buffer->state.subpass_idx);
}
void v3dv_cmd_buffer_end_query(struct v3dv_cmd_buffer *cmd_buffer,
struct v3dv_query_pool *pool,
uint32_t query)
{
switch (pool->query_type) {
case VK_QUERY_TYPE_OCCLUSION:
v3dv_cmd_buffer_end_occlusion_query(cmd_buffer, pool, query);
break;
case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
v3dv_cmd_buffer_end_performance_query(cmd_buffer, pool, query);
break;
default:
unreachable("Unsupported query type");
}
}
void void
v3dv_cmd_buffer_copy_query_results(struct v3dv_cmd_buffer *cmd_buffer, v3dv_cmd_buffer_copy_query_results(struct v3dv_cmd_buffer *cmd_buffer,
struct v3dv_query_pool *pool, struct v3dv_query_pool *pool,

View File

@ -133,6 +133,7 @@ get_device_extensions(const struct v3dv_physical_device *device,
.KHR_get_memory_requirements2 = true, .KHR_get_memory_requirements2 = true,
.KHR_image_format_list = true, .KHR_image_format_list = true,
.KHR_imageless_framebuffer = true, .KHR_imageless_framebuffer = true,
.KHR_performance_query = device->caps.perfmon,
.KHR_relaxed_block_layout = true, .KHR_relaxed_block_layout = true,
.KHR_maintenance1 = true, .KHR_maintenance1 = true,
.KHR_maintenance2 = true, .KHR_maintenance2 = true,
@ -816,6 +817,9 @@ physical_device_init(struct v3dv_physical_device *device,
device->caps.multisync = device->caps.multisync =
v3d_has_feature(device, DRM_V3D_PARAM_SUPPORTS_MULTISYNC_EXT); v3d_has_feature(device, DRM_V3D_PARAM_SUPPORTS_MULTISYNC_EXT);
device->caps.perfmon =
v3d_has_feature(device, DRM_V3D_PARAM_SUPPORTS_PERFMON);
result = init_uuids(device); result = init_uuids(device);
if (result != VK_SUCCESS) if (result != VK_SUCCESS)
goto fail; goto fail;
@ -1144,6 +1148,7 @@ VKAPI_ATTR void VKAPI_CALL
v3dv_GetPhysicalDeviceFeatures2(VkPhysicalDevice physicalDevice, v3dv_GetPhysicalDeviceFeatures2(VkPhysicalDevice physicalDevice,
VkPhysicalDeviceFeatures2 *pFeatures) VkPhysicalDeviceFeatures2 *pFeatures)
{ {
V3DV_FROM_HANDLE(v3dv_physical_device, physical_device, physicalDevice);
v3dv_GetPhysicalDeviceFeatures(physicalDevice, &pFeatures->features); v3dv_GetPhysicalDeviceFeatures(physicalDevice, &pFeatures->features);
VkPhysicalDeviceVulkan13Features vk13 = { VkPhysicalDeviceVulkan13Features vk13 = {
@ -1289,6 +1294,16 @@ v3dv_GetPhysicalDeviceFeatures2(VkPhysicalDevice physicalDevice,
break; break;
} }
case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PERFORMANCE_QUERY_FEATURES_KHR: {
VkPhysicalDevicePerformanceQueryFeaturesKHR *features =
(void *) ext;
features->performanceCounterQueryPools =
physical_device->caps.perfmon;
features->performanceCounterMultipleQueryPools = false;
break;
}
default: default:
v3dv_debug_ignored_stype(ext->sType); v3dv_debug_ignored_stype(ext->sType);
break; break;
@ -1637,6 +1652,13 @@ v3dv_GetPhysicalDeviceProperties2(VkPhysicalDevice physicalDevice,
props->maxVertexAttribDivisor = 0xffff; props->maxVertexAttribDivisor = 0xffff;
break; break;
} }
case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PERFORMANCE_QUERY_PROPERTIES_KHR : {
VkPhysicalDevicePerformanceQueryPropertiesKHR *props =
(VkPhysicalDevicePerformanceQueryPropertiesKHR *)ext;
props->allowCommandBufferQueryCopies = true;
break;
}
case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DRM_PROPERTIES_EXT: { case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DRM_PROPERTIES_EXT: {
VkPhysicalDeviceDrmPropertiesEXT *props = VkPhysicalDeviceDrmPropertiesEXT *props =
(VkPhysicalDeviceDrmPropertiesEXT *)ext; (VkPhysicalDeviceDrmPropertiesEXT *)ext;

View File

@ -188,6 +188,7 @@ struct v3dv_physical_device {
struct { struct {
bool multisync; bool multisync;
bool perfmon;
} caps; } caps;
}; };
@ -263,6 +264,11 @@ struct v3dv_queue {
struct v3dv_last_job_sync last_job_syncs; struct v3dv_last_job_sync last_job_syncs;
struct v3dv_job *noop_job; struct v3dv_job *noop_job;
/* The last active perfmon ID to prevent mixing of counter results when a
* job is submitted with a different perfmon id.
*/
uint32_t last_perfmon_id;
}; };
VkResult v3dv_queue_driver_submit(struct vk_queue *vk_queue, VkResult v3dv_queue_driver_submit(struct vk_queue *vk_queue,
@ -1027,6 +1033,19 @@ struct v3dv_timestamp_query_cpu_job_info {
uint32_t count; uint32_t count;
}; };
/* Number of perfmons required to handle all supported performance counters */
#define V3DV_MAX_PERFMONS DIV_ROUND_UP(V3D_PERFCNT_NUM, \
DRM_V3D_MAX_PERF_COUNTERS)
struct v3dv_perf_query {
uint32_t kperfmon_ids[V3DV_MAX_PERFMONS];
/* A DRM syncobj to wait on the GPU jobs for which we are collecting
* performance data.
*/
struct vk_sync *last_job_sync;
};
struct v3dv_job { struct v3dv_job {
struct list_head list_link; struct list_head list_link;
@ -1127,6 +1146,9 @@ struct v3dv_job {
uint32_t wg_base[3]; uint32_t wg_base[3];
struct drm_v3d_submit_csd submit; struct drm_v3d_submit_csd submit;
} csd; } csd;
/* Perfmons with last job sync for CSD and CL jobs */
struct v3dv_perf_query *perf;
}; };
void v3dv_job_init(struct v3dv_job *job, void v3dv_job_init(struct v3dv_job *job,
@ -1328,12 +1350,15 @@ struct v3dv_cmd_buffer_state {
struct v3dv_end_query_cpu_job_info *states; struct v3dv_end_query_cpu_job_info *states;
} end; } end;
/* This BO is not NULL if we have an active query, that is, we have
* called vkCmdBeginQuery but not vkCmdEndQuery.
*/
struct { struct {
/* This BO is not NULL if we have an active occlusion query, that is,
* we have called vkCmdBeginQuery but not vkCmdEndQuery.
*/
struct v3dv_bo *bo; struct v3dv_bo *bo;
uint32_t offset; uint32_t offset;
/* This pointer is not NULL if we have an active performance query */
struct v3dv_perf_query *perf;
} active_query; } active_query;
} query; } query;
}; };
@ -1375,6 +1400,9 @@ struct v3dv_query {
}; };
/* Used by CPU queries (timestamp) */ /* Used by CPU queries (timestamp) */
uint64_t value; uint64_t value;
/* Used by performance queries */
struct v3dv_perf_query perf;
}; };
}; };
@ -1383,18 +1411,32 @@ struct v3dv_query_pool {
struct v3dv_bo *bo; /* Only used with GPU queries (occlusion) */ struct v3dv_bo *bo; /* Only used with GPU queries (occlusion) */
/* Only used with performance queries */
struct {
uint32_t ncounters;
uint8_t counters[V3D_PERFCNT_NUM];
/* V3D has a limit on the number of counters we can track in a
* single performance monitor, so if too many counters are requested
* we need to create multiple monitors to record all of them. This
* field represents the number of monitors required for the number
* of counters requested.
*/
uint8_t nperfmons;
} perfmon;
VkQueryType query_type; VkQueryType query_type;
uint32_t query_count; uint32_t query_count;
struct v3dv_query *queries; struct v3dv_query *queries;
}; };
VkResult v3dv_get_query_pool_results_cpu(struct v3dv_device *device, VkResult v3dv_get_query_pool_results(struct v3dv_device *device,
struct v3dv_query_pool *pool, struct v3dv_query_pool *pool,
uint32_t first, uint32_t first,
uint32_t count, uint32_t count,
void *data, void *data,
VkDeviceSize stride, VkDeviceSize stride,
VkQueryResultFlags flags); VkQueryResultFlags flags);
void v3dv_reset_query_pools(struct v3dv_device *device, void v3dv_reset_query_pools(struct v3dv_device *device,
struct v3dv_query_pool *query_pool, struct v3dv_query_pool *query_pool,

View File

@ -25,6 +25,148 @@
#include "util/timespec.h" #include "util/timespec.h"
static const char *v3dv_counters[][3] = {
{"FEP", "FEP-valid-primitives-no-rendered-pixels", "[FEP] Valid primitives that result in no rendered pixels, for all rendered tiles"},
{"FEP", "FEP-valid-primitives-rendered-pixels", "[FEP] Valid primitives for all rendered tiles (primitives may be counted in more than one tile)"},
{"FEP", "FEP-clipped-quads", "[FEP] Early-Z/Near/Far clipped quads"},
{"FEP", "FEP-valid-quads", "[FEP] Valid quads"},
{"TLB", "TLB-quads-not-passing-stencil-test", "[TLB] Quads with no pixels passing the stencil test"},
{"TLB", "TLB-quads-not-passing-z-and-stencil-test", "[TLB] Quads with no pixels passing the Z and stencil tests"},
{"TLB", "TLB-quads-passing-z-and-stencil-test", "[TLB] Quads with any pixels passing the Z and stencil tests"},
{"TLB", "TLB-quads-with-zero-coverage", "[TLB] Quads with all pixels having zero coverage"},
{"TLB", "TLB-quads-with-non-zero-coverage", "[TLB] Quads with any pixels having non-zero coverage"},
{"TLB", "TLB-quads-written-to-color-buffer", "[TLB] Quads with valid pixels written to colour buffer"},
{"PTB", "PTB-primitives-discarded-outside-viewport", "[PTB] Primitives discarded by being outside the viewport"},
{"PTB", "PTB-primitives-need-clipping", "[PTB] Primitives that need clipping"},
{"PTB", "PTB-primitives-discared-reversed", "[PTB] Primitives that are discarded because they are reversed"},
{"QPU", "QPU-total-idle-clk-cycles", "[QPU] Total idle clock cycles for all QPUs"},
{"QPU", "QPU-total-active-clk-cycles-vertex-coord-shading", "[QPU] Total active clock cycles for all QPUs doing vertex/coordinate/user shading (counts only when QPU is not stalled)"},
{"QPU", "QPU-total-active-clk-cycles-fragment-shading", "[QPU] Total active clock cycles for all QPUs doing fragment shading (counts only when QPU is not stalled)"},
{"QPU", "QPU-total-clk-cycles-executing-valid-instr", "[QPU] Total clock cycles for all QPUs executing valid instructions"},
{"QPU", "QPU-total-clk-cycles-waiting-TMU", "[QPU] Total clock cycles for all QPUs stalled waiting for TMUs only (counter won't increment if QPU also stalling for another reason)"},
{"QPU", "QPU-total-clk-cycles-waiting-scoreboard", "[QPU] Total clock cycles for all QPUs stalled waiting for Scoreboard only (counter won't increment if QPU also stalling for another reason)"},
{"QPU", "QPU-total-clk-cycles-waiting-varyings", "[QPU] Total clock cycles for all QPUs stalled waiting for Varyings only (counter won't increment if QPU also stalling for another reason)"},
{"QPU", "QPU-total-instr-cache-hit", "[QPU] Total instruction cache hits for all slices"},
{"QPU", "QPU-total-instr-cache-miss", "[QPU] Total instruction cache misses for all slices"},
{"QPU", "QPU-total-uniform-cache-hit", "[QPU] Total uniforms cache hits for all slices"},
{"QPU", "QPU-total-uniform-cache-miss", "[QPU] Total uniforms cache misses for all slices"},
{"TMU", "TMU-total-text-quads-access", "[TMU] Total texture cache accesses"},
{"TMU", "TMU-total-text-cache-miss", "[TMU] Total texture cache misses (number of fetches from memory/L2cache)"},
{"VPM", "VPM-total-clk-cycles-VDW-stalled", "[VPM] Total clock cycles VDW is stalled waiting for VPM access"},
{"VPM", "VPM-total-clk-cycles-VCD-stalled", "[VPM] Total clock cycles VCD is stalled waiting for VPM access"},
{"CLE", "CLE-bin-thread-active-cycles", "[CLE] Bin thread active cycles"},
{"CLE", "CLE-render-thread-active-cycles", "[CLE] Render thread active cycles"},
{"L2T", "L2T-total-cache-hit", "[L2T] Total Level 2 cache hits"},
{"L2T", "L2T-total-cache-miss", "[L2T] Total Level 2 cache misses"},
{"CORE", "cycle-count", "[CORE] Cycle counter"},
{"QPU", "QPU-total-clk-cycles-waiting-vertex-coord-shading", "[QPU] Total stalled clock cycles for all QPUs doing vertex/coordinate/user shading"},
{"QPU", "QPU-total-clk-cycles-waiting-fragment-shading", "[QPU] Total stalled clock cycles for all QPUs doing fragment shading"},
{"PTB", "PTB-primitives-binned", "[PTB] Total primitives binned"},
{"AXI", "AXI-writes-seen-watch-0", "[AXI] Writes seen by watch 0"},
{"AXI", "AXI-reads-seen-watch-0", "[AXI] Reads seen by watch 0"},
{"AXI", "AXI-writes-stalled-seen-watch-0", "[AXI] Write stalls seen by watch 0"},
{"AXI", "AXI-reads-stalled-seen-watch-0", "[AXI] Read stalls seen by watch 0"},
{"AXI", "AXI-write-bytes-seen-watch-0", "[AXI] Total bytes written seen by watch 0"},
{"AXI", "AXI-read-bytes-seen-watch-0", "[AXI] Total bytes read seen by watch 0"},
{"AXI", "AXI-writes-seen-watch-1", "[AXI] Writes seen by watch 1"},
{"AXI", "AXI-reads-seen-watch-1", "[AXI] Reads seen by watch 1"},
{"AXI", "AXI-writes-stalled-seen-watch-1", "[AXI] Write stalls seen by watch 1"},
{"AXI", "AXI-reads-stalled-seen-watch-1", "[AXI] Read stalls seen by watch 1"},
{"AXI", "AXI-write-bytes-seen-watch-1", "[AXI] Total bytes written seen by watch 1"},
{"AXI", "AXI-read-bytes-seen-watch-1", "[AXI] Total bytes read seen by watch 1"},
{"TLB", "TLB-partial-quads-written-to-color-buffer", "[TLB] Partial quads written to the colour buffer"},
{"TMU", "TMU-total-config-access", "[TMU] Total config accesses"},
{"L2T", "L2T-no-id-stalled", "[L2T] No ID stall"},
{"L2T", "L2T-command-queue-stalled", "[L2T] Command queue full stall"},
{"L2T", "L2T-TMU-writes", "[L2T] TMU write accesses"},
{"TMU", "TMU-active-cycles", "[TMU] Active cycles"},
{"TMU", "TMU-stalled-cycles", "[TMU] Stalled cycles"},
{"CLE", "CLE-thread-active-cycles", "[CLE] Bin or render thread active cycles"},
{"L2T", "L2T-TMU-reads", "[L2T] TMU read accesses"},
{"L2T", "L2T-CLE-reads", "[L2T] CLE read accesses"},
{"L2T", "L2T-VCD-reads", "[L2T] VCD read accesses"},
{"L2T", "L2T-TMU-config-reads", "[L2T] TMU CFG read accesses"},
{"L2T", "L2T-SLC0-reads", "[L2T] SLC0 read accesses"},
{"L2T", "L2T-SLC1-reads", "[L2T] SLC1 read accesses"},
{"L2T", "L2T-SLC2-reads", "[L2T] SLC2 read accesses"},
{"L2T", "L2T-TMU-write-miss", "[L2T] TMU write misses"},
{"L2T", "L2T-TMU-read-miss", "[L2T] TMU read misses"},
{"L2T", "L2T-CLE-read-miss", "[L2T] CLE read misses"},
{"L2T", "L2T-VCD-read-miss", "[L2T] VCD read misses"},
{"L2T", "L2T-TMU-config-read-miss", "[L2T] TMU CFG read misses"},
{"L2T", "L2T-SLC0-read-miss", "[L2T] SLC0 read misses"},
{"L2T", "L2T-SLC1-read-miss", "[L2T] SLC1 read misses"},
{"L2T", "L2T-SLC2-read-miss", "[L2T] SLC2 read misses"},
{"CORE", "core-memory-writes", "[CORE] Total memory writes"},
{"L2T", "L2T-memory-writes", "[L2T] Total memory writes"},
{"PTB", "PTB-memory-writes", "[PTB] Total memory writes"},
{"TLB", "TLB-memory-writes", "[TLB] Total memory writes"},
{"CORE", "core-memory-reads", "[CORE] Total memory reads"},
{"L2T", "L2T-memory-reads", "[L2T] Total memory reads"},
{"PTB", "PTB-memory-reads", "[PTB] Total memory reads"},
{"PSE", "PSE-memory-reads", "[PSE] Total memory reads"},
{"TLB", "TLB-memory-reads", "[TLB] Total memory reads"},
{"GMP", "GMP-memory-reads", "[GMP] Total memory reads"},
{"PTB", "PTB-memory-words-writes", "[PTB] Total memory words written"},
{"TLB", "TLB-memory-words-writes", "[TLB] Total memory words written"},
{"PSE", "PSE-memory-words-reads", "[PSE] Total memory words read"},
{"TLB", "TLB-memory-words-reads", "[TLB] Total memory words read"},
{"TMU", "TMU-MRU-hits", "[TMU] Total MRU hits"},
{"CORE", "compute-active-cycles", "[CORE] Compute active cycles"},
};
static void
kperfmon_create(struct v3dv_device *device,
struct v3dv_query_pool *pool,
uint32_t query)
{
for (uint32_t i = 0; i < pool->perfmon.nperfmons; i++) {
assert(i * DRM_V3D_MAX_PERF_COUNTERS < pool->perfmon.ncounters);
struct drm_v3d_perfmon_create req = {
.ncounters = MIN2(pool->perfmon.ncounters -
i * DRM_V3D_MAX_PERF_COUNTERS,
DRM_V3D_MAX_PERF_COUNTERS),
};
memcpy(req.counters,
&pool->perfmon.counters[i * DRM_V3D_MAX_PERF_COUNTERS],
req.ncounters);
int ret = v3dv_ioctl(device->pdevice->render_fd,
DRM_IOCTL_V3D_PERFMON_CREATE,
&req);
if (ret)
fprintf(stderr, "Failed to create perfmon: %s\n", strerror(ret));
pool->queries[query].perf.kperfmon_ids[i] = req.id;
}
}
static void
kperfmon_destroy(struct v3dv_device *device,
struct v3dv_query_pool *pool,
uint32_t query)
{
/* Skip destroying if never created */
if (!pool->queries[query].perf.kperfmon_ids[0])
return;
for (uint32_t i = 0; i < pool->perfmon.nperfmons; i++) {
struct drm_v3d_perfmon_destroy req = {
.id = pool->queries[query].perf.kperfmon_ids[i]
};
int ret = v3dv_ioctl(device->pdevice->render_fd,
DRM_IOCTL_V3D_PERFMON_DESTROY,
&req);
if (ret) {
fprintf(stderr, "Failed to destroy perfmon %u: %s\n",
req.id, strerror(ret));
}
}
}
VKAPI_ATTR VkResult VKAPI_CALL VKAPI_ATTR VkResult VKAPI_CALL
v3dv_CreateQueryPool(VkDevice _device, v3dv_CreateQueryPool(VkDevice _device,
const VkQueryPoolCreateInfo *pCreateInfo, const VkQueryPoolCreateInfo *pCreateInfo,
@ -34,7 +176,8 @@ v3dv_CreateQueryPool(VkDevice _device,
V3DV_FROM_HANDLE(v3dv_device, device, _device); V3DV_FROM_HANDLE(v3dv_device, device, _device);
assert(pCreateInfo->queryType == VK_QUERY_TYPE_OCCLUSION || assert(pCreateInfo->queryType == VK_QUERY_TYPE_OCCLUSION ||
pCreateInfo->queryType == VK_QUERY_TYPE_TIMESTAMP); pCreateInfo->queryType == VK_QUERY_TYPE_TIMESTAMP ||
pCreateInfo->queryType == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR);
assert(pCreateInfo->queryCount > 0); assert(pCreateInfo->queryCount > 0);
struct v3dv_query_pool *pool = struct v3dv_query_pool *pool =
@ -46,6 +189,7 @@ v3dv_CreateQueryPool(VkDevice _device,
pool->query_type = pCreateInfo->queryType; pool->query_type = pCreateInfo->queryType;
pool->query_count = pCreateInfo->queryCount; pool->query_count = pCreateInfo->queryCount;
uint32_t query_idx = 0;
VkResult result; VkResult result;
const uint32_t pool_bytes = sizeof(struct v3dv_query) * pool->query_count; const uint32_t pool_bytes = sizeof(struct v3dv_query) * pool->query_count;
@ -56,7 +200,8 @@ v3dv_CreateQueryPool(VkDevice _device,
goto fail; goto fail;
} }
if (pool->query_type == VK_QUERY_TYPE_OCCLUSION) { switch (pool->query_type) {
case VK_QUERY_TYPE_OCCLUSION: {
/* The hardware allows us to setup groups of 16 queries in consecutive /* The hardware allows us to setup groups of 16 queries in consecutive
* 4-byte addresses, requiring only that each group of 16 queries is * 4-byte addresses, requiring only that each group of 16 queries is
* aligned to a 1024 byte boundary. * aligned to a 1024 byte boundary.
@ -72,22 +217,56 @@ v3dv_CreateQueryPool(VkDevice _device,
result = vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY); result = vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY);
goto fail; goto fail;
} }
break;
}
case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
const VkQueryPoolPerformanceCreateInfoKHR *pq_info =
vk_find_struct_const(pCreateInfo->pNext,
QUERY_POOL_PERFORMANCE_CREATE_INFO_KHR);
assert(pq_info);
assert(pq_info->counterIndexCount <= V3D_PERFCNT_NUM);
pool->perfmon.ncounters = pq_info->counterIndexCount;
for (uint32_t i = 0; i < pq_info->counterIndexCount; i++)
pool->perfmon.counters[i] = pq_info->pCounterIndices[i];
pool->perfmon.nperfmons = DIV_ROUND_UP(pool->perfmon.ncounters,
DRM_V3D_MAX_PERF_COUNTERS);
assert(pool->perfmon.nperfmons <= V3DV_MAX_PERFMONS);
break;
}
case VK_QUERY_TYPE_TIMESTAMP:
break;
default:
unreachable("Unsupported query type");
} }
uint32_t i; for (; query_idx < pool->query_count; query_idx++) {
for (i = 0; i < pool->query_count; i++) { pool->queries[query_idx].maybe_available = false;
pool->queries[i].maybe_available = false;
switch (pool->query_type) { switch (pool->query_type) {
case VK_QUERY_TYPE_OCCLUSION: { case VK_QUERY_TYPE_OCCLUSION: {
const uint32_t query_group = i / 16; const uint32_t query_group = query_idx / 16;
const uint32_t query_offset = query_group * 1024 + (i % 16) * 4; const uint32_t query_offset = query_group * 1024 + (query_idx % 16) * 4;
pool->queries[i].bo = pool->bo; pool->queries[query_idx].bo = pool->bo;
pool->queries[i].offset = query_offset; pool->queries[query_idx].offset = query_offset;
break; break;
} }
case VK_QUERY_TYPE_TIMESTAMP: case VK_QUERY_TYPE_TIMESTAMP:
pool->queries[i].value = 0; pool->queries[query_idx].value = 0;
break; break;
case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
result = vk_sync_create(&device->vk,
&device->pdevice->drm_syncobj_type, 0, 0,
&pool->queries[query_idx].perf.last_job_sync);
if (result != VK_SUCCESS)
goto fail;
for (uint32_t j = 0; j < pool->perfmon.nperfmons; j++)
pool->queries[query_idx].perf.kperfmon_ids[j] = 0;
break;
}
default: default:
unreachable("Unsupported query type"); unreachable("Unsupported query type");
} }
@ -98,6 +277,11 @@ v3dv_CreateQueryPool(VkDevice _device,
return VK_SUCCESS; return VK_SUCCESS;
fail: fail:
if (pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
for (uint32_t j = 0; j < query_idx; j++)
vk_sync_destroy(&device->vk, pool->queries[j].perf.last_job_sync);
}
if (pool->bo) if (pool->bo)
v3dv_bo_free(device, pool->bo); v3dv_bo_free(device, pool->bo);
if (pool->queries) if (pool->queries)
@ -121,6 +305,13 @@ v3dv_DestroyQueryPool(VkDevice _device,
if (pool->bo) if (pool->bo)
v3dv_bo_free(device, pool->bo); v3dv_bo_free(device, pool->bo);
if (pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
for (uint32_t i = 0; i < pool->query_count; i++) {
kperfmon_destroy(device, pool, i);
vk_sync_destroy(&device->vk, pool->queries[i].perf.last_job_sync);
}
}
if (pool->queries) if (pool->queries)
vk_free2(&device->vk.alloc, pAllocator, pool->queries); vk_free2(&device->vk.alloc, pAllocator, pool->queries);
@ -128,7 +319,7 @@ v3dv_DestroyQueryPool(VkDevice _device,
} }
static void static void
write_query_result(void *dst, uint32_t idx, bool do_64bit, uint64_t value) write_to_buffer(void *dst, uint32_t idx, bool do_64bit, uint64_t value)
{ {
if (do_64bit) { if (do_64bit) {
uint64_t *dst64 = (uint64_t *) dst; uint64_t *dst64 = (uint64_t *) dst;
@ -177,13 +368,91 @@ query_wait_available(struct v3dv_device *device,
!v3dv_bo_wait(device, q->bo, 0xffffffffffffffffull)) !v3dv_bo_wait(device, q->bo, 0xffffffffffffffffull))
return vk_device_set_lost(&device->vk, "Query BO wait failed: %m"); return vk_device_set_lost(&device->vk, "Query BO wait failed: %m");
if (query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR &&
vk_sync_wait(&device->vk, q->perf.last_job_sync,
0, VK_SYNC_WAIT_COMPLETE, UINT64_MAX) != VK_SUCCESS)
return vk_device_set_lost(&device->vk, "Query job wait failed");
return VK_SUCCESS; return VK_SUCCESS;
} }
static VkResult static VkResult
query_is_available(struct v3dv_device *device, write_occlusion_query_result(struct v3dv_device *device,
struct v3dv_query *q, struct v3dv_query_pool *pool,
VkQueryType query_type) uint32_t query,
bool do_64bit,
void *data,
uint32_t slot)
{
assert(pool && pool->query_type == VK_QUERY_TYPE_OCCLUSION);
if (vk_device_is_lost(&device->vk))
return VK_ERROR_DEVICE_LOST;
struct v3dv_query *q = &pool->queries[query];
assert(q->bo && q->bo->map);
const uint8_t *query_addr = ((uint8_t *) q->bo->map) + q->offset;
write_to_buffer(data, slot, do_64bit, (uint64_t) *((uint32_t *)query_addr));
return VK_SUCCESS;
}
static VkResult
write_timestamp_query_result(struct v3dv_device *device,
struct v3dv_query_pool *pool,
uint32_t query,
bool do_64bit,
void *data,
uint32_t slot)
{
assert(pool && pool->query_type == VK_QUERY_TYPE_TIMESTAMP);
struct v3dv_query *q = &pool->queries[query];
write_to_buffer(data, slot, do_64bit, q->value);
return VK_SUCCESS;
}
static VkResult
write_performance_query_result(struct v3dv_device *device,
struct v3dv_query_pool *pool,
uint32_t query,
bool do_64bit,
void *data,
uint32_t slot)
{
assert(pool && pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR);
struct v3dv_query *q = &pool->queries[query];
uint64_t counter_values[V3D_PERFCNT_NUM];
for (uint32_t i = 0; i < pool->perfmon.nperfmons; i++) {
struct drm_v3d_perfmon_get_values req = {
.id = q->perf.kperfmon_ids[i],
.values_ptr = (uintptr_t)(&counter_values[i *
DRM_V3D_MAX_PERF_COUNTERS])
};
int ret = v3dv_ioctl(device->pdevice->render_fd,
DRM_IOCTL_V3D_PERFMON_GET_VALUES,
&req);
if (ret) {
fprintf(stderr, "failed to get perfmon values: %s\n", strerror(ret));
return vk_error(device, VK_ERROR_DEVICE_LOST);
}
}
for (uint32_t i = 0; i < pool->perfmon.ncounters; i++)
write_to_buffer(data, slot + i, do_64bit, counter_values[i]);
return VK_SUCCESS;
}
static VkResult
query_check_available(struct v3dv_device *device,
struct v3dv_query *q,
VkQueryType query_type)
{ {
if (!q->maybe_available) if (!q->maybe_available)
return VK_NOT_READY; return VK_NOT_READY;
@ -192,70 +461,105 @@ query_is_available(struct v3dv_device *device,
!v3dv_bo_wait(device, q->bo, 0)) !v3dv_bo_wait(device, q->bo, 0))
return VK_NOT_READY; return VK_NOT_READY;
if (query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR &&
vk_sync_wait(&device->vk, q->perf.last_job_sync,
0, VK_SYNC_WAIT_COMPLETE, 0) != VK_SUCCESS)
return VK_NOT_READY;
return VK_SUCCESS; return VK_SUCCESS;
} }
static VkResult static VkResult
get_query_result(struct v3dv_device *device, write_query_result(struct v3dv_device *device,
struct v3dv_query_pool *pool, struct v3dv_query_pool *pool,
uint32_t query, uint32_t query,
bool do_wait, bool do_64bit,
bool *available, void *data,
uint64_t *value) uint32_t slot)
{
switch (pool->query_type) {
case VK_QUERY_TYPE_OCCLUSION:
return write_occlusion_query_result(device, pool, query, do_64bit,
data, slot);
case VK_QUERY_TYPE_TIMESTAMP:
return write_timestamp_query_result(device, pool, query, do_64bit,
data, slot);
case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
return write_performance_query_result(device, pool, query, do_64bit,
data, slot);
default:
unreachable("Unsupported query type");
}
}
static VkResult
query_is_available(struct v3dv_device *device,
struct v3dv_query_pool *pool,
uint32_t query,
bool do_wait,
bool *available)
{ {
struct v3dv_query *q = &pool->queries[query]; struct v3dv_query *q = &pool->queries[query];
assert(pool->query_type != VK_QUERY_TYPE_OCCLUSION ||
(q->bo && q->bo->map));
if (do_wait) { if (do_wait) {
VkResult result = query_wait_available(device, q, pool->query_type); VkResult result = query_wait_available(device, q, pool->query_type);
if (result != VK_SUCCESS) if (result != VK_SUCCESS) {
*available = false;
return result; return result;
}
*available = true; *available = true;
} else { } else {
VkResult result = query_is_available(device, q, pool->query_type); VkResult result = query_check_available(device, q, pool->query_type);
assert(result == VK_SUCCESS || result == VK_NOT_READY); assert(result == VK_SUCCESS || result == VK_NOT_READY);
*available = (result == VK_SUCCESS); *available = (result == VK_SUCCESS);
} }
return VK_SUCCESS;
}
static uint32_t
get_query_result_count(struct v3dv_query_pool *pool)
{
switch (pool->query_type) { switch (pool->query_type) {
case VK_QUERY_TYPE_OCCLUSION: { case VK_QUERY_TYPE_OCCLUSION:
const uint8_t *query_addr = ((uint8_t *) q->bo->map) + q->offset;
*value = (uint64_t) *((uint32_t *)query_addr);
return VK_SUCCESS;
}
case VK_QUERY_TYPE_TIMESTAMP: case VK_QUERY_TYPE_TIMESTAMP:
*value = q->value; return 1;
return VK_SUCCESS; case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
return pool->perfmon.ncounters;
default: default:
unreachable("Unsupported query type"); unreachable("Unsupported query type");
} }
} }
VkResult VkResult
v3dv_get_query_pool_results_cpu(struct v3dv_device *device, v3dv_get_query_pool_results(struct v3dv_device *device,
struct v3dv_query_pool *pool, struct v3dv_query_pool *pool,
uint32_t first, uint32_t first,
uint32_t count, uint32_t count,
void *data, void *data,
VkDeviceSize stride, VkDeviceSize stride,
VkQueryResultFlags flags) VkQueryResultFlags flags)
{ {
assert(first < pool->query_count); assert(first < pool->query_count);
assert(first + count <= pool->query_count); assert(first + count <= pool->query_count);
assert(data); assert(data);
const bool do_64bit = flags & VK_QUERY_RESULT_64_BIT; const bool do_64bit = flags & VK_QUERY_RESULT_64_BIT ||
pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR;
const bool do_wait = flags & VK_QUERY_RESULT_WAIT_BIT; const bool do_wait = flags & VK_QUERY_RESULT_WAIT_BIT;
const bool do_partial = flags & VK_QUERY_RESULT_PARTIAL_BIT; const bool do_partial = flags & VK_QUERY_RESULT_PARTIAL_BIT;
uint32_t result_count = get_query_result_count(pool);
VkResult result = VK_SUCCESS; VkResult result = VK_SUCCESS;
for (uint32_t i = first; i < first + count; i++) { for (uint32_t i = first; i < first + count; i++) {
bool available = false; bool available = false;
uint64_t value = 0;
VkResult query_result = VkResult query_result =
get_query_result(device, pool, i, do_wait, &available, &value); query_is_available(device, pool, i, do_wait, &available);
if (query_result == VK_ERROR_DEVICE_LOST) if (query_result == VK_ERROR_DEVICE_LOST)
result = VK_ERROR_DEVICE_LOST; result = VK_ERROR_DEVICE_LOST;
@ -273,11 +577,11 @@ v3dv_get_query_pool_results_cpu(struct v3dv_device *device,
const bool write_result = available || do_partial; const bool write_result = available || do_partial;
if (write_result) if (write_result)
write_query_result(data, slot, do_64bit, value); write_query_result(device, pool, i, do_64bit, data, slot);
slot++; slot += result_count;
if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)
write_query_result(data, slot++, do_64bit, available ? 1u : 0u); write_to_buffer(data, slot++, do_64bit, available ? 1u : 0u);
if (!write_result && result != VK_ERROR_DEVICE_LOST) if (!write_result && result != VK_ERROR_DEVICE_LOST)
result = VK_NOT_READY; result = VK_NOT_READY;
@ -301,8 +605,8 @@ v3dv_GetQueryPoolResults(VkDevice _device,
V3DV_FROM_HANDLE(v3dv_device, device, _device); V3DV_FROM_HANDLE(v3dv_device, device, _device);
V3DV_FROM_HANDLE(v3dv_query_pool, pool, queryPool); V3DV_FROM_HANDLE(v3dv_query_pool, pool, queryPool);
return v3dv_get_query_pool_results_cpu(device, pool, firstQuery, queryCount, return v3dv_get_query_pool_results(device, pool, firstQuery, queryCount,
pData, stride, flags); pData, stride, flags);
} }
VKAPI_ATTR void VKAPI_CALL VKAPI_ATTR void VKAPI_CALL
@ -381,6 +685,12 @@ v3dv_reset_query_pools(struct v3dv_device *device,
case VK_QUERY_TYPE_TIMESTAMP: case VK_QUERY_TYPE_TIMESTAMP:
q->value = 0; q->value = 0;
break; break;
case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
kperfmon_destroy(device, pool, i);
kperfmon_create(device, pool, i);
if (vk_sync_reset(&device->vk, q->perf.last_job_sync) != VK_SUCCESS)
fprintf(stderr, "Failed to reset sync");
break;
default: default:
unreachable("Unsupported query type"); unreachable("Unsupported query type");
} }
@ -400,3 +710,69 @@ v3dv_ResetQueryPool(VkDevice _device,
v3dv_reset_query_pools(device, pool, firstQuery, queryCount); v3dv_reset_query_pools(device, pool, firstQuery, queryCount);
} }
VKAPI_ATTR VkResult VKAPI_CALL
v3dv_EnumeratePhysicalDeviceQueueFamilyPerformanceQueryCountersKHR(
VkPhysicalDevice physicalDevice,
uint32_t queueFamilyIndex,
uint32_t *pCounterCount,
VkPerformanceCounterKHR *pCounters,
VkPerformanceCounterDescriptionKHR *pCounterDescriptions)
{
uint32_t desc_count = *pCounterCount;
VK_OUTARRAY_MAKE_TYPED(VkPerformanceCounterKHR,
out, pCounters, pCounterCount);
VK_OUTARRAY_MAKE_TYPED(VkPerformanceCounterDescriptionKHR,
out_desc, pCounterDescriptions, &desc_count);
for (int i = 0; i < ARRAY_SIZE(v3dv_counters); i++) {
vk_outarray_append_typed(VkPerformanceCounterKHR, &out, counter) {
counter->unit = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR;
counter->scope = VK_QUERY_SCOPE_COMMAND_KHR;
counter->storage = VK_PERFORMANCE_COUNTER_STORAGE_UINT64_KHR;
unsigned char sha1_result[20];
_mesa_sha1_compute(v3dv_counters[i][1], strlen(v3dv_counters[i][1]),
sha1_result);
memcpy(counter->uuid, sha1_result, sizeof(counter->uuid));
}
vk_outarray_append_typed(VkPerformanceCounterDescriptionKHR,
&out_desc, desc) {
desc->flags = 0;
snprintf(desc->name, sizeof(desc->name), "%s",
v3dv_counters[i][1]);
snprintf(desc->category, sizeof(desc->category), "%s",
v3dv_counters[i][0]);
snprintf(desc->description, sizeof(desc->description), "%s",
v3dv_counters[i][2]);
}
}
return vk_outarray_status(&out);
}
VKAPI_ATTR void VKAPI_CALL
v3dv_GetPhysicalDeviceQueueFamilyPerformanceQueryPassesKHR(
VkPhysicalDevice physicalDevice,
const VkQueryPoolPerformanceCreateInfoKHR *pPerformanceQueryCreateInfo,
uint32_t *pNumPasses)
{
*pNumPasses = DIV_ROUND_UP(pPerformanceQueryCreateInfo->counterIndexCount,
DRM_V3D_MAX_PERF_COUNTERS);
}
VKAPI_ATTR VkResult VKAPI_CALL
v3dv_AcquireProfilingLockKHR(
VkDevice _device,
const VkAcquireProfilingLockInfoKHR *pInfo)
{
return VK_SUCCESS;
}
VKAPI_ATTR void VKAPI_CALL
v3dv_ReleaseProfilingLockKHR(VkDevice device)
{
}

View File

@ -137,27 +137,129 @@ handle_reset_query_cpu_job(struct v3dv_queue *queue, struct v3dv_job *job,
if (info->pool->query_type == VK_QUERY_TYPE_OCCLUSION) if (info->pool->query_type == VK_QUERY_TYPE_OCCLUSION)
v3dv_bo_wait(job->device, info->pool->bo, PIPE_TIMEOUT_INFINITE); v3dv_bo_wait(job->device, info->pool->bo, PIPE_TIMEOUT_INFINITE);
if (info->pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
struct vk_sync_wait waits[info->count];
unsigned wait_count = 0;
for (int i = 0; i < info->count; i++) {
struct v3dv_query *query = &info->pool->queries[i];
/* Only wait for a query if we've used it otherwise we will be
* waiting forever for the fence to become signaled.
*/
if (query->maybe_available) {
waits[wait_count] = (struct vk_sync_wait){
.sync = info->pool->queries[i].perf.last_job_sync
};
wait_count++;
};
}
VkResult result = vk_sync_wait_many(&job->device->vk, wait_count, waits,
VK_SYNC_WAIT_COMPLETE, UINT64_MAX);
if (result != VK_SUCCESS)
return result;
}
v3dv_reset_query_pools(job->device, info->pool, info->first, info->count); v3dv_reset_query_pools(job->device, info->pool, info->first, info->count);
return VK_SUCCESS; return VK_SUCCESS;
} }
static VkResult static VkResult
handle_end_query_cpu_job(struct v3dv_job *job) export_perfmon_last_job_sync(struct v3dv_queue *queue, struct v3dv_job *job, int *fd)
{ {
int err;
if (job->device->pdevice->caps.multisync) {
static const enum v3dv_queue_type queues_to_sync[] = {
V3DV_QUEUE_CL,
V3DV_QUEUE_CSD,
};
for (uint32_t i = 0; i < ARRAY_SIZE(queues_to_sync); i++) {
enum v3dv_queue_type queue_type = queues_to_sync[i];
int tmp_fd = -1;
err = drmSyncobjExportSyncFile(job->device->pdevice->render_fd,
queue->last_job_syncs.syncs[queue_type],
&tmp_fd);
if (err) {
close(*fd);
return vk_errorf(&job->device->queue, VK_ERROR_UNKNOWN,
"sync file export failed: %m");
}
err = sync_accumulate("v3dv", fd, tmp_fd);
if (err) {
close(tmp_fd);
close(*fd);
return vk_errorf(&job->device->queue, VK_ERROR_UNKNOWN,
"failed to accumulate sync files: %m");
}
}
} else {
err = drmSyncobjExportSyncFile(job->device->pdevice->render_fd,
queue->last_job_syncs.syncs[V3DV_QUEUE_ANY],
fd);
if (err) {
return vk_errorf(&job->device->queue, VK_ERROR_UNKNOWN,
"sync file export failed: %m");
}
}
return VK_SUCCESS;
}
static VkResult
handle_end_query_cpu_job(struct v3dv_job *job, uint32_t counter_pass_idx)
{
VkResult result = VK_SUCCESS;
mtx_lock(&job->device->query_mutex); mtx_lock(&job->device->query_mutex);
struct v3dv_end_query_cpu_job_info *info = &job->cpu.query_end; struct v3dv_end_query_cpu_job_info *info = &job->cpu.query_end;
struct v3dv_queue *queue = &job->device->queue;
int err = 0;
int fd = -1;
if (info->pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
result = export_perfmon_last_job_sync(queue, job, &fd);
if (result != VK_SUCCESS)
goto fail;
assert(fd >= 0);
}
for (uint32_t i = 0; i < info->count; i++) { for (uint32_t i = 0; i < info->count; i++) {
assert(info->query + i < info->pool->query_count); assert(info->query + i < info->pool->query_count);
struct v3dv_query *query = &info->pool->queries[info->query + i]; struct v3dv_query *query = &info->pool->queries[info->query + i];
if (info->pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
uint32_t syncobj = vk_sync_as_drm_syncobj(query->perf.last_job_sync)->syncobj;
err = drmSyncobjImportSyncFile(job->device->pdevice->render_fd,
syncobj, fd);
if (err) {
result = vk_errorf(queue, VK_ERROR_UNKNOWN,
"sync file import failed: %m");
goto fail;
}
}
query->maybe_available = true; query->maybe_available = true;
} }
fail:
if (info->pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR)
close(fd);
cnd_broadcast(&job->device->query_ended); cnd_broadcast(&job->device->query_ended);
mtx_unlock(&job->device->query_mutex); mtx_unlock(&job->device->query_mutex);
return VK_SUCCESS; return result;
} }
static VkResult static VkResult
@ -176,13 +278,13 @@ handle_copy_query_results_cpu_job(struct v3dv_job *job)
uint8_t *offset = ((uint8_t *) bo->map) + uint8_t *offset = ((uint8_t *) bo->map) +
info->offset + info->dst->mem_offset; info->offset + info->dst->mem_offset;
v3dv_get_query_pool_results_cpu(job->device, v3dv_get_query_pool_results(job->device,
info->pool, info->pool,
info->first, info->first,
info->count, info->count,
offset, offset,
info->stride, info->stride,
info->flags); info->flags);
return VK_SUCCESS; return VK_SUCCESS;
} }
@ -635,6 +737,7 @@ fail:
static VkResult static VkResult
handle_cl_job(struct v3dv_queue *queue, handle_cl_job(struct v3dv_queue *queue,
struct v3dv_job *job, struct v3dv_job *job,
uint32_t counter_pass_idx,
struct v3dv_submit_sync_info *sync_info, struct v3dv_submit_sync_info *sync_info,
bool signal_syncs) bool signal_syncs)
{ {
@ -678,9 +781,15 @@ handle_cl_job(struct v3dv_queue *queue,
assert(bo_idx == submit.bo_handle_count); assert(bo_idx == submit.bo_handle_count);
submit.bo_handles = (uintptr_t)(void *)bo_handles; submit.bo_handles = (uintptr_t)(void *)bo_handles;
submit.perfmon_id = job->perf ?
job->perf->kperfmon_ids[counter_pass_idx] : 0;
const bool needs_perf_sync = queue->last_perfmon_id != submit.perfmon_id;
queue->last_perfmon_id = submit.perfmon_id;
/* We need a binning sync if we are waiting on a semaphore with a wait stage /* We need a binning sync if we are waiting on a semaphore with a wait stage
* that involves the geometry pipeline, or if the job comes after a pipeline * that involves the geometry pipeline, or if the job comes after a pipeline
* barrier that involves geometry stages (needs_bcl_sync). * barrier that involves geometry stages (needs_bcl_sync), or if
* performance queries are in use.
* *
* We need a render sync if the job doesn't need a binning sync but has * We need a render sync if the job doesn't need a binning sync but has
* still been flagged for serialization. It should be noted that RCL jobs * still been flagged for serialization. It should be noted that RCL jobs
@ -705,6 +814,7 @@ handle_cl_job(struct v3dv_queue *queue,
VK_PIPELINE_STAGE_TESSELLATION_EVALUATION_SHADER_BIT | VK_PIPELINE_STAGE_TESSELLATION_EVALUATION_SHADER_BIT |
VK_PIPELINE_STAGE_GEOMETRY_SHADER_BIT); VK_PIPELINE_STAGE_GEOMETRY_SHADER_BIT);
} }
needs_bcl_sync |= needs_perf_sync;
bool needs_rcl_sync = job->serialize && !needs_bcl_sync; bool needs_rcl_sync = job->serialize && !needs_bcl_sync;
/* Replace single semaphore settings whenever our kernel-driver supports /* Replace single semaphore settings whenever our kernel-driver supports
@ -795,6 +905,7 @@ handle_tfu_job(struct v3dv_queue *queue,
static VkResult static VkResult
handle_csd_job(struct v3dv_queue *queue, handle_csd_job(struct v3dv_queue *queue,
struct v3dv_job *job, struct v3dv_job *job,
uint32_t counter_pass_idx,
struct v3dv_submit_sync_info *sync_info, struct v3dv_submit_sync_info *sync_info,
bool signal_syncs) bool signal_syncs)
{ {
@ -835,6 +946,9 @@ handle_csd_job(struct v3dv_queue *queue,
submit->in_sync = needs_sync ? last_job_sync : 0; submit->in_sync = needs_sync ? last_job_sync : 0;
submit->out_sync = last_job_sync; submit->out_sync = last_job_sync;
} }
submit->perfmon_id = job->perf ?
job->perf->kperfmon_ids[counter_pass_idx] : 0;
queue->last_perfmon_id = submit->perfmon_id;
int ret = v3dv_ioctl(device->pdevice->render_fd, int ret = v3dv_ioctl(device->pdevice->render_fd,
DRM_IOCTL_V3D_SUBMIT_CSD, submit); DRM_IOCTL_V3D_SUBMIT_CSD, submit);
@ -858,20 +972,21 @@ handle_csd_job(struct v3dv_queue *queue,
static VkResult static VkResult
queue_handle_job(struct v3dv_queue *queue, queue_handle_job(struct v3dv_queue *queue,
struct v3dv_job *job, struct v3dv_job *job,
uint32_t counter_pass_idx,
struct v3dv_submit_sync_info *sync_info, struct v3dv_submit_sync_info *sync_info,
bool signal_syncs) bool signal_syncs)
{ {
switch (job->type) { switch (job->type) {
case V3DV_JOB_TYPE_GPU_CL: case V3DV_JOB_TYPE_GPU_CL:
return handle_cl_job(queue, job, sync_info, signal_syncs); return handle_cl_job(queue, job, counter_pass_idx, sync_info, signal_syncs);
case V3DV_JOB_TYPE_GPU_TFU: case V3DV_JOB_TYPE_GPU_TFU:
return handle_tfu_job(queue, job, sync_info, signal_syncs); return handle_tfu_job(queue, job, sync_info, signal_syncs);
case V3DV_JOB_TYPE_GPU_CSD: case V3DV_JOB_TYPE_GPU_CSD:
return handle_csd_job(queue, job, sync_info, signal_syncs); return handle_csd_job(queue, job, counter_pass_idx, sync_info, signal_syncs);
case V3DV_JOB_TYPE_CPU_RESET_QUERIES: case V3DV_JOB_TYPE_CPU_RESET_QUERIES:
return handle_reset_query_cpu_job(queue, job, sync_info); return handle_reset_query_cpu_job(queue, job, sync_info);
case V3DV_JOB_TYPE_CPU_END_QUERY: case V3DV_JOB_TYPE_CPU_END_QUERY:
return handle_end_query_cpu_job(job); return handle_end_query_cpu_job(job, counter_pass_idx);
case V3DV_JOB_TYPE_CPU_COPY_QUERY_RESULTS: case V3DV_JOB_TYPE_CPU_COPY_QUERY_RESULTS:
return handle_copy_query_results_cpu_job(job); return handle_copy_query_results_cpu_job(job);
case V3DV_JOB_TYPE_CPU_SET_EVENT: case V3DV_JOB_TYPE_CPU_SET_EVENT:
@ -913,6 +1028,7 @@ queue_create_noop_job(struct v3dv_queue *queue)
static VkResult static VkResult
queue_submit_noop_job(struct v3dv_queue *queue, queue_submit_noop_job(struct v3dv_queue *queue,
uint32_t counter_pass_idx,
struct v3dv_submit_sync_info *sync_info, struct v3dv_submit_sync_info *sync_info,
bool signal_syncs) bool signal_syncs)
{ {
@ -923,7 +1039,8 @@ queue_submit_noop_job(struct v3dv_queue *queue,
} }
assert(queue->noop_job); assert(queue->noop_job);
return queue_handle_job(queue, queue->noop_job, sync_info, signal_syncs); return queue_handle_job(queue, queue->noop_job, counter_pass_idx,
sync_info, signal_syncs);
} }
VkResult VkResult
@ -953,7 +1070,8 @@ v3dv_queue_driver_submit(struct vk_queue *vk_queue,
list_for_each_entry_safe(struct v3dv_job, job, list_for_each_entry_safe(struct v3dv_job, job,
&cmd_buffer->jobs, list_link) { &cmd_buffer->jobs, list_link) {
result = queue_handle_job(queue, job, &sync_info, false); result = queue_handle_job(queue, job, submit->perf_pass_index,
&sync_info, false);
if (result != VK_SUCCESS) if (result != VK_SUCCESS)
return result; return result;
} }
@ -964,7 +1082,8 @@ v3dv_queue_driver_submit(struct vk_queue *vk_queue,
* barrier state to limit the queues we serialize against. * barrier state to limit the queues we serialize against.
*/ */
if (cmd_buffer->state.barrier.dst_mask) { if (cmd_buffer->state.barrier.dst_mask) {
result = queue_submit_noop_job(queue, &sync_info, false); result = queue_submit_noop_job(queue, submit->perf_pass_index,
&sync_info, false);
if (result != VK_SUCCESS) if (result != VK_SUCCESS)
return result; return result;
} }
@ -976,7 +1095,8 @@ v3dv_queue_driver_submit(struct vk_queue *vk_queue,
* requirements. * requirements.
*/ */
if (submit->signal_count > 0) { if (submit->signal_count > 0) {
result = queue_submit_noop_job(queue, &sync_info, true); result = queue_submit_noop_job(queue, submit->perf_pass_index,
&sync_info, true);
if (result != VK_SUCCESS) if (result != VK_SUCCESS)
return result; return result;
} }