diff --git a/docs/features.txt b/docs/features.txt index 047f6ee4d86..cd25d33f3ee 100644 --- a/docs/features.txt +++ b/docs/features.txt @@ -510,7 +510,7 @@ Khronos extensions that are not part of any Vulkan version: VK_KHR_get_display_properties2 DONE (anv, lvp, radv, tu, v3dv) VK_KHR_get_surface_capabilities2 DONE (anv, lvp, radv, tu, v3dv, vn) VK_KHR_incremental_present DONE (anv, lvp, radv, tu, v3dv, vn) - VK_KHR_performance_query DONE (anv/gen8+, tu) + VK_KHR_performance_query DONE (anv/gen8+, tu, v3dv) VK_KHR_pipeline_executable_properties DONE (anv, radv, tu, v3dv) VK_KHR_pipeline_library DONE (lvp, radv) VK_KHR_push_descriptor DONE (anv, lvp, radv, tu) diff --git a/src/broadcom/vulkan/v3dv_cmd_buffer.c b/src/broadcom/vulkan/v3dv_cmd_buffer.c index 8fd5758ff29..f4e6a9956c7 100644 --- a/src/broadcom/vulkan/v3dv_cmd_buffer.c +++ b/src/broadcom/vulkan/v3dv_cmd_buffer.c @@ -777,6 +777,8 @@ v3dv_job_init(struct v3dv_job *job, job->is_transfer = cmd_buffer->state.is_transfer; cmd_buffer_serialize_job_if_needed(cmd_buffer, job); + + job->perf = cmd_buffer->state.query.active_query.perf; } } @@ -3223,24 +3225,44 @@ v3dv_cmd_buffer_begin_query(struct v3dv_cmd_buffer *cmd_buffer, uint32_t query, VkQueryControlFlags flags) { - /* FIXME: we only support one active query for now */ - assert(cmd_buffer->state.query.active_query.bo == NULL); assert(query < pool->query_count); + switch (pool->query_type) { + case VK_QUERY_TYPE_OCCLUSION: + /* FIXME: we only support one active occlusion query for now */ + assert(cmd_buffer->state.query.active_query.bo == NULL); - cmd_buffer->state.query.active_query.bo = pool->queries[query].bo; - cmd_buffer->state.query.active_query.offset = pool->queries[query].offset; - cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_OCCLUSION_QUERY; + cmd_buffer->state.query.active_query.bo = pool->queries[query].bo; + cmd_buffer->state.query.active_query.offset = pool->queries[query].offset; + cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_OCCLUSION_QUERY; + break; + case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: { + assert(cmd_buffer->state.query.active_query.perf == NULL); + if (cmd_buffer->state.pass) + v3dv_cmd_buffer_subpass_finish(cmd_buffer); + + cmd_buffer->state.query.active_query.perf = + &pool->queries[query].perf; + + if (cmd_buffer->state.pass) { + v3dv_cmd_buffer_subpass_resume(cmd_buffer, + cmd_buffer->state.subpass_idx); + } + break; + } + default: + unreachable("Unsupported query type"); + } } -void -v3dv_cmd_buffer_end_query(struct v3dv_cmd_buffer *cmd_buffer, - struct v3dv_query_pool *pool, - uint32_t query) +static void +v3dv_cmd_buffer_schedule_end_query(struct v3dv_cmd_buffer *cmd_buffer, + struct v3dv_query_pool *pool, + uint32_t query) { assert(query < pool->query_count); - assert(cmd_buffer->state.query.active_query.bo != NULL); - if (cmd_buffer->state.pass) { + if (cmd_buffer->state.pass && + pool->query_type != VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) { /* Queue the EndQuery in the command buffer state, we will create a CPU * job to flag all of these queries as possibly available right after the * render pass job in which they have been recorded. @@ -3295,11 +3317,57 @@ v3dv_cmd_buffer_end_query(struct v3dv_cmd_buffer *cmd_buffer, list_addtail(&job->list_link, &cmd_buffer->jobs); } +} + +static void +v3dv_cmd_buffer_end_occlusion_query(struct v3dv_cmd_buffer *cmd_buffer, + struct v3dv_query_pool *pool, + uint32_t query) +{ + assert(query < pool->query_count); + assert(cmd_buffer->state.query.active_query.bo != NULL); + + v3dv_cmd_buffer_schedule_end_query(cmd_buffer, pool, query); cmd_buffer->state.query.active_query.bo = NULL; cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_OCCLUSION_QUERY; } +static void +v3dv_cmd_buffer_end_performance_query(struct v3dv_cmd_buffer *cmd_buffer, + struct v3dv_query_pool *pool, + uint32_t query) +{ + assert(query < pool->query_count); + assert(cmd_buffer->state.query.active_query.perf != NULL); + + if (cmd_buffer->state.pass) + v3dv_cmd_buffer_subpass_finish(cmd_buffer); + + v3dv_cmd_buffer_schedule_end_query(cmd_buffer, pool, query); + + cmd_buffer->state.query.active_query.perf = NULL; + + if (cmd_buffer->state.pass) + v3dv_cmd_buffer_subpass_resume(cmd_buffer, cmd_buffer->state.subpass_idx); +} + +void v3dv_cmd_buffer_end_query(struct v3dv_cmd_buffer *cmd_buffer, + struct v3dv_query_pool *pool, + uint32_t query) +{ + switch (pool->query_type) { + case VK_QUERY_TYPE_OCCLUSION: + v3dv_cmd_buffer_end_occlusion_query(cmd_buffer, pool, query); + break; + case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: + v3dv_cmd_buffer_end_performance_query(cmd_buffer, pool, query); + break; + default: + unreachable("Unsupported query type"); + } +} + void v3dv_cmd_buffer_copy_query_results(struct v3dv_cmd_buffer *cmd_buffer, struct v3dv_query_pool *pool, diff --git a/src/broadcom/vulkan/v3dv_device.c b/src/broadcom/vulkan/v3dv_device.c index 21ffdbbc07b..6102b0b42cf 100644 --- a/src/broadcom/vulkan/v3dv_device.c +++ b/src/broadcom/vulkan/v3dv_device.c @@ -133,6 +133,7 @@ get_device_extensions(const struct v3dv_physical_device *device, .KHR_get_memory_requirements2 = true, .KHR_image_format_list = true, .KHR_imageless_framebuffer = true, + .KHR_performance_query = device->caps.perfmon, .KHR_relaxed_block_layout = true, .KHR_maintenance1 = true, .KHR_maintenance2 = true, @@ -816,6 +817,9 @@ physical_device_init(struct v3dv_physical_device *device, device->caps.multisync = v3d_has_feature(device, DRM_V3D_PARAM_SUPPORTS_MULTISYNC_EXT); + device->caps.perfmon = + v3d_has_feature(device, DRM_V3D_PARAM_SUPPORTS_PERFMON); + result = init_uuids(device); if (result != VK_SUCCESS) goto fail; @@ -1144,6 +1148,7 @@ VKAPI_ATTR void VKAPI_CALL v3dv_GetPhysicalDeviceFeatures2(VkPhysicalDevice physicalDevice, VkPhysicalDeviceFeatures2 *pFeatures) { + V3DV_FROM_HANDLE(v3dv_physical_device, physical_device, physicalDevice); v3dv_GetPhysicalDeviceFeatures(physicalDevice, &pFeatures->features); VkPhysicalDeviceVulkan13Features vk13 = { @@ -1289,6 +1294,16 @@ v3dv_GetPhysicalDeviceFeatures2(VkPhysicalDevice physicalDevice, break; } + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PERFORMANCE_QUERY_FEATURES_KHR: { + VkPhysicalDevicePerformanceQueryFeaturesKHR *features = + (void *) ext; + + features->performanceCounterQueryPools = + physical_device->caps.perfmon; + features->performanceCounterMultipleQueryPools = false; + break; + } + default: v3dv_debug_ignored_stype(ext->sType); break; @@ -1637,6 +1652,13 @@ v3dv_GetPhysicalDeviceProperties2(VkPhysicalDevice physicalDevice, props->maxVertexAttribDivisor = 0xffff; break; } + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PERFORMANCE_QUERY_PROPERTIES_KHR : { + VkPhysicalDevicePerformanceQueryPropertiesKHR *props = + (VkPhysicalDevicePerformanceQueryPropertiesKHR *)ext; + + props->allowCommandBufferQueryCopies = true; + break; + } case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DRM_PROPERTIES_EXT: { VkPhysicalDeviceDrmPropertiesEXT *props = (VkPhysicalDeviceDrmPropertiesEXT *)ext; diff --git a/src/broadcom/vulkan/v3dv_private.h b/src/broadcom/vulkan/v3dv_private.h index 6c1399b04d7..cfd32ec7ad6 100644 --- a/src/broadcom/vulkan/v3dv_private.h +++ b/src/broadcom/vulkan/v3dv_private.h @@ -188,6 +188,7 @@ struct v3dv_physical_device { struct { bool multisync; + bool perfmon; } caps; }; @@ -263,6 +264,11 @@ struct v3dv_queue { struct v3dv_last_job_sync last_job_syncs; struct v3dv_job *noop_job; + + /* The last active perfmon ID to prevent mixing of counter results when a + * job is submitted with a different perfmon id. + */ + uint32_t last_perfmon_id; }; VkResult v3dv_queue_driver_submit(struct vk_queue *vk_queue, @@ -1027,6 +1033,19 @@ struct v3dv_timestamp_query_cpu_job_info { uint32_t count; }; +/* Number of perfmons required to handle all supported performance counters */ +#define V3DV_MAX_PERFMONS DIV_ROUND_UP(V3D_PERFCNT_NUM, \ + DRM_V3D_MAX_PERF_COUNTERS) + +struct v3dv_perf_query { + uint32_t kperfmon_ids[V3DV_MAX_PERFMONS]; + + /* A DRM syncobj to wait on the GPU jobs for which we are collecting + * performance data. + */ + struct vk_sync *last_job_sync; +}; + struct v3dv_job { struct list_head list_link; @@ -1127,6 +1146,9 @@ struct v3dv_job { uint32_t wg_base[3]; struct drm_v3d_submit_csd submit; } csd; + + /* Perfmons with last job sync for CSD and CL jobs */ + struct v3dv_perf_query *perf; }; void v3dv_job_init(struct v3dv_job *job, @@ -1328,12 +1350,15 @@ struct v3dv_cmd_buffer_state { struct v3dv_end_query_cpu_job_info *states; } end; - /* This BO is not NULL if we have an active query, that is, we have - * called vkCmdBeginQuery but not vkCmdEndQuery. - */ struct { + /* This BO is not NULL if we have an active occlusion query, that is, + * we have called vkCmdBeginQuery but not vkCmdEndQuery. + */ struct v3dv_bo *bo; uint32_t offset; + + /* This pointer is not NULL if we have an active performance query */ + struct v3dv_perf_query *perf; } active_query; } query; }; @@ -1375,6 +1400,9 @@ struct v3dv_query { }; /* Used by CPU queries (timestamp) */ uint64_t value; + + /* Used by performance queries */ + struct v3dv_perf_query perf; }; }; @@ -1383,18 +1411,32 @@ struct v3dv_query_pool { struct v3dv_bo *bo; /* Only used with GPU queries (occlusion) */ + /* Only used with performance queries */ + struct { + uint32_t ncounters; + uint8_t counters[V3D_PERFCNT_NUM]; + + /* V3D has a limit on the number of counters we can track in a + * single performance monitor, so if too many counters are requested + * we need to create multiple monitors to record all of them. This + * field represents the number of monitors required for the number + * of counters requested. + */ + uint8_t nperfmons; + } perfmon; + VkQueryType query_type; uint32_t query_count; struct v3dv_query *queries; }; -VkResult v3dv_get_query_pool_results_cpu(struct v3dv_device *device, - struct v3dv_query_pool *pool, - uint32_t first, - uint32_t count, - void *data, - VkDeviceSize stride, - VkQueryResultFlags flags); +VkResult v3dv_get_query_pool_results(struct v3dv_device *device, + struct v3dv_query_pool *pool, + uint32_t first, + uint32_t count, + void *data, + VkDeviceSize stride, + VkQueryResultFlags flags); void v3dv_reset_query_pools(struct v3dv_device *device, struct v3dv_query_pool *query_pool, diff --git a/src/broadcom/vulkan/v3dv_query.c b/src/broadcom/vulkan/v3dv_query.c index 60edfc52442..4e188fd5512 100644 --- a/src/broadcom/vulkan/v3dv_query.c +++ b/src/broadcom/vulkan/v3dv_query.c @@ -25,6 +25,148 @@ #include "util/timespec.h" +static const char *v3dv_counters[][3] = { + {"FEP", "FEP-valid-primitives-no-rendered-pixels", "[FEP] Valid primitives that result in no rendered pixels, for all rendered tiles"}, + {"FEP", "FEP-valid-primitives-rendered-pixels", "[FEP] Valid primitives for all rendered tiles (primitives may be counted in more than one tile)"}, + {"FEP", "FEP-clipped-quads", "[FEP] Early-Z/Near/Far clipped quads"}, + {"FEP", "FEP-valid-quads", "[FEP] Valid quads"}, + {"TLB", "TLB-quads-not-passing-stencil-test", "[TLB] Quads with no pixels passing the stencil test"}, + {"TLB", "TLB-quads-not-passing-z-and-stencil-test", "[TLB] Quads with no pixels passing the Z and stencil tests"}, + {"TLB", "TLB-quads-passing-z-and-stencil-test", "[TLB] Quads with any pixels passing the Z and stencil tests"}, + {"TLB", "TLB-quads-with-zero-coverage", "[TLB] Quads with all pixels having zero coverage"}, + {"TLB", "TLB-quads-with-non-zero-coverage", "[TLB] Quads with any pixels having non-zero coverage"}, + {"TLB", "TLB-quads-written-to-color-buffer", "[TLB] Quads with valid pixels written to colour buffer"}, + {"PTB", "PTB-primitives-discarded-outside-viewport", "[PTB] Primitives discarded by being outside the viewport"}, + {"PTB", "PTB-primitives-need-clipping", "[PTB] Primitives that need clipping"}, + {"PTB", "PTB-primitives-discared-reversed", "[PTB] Primitives that are discarded because they are reversed"}, + {"QPU", "QPU-total-idle-clk-cycles", "[QPU] Total idle clock cycles for all QPUs"}, + {"QPU", "QPU-total-active-clk-cycles-vertex-coord-shading", "[QPU] Total active clock cycles for all QPUs doing vertex/coordinate/user shading (counts only when QPU is not stalled)"}, + {"QPU", "QPU-total-active-clk-cycles-fragment-shading", "[QPU] Total active clock cycles for all QPUs doing fragment shading (counts only when QPU is not stalled)"}, + {"QPU", "QPU-total-clk-cycles-executing-valid-instr", "[QPU] Total clock cycles for all QPUs executing valid instructions"}, + {"QPU", "QPU-total-clk-cycles-waiting-TMU", "[QPU] Total clock cycles for all QPUs stalled waiting for TMUs only (counter won't increment if QPU also stalling for another reason)"}, + {"QPU", "QPU-total-clk-cycles-waiting-scoreboard", "[QPU] Total clock cycles for all QPUs stalled waiting for Scoreboard only (counter won't increment if QPU also stalling for another reason)"}, + {"QPU", "QPU-total-clk-cycles-waiting-varyings", "[QPU] Total clock cycles for all QPUs stalled waiting for Varyings only (counter won't increment if QPU also stalling for another reason)"}, + {"QPU", "QPU-total-instr-cache-hit", "[QPU] Total instruction cache hits for all slices"}, + {"QPU", "QPU-total-instr-cache-miss", "[QPU] Total instruction cache misses for all slices"}, + {"QPU", "QPU-total-uniform-cache-hit", "[QPU] Total uniforms cache hits for all slices"}, + {"QPU", "QPU-total-uniform-cache-miss", "[QPU] Total uniforms cache misses for all slices"}, + {"TMU", "TMU-total-text-quads-access", "[TMU] Total texture cache accesses"}, + {"TMU", "TMU-total-text-cache-miss", "[TMU] Total texture cache misses (number of fetches from memory/L2cache)"}, + {"VPM", "VPM-total-clk-cycles-VDW-stalled", "[VPM] Total clock cycles VDW is stalled waiting for VPM access"}, + {"VPM", "VPM-total-clk-cycles-VCD-stalled", "[VPM] Total clock cycles VCD is stalled waiting for VPM access"}, + {"CLE", "CLE-bin-thread-active-cycles", "[CLE] Bin thread active cycles"}, + {"CLE", "CLE-render-thread-active-cycles", "[CLE] Render thread active cycles"}, + {"L2T", "L2T-total-cache-hit", "[L2T] Total Level 2 cache hits"}, + {"L2T", "L2T-total-cache-miss", "[L2T] Total Level 2 cache misses"}, + {"CORE", "cycle-count", "[CORE] Cycle counter"}, + {"QPU", "QPU-total-clk-cycles-waiting-vertex-coord-shading", "[QPU] Total stalled clock cycles for all QPUs doing vertex/coordinate/user shading"}, + {"QPU", "QPU-total-clk-cycles-waiting-fragment-shading", "[QPU] Total stalled clock cycles for all QPUs doing fragment shading"}, + {"PTB", "PTB-primitives-binned", "[PTB] Total primitives binned"}, + {"AXI", "AXI-writes-seen-watch-0", "[AXI] Writes seen by watch 0"}, + {"AXI", "AXI-reads-seen-watch-0", "[AXI] Reads seen by watch 0"}, + {"AXI", "AXI-writes-stalled-seen-watch-0", "[AXI] Write stalls seen by watch 0"}, + {"AXI", "AXI-reads-stalled-seen-watch-0", "[AXI] Read stalls seen by watch 0"}, + {"AXI", "AXI-write-bytes-seen-watch-0", "[AXI] Total bytes written seen by watch 0"}, + {"AXI", "AXI-read-bytes-seen-watch-0", "[AXI] Total bytes read seen by watch 0"}, + {"AXI", "AXI-writes-seen-watch-1", "[AXI] Writes seen by watch 1"}, + {"AXI", "AXI-reads-seen-watch-1", "[AXI] Reads seen by watch 1"}, + {"AXI", "AXI-writes-stalled-seen-watch-1", "[AXI] Write stalls seen by watch 1"}, + {"AXI", "AXI-reads-stalled-seen-watch-1", "[AXI] Read stalls seen by watch 1"}, + {"AXI", "AXI-write-bytes-seen-watch-1", "[AXI] Total bytes written seen by watch 1"}, + {"AXI", "AXI-read-bytes-seen-watch-1", "[AXI] Total bytes read seen by watch 1"}, + {"TLB", "TLB-partial-quads-written-to-color-buffer", "[TLB] Partial quads written to the colour buffer"}, + {"TMU", "TMU-total-config-access", "[TMU] Total config accesses"}, + {"L2T", "L2T-no-id-stalled", "[L2T] No ID stall"}, + {"L2T", "L2T-command-queue-stalled", "[L2T] Command queue full stall"}, + {"L2T", "L2T-TMU-writes", "[L2T] TMU write accesses"}, + {"TMU", "TMU-active-cycles", "[TMU] Active cycles"}, + {"TMU", "TMU-stalled-cycles", "[TMU] Stalled cycles"}, + {"CLE", "CLE-thread-active-cycles", "[CLE] Bin or render thread active cycles"}, + {"L2T", "L2T-TMU-reads", "[L2T] TMU read accesses"}, + {"L2T", "L2T-CLE-reads", "[L2T] CLE read accesses"}, + {"L2T", "L2T-VCD-reads", "[L2T] VCD read accesses"}, + {"L2T", "L2T-TMU-config-reads", "[L2T] TMU CFG read accesses"}, + {"L2T", "L2T-SLC0-reads", "[L2T] SLC0 read accesses"}, + {"L2T", "L2T-SLC1-reads", "[L2T] SLC1 read accesses"}, + {"L2T", "L2T-SLC2-reads", "[L2T] SLC2 read accesses"}, + {"L2T", "L2T-TMU-write-miss", "[L2T] TMU write misses"}, + {"L2T", "L2T-TMU-read-miss", "[L2T] TMU read misses"}, + {"L2T", "L2T-CLE-read-miss", "[L2T] CLE read misses"}, + {"L2T", "L2T-VCD-read-miss", "[L2T] VCD read misses"}, + {"L2T", "L2T-TMU-config-read-miss", "[L2T] TMU CFG read misses"}, + {"L2T", "L2T-SLC0-read-miss", "[L2T] SLC0 read misses"}, + {"L2T", "L2T-SLC1-read-miss", "[L2T] SLC1 read misses"}, + {"L2T", "L2T-SLC2-read-miss", "[L2T] SLC2 read misses"}, + {"CORE", "core-memory-writes", "[CORE] Total memory writes"}, + {"L2T", "L2T-memory-writes", "[L2T] Total memory writes"}, + {"PTB", "PTB-memory-writes", "[PTB] Total memory writes"}, + {"TLB", "TLB-memory-writes", "[TLB] Total memory writes"}, + {"CORE", "core-memory-reads", "[CORE] Total memory reads"}, + {"L2T", "L2T-memory-reads", "[L2T] Total memory reads"}, + {"PTB", "PTB-memory-reads", "[PTB] Total memory reads"}, + {"PSE", "PSE-memory-reads", "[PSE] Total memory reads"}, + {"TLB", "TLB-memory-reads", "[TLB] Total memory reads"}, + {"GMP", "GMP-memory-reads", "[GMP] Total memory reads"}, + {"PTB", "PTB-memory-words-writes", "[PTB] Total memory words written"}, + {"TLB", "TLB-memory-words-writes", "[TLB] Total memory words written"}, + {"PSE", "PSE-memory-words-reads", "[PSE] Total memory words read"}, + {"TLB", "TLB-memory-words-reads", "[TLB] Total memory words read"}, + {"TMU", "TMU-MRU-hits", "[TMU] Total MRU hits"}, + {"CORE", "compute-active-cycles", "[CORE] Compute active cycles"}, +}; + +static void +kperfmon_create(struct v3dv_device *device, + struct v3dv_query_pool *pool, + uint32_t query) +{ + for (uint32_t i = 0; i < pool->perfmon.nperfmons; i++) { + assert(i * DRM_V3D_MAX_PERF_COUNTERS < pool->perfmon.ncounters); + + struct drm_v3d_perfmon_create req = { + .ncounters = MIN2(pool->perfmon.ncounters - + i * DRM_V3D_MAX_PERF_COUNTERS, + DRM_V3D_MAX_PERF_COUNTERS), + }; + memcpy(req.counters, + &pool->perfmon.counters[i * DRM_V3D_MAX_PERF_COUNTERS], + req.ncounters); + + int ret = v3dv_ioctl(device->pdevice->render_fd, + DRM_IOCTL_V3D_PERFMON_CREATE, + &req); + if (ret) + fprintf(stderr, "Failed to create perfmon: %s\n", strerror(ret)); + + pool->queries[query].perf.kperfmon_ids[i] = req.id; + } +} + +static void +kperfmon_destroy(struct v3dv_device *device, + struct v3dv_query_pool *pool, + uint32_t query) +{ + /* Skip destroying if never created */ + if (!pool->queries[query].perf.kperfmon_ids[0]) + return; + + for (uint32_t i = 0; i < pool->perfmon.nperfmons; i++) { + struct drm_v3d_perfmon_destroy req = { + .id = pool->queries[query].perf.kperfmon_ids[i] + }; + + int ret = v3dv_ioctl(device->pdevice->render_fd, + DRM_IOCTL_V3D_PERFMON_DESTROY, + &req); + + if (ret) { + fprintf(stderr, "Failed to destroy perfmon %u: %s\n", + req.id, strerror(ret)); + } + } +} + VKAPI_ATTR VkResult VKAPI_CALL v3dv_CreateQueryPool(VkDevice _device, const VkQueryPoolCreateInfo *pCreateInfo, @@ -34,7 +176,8 @@ v3dv_CreateQueryPool(VkDevice _device, V3DV_FROM_HANDLE(v3dv_device, device, _device); assert(pCreateInfo->queryType == VK_QUERY_TYPE_OCCLUSION || - pCreateInfo->queryType == VK_QUERY_TYPE_TIMESTAMP); + pCreateInfo->queryType == VK_QUERY_TYPE_TIMESTAMP || + pCreateInfo->queryType == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR); assert(pCreateInfo->queryCount > 0); struct v3dv_query_pool *pool = @@ -46,6 +189,7 @@ v3dv_CreateQueryPool(VkDevice _device, pool->query_type = pCreateInfo->queryType; pool->query_count = pCreateInfo->queryCount; + uint32_t query_idx = 0; VkResult result; const uint32_t pool_bytes = sizeof(struct v3dv_query) * pool->query_count; @@ -56,7 +200,8 @@ v3dv_CreateQueryPool(VkDevice _device, goto fail; } - if (pool->query_type == VK_QUERY_TYPE_OCCLUSION) { + switch (pool->query_type) { + case VK_QUERY_TYPE_OCCLUSION: { /* The hardware allows us to setup groups of 16 queries in consecutive * 4-byte addresses, requiring only that each group of 16 queries is * aligned to a 1024 byte boundary. @@ -72,22 +217,56 @@ v3dv_CreateQueryPool(VkDevice _device, result = vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY); goto fail; } + break; + } + case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: { + const VkQueryPoolPerformanceCreateInfoKHR *pq_info = + vk_find_struct_const(pCreateInfo->pNext, + QUERY_POOL_PERFORMANCE_CREATE_INFO_KHR); + + assert(pq_info); + assert(pq_info->counterIndexCount <= V3D_PERFCNT_NUM); + + pool->perfmon.ncounters = pq_info->counterIndexCount; + for (uint32_t i = 0; i < pq_info->counterIndexCount; i++) + pool->perfmon.counters[i] = pq_info->pCounterIndices[i]; + + pool->perfmon.nperfmons = DIV_ROUND_UP(pool->perfmon.ncounters, + DRM_V3D_MAX_PERF_COUNTERS); + + assert(pool->perfmon.nperfmons <= V3DV_MAX_PERFMONS); + break; + } + case VK_QUERY_TYPE_TIMESTAMP: + break; + default: + unreachable("Unsupported query type"); } - uint32_t i; - for (i = 0; i < pool->query_count; i++) { - pool->queries[i].maybe_available = false; + for (; query_idx < pool->query_count; query_idx++) { + pool->queries[query_idx].maybe_available = false; switch (pool->query_type) { case VK_QUERY_TYPE_OCCLUSION: { - const uint32_t query_group = i / 16; - const uint32_t query_offset = query_group * 1024 + (i % 16) * 4; - pool->queries[i].bo = pool->bo; - pool->queries[i].offset = query_offset; + const uint32_t query_group = query_idx / 16; + const uint32_t query_offset = query_group * 1024 + (query_idx % 16) * 4; + pool->queries[query_idx].bo = pool->bo; + pool->queries[query_idx].offset = query_offset; break; } case VK_QUERY_TYPE_TIMESTAMP: - pool->queries[i].value = 0; + pool->queries[query_idx].value = 0; break; + case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: { + result = vk_sync_create(&device->vk, + &device->pdevice->drm_syncobj_type, 0, 0, + &pool->queries[query_idx].perf.last_job_sync); + if (result != VK_SUCCESS) + goto fail; + + for (uint32_t j = 0; j < pool->perfmon.nperfmons; j++) + pool->queries[query_idx].perf.kperfmon_ids[j] = 0; + break; + } default: unreachable("Unsupported query type"); } @@ -98,6 +277,11 @@ v3dv_CreateQueryPool(VkDevice _device, return VK_SUCCESS; fail: + if (pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) { + for (uint32_t j = 0; j < query_idx; j++) + vk_sync_destroy(&device->vk, pool->queries[j].perf.last_job_sync); + } + if (pool->bo) v3dv_bo_free(device, pool->bo); if (pool->queries) @@ -121,6 +305,13 @@ v3dv_DestroyQueryPool(VkDevice _device, if (pool->bo) v3dv_bo_free(device, pool->bo); + if (pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) { + for (uint32_t i = 0; i < pool->query_count; i++) { + kperfmon_destroy(device, pool, i); + vk_sync_destroy(&device->vk, pool->queries[i].perf.last_job_sync); + } + } + if (pool->queries) vk_free2(&device->vk.alloc, pAllocator, pool->queries); @@ -128,7 +319,7 @@ v3dv_DestroyQueryPool(VkDevice _device, } static void -write_query_result(void *dst, uint32_t idx, bool do_64bit, uint64_t value) +write_to_buffer(void *dst, uint32_t idx, bool do_64bit, uint64_t value) { if (do_64bit) { uint64_t *dst64 = (uint64_t *) dst; @@ -177,13 +368,91 @@ query_wait_available(struct v3dv_device *device, !v3dv_bo_wait(device, q->bo, 0xffffffffffffffffull)) return vk_device_set_lost(&device->vk, "Query BO wait failed: %m"); + if (query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR && + vk_sync_wait(&device->vk, q->perf.last_job_sync, + 0, VK_SYNC_WAIT_COMPLETE, UINT64_MAX) != VK_SUCCESS) + return vk_device_set_lost(&device->vk, "Query job wait failed"); + return VK_SUCCESS; } static VkResult -query_is_available(struct v3dv_device *device, - struct v3dv_query *q, - VkQueryType query_type) +write_occlusion_query_result(struct v3dv_device *device, + struct v3dv_query_pool *pool, + uint32_t query, + bool do_64bit, + void *data, + uint32_t slot) +{ + assert(pool && pool->query_type == VK_QUERY_TYPE_OCCLUSION); + + if (vk_device_is_lost(&device->vk)) + return VK_ERROR_DEVICE_LOST; + + struct v3dv_query *q = &pool->queries[query]; + assert(q->bo && q->bo->map); + + const uint8_t *query_addr = ((uint8_t *) q->bo->map) + q->offset; + write_to_buffer(data, slot, do_64bit, (uint64_t) *((uint32_t *)query_addr)); + return VK_SUCCESS; +} + +static VkResult +write_timestamp_query_result(struct v3dv_device *device, + struct v3dv_query_pool *pool, + uint32_t query, + bool do_64bit, + void *data, + uint32_t slot) +{ + assert(pool && pool->query_type == VK_QUERY_TYPE_TIMESTAMP); + + struct v3dv_query *q = &pool->queries[query]; + + write_to_buffer(data, slot, do_64bit, q->value); + return VK_SUCCESS; +} + +static VkResult +write_performance_query_result(struct v3dv_device *device, + struct v3dv_query_pool *pool, + uint32_t query, + bool do_64bit, + void *data, + uint32_t slot) +{ + assert(pool && pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR); + + struct v3dv_query *q = &pool->queries[query]; + uint64_t counter_values[V3D_PERFCNT_NUM]; + + for (uint32_t i = 0; i < pool->perfmon.nperfmons; i++) { + struct drm_v3d_perfmon_get_values req = { + .id = q->perf.kperfmon_ids[i], + .values_ptr = (uintptr_t)(&counter_values[i * + DRM_V3D_MAX_PERF_COUNTERS]) + }; + + int ret = v3dv_ioctl(device->pdevice->render_fd, + DRM_IOCTL_V3D_PERFMON_GET_VALUES, + &req); + + if (ret) { + fprintf(stderr, "failed to get perfmon values: %s\n", strerror(ret)); + return vk_error(device, VK_ERROR_DEVICE_LOST); + } + } + + for (uint32_t i = 0; i < pool->perfmon.ncounters; i++) + write_to_buffer(data, slot + i, do_64bit, counter_values[i]); + + return VK_SUCCESS; +} + +static VkResult +query_check_available(struct v3dv_device *device, + struct v3dv_query *q, + VkQueryType query_type) { if (!q->maybe_available) return VK_NOT_READY; @@ -192,70 +461,105 @@ query_is_available(struct v3dv_device *device, !v3dv_bo_wait(device, q->bo, 0)) return VK_NOT_READY; + if (query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR && + vk_sync_wait(&device->vk, q->perf.last_job_sync, + 0, VK_SYNC_WAIT_COMPLETE, 0) != VK_SUCCESS) + return VK_NOT_READY; + return VK_SUCCESS; } static VkResult -get_query_result(struct v3dv_device *device, - struct v3dv_query_pool *pool, - uint32_t query, - bool do_wait, - bool *available, - uint64_t *value) +write_query_result(struct v3dv_device *device, + struct v3dv_query_pool *pool, + uint32_t query, + bool do_64bit, + void *data, + uint32_t slot) +{ + switch (pool->query_type) { + case VK_QUERY_TYPE_OCCLUSION: + return write_occlusion_query_result(device, pool, query, do_64bit, + data, slot); + case VK_QUERY_TYPE_TIMESTAMP: + return write_timestamp_query_result(device, pool, query, do_64bit, + data, slot); + case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: + return write_performance_query_result(device, pool, query, do_64bit, + data, slot); + default: + unreachable("Unsupported query type"); + } +} + +static VkResult +query_is_available(struct v3dv_device *device, + struct v3dv_query_pool *pool, + uint32_t query, + bool do_wait, + bool *available) { struct v3dv_query *q = &pool->queries[query]; + assert(pool->query_type != VK_QUERY_TYPE_OCCLUSION || + (q->bo && q->bo->map)); + if (do_wait) { VkResult result = query_wait_available(device, q, pool->query_type); - if (result != VK_SUCCESS) + if (result != VK_SUCCESS) { + *available = false; return result; + } *available = true; } else { - VkResult result = query_is_available(device, q, pool->query_type); + VkResult result = query_check_available(device, q, pool->query_type); assert(result == VK_SUCCESS || result == VK_NOT_READY); *available = (result == VK_SUCCESS); } + return VK_SUCCESS; +} + +static uint32_t +get_query_result_count(struct v3dv_query_pool *pool) +{ switch (pool->query_type) { - case VK_QUERY_TYPE_OCCLUSION: { - const uint8_t *query_addr = ((uint8_t *) q->bo->map) + q->offset; - *value = (uint64_t) *((uint32_t *)query_addr); - return VK_SUCCESS; - } - + case VK_QUERY_TYPE_OCCLUSION: case VK_QUERY_TYPE_TIMESTAMP: - *value = q->value; - return VK_SUCCESS; - + return 1; + case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: + return pool->perfmon.ncounters; default: unreachable("Unsupported query type"); } } VkResult -v3dv_get_query_pool_results_cpu(struct v3dv_device *device, - struct v3dv_query_pool *pool, - uint32_t first, - uint32_t count, - void *data, - VkDeviceSize stride, - VkQueryResultFlags flags) +v3dv_get_query_pool_results(struct v3dv_device *device, + struct v3dv_query_pool *pool, + uint32_t first, + uint32_t count, + void *data, + VkDeviceSize stride, + VkQueryResultFlags flags) { assert(first < pool->query_count); assert(first + count <= pool->query_count); assert(data); - const bool do_64bit = flags & VK_QUERY_RESULT_64_BIT; + const bool do_64bit = flags & VK_QUERY_RESULT_64_BIT || + pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR; const bool do_wait = flags & VK_QUERY_RESULT_WAIT_BIT; const bool do_partial = flags & VK_QUERY_RESULT_PARTIAL_BIT; + uint32_t result_count = get_query_result_count(pool); + VkResult result = VK_SUCCESS; for (uint32_t i = first; i < first + count; i++) { bool available = false; - uint64_t value = 0; VkResult query_result = - get_query_result(device, pool, i, do_wait, &available, &value); + query_is_available(device, pool, i, do_wait, &available); if (query_result == VK_ERROR_DEVICE_LOST) result = VK_ERROR_DEVICE_LOST; @@ -273,11 +577,11 @@ v3dv_get_query_pool_results_cpu(struct v3dv_device *device, const bool write_result = available || do_partial; if (write_result) - write_query_result(data, slot, do_64bit, value); - slot++; + write_query_result(device, pool, i, do_64bit, data, slot); + slot += result_count; if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) - write_query_result(data, slot++, do_64bit, available ? 1u : 0u); + write_to_buffer(data, slot++, do_64bit, available ? 1u : 0u); if (!write_result && result != VK_ERROR_DEVICE_LOST) result = VK_NOT_READY; @@ -301,8 +605,8 @@ v3dv_GetQueryPoolResults(VkDevice _device, V3DV_FROM_HANDLE(v3dv_device, device, _device); V3DV_FROM_HANDLE(v3dv_query_pool, pool, queryPool); - return v3dv_get_query_pool_results_cpu(device, pool, firstQuery, queryCount, - pData, stride, flags); + return v3dv_get_query_pool_results(device, pool, firstQuery, queryCount, + pData, stride, flags); } VKAPI_ATTR void VKAPI_CALL @@ -381,6 +685,12 @@ v3dv_reset_query_pools(struct v3dv_device *device, case VK_QUERY_TYPE_TIMESTAMP: q->value = 0; break; + case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: + kperfmon_destroy(device, pool, i); + kperfmon_create(device, pool, i); + if (vk_sync_reset(&device->vk, q->perf.last_job_sync) != VK_SUCCESS) + fprintf(stderr, "Failed to reset sync"); + break; default: unreachable("Unsupported query type"); } @@ -400,3 +710,69 @@ v3dv_ResetQueryPool(VkDevice _device, v3dv_reset_query_pools(device, pool, firstQuery, queryCount); } + +VKAPI_ATTR VkResult VKAPI_CALL +v3dv_EnumeratePhysicalDeviceQueueFamilyPerformanceQueryCountersKHR( + VkPhysicalDevice physicalDevice, + uint32_t queueFamilyIndex, + uint32_t *pCounterCount, + VkPerformanceCounterKHR *pCounters, + VkPerformanceCounterDescriptionKHR *pCounterDescriptions) +{ + uint32_t desc_count = *pCounterCount; + + VK_OUTARRAY_MAKE_TYPED(VkPerformanceCounterKHR, + out, pCounters, pCounterCount); + VK_OUTARRAY_MAKE_TYPED(VkPerformanceCounterDescriptionKHR, + out_desc, pCounterDescriptions, &desc_count); + + for (int i = 0; i < ARRAY_SIZE(v3dv_counters); i++) { + vk_outarray_append_typed(VkPerformanceCounterKHR, &out, counter) { + counter->unit = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR; + counter->scope = VK_QUERY_SCOPE_COMMAND_KHR; + counter->storage = VK_PERFORMANCE_COUNTER_STORAGE_UINT64_KHR; + + unsigned char sha1_result[20]; + _mesa_sha1_compute(v3dv_counters[i][1], strlen(v3dv_counters[i][1]), + sha1_result); + + memcpy(counter->uuid, sha1_result, sizeof(counter->uuid)); + } + + vk_outarray_append_typed(VkPerformanceCounterDescriptionKHR, + &out_desc, desc) { + desc->flags = 0; + snprintf(desc->name, sizeof(desc->name), "%s", + v3dv_counters[i][1]); + snprintf(desc->category, sizeof(desc->category), "%s", + v3dv_counters[i][0]); + snprintf(desc->description, sizeof(desc->description), "%s", + v3dv_counters[i][2]); + } + } + + return vk_outarray_status(&out); +} + +VKAPI_ATTR void VKAPI_CALL +v3dv_GetPhysicalDeviceQueueFamilyPerformanceQueryPassesKHR( + VkPhysicalDevice physicalDevice, + const VkQueryPoolPerformanceCreateInfoKHR *pPerformanceQueryCreateInfo, + uint32_t *pNumPasses) +{ + *pNumPasses = DIV_ROUND_UP(pPerformanceQueryCreateInfo->counterIndexCount, + DRM_V3D_MAX_PERF_COUNTERS); +} + +VKAPI_ATTR VkResult VKAPI_CALL +v3dv_AcquireProfilingLockKHR( + VkDevice _device, + const VkAcquireProfilingLockInfoKHR *pInfo) +{ + return VK_SUCCESS; +} + +VKAPI_ATTR void VKAPI_CALL +v3dv_ReleaseProfilingLockKHR(VkDevice device) +{ +} diff --git a/src/broadcom/vulkan/v3dv_queue.c b/src/broadcom/vulkan/v3dv_queue.c index 799139b9174..a3d92466d88 100644 --- a/src/broadcom/vulkan/v3dv_queue.c +++ b/src/broadcom/vulkan/v3dv_queue.c @@ -137,27 +137,129 @@ handle_reset_query_cpu_job(struct v3dv_queue *queue, struct v3dv_job *job, if (info->pool->query_type == VK_QUERY_TYPE_OCCLUSION) v3dv_bo_wait(job->device, info->pool->bo, PIPE_TIMEOUT_INFINITE); + if (info->pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) { + struct vk_sync_wait waits[info->count]; + unsigned wait_count = 0; + for (int i = 0; i < info->count; i++) { + struct v3dv_query *query = &info->pool->queries[i]; + /* Only wait for a query if we've used it otherwise we will be + * waiting forever for the fence to become signaled. + */ + if (query->maybe_available) { + waits[wait_count] = (struct vk_sync_wait){ + .sync = info->pool->queries[i].perf.last_job_sync + }; + wait_count++; + }; + } + + VkResult result = vk_sync_wait_many(&job->device->vk, wait_count, waits, + VK_SYNC_WAIT_COMPLETE, UINT64_MAX); + + if (result != VK_SUCCESS) + return result; + } + v3dv_reset_query_pools(job->device, info->pool, info->first, info->count); return VK_SUCCESS; } static VkResult -handle_end_query_cpu_job(struct v3dv_job *job) +export_perfmon_last_job_sync(struct v3dv_queue *queue, struct v3dv_job *job, int *fd) { + int err; + if (job->device->pdevice->caps.multisync) { + static const enum v3dv_queue_type queues_to_sync[] = { + V3DV_QUEUE_CL, + V3DV_QUEUE_CSD, + }; + + for (uint32_t i = 0; i < ARRAY_SIZE(queues_to_sync); i++) { + enum v3dv_queue_type queue_type = queues_to_sync[i]; + int tmp_fd = -1; + + err = drmSyncobjExportSyncFile(job->device->pdevice->render_fd, + queue->last_job_syncs.syncs[queue_type], + &tmp_fd); + + if (err) { + close(*fd); + return vk_errorf(&job->device->queue, VK_ERROR_UNKNOWN, + "sync file export failed: %m"); + } + + err = sync_accumulate("v3dv", fd, tmp_fd); + + if (err) { + close(tmp_fd); + close(*fd); + return vk_errorf(&job->device->queue, VK_ERROR_UNKNOWN, + "failed to accumulate sync files: %m"); + } + } + } else { + err = drmSyncobjExportSyncFile(job->device->pdevice->render_fd, + queue->last_job_syncs.syncs[V3DV_QUEUE_ANY], + fd); + + if (err) { + return vk_errorf(&job->device->queue, VK_ERROR_UNKNOWN, + "sync file export failed: %m"); + } + } + return VK_SUCCESS; +} + +static VkResult +handle_end_query_cpu_job(struct v3dv_job *job, uint32_t counter_pass_idx) +{ + VkResult result = VK_SUCCESS; + mtx_lock(&job->device->query_mutex); struct v3dv_end_query_cpu_job_info *info = &job->cpu.query_end; + struct v3dv_queue *queue = &job->device->queue; + + int err = 0; + int fd = -1; + + if (info->pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) { + result = export_perfmon_last_job_sync(queue, job, &fd); + + if (result != VK_SUCCESS) + goto fail; + + assert(fd >= 0); + } + for (uint32_t i = 0; i < info->count; i++) { assert(info->query + i < info->pool->query_count); struct v3dv_query *query = &info->pool->queries[info->query + i]; + + if (info->pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) { + uint32_t syncobj = vk_sync_as_drm_syncobj(query->perf.last_job_sync)->syncobj; + err = drmSyncobjImportSyncFile(job->device->pdevice->render_fd, + syncobj, fd); + + if (err) { + result = vk_errorf(queue, VK_ERROR_UNKNOWN, + "sync file import failed: %m"); + goto fail; + } + } + query->maybe_available = true; } +fail: + if (info->pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) + close(fd); + cnd_broadcast(&job->device->query_ended); mtx_unlock(&job->device->query_mutex); - return VK_SUCCESS; + return result; } static VkResult @@ -176,13 +278,13 @@ handle_copy_query_results_cpu_job(struct v3dv_job *job) uint8_t *offset = ((uint8_t *) bo->map) + info->offset + info->dst->mem_offset; - v3dv_get_query_pool_results_cpu(job->device, - info->pool, - info->first, - info->count, - offset, - info->stride, - info->flags); + v3dv_get_query_pool_results(job->device, + info->pool, + info->first, + info->count, + offset, + info->stride, + info->flags); return VK_SUCCESS; } @@ -635,6 +737,7 @@ fail: static VkResult handle_cl_job(struct v3dv_queue *queue, struct v3dv_job *job, + uint32_t counter_pass_idx, struct v3dv_submit_sync_info *sync_info, bool signal_syncs) { @@ -678,9 +781,15 @@ handle_cl_job(struct v3dv_queue *queue, assert(bo_idx == submit.bo_handle_count); submit.bo_handles = (uintptr_t)(void *)bo_handles; + submit.perfmon_id = job->perf ? + job->perf->kperfmon_ids[counter_pass_idx] : 0; + const bool needs_perf_sync = queue->last_perfmon_id != submit.perfmon_id; + queue->last_perfmon_id = submit.perfmon_id; + /* We need a binning sync if we are waiting on a semaphore with a wait stage * that involves the geometry pipeline, or if the job comes after a pipeline - * barrier that involves geometry stages (needs_bcl_sync). + * barrier that involves geometry stages (needs_bcl_sync), or if + * performance queries are in use. * * We need a render sync if the job doesn't need a binning sync but has * still been flagged for serialization. It should be noted that RCL jobs @@ -705,6 +814,7 @@ handle_cl_job(struct v3dv_queue *queue, VK_PIPELINE_STAGE_TESSELLATION_EVALUATION_SHADER_BIT | VK_PIPELINE_STAGE_GEOMETRY_SHADER_BIT); } + needs_bcl_sync |= needs_perf_sync; bool needs_rcl_sync = job->serialize && !needs_bcl_sync; /* Replace single semaphore settings whenever our kernel-driver supports @@ -795,6 +905,7 @@ handle_tfu_job(struct v3dv_queue *queue, static VkResult handle_csd_job(struct v3dv_queue *queue, struct v3dv_job *job, + uint32_t counter_pass_idx, struct v3dv_submit_sync_info *sync_info, bool signal_syncs) { @@ -835,6 +946,9 @@ handle_csd_job(struct v3dv_queue *queue, submit->in_sync = needs_sync ? last_job_sync : 0; submit->out_sync = last_job_sync; } + submit->perfmon_id = job->perf ? + job->perf->kperfmon_ids[counter_pass_idx] : 0; + queue->last_perfmon_id = submit->perfmon_id; int ret = v3dv_ioctl(device->pdevice->render_fd, DRM_IOCTL_V3D_SUBMIT_CSD, submit); @@ -858,20 +972,21 @@ handle_csd_job(struct v3dv_queue *queue, static VkResult queue_handle_job(struct v3dv_queue *queue, struct v3dv_job *job, + uint32_t counter_pass_idx, struct v3dv_submit_sync_info *sync_info, bool signal_syncs) { switch (job->type) { case V3DV_JOB_TYPE_GPU_CL: - return handle_cl_job(queue, job, sync_info, signal_syncs); + return handle_cl_job(queue, job, counter_pass_idx, sync_info, signal_syncs); case V3DV_JOB_TYPE_GPU_TFU: return handle_tfu_job(queue, job, sync_info, signal_syncs); case V3DV_JOB_TYPE_GPU_CSD: - return handle_csd_job(queue, job, sync_info, signal_syncs); + return handle_csd_job(queue, job, counter_pass_idx, sync_info, signal_syncs); case V3DV_JOB_TYPE_CPU_RESET_QUERIES: return handle_reset_query_cpu_job(queue, job, sync_info); case V3DV_JOB_TYPE_CPU_END_QUERY: - return handle_end_query_cpu_job(job); + return handle_end_query_cpu_job(job, counter_pass_idx); case V3DV_JOB_TYPE_CPU_COPY_QUERY_RESULTS: return handle_copy_query_results_cpu_job(job); case V3DV_JOB_TYPE_CPU_SET_EVENT: @@ -913,6 +1028,7 @@ queue_create_noop_job(struct v3dv_queue *queue) static VkResult queue_submit_noop_job(struct v3dv_queue *queue, + uint32_t counter_pass_idx, struct v3dv_submit_sync_info *sync_info, bool signal_syncs) { @@ -923,7 +1039,8 @@ queue_submit_noop_job(struct v3dv_queue *queue, } assert(queue->noop_job); - return queue_handle_job(queue, queue->noop_job, sync_info, signal_syncs); + return queue_handle_job(queue, queue->noop_job, counter_pass_idx, + sync_info, signal_syncs); } VkResult @@ -953,7 +1070,8 @@ v3dv_queue_driver_submit(struct vk_queue *vk_queue, list_for_each_entry_safe(struct v3dv_job, job, &cmd_buffer->jobs, list_link) { - result = queue_handle_job(queue, job, &sync_info, false); + result = queue_handle_job(queue, job, submit->perf_pass_index, + &sync_info, false); if (result != VK_SUCCESS) return result; } @@ -964,7 +1082,8 @@ v3dv_queue_driver_submit(struct vk_queue *vk_queue, * barrier state to limit the queues we serialize against. */ if (cmd_buffer->state.barrier.dst_mask) { - result = queue_submit_noop_job(queue, &sync_info, false); + result = queue_submit_noop_job(queue, submit->perf_pass_index, + &sync_info, false); if (result != VK_SUCCESS) return result; } @@ -976,7 +1095,8 @@ v3dv_queue_driver_submit(struct vk_queue *vk_queue, * requirements. */ if (submit->signal_count > 0) { - result = queue_submit_noop_job(queue, &sync_info, true); + result = queue_submit_noop_job(queue, submit->perf_pass_index, + &sync_info, true); if (result != VK_SUCCESS) return result; }