779 lines
30 KiB
C
779 lines
30 KiB
C
/*
|
|
* Copyright © 2020 Raspberry Pi Ltd
|
|
*
|
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
|
* copy of this software and associated documentation files (the "Software"),
|
|
* to deal in the Software without restriction, including without limitation
|
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
|
* and/or sell copies of the Software, and to permit persons to whom the
|
|
* Software is furnished to do so, subject to the following conditions:
|
|
*
|
|
* The above copyright notice and this permission notice (including the next
|
|
* paragraph) shall be included in all copies or substantial portions of the
|
|
* Software.
|
|
*
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
|
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
|
* IN THE SOFTWARE.
|
|
*/
|
|
|
|
#include "v3dv_private.h"
|
|
|
|
#include "util/timespec.h"
|
|
|
|
static const char *v3dv_counters[][3] = {
|
|
{"FEP", "FEP-valid-primitives-no-rendered-pixels", "[FEP] Valid primitives that result in no rendered pixels, for all rendered tiles"},
|
|
{"FEP", "FEP-valid-primitives-rendered-pixels", "[FEP] Valid primitives for all rendered tiles (primitives may be counted in more than one tile)"},
|
|
{"FEP", "FEP-clipped-quads", "[FEP] Early-Z/Near/Far clipped quads"},
|
|
{"FEP", "FEP-valid-quads", "[FEP] Valid quads"},
|
|
{"TLB", "TLB-quads-not-passing-stencil-test", "[TLB] Quads with no pixels passing the stencil test"},
|
|
{"TLB", "TLB-quads-not-passing-z-and-stencil-test", "[TLB] Quads with no pixels passing the Z and stencil tests"},
|
|
{"TLB", "TLB-quads-passing-z-and-stencil-test", "[TLB] Quads with any pixels passing the Z and stencil tests"},
|
|
{"TLB", "TLB-quads-with-zero-coverage", "[TLB] Quads with all pixels having zero coverage"},
|
|
{"TLB", "TLB-quads-with-non-zero-coverage", "[TLB] Quads with any pixels having non-zero coverage"},
|
|
{"TLB", "TLB-quads-written-to-color-buffer", "[TLB] Quads with valid pixels written to colour buffer"},
|
|
{"PTB", "PTB-primitives-discarded-outside-viewport", "[PTB] Primitives discarded by being outside the viewport"},
|
|
{"PTB", "PTB-primitives-need-clipping", "[PTB] Primitives that need clipping"},
|
|
{"PTB", "PTB-primitives-discared-reversed", "[PTB] Primitives that are discarded because they are reversed"},
|
|
{"QPU", "QPU-total-idle-clk-cycles", "[QPU] Total idle clock cycles for all QPUs"},
|
|
{"QPU", "QPU-total-active-clk-cycles-vertex-coord-shading", "[QPU] Total active clock cycles for all QPUs doing vertex/coordinate/user shading (counts only when QPU is not stalled)"},
|
|
{"QPU", "QPU-total-active-clk-cycles-fragment-shading", "[QPU] Total active clock cycles for all QPUs doing fragment shading (counts only when QPU is not stalled)"},
|
|
{"QPU", "QPU-total-clk-cycles-executing-valid-instr", "[QPU] Total clock cycles for all QPUs executing valid instructions"},
|
|
{"QPU", "QPU-total-clk-cycles-waiting-TMU", "[QPU] Total clock cycles for all QPUs stalled waiting for TMUs only (counter won't increment if QPU also stalling for another reason)"},
|
|
{"QPU", "QPU-total-clk-cycles-waiting-scoreboard", "[QPU] Total clock cycles for all QPUs stalled waiting for Scoreboard only (counter won't increment if QPU also stalling for another reason)"},
|
|
{"QPU", "QPU-total-clk-cycles-waiting-varyings", "[QPU] Total clock cycles for all QPUs stalled waiting for Varyings only (counter won't increment if QPU also stalling for another reason)"},
|
|
{"QPU", "QPU-total-instr-cache-hit", "[QPU] Total instruction cache hits for all slices"},
|
|
{"QPU", "QPU-total-instr-cache-miss", "[QPU] Total instruction cache misses for all slices"},
|
|
{"QPU", "QPU-total-uniform-cache-hit", "[QPU] Total uniforms cache hits for all slices"},
|
|
{"QPU", "QPU-total-uniform-cache-miss", "[QPU] Total uniforms cache misses for all slices"},
|
|
{"TMU", "TMU-total-text-quads-access", "[TMU] Total texture cache accesses"},
|
|
{"TMU", "TMU-total-text-cache-miss", "[TMU] Total texture cache misses (number of fetches from memory/L2cache)"},
|
|
{"VPM", "VPM-total-clk-cycles-VDW-stalled", "[VPM] Total clock cycles VDW is stalled waiting for VPM access"},
|
|
{"VPM", "VPM-total-clk-cycles-VCD-stalled", "[VPM] Total clock cycles VCD is stalled waiting for VPM access"},
|
|
{"CLE", "CLE-bin-thread-active-cycles", "[CLE] Bin thread active cycles"},
|
|
{"CLE", "CLE-render-thread-active-cycles", "[CLE] Render thread active cycles"},
|
|
{"L2T", "L2T-total-cache-hit", "[L2T] Total Level 2 cache hits"},
|
|
{"L2T", "L2T-total-cache-miss", "[L2T] Total Level 2 cache misses"},
|
|
{"CORE", "cycle-count", "[CORE] Cycle counter"},
|
|
{"QPU", "QPU-total-clk-cycles-waiting-vertex-coord-shading", "[QPU] Total stalled clock cycles for all QPUs doing vertex/coordinate/user shading"},
|
|
{"QPU", "QPU-total-clk-cycles-waiting-fragment-shading", "[QPU] Total stalled clock cycles for all QPUs doing fragment shading"},
|
|
{"PTB", "PTB-primitives-binned", "[PTB] Total primitives binned"},
|
|
{"AXI", "AXI-writes-seen-watch-0", "[AXI] Writes seen by watch 0"},
|
|
{"AXI", "AXI-reads-seen-watch-0", "[AXI] Reads seen by watch 0"},
|
|
{"AXI", "AXI-writes-stalled-seen-watch-0", "[AXI] Write stalls seen by watch 0"},
|
|
{"AXI", "AXI-reads-stalled-seen-watch-0", "[AXI] Read stalls seen by watch 0"},
|
|
{"AXI", "AXI-write-bytes-seen-watch-0", "[AXI] Total bytes written seen by watch 0"},
|
|
{"AXI", "AXI-read-bytes-seen-watch-0", "[AXI] Total bytes read seen by watch 0"},
|
|
{"AXI", "AXI-writes-seen-watch-1", "[AXI] Writes seen by watch 1"},
|
|
{"AXI", "AXI-reads-seen-watch-1", "[AXI] Reads seen by watch 1"},
|
|
{"AXI", "AXI-writes-stalled-seen-watch-1", "[AXI] Write stalls seen by watch 1"},
|
|
{"AXI", "AXI-reads-stalled-seen-watch-1", "[AXI] Read stalls seen by watch 1"},
|
|
{"AXI", "AXI-write-bytes-seen-watch-1", "[AXI] Total bytes written seen by watch 1"},
|
|
{"AXI", "AXI-read-bytes-seen-watch-1", "[AXI] Total bytes read seen by watch 1"},
|
|
{"TLB", "TLB-partial-quads-written-to-color-buffer", "[TLB] Partial quads written to the colour buffer"},
|
|
{"TMU", "TMU-total-config-access", "[TMU] Total config accesses"},
|
|
{"L2T", "L2T-no-id-stalled", "[L2T] No ID stall"},
|
|
{"L2T", "L2T-command-queue-stalled", "[L2T] Command queue full stall"},
|
|
{"L2T", "L2T-TMU-writes", "[L2T] TMU write accesses"},
|
|
{"TMU", "TMU-active-cycles", "[TMU] Active cycles"},
|
|
{"TMU", "TMU-stalled-cycles", "[TMU] Stalled cycles"},
|
|
{"CLE", "CLE-thread-active-cycles", "[CLE] Bin or render thread active cycles"},
|
|
{"L2T", "L2T-TMU-reads", "[L2T] TMU read accesses"},
|
|
{"L2T", "L2T-CLE-reads", "[L2T] CLE read accesses"},
|
|
{"L2T", "L2T-VCD-reads", "[L2T] VCD read accesses"},
|
|
{"L2T", "L2T-TMU-config-reads", "[L2T] TMU CFG read accesses"},
|
|
{"L2T", "L2T-SLC0-reads", "[L2T] SLC0 read accesses"},
|
|
{"L2T", "L2T-SLC1-reads", "[L2T] SLC1 read accesses"},
|
|
{"L2T", "L2T-SLC2-reads", "[L2T] SLC2 read accesses"},
|
|
{"L2T", "L2T-TMU-write-miss", "[L2T] TMU write misses"},
|
|
{"L2T", "L2T-TMU-read-miss", "[L2T] TMU read misses"},
|
|
{"L2T", "L2T-CLE-read-miss", "[L2T] CLE read misses"},
|
|
{"L2T", "L2T-VCD-read-miss", "[L2T] VCD read misses"},
|
|
{"L2T", "L2T-TMU-config-read-miss", "[L2T] TMU CFG read misses"},
|
|
{"L2T", "L2T-SLC0-read-miss", "[L2T] SLC0 read misses"},
|
|
{"L2T", "L2T-SLC1-read-miss", "[L2T] SLC1 read misses"},
|
|
{"L2T", "L2T-SLC2-read-miss", "[L2T] SLC2 read misses"},
|
|
{"CORE", "core-memory-writes", "[CORE] Total memory writes"},
|
|
{"L2T", "L2T-memory-writes", "[L2T] Total memory writes"},
|
|
{"PTB", "PTB-memory-writes", "[PTB] Total memory writes"},
|
|
{"TLB", "TLB-memory-writes", "[TLB] Total memory writes"},
|
|
{"CORE", "core-memory-reads", "[CORE] Total memory reads"},
|
|
{"L2T", "L2T-memory-reads", "[L2T] Total memory reads"},
|
|
{"PTB", "PTB-memory-reads", "[PTB] Total memory reads"},
|
|
{"PSE", "PSE-memory-reads", "[PSE] Total memory reads"},
|
|
{"TLB", "TLB-memory-reads", "[TLB] Total memory reads"},
|
|
{"GMP", "GMP-memory-reads", "[GMP] Total memory reads"},
|
|
{"PTB", "PTB-memory-words-writes", "[PTB] Total memory words written"},
|
|
{"TLB", "TLB-memory-words-writes", "[TLB] Total memory words written"},
|
|
{"PSE", "PSE-memory-words-reads", "[PSE] Total memory words read"},
|
|
{"TLB", "TLB-memory-words-reads", "[TLB] Total memory words read"},
|
|
{"TMU", "TMU-MRU-hits", "[TMU] Total MRU hits"},
|
|
{"CORE", "compute-active-cycles", "[CORE] Compute active cycles"},
|
|
};
|
|
|
|
static void
|
|
kperfmon_create(struct v3dv_device *device,
|
|
struct v3dv_query_pool *pool,
|
|
uint32_t query)
|
|
{
|
|
for (uint32_t i = 0; i < pool->perfmon.nperfmons; i++) {
|
|
assert(i * DRM_V3D_MAX_PERF_COUNTERS < pool->perfmon.ncounters);
|
|
|
|
struct drm_v3d_perfmon_create req = {
|
|
.ncounters = MIN2(pool->perfmon.ncounters -
|
|
i * DRM_V3D_MAX_PERF_COUNTERS,
|
|
DRM_V3D_MAX_PERF_COUNTERS),
|
|
};
|
|
memcpy(req.counters,
|
|
&pool->perfmon.counters[i * DRM_V3D_MAX_PERF_COUNTERS],
|
|
req.ncounters);
|
|
|
|
int ret = v3dv_ioctl(device->pdevice->render_fd,
|
|
DRM_IOCTL_V3D_PERFMON_CREATE,
|
|
&req);
|
|
if (ret)
|
|
fprintf(stderr, "Failed to create perfmon: %s\n", strerror(ret));
|
|
|
|
pool->queries[query].perf.kperfmon_ids[i] = req.id;
|
|
}
|
|
}
|
|
|
|
static void
|
|
kperfmon_destroy(struct v3dv_device *device,
|
|
struct v3dv_query_pool *pool,
|
|
uint32_t query)
|
|
{
|
|
/* Skip destroying if never created */
|
|
if (!pool->queries[query].perf.kperfmon_ids[0])
|
|
return;
|
|
|
|
for (uint32_t i = 0; i < pool->perfmon.nperfmons; i++) {
|
|
struct drm_v3d_perfmon_destroy req = {
|
|
.id = pool->queries[query].perf.kperfmon_ids[i]
|
|
};
|
|
|
|
int ret = v3dv_ioctl(device->pdevice->render_fd,
|
|
DRM_IOCTL_V3D_PERFMON_DESTROY,
|
|
&req);
|
|
|
|
if (ret) {
|
|
fprintf(stderr, "Failed to destroy perfmon %u: %s\n",
|
|
req.id, strerror(ret));
|
|
}
|
|
}
|
|
}
|
|
|
|
VKAPI_ATTR VkResult VKAPI_CALL
|
|
v3dv_CreateQueryPool(VkDevice _device,
|
|
const VkQueryPoolCreateInfo *pCreateInfo,
|
|
const VkAllocationCallbacks *pAllocator,
|
|
VkQueryPool *pQueryPool)
|
|
{
|
|
V3DV_FROM_HANDLE(v3dv_device, device, _device);
|
|
|
|
assert(pCreateInfo->queryType == VK_QUERY_TYPE_OCCLUSION ||
|
|
pCreateInfo->queryType == VK_QUERY_TYPE_TIMESTAMP ||
|
|
pCreateInfo->queryType == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR);
|
|
assert(pCreateInfo->queryCount > 0);
|
|
|
|
struct v3dv_query_pool *pool =
|
|
vk_object_zalloc(&device->vk, pAllocator, sizeof(*pool),
|
|
VK_OBJECT_TYPE_QUERY_POOL);
|
|
if (pool == NULL)
|
|
return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
|
|
|
|
pool->query_type = pCreateInfo->queryType;
|
|
pool->query_count = pCreateInfo->queryCount;
|
|
|
|
uint32_t query_idx = 0;
|
|
VkResult result;
|
|
|
|
const uint32_t pool_bytes = sizeof(struct v3dv_query) * pool->query_count;
|
|
pool->queries = vk_alloc2(&device->vk.alloc, pAllocator, pool_bytes, 8,
|
|
VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
|
|
if (pool->queries == NULL) {
|
|
result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
|
|
goto fail;
|
|
}
|
|
|
|
switch (pool->query_type) {
|
|
case VK_QUERY_TYPE_OCCLUSION: {
|
|
/* The hardware allows us to setup groups of 16 queries in consecutive
|
|
* 4-byte addresses, requiring only that each group of 16 queries is
|
|
* aligned to a 1024 byte boundary.
|
|
*/
|
|
const uint32_t query_groups = DIV_ROUND_UP(pool->query_count, 16);
|
|
const uint32_t bo_size = query_groups * 1024;
|
|
pool->bo = v3dv_bo_alloc(device, bo_size, "query", true);
|
|
if (!pool->bo) {
|
|
result = vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY);
|
|
goto fail;
|
|
}
|
|
if (!v3dv_bo_map(device, pool->bo, bo_size)) {
|
|
result = vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY);
|
|
goto fail;
|
|
}
|
|
break;
|
|
}
|
|
case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
|
|
const VkQueryPoolPerformanceCreateInfoKHR *pq_info =
|
|
vk_find_struct_const(pCreateInfo->pNext,
|
|
QUERY_POOL_PERFORMANCE_CREATE_INFO_KHR);
|
|
|
|
assert(pq_info);
|
|
assert(pq_info->counterIndexCount <= V3D_PERFCNT_NUM);
|
|
|
|
pool->perfmon.ncounters = pq_info->counterIndexCount;
|
|
for (uint32_t i = 0; i < pq_info->counterIndexCount; i++)
|
|
pool->perfmon.counters[i] = pq_info->pCounterIndices[i];
|
|
|
|
pool->perfmon.nperfmons = DIV_ROUND_UP(pool->perfmon.ncounters,
|
|
DRM_V3D_MAX_PERF_COUNTERS);
|
|
|
|
assert(pool->perfmon.nperfmons <= V3DV_MAX_PERFMONS);
|
|
break;
|
|
}
|
|
case VK_QUERY_TYPE_TIMESTAMP:
|
|
break;
|
|
default:
|
|
unreachable("Unsupported query type");
|
|
}
|
|
|
|
for (; query_idx < pool->query_count; query_idx++) {
|
|
pool->queries[query_idx].maybe_available = false;
|
|
switch (pool->query_type) {
|
|
case VK_QUERY_TYPE_OCCLUSION: {
|
|
const uint32_t query_group = query_idx / 16;
|
|
const uint32_t query_offset = query_group * 1024 + (query_idx % 16) * 4;
|
|
pool->queries[query_idx].bo = pool->bo;
|
|
pool->queries[query_idx].offset = query_offset;
|
|
break;
|
|
}
|
|
case VK_QUERY_TYPE_TIMESTAMP:
|
|
pool->queries[query_idx].value = 0;
|
|
break;
|
|
case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
|
|
result = vk_sync_create(&device->vk,
|
|
&device->pdevice->drm_syncobj_type, 0, 0,
|
|
&pool->queries[query_idx].perf.last_job_sync);
|
|
if (result != VK_SUCCESS)
|
|
goto fail;
|
|
|
|
for (uint32_t j = 0; j < pool->perfmon.nperfmons; j++)
|
|
pool->queries[query_idx].perf.kperfmon_ids[j] = 0;
|
|
break;
|
|
}
|
|
default:
|
|
unreachable("Unsupported query type");
|
|
}
|
|
}
|
|
|
|
*pQueryPool = v3dv_query_pool_to_handle(pool);
|
|
|
|
return VK_SUCCESS;
|
|
|
|
fail:
|
|
if (pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
|
|
for (uint32_t j = 0; j < query_idx; j++)
|
|
vk_sync_destroy(&device->vk, pool->queries[j].perf.last_job_sync);
|
|
}
|
|
|
|
if (pool->bo)
|
|
v3dv_bo_free(device, pool->bo);
|
|
if (pool->queries)
|
|
vk_free2(&device->vk.alloc, pAllocator, pool->queries);
|
|
vk_object_free(&device->vk, pAllocator, pool);
|
|
|
|
return result;
|
|
}
|
|
|
|
VKAPI_ATTR void VKAPI_CALL
|
|
v3dv_DestroyQueryPool(VkDevice _device,
|
|
VkQueryPool queryPool,
|
|
const VkAllocationCallbacks *pAllocator)
|
|
{
|
|
V3DV_FROM_HANDLE(v3dv_device, device, _device);
|
|
V3DV_FROM_HANDLE(v3dv_query_pool, pool, queryPool);
|
|
|
|
if (!pool)
|
|
return;
|
|
|
|
if (pool->bo)
|
|
v3dv_bo_free(device, pool->bo);
|
|
|
|
if (pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
|
|
for (uint32_t i = 0; i < pool->query_count; i++) {
|
|
kperfmon_destroy(device, pool, i);
|
|
vk_sync_destroy(&device->vk, pool->queries[i].perf.last_job_sync);
|
|
}
|
|
}
|
|
|
|
if (pool->queries)
|
|
vk_free2(&device->vk.alloc, pAllocator, pool->queries);
|
|
|
|
vk_object_free(&device->vk, pAllocator, pool);
|
|
}
|
|
|
|
static void
|
|
write_to_buffer(void *dst, uint32_t idx, bool do_64bit, uint64_t value)
|
|
{
|
|
if (do_64bit) {
|
|
uint64_t *dst64 = (uint64_t *) dst;
|
|
dst64[idx] = value;
|
|
} else {
|
|
uint32_t *dst32 = (uint32_t *) dst;
|
|
dst32[idx] = (uint32_t) value;
|
|
}
|
|
}
|
|
|
|
static VkResult
|
|
query_wait_available(struct v3dv_device *device,
|
|
struct v3dv_query *q,
|
|
VkQueryType query_type)
|
|
{
|
|
if (!q->maybe_available) {
|
|
struct timespec timeout;
|
|
timespec_get(&timeout, TIME_UTC);
|
|
timespec_add_msec(&timeout, &timeout, 2000);
|
|
|
|
VkResult result = VK_SUCCESS;
|
|
|
|
mtx_lock(&device->query_mutex);
|
|
while (!q->maybe_available) {
|
|
if (vk_device_is_lost(&device->vk)) {
|
|
result = VK_ERROR_DEVICE_LOST;
|
|
break;
|
|
}
|
|
|
|
int ret = cnd_timedwait(&device->query_ended,
|
|
&device->query_mutex,
|
|
&timeout);
|
|
if (ret != thrd_success) {
|
|
mtx_unlock(&device->query_mutex);
|
|
result = vk_device_set_lost(&device->vk, "Query wait failed");
|
|
break;
|
|
}
|
|
}
|
|
mtx_unlock(&device->query_mutex);
|
|
|
|
if (result != VK_SUCCESS)
|
|
return result;
|
|
}
|
|
|
|
if (query_type == VK_QUERY_TYPE_OCCLUSION &&
|
|
!v3dv_bo_wait(device, q->bo, 0xffffffffffffffffull))
|
|
return vk_device_set_lost(&device->vk, "Query BO wait failed: %m");
|
|
|
|
if (query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR &&
|
|
vk_sync_wait(&device->vk, q->perf.last_job_sync,
|
|
0, VK_SYNC_WAIT_COMPLETE, UINT64_MAX) != VK_SUCCESS)
|
|
return vk_device_set_lost(&device->vk, "Query job wait failed");
|
|
|
|
return VK_SUCCESS;
|
|
}
|
|
|
|
static VkResult
|
|
write_occlusion_query_result(struct v3dv_device *device,
|
|
struct v3dv_query_pool *pool,
|
|
uint32_t query,
|
|
bool do_64bit,
|
|
void *data,
|
|
uint32_t slot)
|
|
{
|
|
assert(pool && pool->query_type == VK_QUERY_TYPE_OCCLUSION);
|
|
|
|
if (vk_device_is_lost(&device->vk))
|
|
return VK_ERROR_DEVICE_LOST;
|
|
|
|
struct v3dv_query *q = &pool->queries[query];
|
|
assert(q->bo && q->bo->map);
|
|
|
|
const uint8_t *query_addr = ((uint8_t *) q->bo->map) + q->offset;
|
|
write_to_buffer(data, slot, do_64bit, (uint64_t) *((uint32_t *)query_addr));
|
|
return VK_SUCCESS;
|
|
}
|
|
|
|
static VkResult
|
|
write_timestamp_query_result(struct v3dv_device *device,
|
|
struct v3dv_query_pool *pool,
|
|
uint32_t query,
|
|
bool do_64bit,
|
|
void *data,
|
|
uint32_t slot)
|
|
{
|
|
assert(pool && pool->query_type == VK_QUERY_TYPE_TIMESTAMP);
|
|
|
|
struct v3dv_query *q = &pool->queries[query];
|
|
|
|
write_to_buffer(data, slot, do_64bit, q->value);
|
|
return VK_SUCCESS;
|
|
}
|
|
|
|
static VkResult
|
|
write_performance_query_result(struct v3dv_device *device,
|
|
struct v3dv_query_pool *pool,
|
|
uint32_t query,
|
|
bool do_64bit,
|
|
void *data,
|
|
uint32_t slot)
|
|
{
|
|
assert(pool && pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR);
|
|
|
|
struct v3dv_query *q = &pool->queries[query];
|
|
uint64_t counter_values[V3D_PERFCNT_NUM];
|
|
|
|
for (uint32_t i = 0; i < pool->perfmon.nperfmons; i++) {
|
|
struct drm_v3d_perfmon_get_values req = {
|
|
.id = q->perf.kperfmon_ids[i],
|
|
.values_ptr = (uintptr_t)(&counter_values[i *
|
|
DRM_V3D_MAX_PERF_COUNTERS])
|
|
};
|
|
|
|
int ret = v3dv_ioctl(device->pdevice->render_fd,
|
|
DRM_IOCTL_V3D_PERFMON_GET_VALUES,
|
|
&req);
|
|
|
|
if (ret) {
|
|
fprintf(stderr, "failed to get perfmon values: %s\n", strerror(ret));
|
|
return vk_error(device, VK_ERROR_DEVICE_LOST);
|
|
}
|
|
}
|
|
|
|
for (uint32_t i = 0; i < pool->perfmon.ncounters; i++)
|
|
write_to_buffer(data, slot + i, do_64bit, counter_values[i]);
|
|
|
|
return VK_SUCCESS;
|
|
}
|
|
|
|
static VkResult
|
|
query_check_available(struct v3dv_device *device,
|
|
struct v3dv_query *q,
|
|
VkQueryType query_type)
|
|
{
|
|
if (!q->maybe_available)
|
|
return VK_NOT_READY;
|
|
|
|
if (query_type == VK_QUERY_TYPE_OCCLUSION &&
|
|
!v3dv_bo_wait(device, q->bo, 0))
|
|
return VK_NOT_READY;
|
|
|
|
if (query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR &&
|
|
vk_sync_wait(&device->vk, q->perf.last_job_sync,
|
|
0, VK_SYNC_WAIT_COMPLETE, 0) != VK_SUCCESS)
|
|
return VK_NOT_READY;
|
|
|
|
return VK_SUCCESS;
|
|
}
|
|
|
|
static VkResult
|
|
write_query_result(struct v3dv_device *device,
|
|
struct v3dv_query_pool *pool,
|
|
uint32_t query,
|
|
bool do_64bit,
|
|
void *data,
|
|
uint32_t slot)
|
|
{
|
|
switch (pool->query_type) {
|
|
case VK_QUERY_TYPE_OCCLUSION:
|
|
return write_occlusion_query_result(device, pool, query, do_64bit,
|
|
data, slot);
|
|
case VK_QUERY_TYPE_TIMESTAMP:
|
|
return write_timestamp_query_result(device, pool, query, do_64bit,
|
|
data, slot);
|
|
case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
|
|
return write_performance_query_result(device, pool, query, do_64bit,
|
|
data, slot);
|
|
default:
|
|
unreachable("Unsupported query type");
|
|
}
|
|
}
|
|
|
|
static VkResult
|
|
query_is_available(struct v3dv_device *device,
|
|
struct v3dv_query_pool *pool,
|
|
uint32_t query,
|
|
bool do_wait,
|
|
bool *available)
|
|
{
|
|
struct v3dv_query *q = &pool->queries[query];
|
|
|
|
assert(pool->query_type != VK_QUERY_TYPE_OCCLUSION ||
|
|
(q->bo && q->bo->map));
|
|
|
|
if (do_wait) {
|
|
VkResult result = query_wait_available(device, q, pool->query_type);
|
|
if (result != VK_SUCCESS) {
|
|
*available = false;
|
|
return result;
|
|
}
|
|
|
|
*available = true;
|
|
} else {
|
|
VkResult result = query_check_available(device, q, pool->query_type);
|
|
assert(result == VK_SUCCESS || result == VK_NOT_READY);
|
|
*available = (result == VK_SUCCESS);
|
|
}
|
|
|
|
return VK_SUCCESS;
|
|
}
|
|
|
|
static uint32_t
|
|
get_query_result_count(struct v3dv_query_pool *pool)
|
|
{
|
|
switch (pool->query_type) {
|
|
case VK_QUERY_TYPE_OCCLUSION:
|
|
case VK_QUERY_TYPE_TIMESTAMP:
|
|
return 1;
|
|
case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
|
|
return pool->perfmon.ncounters;
|
|
default:
|
|
unreachable("Unsupported query type");
|
|
}
|
|
}
|
|
|
|
VkResult
|
|
v3dv_get_query_pool_results(struct v3dv_device *device,
|
|
struct v3dv_query_pool *pool,
|
|
uint32_t first,
|
|
uint32_t count,
|
|
void *data,
|
|
VkDeviceSize stride,
|
|
VkQueryResultFlags flags)
|
|
{
|
|
assert(first < pool->query_count);
|
|
assert(first + count <= pool->query_count);
|
|
assert(data);
|
|
|
|
const bool do_64bit = flags & VK_QUERY_RESULT_64_BIT ||
|
|
pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR;
|
|
const bool do_wait = flags & VK_QUERY_RESULT_WAIT_BIT;
|
|
const bool do_partial = flags & VK_QUERY_RESULT_PARTIAL_BIT;
|
|
|
|
uint32_t result_count = get_query_result_count(pool);
|
|
|
|
VkResult result = VK_SUCCESS;
|
|
for (uint32_t i = first; i < first + count; i++) {
|
|
bool available = false;
|
|
VkResult query_result =
|
|
query_is_available(device, pool, i, do_wait, &available);
|
|
if (query_result == VK_ERROR_DEVICE_LOST)
|
|
result = VK_ERROR_DEVICE_LOST;
|
|
|
|
/**
|
|
* From the Vulkan 1.0 spec:
|
|
*
|
|
* "If VK_QUERY_RESULT_WAIT_BIT and VK_QUERY_RESULT_PARTIAL_BIT are
|
|
* both not set then no result values are written to pData for queries
|
|
* that are in the unavailable state at the time of the call, and
|
|
* vkGetQueryPoolResults returns VK_NOT_READY. However, availability
|
|
* state is still written to pData for those queries if
|
|
* VK_QUERY_RESULT_WITH_AVAILABILITY_BIT is set."
|
|
*/
|
|
uint32_t slot = 0;
|
|
|
|
const bool write_result = available || do_partial;
|
|
if (write_result)
|
|
write_query_result(device, pool, i, do_64bit, data, slot);
|
|
slot += result_count;
|
|
|
|
if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)
|
|
write_to_buffer(data, slot++, do_64bit, available ? 1u : 0u);
|
|
|
|
if (!write_result && result != VK_ERROR_DEVICE_LOST)
|
|
result = VK_NOT_READY;
|
|
|
|
data += stride;
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
VKAPI_ATTR VkResult VKAPI_CALL
|
|
v3dv_GetQueryPoolResults(VkDevice _device,
|
|
VkQueryPool queryPool,
|
|
uint32_t firstQuery,
|
|
uint32_t queryCount,
|
|
size_t dataSize,
|
|
void *pData,
|
|
VkDeviceSize stride,
|
|
VkQueryResultFlags flags)
|
|
{
|
|
V3DV_FROM_HANDLE(v3dv_device, device, _device);
|
|
V3DV_FROM_HANDLE(v3dv_query_pool, pool, queryPool);
|
|
|
|
return v3dv_get_query_pool_results(device, pool, firstQuery, queryCount,
|
|
pData, stride, flags);
|
|
}
|
|
|
|
VKAPI_ATTR void VKAPI_CALL
|
|
v3dv_CmdResetQueryPool(VkCommandBuffer commandBuffer,
|
|
VkQueryPool queryPool,
|
|
uint32_t firstQuery,
|
|
uint32_t queryCount)
|
|
{
|
|
V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
|
|
V3DV_FROM_HANDLE(v3dv_query_pool, pool, queryPool);
|
|
|
|
v3dv_cmd_buffer_reset_queries(cmd_buffer, pool, firstQuery, queryCount);
|
|
}
|
|
|
|
VKAPI_ATTR void VKAPI_CALL
|
|
v3dv_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer,
|
|
VkQueryPool queryPool,
|
|
uint32_t firstQuery,
|
|
uint32_t queryCount,
|
|
VkBuffer dstBuffer,
|
|
VkDeviceSize dstOffset,
|
|
VkDeviceSize stride,
|
|
VkQueryResultFlags flags)
|
|
{
|
|
V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
|
|
V3DV_FROM_HANDLE(v3dv_query_pool, pool, queryPool);
|
|
V3DV_FROM_HANDLE(v3dv_buffer, dst, dstBuffer);
|
|
|
|
v3dv_cmd_buffer_copy_query_results(cmd_buffer, pool,
|
|
firstQuery, queryCount,
|
|
dst, dstOffset, stride, flags);
|
|
}
|
|
|
|
VKAPI_ATTR void VKAPI_CALL
|
|
v3dv_CmdBeginQuery(VkCommandBuffer commandBuffer,
|
|
VkQueryPool queryPool,
|
|
uint32_t query,
|
|
VkQueryControlFlags flags)
|
|
{
|
|
V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
|
|
V3DV_FROM_HANDLE(v3dv_query_pool, pool, queryPool);
|
|
|
|
v3dv_cmd_buffer_begin_query(cmd_buffer, pool, query, flags);
|
|
}
|
|
|
|
VKAPI_ATTR void VKAPI_CALL
|
|
v3dv_CmdEndQuery(VkCommandBuffer commandBuffer,
|
|
VkQueryPool queryPool,
|
|
uint32_t query)
|
|
{
|
|
V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
|
|
V3DV_FROM_HANDLE(v3dv_query_pool, pool, queryPool);
|
|
|
|
v3dv_cmd_buffer_end_query(cmd_buffer, pool, query);
|
|
}
|
|
|
|
void
|
|
v3dv_reset_query_pools(struct v3dv_device *device,
|
|
struct v3dv_query_pool *pool,
|
|
uint32_t first,
|
|
uint32_t count)
|
|
{
|
|
mtx_lock(&device->query_mutex);
|
|
|
|
for (uint32_t i = first; i < first + count; i++) {
|
|
assert(i < pool->query_count);
|
|
struct v3dv_query *q = &pool->queries[i];
|
|
q->maybe_available = false;
|
|
switch (pool->query_type) {
|
|
case VK_QUERY_TYPE_OCCLUSION: {
|
|
const uint8_t *q_addr = ((uint8_t *) q->bo->map) + q->offset;
|
|
uint32_t *counter = (uint32_t *) q_addr;
|
|
*counter = 0;
|
|
break;
|
|
}
|
|
case VK_QUERY_TYPE_TIMESTAMP:
|
|
q->value = 0;
|
|
break;
|
|
case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
|
|
kperfmon_destroy(device, pool, i);
|
|
kperfmon_create(device, pool, i);
|
|
if (vk_sync_reset(&device->vk, q->perf.last_job_sync) != VK_SUCCESS)
|
|
fprintf(stderr, "Failed to reset sync");
|
|
break;
|
|
default:
|
|
unreachable("Unsupported query type");
|
|
}
|
|
}
|
|
|
|
mtx_unlock(&device->query_mutex);
|
|
}
|
|
|
|
VKAPI_ATTR void VKAPI_CALL
|
|
v3dv_ResetQueryPool(VkDevice _device,
|
|
VkQueryPool queryPool,
|
|
uint32_t firstQuery,
|
|
uint32_t queryCount)
|
|
{
|
|
V3DV_FROM_HANDLE(v3dv_device, device, _device);
|
|
V3DV_FROM_HANDLE(v3dv_query_pool, pool, queryPool);
|
|
|
|
v3dv_reset_query_pools(device, pool, firstQuery, queryCount);
|
|
}
|
|
|
|
VKAPI_ATTR VkResult VKAPI_CALL
|
|
v3dv_EnumeratePhysicalDeviceQueueFamilyPerformanceQueryCountersKHR(
|
|
VkPhysicalDevice physicalDevice,
|
|
uint32_t queueFamilyIndex,
|
|
uint32_t *pCounterCount,
|
|
VkPerformanceCounterKHR *pCounters,
|
|
VkPerformanceCounterDescriptionKHR *pCounterDescriptions)
|
|
{
|
|
uint32_t desc_count = *pCounterCount;
|
|
|
|
VK_OUTARRAY_MAKE_TYPED(VkPerformanceCounterKHR,
|
|
out, pCounters, pCounterCount);
|
|
VK_OUTARRAY_MAKE_TYPED(VkPerformanceCounterDescriptionKHR,
|
|
out_desc, pCounterDescriptions, &desc_count);
|
|
|
|
for (int i = 0; i < ARRAY_SIZE(v3dv_counters); i++) {
|
|
vk_outarray_append_typed(VkPerformanceCounterKHR, &out, counter) {
|
|
counter->unit = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR;
|
|
counter->scope = VK_PERFORMANCE_COUNTER_SCOPE_COMMAND_KHR;
|
|
counter->storage = VK_PERFORMANCE_COUNTER_STORAGE_UINT64_KHR;
|
|
|
|
unsigned char sha1_result[20];
|
|
_mesa_sha1_compute(v3dv_counters[i][1], strlen(v3dv_counters[i][1]),
|
|
sha1_result);
|
|
|
|
memcpy(counter->uuid, sha1_result, sizeof(counter->uuid));
|
|
}
|
|
|
|
vk_outarray_append_typed(VkPerformanceCounterDescriptionKHR,
|
|
&out_desc, desc) {
|
|
desc->flags = 0;
|
|
snprintf(desc->name, sizeof(desc->name), "%s",
|
|
v3dv_counters[i][1]);
|
|
snprintf(desc->category, sizeof(desc->category), "%s",
|
|
v3dv_counters[i][0]);
|
|
snprintf(desc->description, sizeof(desc->description), "%s",
|
|
v3dv_counters[i][2]);
|
|
}
|
|
}
|
|
|
|
return vk_outarray_status(&out);
|
|
}
|
|
|
|
VKAPI_ATTR void VKAPI_CALL
|
|
v3dv_GetPhysicalDeviceQueueFamilyPerformanceQueryPassesKHR(
|
|
VkPhysicalDevice physicalDevice,
|
|
const VkQueryPoolPerformanceCreateInfoKHR *pPerformanceQueryCreateInfo,
|
|
uint32_t *pNumPasses)
|
|
{
|
|
*pNumPasses = DIV_ROUND_UP(pPerformanceQueryCreateInfo->counterIndexCount,
|
|
DRM_V3D_MAX_PERF_COUNTERS);
|
|
}
|
|
|
|
VKAPI_ATTR VkResult VKAPI_CALL
|
|
v3dv_AcquireProfilingLockKHR(
|
|
VkDevice _device,
|
|
const VkAcquireProfilingLockInfoKHR *pInfo)
|
|
{
|
|
return VK_SUCCESS;
|
|
}
|
|
|
|
VKAPI_ATTR void VKAPI_CALL
|
|
v3dv_ReleaseProfilingLockKHR(VkDevice device)
|
|
{
|
|
}
|