anv: implement VK_INTEL_performance_query

v2: Introduce the appropriate pipe controls
    Properly deal with changes in metric sets (using execbuf parameter)
    Record marker at query end

v3: Fill out PerfCntr1&2

v4: Introduce vkUninitializePerformanceApiINTEL

v5: Use new execbuf extension mechanism

v6: Fix comments in genX_query.c (Rafael)
    Use PIPE_CONTROL workarounds (Rafael)
    Refactor on the last kernel series update (Lionel)

v7: Only I915_PERF_IOCTL_CONFIG when perf stream is already opened (Lionel)

Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Rafael Antognolli <rafael.antognolli@intel.com>
This commit is contained in:
Lionel Landwerlin 2018-06-07 18:02:03 +01:00
parent 5ba6d9941b
commit 2b5f30b1d9
9 changed files with 536 additions and 19 deletions

View File

@ -305,6 +305,7 @@ LOCAL_WHOLE_STATIC_LIBRARIES := \
libmesa_compiler \
libmesa_intel_common \
libmesa_intel_dev \
libmesa_intel_perf \
libmesa_vulkan_common \
libmesa_vulkan_util \
libmesa_anv_gen7 \

View File

@ -259,6 +259,7 @@ VULKAN_FILES := \
vulkan/anv_nir_lower_push_constants.c \
vulkan/anv_nir_lower_ycbcr_textures.c \
vulkan/anv_pass.c \
vulkan/anv_perf.c \
vulkan/anv_pipeline.c \
vulkan/anv_pipeline_cache.c \
vulkan/anv_private.h \

View File

@ -604,6 +604,8 @@ anv_physical_device_init(struct anv_physical_device *device,
goto fail;
}
device->perf = anv_get_perf(&device->info, fd);
anv_physical_device_get_supported_extensions(device,
&device->supported_extensions);
@ -625,6 +627,7 @@ anv_physical_device_finish(struct anv_physical_device *device)
anv_finish_wsi(device);
anv_physical_device_free_disk_cache(device);
ralloc_free(device->compiler);
ralloc_free(device->perf);
close(device->local_fd);
if (device->master_fd >= 0)
close(device->master_fd);
@ -2657,6 +2660,8 @@ VkResult anv_CreateDevice(
anv_device_init_border_colors(device);
anv_device_perf_init(device);
*pDevice = anv_device_to_handle(device);
return VK_SUCCESS;

View File

@ -165,6 +165,7 @@ EXTENSIONS = [
Extension('VK_ANDROID_native_buffer', 7, 'ANDROID'),
Extension('VK_GOOGLE_decorate_string', 1, True),
Extension('VK_GOOGLE_hlsl_functionality1', 1, True),
Extension('VK_INTEL_performance_query', 1, 'device->perf'),
Extension('VK_NV_compute_shader_derivatives', 1, True),
]

224
src/intel/vulkan/anv_perf.c Normal file
View File

@ -0,0 +1,224 @@
/*
* Copyright © 2018 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
#include <assert.h>
#include <stdbool.h>
#include <stdint.h>
#include "anv_private.h"
#include "perf/gen_perf.h"
#include "perf/gen_perf_mdapi.h"
struct gen_perf_config *
anv_get_perf(const struct gen_device_info *devinfo, int fd)
{
struct gen_perf_config *perf = gen_perf_new(NULL);
gen_perf_init_metrics(perf, devinfo, fd);
/* We need DRM_I915_PERF_PROP_HOLD_PREEMPTION support, only available in
* perf revision 2.
*/
if (anv_gem_get_param(fd, I915_PARAM_PERF_REVISION) < 3)
goto err;
return perf;
err:
ralloc_free(perf);
return NULL;
}
void
anv_device_perf_init(struct anv_device *device)
{
device->perf_fd = -1;
}
static int
anv_device_perf_open(struct anv_device *device, uint64_t metric_id)
{
uint64_t properties[DRM_I915_PERF_PROP_MAX * 2];
struct drm_i915_perf_open_param param;
int p = 0, stream_fd;
properties[p++] = DRM_I915_PERF_PROP_SAMPLE_OA;
properties[p++] = true;
properties[p++] = DRM_I915_PERF_PROP_OA_METRICS_SET;
properties[p++] = metric_id;
properties[p++] = DRM_I915_PERF_PROP_OA_FORMAT;
properties[p++] = device->info.gen >= 8 ?
I915_OA_FORMAT_A32u40_A4u32_B8_C8 :
I915_OA_FORMAT_A45_B8_C8;
properties[p++] = DRM_I915_PERF_PROP_OA_EXPONENT;
properties[p++] = 31; /* slowest sampling period */
properties[p++] = DRM_I915_PERF_PROP_CTX_HANDLE;
properties[p++] = device->context_id;
properties[p++] = DRM_I915_PERF_PROP_HOLD_PREEMPTION;
properties[p++] = true;
memset(&param, 0, sizeof(param));
param.flags = 0;
param.flags |= I915_PERF_FLAG_FD_CLOEXEC | I915_PERF_FLAG_FD_NONBLOCK;
param.properties_ptr = (uintptr_t)properties;
param.num_properties = p / 2;
stream_fd = gen_ioctl(device->fd, DRM_IOCTL_I915_PERF_OPEN, &param);
return stream_fd;
}
VkResult anv_InitializePerformanceApiINTEL(
VkDevice _device,
const VkInitializePerformanceApiInfoINTEL* pInitializeInfo)
{
ANV_FROM_HANDLE(anv_device, device, _device);
const struct anv_physical_device *pdevice = &device->instance->physicalDevice;
if (!pdevice->perf)
return VK_ERROR_EXTENSION_NOT_PRESENT;
/* Not much to do here */
return VK_SUCCESS;
}
VkResult anv_GetPerformanceParameterINTEL(
VkDevice _device,
VkPerformanceParameterTypeINTEL parameter,
VkPerformanceValueINTEL* pValue)
{
ANV_FROM_HANDLE(anv_device, device, _device);
const struct anv_physical_device *pdevice = &device->instance->physicalDevice;
if (!pdevice->perf)
return VK_ERROR_EXTENSION_NOT_PRESENT;
VkResult result = VK_SUCCESS;
switch (parameter) {
case VK_PERFORMANCE_PARAMETER_TYPE_HW_COUNTERS_SUPPORTED_INTEL:
pValue->type = VK_PERFORMANCE_VALUE_TYPE_BOOL_INTEL;
pValue->data.valueBool = VK_TRUE;
break;
case VK_PERFORMANCE_PARAMETER_TYPE_STREAM_MARKER_VALID_BITS_INTEL:
pValue->type = VK_PERFORMANCE_VALUE_TYPE_UINT32_INTEL;
pValue->data.value32 = 25;
break;
default:
result = VK_ERROR_FEATURE_NOT_PRESENT;
break;
}
return result;
}
VkResult anv_CmdSetPerformanceMarkerINTEL(
VkCommandBuffer commandBuffer,
const VkPerformanceMarkerInfoINTEL* pMarkerInfo)
{
ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
cmd_buffer->intel_perf_marker = pMarkerInfo->marker;
return VK_SUCCESS;
}
VkResult anv_AcquirePerformanceConfigurationINTEL(
VkDevice _device,
const VkPerformanceConfigurationAcquireInfoINTEL* pAcquireInfo,
VkPerformanceConfigurationINTEL* pConfiguration)
{
ANV_FROM_HANDLE(anv_device, device, _device);
const struct anv_physical_device *pdevice = &device->instance->physicalDevice;
struct gen_perf_registers *perf_config =
gen_perf_load_configuration(pdevice->perf, device->fd,
GEN_PERF_QUERY_GUID_MDAPI);
if (!perf_config)
return VK_INCOMPLETE;
int ret = gen_perf_store_configuration(pdevice->perf, device->fd,
perf_config, NULL /* guid */);
if (ret < 0) {
ralloc_free(perf_config);
return VK_INCOMPLETE;
}
*pConfiguration = (VkPerformanceConfigurationINTEL) (uint64_t) ret;
return VK_SUCCESS;
}
VkResult anv_ReleasePerformanceConfigurationINTEL(
VkDevice _device,
VkPerformanceConfigurationINTEL _configuration)
{
ANV_FROM_HANDLE(anv_device, device, _device);
uint64_t config = (uint64_t) _configuration;
gen_ioctl(device->fd, DRM_IOCTL_I915_PERF_REMOVE_CONFIG, &config);
return VK_SUCCESS;
}
VkResult anv_QueueSetPerformanceConfigurationINTEL(
VkQueue _queue,
VkPerformanceConfigurationINTEL _configuration)
{
ANV_FROM_HANDLE(anv_queue, queue, _queue);
struct anv_device *device = queue->device;
uint64_t configuration = (uint64_t) _configuration;
if (device->perf_fd < 0) {
device->perf_fd = anv_device_perf_open(device, configuration);
if (device->perf_fd < 0)
return VK_ERROR_INITIALIZATION_FAILED;
} else {
int ret = gen_ioctl(device->perf_fd, I915_PERF_IOCTL_CONFIG,
(void *)(uintptr_t) _configuration);
if (ret < 0) {
return anv_device_set_lost(device,
"i915-perf config failed: %s",
strerror(ret));
}
}
return VK_SUCCESS;
}
void anv_UninitializePerformanceApiINTEL(
VkDevice _device)
{
ANV_FROM_HANDLE(anv_device, device, _device);
if (device->perf_fd >= 0) {
close(device->perf_fd);
device->perf_fd = -1;
}
}

View File

@ -74,6 +74,7 @@ struct anv_image_view;
struct anv_instance;
struct gen_l3_config;
struct gen_perf_config;
#include <vulkan/vulkan.h>
#include <vulkan/vulkan_intel.h>
@ -948,6 +949,7 @@ struct anv_physical_device {
bool supports_48bit_addresses;
struct brw_compiler * compiler;
struct isl_device isl_dev;
struct gen_perf_config * perf;
int cmd_parser_version;
bool has_exec_async;
bool has_exec_capture;
@ -1169,6 +1171,9 @@ struct anv_device {
* the cmd_buffer's list.
*/
struct anv_cmd_buffer *cmd_buffer_being_decoded;
int perf_fd; /* -1 if no opened */
uint64_t perf_metric; /* 0 if unset */
};
static inline struct anv_state_pool *
@ -2530,6 +2535,9 @@ struct anv_cmd_buffer {
VkCommandBufferLevel level;
struct anv_cmd_state state;
/* Set by SetPerformanceMarkerINTEL, written into queries by CmdBeginQuery */
uint64_t intel_perf_marker;
};
VkResult anv_cmd_buffer_init_batch_bo_chain(struct anv_cmd_buffer *cmd_buffer);
@ -3750,6 +3758,9 @@ anv_get_subpass_id(const struct anv_cmd_state * const cmd_state)
return subpass_id;
}
struct gen_perf_config *anv_get_perf(const struct gen_device_info *devinfo, int fd);
void anv_device_perf_init(struct anv_device *device);
#define ANV_DEFINE_HANDLE_CASTS(__anv_type, __VkType) \
\
static inline struct __anv_type * \

View File

@ -5091,3 +5091,57 @@ void genX(CmdWaitEvents)(
bufferMemoryBarrierCount, pBufferMemoryBarriers,
imageMemoryBarrierCount, pImageMemoryBarriers);
}
VkResult genX(CmdSetPerformanceOverrideINTEL)(
VkCommandBuffer commandBuffer,
const VkPerformanceOverrideInfoINTEL* pOverrideInfo)
{
ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
switch (pOverrideInfo->type) {
case VK_PERFORMANCE_OVERRIDE_TYPE_NULL_HARDWARE_INTEL: {
uint32_t dw;
#if GEN_GEN >= 9
anv_pack_struct(&dw, GENX(CS_DEBUG_MODE2),
._3DRenderingInstructionDisable = pOverrideInfo->enable,
.MediaInstructionDisable = pOverrideInfo->enable,
._3DRenderingInstructionDisableMask = true,
.MediaInstructionDisableMask = true);
emit_lri(&cmd_buffer->batch, GENX(CS_DEBUG_MODE2_num), dw);
#else
anv_pack_struct(&dw, GENX(INSTPM),
._3DRenderingInstructionDisable = pOverrideInfo->enable,
.MediaInstructionDisable = pOverrideInfo->enable,
._3DRenderingInstructionDisableMask = true,
.MediaInstructionDisableMask = true);
emit_lri(&cmd_buffer->batch, GENX(INSTPM_num), dw);
#endif
break;
}
case VK_PERFORMANCE_OVERRIDE_TYPE_FLUSH_GPU_CACHES_INTEL:
if (pOverrideInfo->enable) {
/* FLUSH ALL THE THINGS! As requested by the MDAPI team. */
cmd_buffer->state.pending_pipe_bits |=
ANV_PIPE_FLUSH_BITS |
ANV_PIPE_INVALIDATE_BITS;
genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
}
break;
default:
unreachable("Invalid override");
}
return VK_SUCCESS;
}
VkResult genX(CmdSetPerformanceStreamMarkerINTEL)(
VkCommandBuffer commandBuffer,
const VkPerformanceStreamMarkerInfoINTEL* pMarkerInfo)
{
/* TODO: Waiting on the register to write, might depend on generation. */
return VK_SUCCESS;
}

View File

@ -37,6 +37,10 @@
#define __gen_get_batch_dwords anv_batch_emit_dwords
#define __gen_address_offset anv_address_add
#include "common/gen_mi_builder.h"
#include "perf/gen_perf.h"
#include "perf/gen_perf_mdapi.h"
#define OA_REPORT_N_UINT64 (256 / sizeof(uint64_t))
VkResult genX(CreateQueryPool)(
VkDevice _device,
@ -52,9 +56,14 @@ VkResult genX(CreateQueryPool)(
assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO);
/* Query pool slots are made up of some number of 64-bit values packed
* tightly together. The first 64-bit value is always the "available" bit
* which is 0 when the query is unavailable and 1 when it is available.
* The 64-bit values that follow are determined by the type of query.
* tightly together. For most query types have the first 64-bit value is
* the "available" bit which is 0 when the query is unavailable and 1 when
* it is available. The 64-bit values that follow are determined by the
* type of query.
*
* For performance queries, we have a requirement to align OA reports at
* 64bytes so we put those first and have the "available" bit behind
* together with some other counters.
*/
uint32_t uint64s_per_slot = 1;
@ -84,6 +93,15 @@ VkResult genX(CreateQueryPool)(
*/
uint64s_per_slot += 4;
break;
case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: {
uint64s_per_slot = 2 * OA_REPORT_N_UINT64; /* begin & end OA reports */
uint64s_per_slot += 4; /* PerfCounter 1 & 2 */
uint64s_per_slot++; /* 2 * 32bit RPSTAT register */
uint64s_per_slot++; /* 64bit marker */
uint64s_per_slot++; /* availability */
uint64s_per_slot = align_u32(uint64s_per_slot, 8); /* OA reports must be aligned to 64 bytes */
break;
}
default:
assert(!"Invalid query type");
}
@ -160,6 +178,57 @@ anv_query_address(struct anv_query_pool *pool, uint32_t query)
};
}
/**
* VK_INTEL_performance_query layout:
*
* ------------------------------
* | end MI_RPC (256b) |
* |----------------------------|
* | begin MI_RPC (256b) |
* |----------------------------|
* | begin perfcntr 1 & 2 (16b) |
* |----------------------------|
* | end perfcntr 1 & 2 (16b) |
* |----------------------------|
* | begin RPSTAT register (4b) |
* |----------------------------|
* | end RPSTAT register (4b) |
* |----------------------------|
* | marker (8b) |
* |----------------------------|
* | availability (8b) |
* ------------------------------
*/
static uint32_t
intel_perf_mi_rpc_offset(bool end)
{
return end ? 0 : 256;
}
static uint32_t
intel_perf_counter(bool end)
{
uint32_t offset = 512;
offset += end ? 2 * sizeof(uint64_t) : 0;
return offset;
}
static uint32_t
intel_perf_rpstart_offset(bool end)
{
uint32_t offset = intel_perf_counter(false) +
4 * sizeof(uint64_t);
offset += end ? sizeof(uint32_t) : 0;
return offset;
}
static uint32_t
intel_perf_marker_offset(void)
{
return intel_perf_rpstart_offset(false) + sizeof(uint64_t);
}
static void
cpu_write_query_result(void *dst_slot, VkQueryResultFlags flags,
uint32_t value_index, uint64_t result)
@ -173,18 +242,28 @@ cpu_write_query_result(void *dst_slot, VkQueryResultFlags flags,
}
}
static bool
query_is_available(uint64_t *slot)
static void *
query_slot(struct anv_query_pool *pool, uint32_t query)
{
return *(volatile uint64_t *)slot;
return pool->bo.map + query * pool->stride;
}
static bool
query_is_available(struct anv_query_pool *pool, uint32_t query)
{
if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL) {
return *(volatile uint64_t *)((uint8_t *)query_slot(pool, query) +
pool->stride - 8);
} else
return *(volatile uint64_t *)query_slot(pool, query);
}
static VkResult
wait_for_available(struct anv_device *device,
struct anv_query_pool *pool, uint64_t *slot)
struct anv_query_pool *pool, uint32_t query)
{
while (true) {
if (query_is_available(slot))
if (query_is_available(pool, query))
return VK_SUCCESS;
int ret = anv_gem_busy(device, pool->bo.gem_handle);
@ -197,7 +276,7 @@ wait_for_available(struct anv_device *device,
} else {
assert(ret == 0);
/* The BO is no longer busy. */
if (query_is_available(slot)) {
if (query_is_available(pool, query)) {
return VK_SUCCESS;
} else {
VkResult status = anv_device_query_status(device);
@ -233,7 +312,8 @@ VkResult genX(GetQueryPoolResults)(
assert(pool->type == VK_QUERY_TYPE_OCCLUSION ||
pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS ||
pool->type == VK_QUERY_TYPE_TIMESTAMP ||
pool->type == VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT);
pool->type == VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT ||
pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL);
if (anv_device_is_lost(device))
return VK_ERROR_DEVICE_LOST;
@ -245,13 +325,10 @@ VkResult genX(GetQueryPoolResults)(
VkResult status = VK_SUCCESS;
for (uint32_t i = 0; i < queryCount; i++) {
uint64_t *slot = pool->bo.map + (firstQuery + i) * pool->stride;
/* Availability is always at the start of the slot */
bool available = slot[0];
bool available = query_is_available(pool, firstQuery + i);
if (!available && (flags & VK_QUERY_RESULT_WAIT_BIT)) {
status = wait_for_available(device, pool, slot);
status = wait_for_available(device, pool, firstQuery + i);
if (status != VK_SUCCESS)
return status;
@ -271,13 +348,16 @@ VkResult genX(GetQueryPoolResults)(
uint32_t idx = 0;
switch (pool->type) {
case VK_QUERY_TYPE_OCCLUSION:
case VK_QUERY_TYPE_OCCLUSION: {
uint64_t *slot = query_slot(pool, firstQuery + i);
if (write_results)
cpu_write_query_result(pData, flags, idx, slot[2] - slot[1]);
idx++;
break;
}
case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
uint64_t *slot = query_slot(pool, firstQuery + i);
uint32_t statistics = pool->pipeline_statistics;
while (statistics) {
uint32_t stat = u_bit_scan(&statistics);
@ -297,7 +377,8 @@ VkResult genX(GetQueryPoolResults)(
break;
}
case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: {
uint64_t *slot = query_slot(pool, firstQuery + i);
if (write_results)
cpu_write_query_result(pData, flags, idx, slot[2] - slot[1]);
idx++;
@ -305,12 +386,54 @@ VkResult genX(GetQueryPoolResults)(
cpu_write_query_result(pData, flags, idx, slot[4] - slot[3]);
idx++;
break;
}
case VK_QUERY_TYPE_TIMESTAMP:
case VK_QUERY_TYPE_TIMESTAMP: {
uint64_t *slot = query_slot(pool, firstQuery + i);
if (write_results)
cpu_write_query_result(pData, flags, idx, slot[1]);
idx++;
break;
}
case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: {
if (!write_results)
break;
const void *query_data = query_slot(pool, firstQuery + i);
const uint32_t *oa_begin = query_data + intel_perf_mi_rpc_offset(false);
const uint32_t *oa_end = query_data + intel_perf_mi_rpc_offset(true);
const uint32_t *rpstat_begin = query_data + intel_perf_rpstart_offset(false);
const uint32_t *rpstat_end = query_data + intel_perf_mi_rpc_offset(true);
struct gen_perf_query_result result;
struct gen_perf_query_info metric = {
.oa_format = (GEN_GEN >= 8 ?
I915_OA_FORMAT_A32u40_A4u32_B8_C8 :
I915_OA_FORMAT_A45_B8_C8),
};
uint32_t core_freq[2];
#if GEN_GEN < 9
core_freq[0] = ((*rpstat_begin >> 7) & 0x7f) * 1000000ULL;
core_freq[1] = ((*rpstat_end >> 7) & 0x7f) * 1000000ULL;
#else
core_freq[0] = ((*rpstat_begin >> 23) & 0x1ff) * 1000000ULL;
core_freq[1] = ((*rpstat_end >> 23) & 0x1ff) * 1000000ULL;
#endif
gen_perf_query_result_clear(&result);
gen_perf_query_result_accumulate(&result, &metric,
oa_begin, oa_end);
gen_perf_query_result_read_frequencies(&result, &device->info,
oa_begin, oa_end);
gen_perf_query_result_write_mdapi(pData, stride,
&device->info,
&result,
core_freq[0], core_freq[1]);
gen_perf_query_mdapi_write_perfcntr(pData, stride, &device->info,
query_data + intel_perf_counter(false),
query_data + intel_perf_counter(true));
const uint64_t *marker = query_data + intel_perf_marker_offset();
gen_perf_query_mdapi_write_marker(pData, stride, &device->info, *marker);
break;
}
default:
unreachable("invalid pool type");
@ -406,6 +529,16 @@ emit_zero_queries(struct anv_cmd_buffer *cmd_buffer,
}
break;
case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL:
for (uint32_t i = 0; i < num_queries; i++) {
struct anv_address slot_addr =
anv_query_address(pool, first_index + i);
gen_mi_memset(b, slot_addr, 0, pool->stride - 8);
emit_query_mi_availability(b, anv_address_add(slot_addr,
pool->stride - 8), true);
}
break;
default:
unreachable("Unsupported query type");
}
@ -440,6 +573,21 @@ void genX(CmdResetQueryPool)(
break;
}
case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: {
struct gen_mi_builder b;
gen_mi_builder_init(&b, &cmd_buffer->batch);
for (uint32_t i = 0; i < queryCount; i++) {
emit_query_mi_availability(
&b,
anv_address_add(
anv_query_address(pool, firstQuery + i),
pool->stride - 8),
false);
}
break;
}
default:
unreachable("Unsupported query type");
}
@ -550,6 +698,37 @@ void genX(CmdBeginQueryIndexedEXT)(
emit_xfb_query(&b, index, anv_address_add(query_addr, 8));
break;
case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: {
anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
pc.CommandStreamerStallEnable = true;
pc.StallAtPixelScoreboard = true;
}
anv_batch_emit(&cmd_buffer->batch, GENX(MI_REPORT_PERF_COUNT), rpc) {
rpc.MemoryAddress =
anv_address_add(query_addr, intel_perf_mi_rpc_offset(false));
}
#if GEN_GEN < 9
gen_mi_store(&b,
gen_mi_mem32(anv_address_add(query_addr,
intel_perf_rpstart_offset(false))),
gen_mi_reg32(GENX(RPSTAT1_num)));
#else
gen_mi_store(&b,
gen_mi_mem32(anv_address_add(query_addr,
intel_perf_rpstart_offset(false))),
gen_mi_reg32(GENX(RPSTAT0_num)));
#endif
#if GEN_GEN >= 8 && GEN_GEN <= 11
gen_mi_store(&b, gen_mi_mem64(anv_address_add(query_addr,
intel_perf_counter(false))),
gen_mi_reg64(GENX(PERFCNT1_num)));
gen_mi_store(&b, gen_mi_mem64(anv_address_add(query_addr,
intel_perf_counter(false) + 8)),
gen_mi_reg64(GENX(PERFCNT2_num)));
#endif
break;
}
default:
unreachable("");
}
@ -611,6 +790,45 @@ void genX(CmdEndQueryIndexedEXT)(
emit_query_mi_availability(&b, query_addr, true);
break;
case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: {
anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
pc.CommandStreamerStallEnable = true;
pc.StallAtPixelScoreboard = true;
}
uint32_t marker_offset = intel_perf_marker_offset();
gen_mi_store(&b, gen_mi_mem64(anv_address_add(query_addr, marker_offset)),
gen_mi_imm(cmd_buffer->intel_perf_marker));
#if GEN_GEN >= 8 && GEN_GEN <= 11
gen_mi_store(&b, gen_mi_mem64(anv_address_add(query_addr, intel_perf_counter(true))),
gen_mi_reg64(GENX(PERFCNT1_num)));
gen_mi_store(&b, gen_mi_mem64(anv_address_add(query_addr, intel_perf_counter(true) + 8)),
gen_mi_reg64(GENX(PERFCNT2_num)));
#endif
#if GEN_GEN < 9
gen_mi_store(&b,
gen_mi_mem32(anv_address_add(query_addr,
intel_perf_rpstart_offset(true))),
gen_mi_reg32(GENX(RPSTAT1_num)));
#else
gen_mi_store(&b,
gen_mi_mem32(anv_address_add(query_addr,
intel_perf_rpstart_offset(true))),
gen_mi_reg32(GENX(RPSTAT0_num)));
#endif
/* Position the last OA snapshot at the beginning of the query so that
* we can tell whether it's ready.
*/
anv_batch_emit(&cmd_buffer->batch, GENX(MI_REPORT_PERF_COUNT), rpc) {
rpc.MemoryAddress = anv_address_add(query_addr,
intel_perf_mi_rpc_offset(true));
rpc.ReportID = 0xdeadbeef; /* This goes in the first dword */
}
emit_query_mi_availability(&b,
anv_address_add(query_addr, pool->stride - 8),
true);
break;
}
default:
unreachable("");
}

View File

@ -118,6 +118,7 @@ libanv_files = files(
'anv_nir_lower_push_constants.c',
'anv_nir_lower_ycbcr_textures.c',
'anv_pass.c',
'anv_perf.c',
'anv_pipeline.c',
'anv_pipeline_cache.c',
'anv_private.h',
@ -194,6 +195,7 @@ libvulkan_intel = shared_library(
link_whole : [libanv_common, libanv_gen_libs],
link_with : [
libintel_compiler, libintel_dev, libisl, libblorp, libvulkan_wsi,
libintel_perf,
],
dependencies : [
dep_thread, dep_dl, dep_m, anv_deps, idep_libintel_common,
@ -227,7 +229,7 @@ if with_tests
link_whole : libanv_common,
link_with : [
libanv_gen_libs, libintel_compiler, libintel_common, libintel_dev,
libisl, libblorp, libvulkan_wsi,
libisl, libblorp, libvulkan_wsi, libintel_perf,
],
dependencies : [
dep_thread, dep_dl, dep_m, anv_deps,