radv: Implement support for querying performance counters.
Reviewed-by: Samuel Pitoiset <samuel.pitoiset@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/16879>
This commit is contained in:
parent
439e4b5c88
commit
eb669b94ee
|
@ -37,6 +37,19 @@ radv_perfcounter_emit_shaders(struct radeon_cmdbuf *cs, unsigned shaders)
|
|||
radeon_emit(cs, 0xffffffff);
|
||||
}
|
||||
|
||||
static void
|
||||
radv_emit_windowed_counters(struct radv_device *device, struct radeon_cmdbuf *cs, int family,
|
||||
bool enable)
|
||||
{
|
||||
if (family == RADV_QUEUE_GENERAL) {
|
||||
radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
|
||||
radeon_emit(cs, EVENT_TYPE(enable ? V_028A90_PERFCOUNTER_START : V_028A90_PERFCOUNTER_STOP) |
|
||||
EVENT_INDEX(0));
|
||||
}
|
||||
|
||||
radeon_set_sh_reg(cs, R_00B82C_COMPUTE_PERFCOUNT_ENABLE, S_00B82C_PERFCOUNT_ENABLE(enable));
|
||||
}
|
||||
|
||||
void
|
||||
radv_perfcounter_emit_spm_reset(struct radeon_cmdbuf *cs)
|
||||
{
|
||||
|
@ -53,25 +66,13 @@ radv_perfcounter_emit_spm_start(struct radv_device *device, struct radeon_cmdbuf
|
|||
S_036020_PERFMON_STATE(V_036020_CP_PERFMON_STATE_DISABLE_AND_RESET) |
|
||||
S_036020_SPM_PERFMON_STATE(V_036020_STRM_PERFMON_STATE_START_COUNTING));
|
||||
|
||||
/* Start windowed performance counters. */
|
||||
if (family == RADV_QUEUE_GENERAL) {
|
||||
radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
|
||||
radeon_emit(cs, EVENT_TYPE(V_028A90_PERFCOUNTER_START) | EVENT_INDEX(0));
|
||||
}
|
||||
radeon_set_sh_reg(cs, R_00B82C_COMPUTE_PERFCOUNT_ENABLE, S_00B82C_PERFCOUNT_ENABLE(1));
|
||||
radv_emit_windowed_counters(device, cs, family, true);
|
||||
}
|
||||
|
||||
void
|
||||
radv_perfcounter_emit_spm_stop(struct radv_device *device, struct radeon_cmdbuf *cs, int family)
|
||||
{
|
||||
/* Stop windowed performance counters. */
|
||||
if (family == RADV_QUEUE_GENERAL) {
|
||||
if (!device->physical_device->rad_info.never_send_perfcounter_stop) {
|
||||
radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
|
||||
radeon_emit(cs, EVENT_TYPE(V_028A90_PERFCOUNTER_STOP) | EVENT_INDEX(0));
|
||||
}
|
||||
}
|
||||
radeon_set_sh_reg(cs, R_00B82C_COMPUTE_PERFCOUNT_ENABLE, S_00B82C_PERFCOUNT_ENABLE(0));
|
||||
radv_emit_windowed_counters(device, cs, family, false);
|
||||
|
||||
/* Stop SPM counters. */
|
||||
radeon_set_uconfig_reg(cs, R_036020_CP_PERFMON_CNTL,
|
||||
|
@ -391,3 +392,446 @@ radv_get_num_counter_passes(const struct radv_physical_device *pdevice, unsigned
|
|||
|
||||
return passes_needed;
|
||||
}
|
||||
|
||||
void
|
||||
radv_pc_deinit_query_pool(struct radv_pc_query_pool *pool)
|
||||
{
|
||||
free(pool->counters);
|
||||
free(pool->pc_regs);
|
||||
}
|
||||
|
||||
VkResult
|
||||
radv_pc_init_query_pool(struct radv_physical_device *pdevice,
|
||||
const VkQueryPoolCreateInfo *pCreateInfo, struct radv_pc_query_pool *pool)
|
||||
{
|
||||
const VkQueryPoolPerformanceCreateInfoKHR *perf_info =
|
||||
vk_find_struct_const(pCreateInfo->pNext, QUERY_POOL_PERFORMANCE_CREATE_INFO_KHR);
|
||||
VkResult result;
|
||||
|
||||
if (!radv_init_perfcounter_descs(pdevice))
|
||||
return VK_ERROR_OUT_OF_HOST_MEMORY;
|
||||
|
||||
result =
|
||||
radv_get_counter_registers(pdevice, perf_info->counterIndexCount, perf_info->pCounterIndices,
|
||||
&pool->num_pc_regs, &pool->pc_regs);
|
||||
if (result != VK_SUCCESS)
|
||||
return result;
|
||||
|
||||
pool->num_passes = radv_get_num_counter_passes(pdevice, pool->num_pc_regs, pool->pc_regs);
|
||||
|
||||
uint32_t *pc_reg_offsets = malloc(pool->num_pc_regs * sizeof(uint32_t));
|
||||
if (!pc_reg_offsets)
|
||||
return VK_ERROR_OUT_OF_HOST_MEMORY;
|
||||
|
||||
unsigned offset = 0;
|
||||
for (unsigned i = 0; i < pool->num_pc_regs; ++i) {
|
||||
enum ac_pc_gpu_block block = pool->pc_regs[i] >> 16;
|
||||
struct ac_pc_block *ac_block = ac_pc_get_block(&pdevice->ac_perfcounters, block);
|
||||
unsigned num_instances = radv_pc_get_num_instances(pdevice, ac_block);
|
||||
|
||||
pc_reg_offsets[i] = S_REG_OFFSET(offset) | S_REG_INSTANCES(num_instances);
|
||||
offset += sizeof(uint64_t) * 2 * num_instances;
|
||||
}
|
||||
|
||||
/* allow an uint32_t per pass to signal completion. */
|
||||
pool->b.stride = offset + 8 * pool->num_passes;
|
||||
|
||||
pool->num_counters = perf_info->counterIndexCount;
|
||||
pool->counters = malloc(pool->num_counters * sizeof(struct radv_perfcounter_impl));
|
||||
if (!pool->counters) {
|
||||
free(pc_reg_offsets);
|
||||
return VK_ERROR_OUT_OF_HOST_MEMORY;
|
||||
}
|
||||
|
||||
for (unsigned i = 0; i < pool->num_counters; ++i) {
|
||||
pool->counters[i] = pdevice->perfcounters[perf_info->pCounterIndices[i]].impl;
|
||||
|
||||
for (unsigned j = 0; j < ARRAY_SIZE(pool->counters[i].regs); ++j) {
|
||||
uint32_t reg = pool->counters[i].regs[j];
|
||||
if (!reg || G_REG_CONSTANT(reg))
|
||||
continue;
|
||||
|
||||
unsigned k;
|
||||
for (k = 0; k < pool->num_pc_regs; ++k)
|
||||
if (pool->pc_regs[k] == reg)
|
||||
break;
|
||||
pool->counters[i].regs[j] = pc_reg_offsets[k];
|
||||
}
|
||||
}
|
||||
|
||||
free(pc_reg_offsets);
|
||||
return VK_SUCCESS;
|
||||
}
|
||||
|
||||
static void
|
||||
radv_emit_instance(struct radv_cmd_buffer *cmd_buffer, int se, int instance)
|
||||
{
|
||||
struct radeon_cmdbuf *cs = cmd_buffer->cs;
|
||||
unsigned value = S_030800_SH_BROADCAST_WRITES(1);
|
||||
|
||||
if (se >= 0) {
|
||||
value |= S_030800_SE_INDEX(se);
|
||||
} else {
|
||||
value |= S_030800_SE_BROADCAST_WRITES(1);
|
||||
}
|
||||
|
||||
if (instance >= 0) {
|
||||
value |= S_030800_INSTANCE_INDEX(instance);
|
||||
} else {
|
||||
value |= S_030800_INSTANCE_BROADCAST_WRITES(1);
|
||||
}
|
||||
|
||||
radeon_set_uconfig_reg(cs, R_030800_GRBM_GFX_INDEX, value);
|
||||
}
|
||||
|
||||
static void
|
||||
radv_emit_select(struct radv_cmd_buffer *cmd_buffer, struct ac_pc_block *block, unsigned count,
|
||||
unsigned *selectors)
|
||||
{
|
||||
struct ac_pc_block_base *regs = block->b->b;
|
||||
struct radeon_cmdbuf *cs = cmd_buffer->cs;
|
||||
unsigned idx;
|
||||
|
||||
assert(count <= regs->num_counters);
|
||||
|
||||
/* Fake counters. */
|
||||
if (!regs->select0)
|
||||
return;
|
||||
|
||||
for (idx = 0; idx < count; ++idx) {
|
||||
radeon_set_perfctr_reg(cmd_buffer, regs->select0[idx],
|
||||
G_REG_SEL(selectors[idx]) | regs->select_or);
|
||||
}
|
||||
|
||||
for (idx = 0; idx < regs->num_spm_counters; idx++) {
|
||||
radeon_set_uconfig_reg_seq(cs, regs->select1[idx], 1);
|
||||
radeon_emit(cs, 0);
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
radv_pc_emit_block_instance_read(struct radv_cmd_buffer *cmd_buffer, struct ac_pc_block *block,
|
||||
unsigned count, uint64_t va)
|
||||
{
|
||||
struct ac_pc_block_base *regs = block->b->b;
|
||||
struct radeon_cmdbuf *cs = cmd_buffer->cs;
|
||||
unsigned reg = regs->counter0_lo;
|
||||
unsigned reg_delta = 8;
|
||||
|
||||
assert(regs->select0);
|
||||
for (unsigned idx = 0; idx < count; ++idx) {
|
||||
if (regs->counters)
|
||||
reg = regs->counters[idx];
|
||||
|
||||
radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
|
||||
radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_PERF) | COPY_DATA_DST_SEL(COPY_DATA_TC_L2) |
|
||||
COPY_DATA_WR_CONFIRM | COPY_DATA_COUNT_SEL); /* 64 bits */
|
||||
radeon_emit(cs, reg >> 2);
|
||||
radeon_emit(cs, 0); /* unused */
|
||||
radeon_emit(cs, va);
|
||||
radeon_emit(cs, va >> 32);
|
||||
|
||||
va += sizeof(uint64_t) * 2 *
|
||||
radv_pc_get_num_instances(cmd_buffer->device->physical_device, block);
|
||||
reg += reg_delta;
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
radv_pc_sample_block(struct radv_cmd_buffer *cmd_buffer, struct ac_pc_block *block, unsigned count,
|
||||
uint64_t va)
|
||||
{
|
||||
unsigned se_end = 1;
|
||||
if (block->b->b->flags & AC_PC_BLOCK_SE)
|
||||
se_end = cmd_buffer->device->physical_device->rad_info.max_se;
|
||||
|
||||
for (unsigned se = 0; se < se_end; ++se) {
|
||||
for (unsigned instance = 0; instance < block->num_instances; ++instance) {
|
||||
radv_emit_instance(cmd_buffer, se, instance);
|
||||
radv_pc_emit_block_instance_read(cmd_buffer, block, count, va);
|
||||
va += sizeof(uint64_t) * 2;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
radv_pc_wait_idle(struct radv_cmd_buffer *cmd_buffer)
|
||||
{
|
||||
struct radeon_cmdbuf *cs = cmd_buffer->cs;
|
||||
|
||||
radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
|
||||
radeon_emit(cs, EVENT_TYPE(V_028A90_CS_PARTIAL_FLUSH | EVENT_INDEX(4)));
|
||||
|
||||
radeon_emit(cs, PKT3(PKT3_ACQUIRE_MEM, 6, 0));
|
||||
radeon_emit(cs, 0); /* CP_COHER_CNTL */
|
||||
radeon_emit(cs, 0xffffffff); /* CP_COHER_SIZE */
|
||||
radeon_emit(cs, 0xffffff); /* CP_COHER_SIZE_HI */
|
||||
radeon_emit(cs, 0); /* CP_COHER_BASE */
|
||||
radeon_emit(cs, 0); /* CP_COHER_BASE_HI */
|
||||
radeon_emit(cs, 0x0000000A); /* POLL_INTERVAL */
|
||||
radeon_emit(cs, 0); /* GCR_CNTL */
|
||||
|
||||
radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
|
||||
radeon_emit(cs, 0);
|
||||
}
|
||||
|
||||
static void
|
||||
radv_pc_stop_and_sample(struct radv_cmd_buffer *cmd_buffer, struct radv_pc_query_pool *pool,
|
||||
uint64_t va, bool end)
|
||||
{
|
||||
struct radeon_cmdbuf *cs = cmd_buffer->cs;
|
||||
struct radv_physical_device *pdevice = cmd_buffer->device->physical_device;
|
||||
|
||||
radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
|
||||
radeon_emit(cs, EVENT_TYPE(V_028A90_PERFCOUNTER_SAMPLE) | EVENT_INDEX(0));
|
||||
|
||||
radv_pc_wait_idle(cmd_buffer);
|
||||
|
||||
radv_emit_instance(cmd_buffer, -1, -1);
|
||||
radv_emit_windowed_counters(cmd_buffer->device, cs, cmd_buffer->qf, false);
|
||||
|
||||
radeon_set_uconfig_reg(cs, R_036020_CP_PERFMON_CNTL,
|
||||
S_036020_PERFMON_STATE(V_036020_CP_PERFMON_STATE_STOP_COUNTING) |
|
||||
S_036020_PERFMON_SAMPLE_ENABLE(1));
|
||||
|
||||
for (unsigned pass = 0; pass < pool->num_passes; ++pass) {
|
||||
uint64_t pred_va = radv_buffer_get_va(cmd_buffer->device->perf_counter_bo) +
|
||||
PERF_CTR_BO_PASS_OFFSET + 8 * pass;
|
||||
uint64_t reg_va = va + (end ? 8 : 0);
|
||||
|
||||
radeon_emit(cs, PKT3(PKT3_COND_EXEC, 3, 0));
|
||||
radeon_emit(cs, pred_va);
|
||||
radeon_emit(cs, pred_va >> 32);
|
||||
radeon_emit(cs, 0); /* Cache policy */
|
||||
|
||||
uint32_t *skip_dwords = cs->buf + cs->cdw;
|
||||
radeon_emit(cs, 0);
|
||||
|
||||
for (unsigned i = 0; i < pool->num_pc_regs;) {
|
||||
enum ac_pc_gpu_block block = G_REG_BLOCK(pool->pc_regs[i]);
|
||||
struct ac_pc_block *ac_block = ac_pc_get_block(&pdevice->ac_perfcounters, block);
|
||||
unsigned offset = ac_block->num_instances * pass;
|
||||
unsigned num_instances = radv_pc_get_num_instances(pdevice, ac_block);
|
||||
|
||||
unsigned cnt = 1;
|
||||
while (cnt < pool->num_pc_regs - i && block == G_REG_BLOCK(pool->pc_regs[i + cnt]))
|
||||
++cnt;
|
||||
|
||||
if (offset < cnt) {
|
||||
unsigned pass_reg_cnt = MIN2(cnt - offset, ac_block->b->b->num_counters);
|
||||
radv_pc_sample_block(cmd_buffer, ac_block, pass_reg_cnt,
|
||||
reg_va + offset * num_instances * sizeof(uint64_t));
|
||||
}
|
||||
|
||||
i += cnt;
|
||||
reg_va += num_instances * sizeof(uint64_t) * 2 * cnt;
|
||||
}
|
||||
|
||||
if (end) {
|
||||
uint64_t signal_va = va + pool->b.stride - 8 - 8 * pass;
|
||||
radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 3, 0));
|
||||
radeon_emit(cs,
|
||||
S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(V_370_ME));
|
||||
radeon_emit(cs, signal_va);
|
||||
radeon_emit(cs, signal_va >> 32);
|
||||
radeon_emit(cs, 1); /* value */
|
||||
}
|
||||
|
||||
*skip_dwords = cs->buf + cs->cdw - skip_dwords - 1;
|
||||
}
|
||||
|
||||
radv_emit_instance(cmd_buffer, -1, -1);
|
||||
}
|
||||
|
||||
void
|
||||
radv_pc_begin_query(struct radv_cmd_buffer *cmd_buffer, struct radv_pc_query_pool *pool,
|
||||
uint64_t va)
|
||||
{
|
||||
struct radeon_cmdbuf *cs = cmd_buffer->cs;
|
||||
struct radv_physical_device *pdevice = cmd_buffer->device->physical_device;
|
||||
ASSERTED unsigned cdw_max;
|
||||
|
||||
cmd_buffer->state.uses_perf_counters = true;
|
||||
|
||||
cdw_max = radeon_check_space(cmd_buffer->device->ws, cs,
|
||||
256 + /* Random one time stuff */
|
||||
10 * pool->num_passes + /* COND_EXECs */
|
||||
pool->b.stride / 8 * (5 + 8));
|
||||
|
||||
radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, pool->b.bo);
|
||||
radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, cmd_buffer->device->perf_counter_bo);
|
||||
|
||||
uint64_t perf_ctr_va =
|
||||
radv_buffer_get_va(cmd_buffer->device->perf_counter_bo) + PERF_CTR_BO_FENCE_OFFSET;
|
||||
radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 3, 0));
|
||||
radeon_emit(cs, S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(V_370_ME));
|
||||
radeon_emit(cs, perf_ctr_va);
|
||||
radeon_emit(cs, perf_ctr_va >> 32);
|
||||
radeon_emit(cs, 0); /* value */
|
||||
|
||||
radv_pc_wait_idle(cmd_buffer);
|
||||
|
||||
radeon_set_uconfig_reg(cs, R_036020_CP_PERFMON_CNTL,
|
||||
S_036020_PERFMON_STATE(V_036020_CP_PERFMON_STATE_DISABLE_AND_RESET));
|
||||
|
||||
radv_emit_inhibit_clockgating(cmd_buffer->device, cs, true);
|
||||
radv_emit_spi_config_cntl(cmd_buffer->device, cs, true);
|
||||
radv_perfcounter_emit_shaders(cs, 0x7f);
|
||||
|
||||
for (unsigned pass = 0; pass < pool->num_passes; ++pass) {
|
||||
uint64_t pred_va = radv_buffer_get_va(cmd_buffer->device->perf_counter_bo) +
|
||||
PERF_CTR_BO_PASS_OFFSET + 8 * pass;
|
||||
|
||||
radeon_emit(cs, PKT3(PKT3_COND_EXEC, 3, 0));
|
||||
radeon_emit(cs, pred_va);
|
||||
radeon_emit(cs, pred_va >> 32);
|
||||
radeon_emit(cs, 0); /* Cache policy */
|
||||
|
||||
uint32_t *skip_dwords = cs->buf + cs->cdw;
|
||||
radeon_emit(cs, 0);
|
||||
|
||||
for (unsigned i = 0; i < pool->num_pc_regs;) {
|
||||
enum ac_pc_gpu_block block = G_REG_BLOCK(pool->pc_regs[i]);
|
||||
struct ac_pc_block *ac_block = ac_pc_get_block(&pdevice->ac_perfcounters, block);
|
||||
unsigned offset = ac_block->num_instances * pass;
|
||||
|
||||
unsigned cnt = 1;
|
||||
while (cnt < pool->num_pc_regs - i && block == G_REG_BLOCK(pool->pc_regs[i + cnt]))
|
||||
++cnt;
|
||||
|
||||
if (offset < cnt) {
|
||||
unsigned pass_reg_cnt = MIN2(cnt - offset, ac_block->b->b->num_counters);
|
||||
radv_emit_select(cmd_buffer, ac_block, pass_reg_cnt, pool->pc_regs + i + offset);
|
||||
}
|
||||
|
||||
i += cnt;
|
||||
}
|
||||
|
||||
*skip_dwords = cs->buf + cs->cdw - skip_dwords - 1;
|
||||
}
|
||||
|
||||
radv_emit_instance(cmd_buffer, -1, -1);
|
||||
|
||||
/* The following sequence actually starts the perfcounters. */
|
||||
|
||||
radv_pc_stop_and_sample(cmd_buffer, pool, va, false);
|
||||
|
||||
radeon_set_uconfig_reg(cs, R_036020_CP_PERFMON_CNTL,
|
||||
S_036020_PERFMON_STATE(V_036020_CP_PERFMON_STATE_START_COUNTING));
|
||||
|
||||
radv_emit_windowed_counters(cmd_buffer->device, cs, cmd_buffer->qf, true);
|
||||
|
||||
assert(cmd_buffer->cs->cdw <= cdw_max);
|
||||
}
|
||||
|
||||
void
|
||||
radv_pc_end_query(struct radv_cmd_buffer *cmd_buffer, struct radv_pc_query_pool *pool, uint64_t va)
|
||||
{
|
||||
struct radeon_cmdbuf *cs = cmd_buffer->cs;
|
||||
ASSERTED unsigned cdw_max;
|
||||
|
||||
cdw_max =
|
||||
radeon_check_space(cmd_buffer->device->ws, cs,
|
||||
256 + /* Reserved for things that don't scale with passes/counters */
|
||||
5 * pool->num_passes + /* COND_EXECs */
|
||||
pool->b.stride / 8 * 8);
|
||||
|
||||
radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, pool->b.bo);
|
||||
radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, cmd_buffer->device->perf_counter_bo);
|
||||
|
||||
uint64_t perf_ctr_va =
|
||||
radv_buffer_get_va(cmd_buffer->device->perf_counter_bo) + PERF_CTR_BO_FENCE_OFFSET;
|
||||
si_cs_emit_write_event_eop(cs, cmd_buffer->device->physical_device->rad_info.gfx_level,
|
||||
radv_cmd_buffer_uses_mec(cmd_buffer), V_028A90_BOTTOM_OF_PIPE_TS, 0,
|
||||
EOP_DST_SEL_MEM, EOP_DATA_SEL_VALUE_32BIT, perf_ctr_va, 1,
|
||||
cmd_buffer->gfx9_fence_va);
|
||||
radv_cp_wait_mem(cs, WAIT_REG_MEM_EQUAL, perf_ctr_va, 1, 0xffffffff);
|
||||
|
||||
radv_pc_wait_idle(cmd_buffer);
|
||||
radv_pc_stop_and_sample(cmd_buffer, pool, va, true);
|
||||
|
||||
radeon_set_uconfig_reg(cs, R_036020_CP_PERFMON_CNTL,
|
||||
S_036020_PERFMON_STATE(V_036020_CP_PERFMON_STATE_DISABLE_AND_RESET));
|
||||
radv_emit_spi_config_cntl(cmd_buffer->device, cs, false);
|
||||
radv_emit_inhibit_clockgating(cmd_buffer->device, cs, false);
|
||||
|
||||
assert(cmd_buffer->cs->cdw <= cdw_max);
|
||||
}
|
||||
|
||||
static uint64_t
|
||||
radv_pc_sum_reg(uint32_t reg, const uint64_t *data)
|
||||
{
|
||||
unsigned instances = G_REG_INSTANCES(reg);
|
||||
unsigned offset = G_REG_OFFSET(reg) / 8;
|
||||
uint64_t result = 0;
|
||||
|
||||
if (G_REG_CONSTANT(reg))
|
||||
return reg & 0x7fffffffu;
|
||||
|
||||
for (unsigned i = 0; i < instances; ++i) {
|
||||
result += data[offset + 2 * i + 1] - data[offset + 2 * i];
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
static uint64_t
|
||||
radv_pc_max_reg(uint32_t reg, const uint64_t *data)
|
||||
{
|
||||
unsigned instances = G_REG_INSTANCES(reg);
|
||||
unsigned offset = G_REG_OFFSET(reg) / 8;
|
||||
uint64_t result = 0;
|
||||
|
||||
if (G_REG_CONSTANT(reg))
|
||||
return reg & 0x7fffffffu;
|
||||
|
||||
for (unsigned i = 0; i < instances; ++i) {
|
||||
result = MAX2(result, data[offset + 2 * i + 1]);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
static union VkPerformanceCounterResultKHR
|
||||
radv_pc_get_result(const struct radv_perfcounter_impl *impl, const uint64_t *data)
|
||||
{
|
||||
union VkPerformanceCounterResultKHR result;
|
||||
|
||||
switch (impl->op) {
|
||||
case RADV_PC_OP_MAX:
|
||||
result.float64 = radv_pc_max_reg(impl->regs[0], data);
|
||||
break;
|
||||
case RADV_PC_OP_SUM:
|
||||
result.float64 = radv_pc_sum_reg(impl->regs[0], data);
|
||||
break;
|
||||
case RADV_PC_OP_RATIO_DIVSCALE:
|
||||
result.float64 = radv_pc_sum_reg(impl->regs[0], data) /
|
||||
(double)radv_pc_sum_reg(impl->regs[1], data) /
|
||||
radv_pc_sum_reg(impl->regs[2], data) * 100.0;
|
||||
break;
|
||||
case RADV_PC_OP_REVERSE_RATIO: {
|
||||
double tmp = radv_pc_sum_reg(impl->regs[1], data);
|
||||
result.float64 = (tmp - radv_pc_sum_reg(impl->regs[0], data)) / tmp * 100.0;
|
||||
break;
|
||||
}
|
||||
case RADV_PC_OP_SUM_WEIGHTED_4:
|
||||
result.float64 = 0.0;
|
||||
for (unsigned i = 0; i < 4; ++i)
|
||||
result.float64 +=
|
||||
radv_pc_sum_reg(impl->regs[2 * i], data) * radv_pc_sum_reg(impl->regs[2 * i + 1], data);
|
||||
break;
|
||||
default:
|
||||
unreachable("unhandled performance counter operation");
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
void
|
||||
radv_pc_get_results(const struct radv_pc_query_pool *pc_pool, const uint64_t *data, void *out)
|
||||
{
|
||||
union VkPerformanceCounterResultKHR *pc_result = out;
|
||||
|
||||
for (unsigned i = 0; i < pc_pool->num_counters; ++i) {
|
||||
pc_result[i] = radv_pc_get_result(pc_pool->counters + i, data);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -2811,6 +2811,30 @@ struct radv_query_pool {
|
|||
bool uses_gds; /* For NGG GS on GFX10+ */
|
||||
};
|
||||
|
||||
struct radv_perfcounter_impl;
|
||||
|
||||
struct radv_pc_query_pool {
|
||||
struct radv_query_pool b;
|
||||
|
||||
uint32_t *pc_regs;
|
||||
unsigned num_pc_regs;
|
||||
|
||||
unsigned num_passes;
|
||||
|
||||
unsigned num_counters;
|
||||
struct radv_perfcounter_impl *counters;
|
||||
};
|
||||
|
||||
void radv_pc_deinit_query_pool(struct radv_pc_query_pool *pool);
|
||||
VkResult radv_pc_init_query_pool(struct radv_physical_device *pdevice,
|
||||
const VkQueryPoolCreateInfo *pCreateInfo,
|
||||
struct radv_pc_query_pool *pool);
|
||||
void radv_pc_begin_query(struct radv_cmd_buffer *cmd_buffer, struct radv_pc_query_pool *pool,
|
||||
uint64_t va);
|
||||
void radv_pc_end_query(struct radv_cmd_buffer *cmd_buffer, struct radv_pc_query_pool *pool,
|
||||
uint64_t va);
|
||||
void radv_pc_get_results(const struct radv_pc_query_pool *pc_pool, const uint64_t *data, void *out);
|
||||
|
||||
bool radv_queue_internal_submit(struct radv_queue *queue, struct radeon_cmdbuf *cs);
|
||||
|
||||
int radv_queue_init(struct radv_device *device, struct radv_queue *queue, int idx,
|
||||
|
@ -2874,6 +2898,7 @@ bool radv_is_instruction_timing_enabled(void);
|
|||
|
||||
void radv_emit_inhibit_clockgating(struct radv_device *device, struct radeon_cmdbuf *cs,
|
||||
bool inhibit);
|
||||
void radv_emit_spi_config_cntl(struct radv_device *device, struct radeon_cmdbuf *cs, bool enable);
|
||||
|
||||
bool radv_sdma_copy_image(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
|
||||
struct radv_buffer *buffer, const VkBufferImageCopy2 *region);
|
||||
|
|
|
@ -30,6 +30,7 @@
|
|||
|
||||
#include "nir/nir_builder.h"
|
||||
#include "util/u_atomic.h"
|
||||
#include "vulkan/vulkan_core.h"
|
||||
#include "radv_acceleration_structure.h"
|
||||
#include "radv_cs.h"
|
||||
#include "radv_meta.h"
|
||||
|
@ -1066,6 +1067,9 @@ static void
|
|||
radv_destroy_query_pool(struct radv_device *device, const VkAllocationCallbacks *pAllocator,
|
||||
struct radv_query_pool *pool)
|
||||
{
|
||||
if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR)
|
||||
radv_pc_deinit_query_pool((struct radv_pc_query_pool *)pool);
|
||||
|
||||
if (pool->bo)
|
||||
device->ws->buffer_destroy(device->ws, pool->bo);
|
||||
vk_object_base_finish(&pool->base);
|
||||
|
@ -1077,8 +1081,13 @@ radv_CreateQueryPool(VkDevice _device, const VkQueryPoolCreateInfo *pCreateInfo,
|
|||
const VkAllocationCallbacks *pAllocator, VkQueryPool *pQueryPool)
|
||||
{
|
||||
RADV_FROM_HANDLE(radv_device, device, _device);
|
||||
struct radv_query_pool *pool =
|
||||
vk_alloc2(&device->vk.alloc, pAllocator, sizeof(*pool), 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
|
||||
VkResult result;
|
||||
size_t pool_struct_size = pCreateInfo->queryType == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR
|
||||
? sizeof(struct radv_pc_query_pool)
|
||||
: sizeof(struct radv_query_pool);
|
||||
|
||||
struct radv_query_pool *pool = vk_alloc2(&device->vk.alloc, pAllocator, pool_struct_size, 8,
|
||||
VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
|
||||
|
||||
if (!pool)
|
||||
return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
|
||||
|
@ -1126,6 +1135,16 @@ radv_CreateQueryPool(VkDevice _device, const VkQueryPoolCreateInfo *pCreateInfo,
|
|||
pool->stride += 8 * 4;
|
||||
}
|
||||
break;
|
||||
case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
|
||||
result = radv_pc_init_query_pool(device->physical_device, pCreateInfo,
|
||||
(struct radv_pc_query_pool *)pool);
|
||||
|
||||
if (result != VK_SUCCESS) {
|
||||
radv_destroy_query_pool(device, pAllocator, pool);
|
||||
return vk_error(device, result);
|
||||
}
|
||||
break;
|
||||
}
|
||||
default:
|
||||
unreachable("creating unhandled query type");
|
||||
}
|
||||
|
@ -1135,9 +1154,9 @@ radv_CreateQueryPool(VkDevice _device, const VkQueryPoolCreateInfo *pCreateInfo,
|
|||
if (pCreateInfo->queryType == VK_QUERY_TYPE_PIPELINE_STATISTICS)
|
||||
pool->size += 4 * pCreateInfo->queryCount;
|
||||
|
||||
VkResult result = device->ws->buffer_create(device->ws, pool->size, 64, RADEON_DOMAIN_GTT,
|
||||
RADEON_FLAG_NO_INTERPROCESS_SHARING,
|
||||
RADV_BO_PRIORITY_QUERY_POOL, 0, &pool->bo);
|
||||
result = device->ws->buffer_create(device->ws, pool->size, 64, RADEON_DOMAIN_GTT,
|
||||
RADEON_FLAG_NO_INTERPROCESS_SHARING,
|
||||
RADV_BO_PRIORITY_QUERY_POOL, 0, &pool->bo);
|
||||
if (result != VK_SUCCESS) {
|
||||
radv_destroy_query_pool(device, pAllocator, pool);
|
||||
return vk_error(device, result);
|
||||
|
@ -1393,6 +1412,23 @@ radv_GetQueryPoolResults(VkDevice _device, VkQueryPool queryPool, uint32_t first
|
|||
}
|
||||
break;
|
||||
}
|
||||
case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
|
||||
struct radv_pc_query_pool *pc_pool = (struct radv_pc_query_pool *)pool;
|
||||
const uint64_t *src64 = (const uint64_t *)src;
|
||||
bool avail;
|
||||
do {
|
||||
avail = true;
|
||||
for (unsigned i = 0; i < pc_pool->num_passes; ++i)
|
||||
if (!p_atomic_read(src64 + pool->stride / 8 - i - 1))
|
||||
avail = false;
|
||||
} while (!avail && (flags & VK_QUERY_RESULT_WAIT_BIT));
|
||||
|
||||
available = avail;
|
||||
|
||||
radv_pc_get_results(pc_pool, src64, dest);
|
||||
dest += pc_pool->num_counters * sizeof(union VkPerformanceCounterResultKHR);
|
||||
break;
|
||||
}
|
||||
default:
|
||||
unreachable("trying to get results of unhandled query type");
|
||||
}
|
||||
|
@ -1813,6 +1849,10 @@ emit_begin_query(struct radv_cmd_buffer *cmd_buffer, struct radv_query_pool *poo
|
|||
}
|
||||
break;
|
||||
}
|
||||
case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
|
||||
radv_pc_begin_query(cmd_buffer, (struct radv_pc_query_pool *)pool, va);
|
||||
break;
|
||||
}
|
||||
default:
|
||||
unreachable("beginning unhandled query type");
|
||||
}
|
||||
|
@ -1899,6 +1939,10 @@ emit_end_query(struct radv_cmd_buffer *cmd_buffer, struct radv_query_pool *pool,
|
|||
}
|
||||
break;
|
||||
}
|
||||
case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
|
||||
radv_pc_end_query(cmd_buffer, (struct radv_pc_query_pool *)pool, va);
|
||||
break;
|
||||
}
|
||||
default:
|
||||
unreachable("ending unhandled query type");
|
||||
}
|
||||
|
|
|
@ -364,7 +364,7 @@ radv_emit_thread_trace_userdata(struct radv_cmd_buffer *cmd_buffer, const void *
|
|||
}
|
||||
}
|
||||
|
||||
static void
|
||||
void
|
||||
radv_emit_spi_config_cntl(struct radv_device *device, struct radeon_cmdbuf *cs, bool enable)
|
||||
{
|
||||
if (device->physical_device->rad_info.gfx_level >= GFX9) {
|
||||
|
|
Loading…
Reference in New Issue