/* * Copyright © 2021 Valve Corporation * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice (including the next * paragraph) shall be included in all copies or substantial portions of the * Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS * IN THE SOFTWARE. */ #include #include "ac_perfcounter.h" #include "amdgfxregs.h" #include "radv_cs.h" #include "radv_private.h" #include "sid.h" void radv_perfcounter_emit_shaders(struct radeon_cmdbuf *cs, unsigned shaders) { radeon_set_uconfig_reg_seq(cs, R_036780_SQ_PERFCOUNTER_CTRL, 2); radeon_emit(cs, shaders & 0x7f); radeon_emit(cs, 0xffffffff); } static void radv_emit_windowed_counters(struct radv_device *device, struct radeon_cmdbuf *cs, int family, bool enable) { if (family == RADV_QUEUE_GENERAL) { radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); radeon_emit(cs, EVENT_TYPE(enable ? V_028A90_PERFCOUNTER_START : V_028A90_PERFCOUNTER_STOP) | EVENT_INDEX(0)); } radeon_set_sh_reg(cs, R_00B82C_COMPUTE_PERFCOUNT_ENABLE, S_00B82C_PERFCOUNT_ENABLE(enable)); } void radv_perfcounter_emit_spm_reset(struct radeon_cmdbuf *cs) { radeon_set_uconfig_reg(cs, R_036020_CP_PERFMON_CNTL, S_036020_PERFMON_STATE(V_036020_CP_PERFMON_STATE_DISABLE_AND_RESET) | S_036020_SPM_PERFMON_STATE(V_036020_STRM_PERFMON_STATE_DISABLE_AND_RESET)); } void radv_perfcounter_emit_spm_start(struct radv_device *device, struct radeon_cmdbuf *cs, int family) { /* Start SPM counters. */ radeon_set_uconfig_reg(cs, R_036020_CP_PERFMON_CNTL, S_036020_PERFMON_STATE(V_036020_CP_PERFMON_STATE_DISABLE_AND_RESET) | S_036020_SPM_PERFMON_STATE(V_036020_STRM_PERFMON_STATE_START_COUNTING)); radv_emit_windowed_counters(device, cs, family, true); } void radv_perfcounter_emit_spm_stop(struct radv_device *device, struct radeon_cmdbuf *cs, int family) { radv_emit_windowed_counters(device, cs, family, false); /* Stop SPM counters. */ radeon_set_uconfig_reg(cs, R_036020_CP_PERFMON_CNTL, S_036020_PERFMON_STATE(V_036020_CP_PERFMON_STATE_DISABLE_AND_RESET) | S_036020_SPM_PERFMON_STATE(device->physical_device->rad_info.never_stop_sq_perf_counters ? V_036020_STRM_PERFMON_STATE_START_COUNTING : V_036020_STRM_PERFMON_STATE_STOP_COUNTING)); } enum radv_perfcounter_op { RADV_PC_OP_SUM, RADV_PC_OP_MAX, RADV_PC_OP_RATIO_DIVSCALE, RADV_PC_OP_REVERSE_RATIO, /* (reg1 - reg0) / reg1 */ RADV_PC_OP_SUM_WEIGHTED_4, }; #define S_REG_SEL(x) ((x)&0xFFFF) #define G_REG_SEL(x) ((x)&0xFFFF) #define S_REG_BLOCK(x) ((x) << 16) #define G_REG_BLOCK(x) (((x) >> 16) & 0x7FFF) #define S_REG_OFFSET(x) ((x)&0xFFFF) #define G_REG_OFFSET(x) ((x)&0xFFFF) #define S_REG_INSTANCES(x) ((x) << 16) #define G_REG_INSTANCES(x) (((x) >> 16) & 0x7FFF) #define S_REG_CONSTANT(x) ((x) << 31) #define G_REG_CONSTANT(x) ((x) >> 31) struct radv_perfcounter_impl { enum radv_perfcounter_op op; uint32_t regs[8]; }; /* Only append to this list, never insert into the middle or remove (but can rename). * * The invariant we're trying to get here is counters that have the same meaning, so * these can be shared between counters that have different implementations on different * GPUs, but should be unique within a GPU. */ enum radv_perfcounter_uuid { RADV_PC_UUID_GPU_CYCLES, RADV_PC_UUID_SHADER_WAVES, RADV_PC_UUID_SHADER_INSTRUCTIONS, RADV_PC_UUID_SHADER_INSTRUCTIONS_VALU, RADV_PC_UUID_SHADER_INSTRUCTIONS_SALU, RADV_PC_UUID_SHADER_INSTRUCTIONS_VMEM_LOAD, RADV_PC_UUID_SHADER_INSTRUCTIONS_SMEM_LOAD, RADV_PC_UUID_SHADER_INSTRUCTIONS_VMEM_STORE, RADV_PC_UUID_SHADER_INSTRUCTIONS_LDS, RADV_PC_UUID_SHADER_INSTRUCTIONS_GDS, RADV_PC_UUID_SHADER_VALU_BUSY, RADV_PC_UUID_SHADER_SALU_BUSY, RADV_PC_UUID_VRAM_READ_SIZE, RADV_PC_UUID_VRAM_WRITE_SIZE, RADV_PC_UUID_L0_CACHE_HIT_RATIO, RADV_PC_UUID_L1_CACHE_HIT_RATIO, RADV_PC_UUID_L2_CACHE_HIT_RATIO, }; struct radv_perfcounter_desc { struct radv_perfcounter_impl impl; VkPerformanceCounterUnitKHR unit; char name[VK_MAX_DESCRIPTION_SIZE]; char category[VK_MAX_DESCRIPTION_SIZE]; char description[VK_MAX_DESCRIPTION_SIZE]; enum radv_perfcounter_uuid uuid; }; #define PC_DESC(arg_op, arg_unit, arg_name, arg_category, arg_description, arg_uuid, ...) \ (struct radv_perfcounter_desc) \ { \ .impl = {.op = arg_op, .regs = {__VA_ARGS__}}, \ .unit = VK_PERFORMANCE_COUNTER_UNIT_##arg_unit##_KHR, .name = arg_name, \ .category = arg_category, .description = arg_description, .uuid = RADV_PC_UUID_##arg_uuid \ } #define ADD_PC(op, unit, name, category, description, uuid, ...) \ do { \ if (descs) { \ descs[*count] = PC_DESC((op), unit, name, category, description, uuid, __VA_ARGS__); \ } \ ++*count; \ } while (0) #define CTR(block, ctr) (S_REG_BLOCK(block) | S_REG_SEL(ctr)) #define CONSTANT(v) (S_REG_CONSTANT(1) | (uint32_t)(v)) enum { GRBM_PERF_SEL_GUI_ACTIVE = CTR(GRBM, 2) }; enum { CPF_PERF_SEL_CPF_STAT_BUSY_GFX10 = CTR(CPF, 0x18) }; enum { GL1C_PERF_SEL_REQ = CTR(GL1C, 0xe), GL1C_PERF_SEL_REQ_MISS = CTR(GL1C, 0x12), }; enum { GL2C_PERF_SEL_REQ = CTR(GL2C, 0x3), GL2C_PERF_SEL_MISS_GFX101 = CTR(GL2C, 0x23), GL2C_PERF_SEL_MC_WRREQ_GFX101 = CTR(GL2C, 0x4b), GL2C_PERF_SEL_EA_WRREQ_64B_GFX101 = CTR(GL2C, 0x4c), GL2C_PERF_SEL_EA_RDREQ_32B_GFX101 = CTR(GL2C, 0x59), GL2C_PERF_SEL_EA_RDREQ_64B_GFX101 = CTR(GL2C, 0x5a), GL2C_PERF_SEL_EA_RDREQ_96B_GFX101 = CTR(GL2C, 0x5b), GL2C_PERF_SEL_EA_RDREQ_128B_GFX101 = CTR(GL2C, 0x5c), GL2C_PERF_SEL_MISS_GFX103 = CTR(GL2C, 0x2b), GL2C_PERF_SEL_MC_WRREQ_GFX103 = CTR(GL2C, 0x53), GL2C_PERF_SEL_EA_WRREQ_64B_GFX103 = CTR(GL2C, 0x55), GL2C_PERF_SEL_EA_RDREQ_32B_GFX103 = CTR(GL2C, 0x63), GL2C_PERF_SEL_EA_RDREQ_64B_GFX103 = CTR(GL2C, 0x64), GL2C_PERF_SEL_EA_RDREQ_96B_GFX103 = CTR(GL2C, 0x65), GL2C_PERF_SEL_EA_RDREQ_128B_GFX103 = CTR(GL2C, 0x66), }; enum { SQ_PERF_SEL_WAVES = CTR(SQ, 0x4), SQ_PERF_SEL_INSTS_ALL_GFX10 = CTR(SQ, 0x31), SQ_PERF_SEL_INSTS_GDS_GFX10 = CTR(SQ, 0x37), SQ_PERF_SEL_INSTS_LDS_GFX10 = CTR(SQ, 0x3b), SQ_PERF_SEL_INSTS_SALU_GFX10 = CTR(SQ, 0x3c), SQ_PERF_SEL_INSTS_SMEM_GFX10 = CTR(SQ, 0x3d), SQ_PERF_SEL_INSTS_VALU_GFX10 = CTR(SQ, 0x40), SQ_PERF_SEL_INSTS_TEX_LOAD_GFX10 = CTR(SQ, 0x45), SQ_PERF_SEL_INSTS_TEX_STORE_GFX10 = CTR(SQ, 0x46), SQ_PERF_SEL_INST_CYCLES_VALU_GFX10 = CTR(SQ, 0x75), }; enum { TCP_PERF_SEL_REQ_GFX10 = CTR(TCP, 0x9), TCP_PERF_SEL_REQ_MISS_GFX10 = CTR(TCP, 0x12), }; #define CTR_NUM_SIMD \ CONSTANT(pdev->rad_info.num_simd_per_compute_unit * pdev->rad_info.num_cu) #define CTR_NUM_CUS CONSTANT(pdev->rad_info.num_cu) static void radv_query_perfcounter_descs(struct radv_physical_device *pdev, uint32_t *count, struct radv_perfcounter_desc *descs) { *count = 0; ADD_PC(RADV_PC_OP_MAX, CYCLES, "GPU active cycles", "GRBM", "cycles the GPU is active processing a command buffer.", GPU_CYCLES, GRBM_PERF_SEL_GUI_ACTIVE); ADD_PC(RADV_PC_OP_SUM, GENERIC, "Waves", "Shaders", "Number of waves executed", SHADER_WAVES, SQ_PERF_SEL_WAVES); ADD_PC(RADV_PC_OP_SUM, GENERIC, "Instructions", "Shaders", "Number of Instructions executed", SHADER_INSTRUCTIONS, SQ_PERF_SEL_INSTS_ALL_GFX10); ADD_PC(RADV_PC_OP_SUM, GENERIC, "VALU Instructions", "Shaders", "Number of VALU Instructions executed", SHADER_INSTRUCTIONS_VALU, SQ_PERF_SEL_INSTS_VALU_GFX10); ADD_PC(RADV_PC_OP_SUM, GENERIC, "SALU Instructions", "Shaders", "Number of SALU Instructions executed", SHADER_INSTRUCTIONS_SALU, SQ_PERF_SEL_INSTS_SALU_GFX10); ADD_PC(RADV_PC_OP_SUM, GENERIC, "VMEM Load Instructions", "Shaders", "Number of VMEM load instructions executed", SHADER_INSTRUCTIONS_VMEM_LOAD, SQ_PERF_SEL_INSTS_TEX_LOAD_GFX10); ADD_PC(RADV_PC_OP_SUM, GENERIC, "SMEM Load Instructions", "Shaders", "Number of SMEM load instructions executed", SHADER_INSTRUCTIONS_SMEM_LOAD, SQ_PERF_SEL_INSTS_SMEM_GFX10); ADD_PC(RADV_PC_OP_SUM, GENERIC, "VMEM Store Instructions", "Shaders", "Number of VMEM store instructions executed", SHADER_INSTRUCTIONS_VMEM_STORE, SQ_PERF_SEL_INSTS_TEX_STORE_GFX10); ADD_PC(RADV_PC_OP_SUM, GENERIC, "LDS Instructions", "Shaders", "Number of LDS Instructions executed", SHADER_INSTRUCTIONS_LDS, SQ_PERF_SEL_INSTS_LDS_GFX10); ADD_PC(RADV_PC_OP_SUM, GENERIC, "GDS Instructions", "Shaders", "Number of GDS Instructions executed", SHADER_INSTRUCTIONS_GDS, SQ_PERF_SEL_INSTS_GDS_GFX10); ADD_PC(RADV_PC_OP_RATIO_DIVSCALE, PERCENTAGE, "VALU Busy", "Shader Utilization", "Percentage of time the VALU units are busy", SHADER_VALU_BUSY, SQ_PERF_SEL_INST_CYCLES_VALU_GFX10, CPF_PERF_SEL_CPF_STAT_BUSY_GFX10, CTR_NUM_SIMD); ADD_PC(RADV_PC_OP_RATIO_DIVSCALE, PERCENTAGE, "SALU Busy", "Shader Utilization", "Percentage of time the SALU units are busy", SHADER_SALU_BUSY, SQ_PERF_SEL_INSTS_SALU_GFX10, CPF_PERF_SEL_CPF_STAT_BUSY_GFX10, CTR_NUM_CUS); if (pdev->rad_info.gfx_level >= GFX10_3) { ADD_PC(RADV_PC_OP_SUM_WEIGHTED_4, BYTES, "VRAM read size", "Memory", "Number of bytes read from VRAM", VRAM_READ_SIZE, GL2C_PERF_SEL_EA_RDREQ_32B_GFX103, CONSTANT(32), GL2C_PERF_SEL_EA_RDREQ_64B_GFX103, CONSTANT(64), GL2C_PERF_SEL_EA_RDREQ_96B_GFX103, CONSTANT(96), GL2C_PERF_SEL_EA_RDREQ_128B_GFX103, CONSTANT(128)); ADD_PC(RADV_PC_OP_SUM_WEIGHTED_4, BYTES, "VRAM write size", "Memory", "Number of bytes written to VRAM", VRAM_WRITE_SIZE, GL2C_PERF_SEL_MC_WRREQ_GFX103, CONSTANT(32), GL2C_PERF_SEL_EA_WRREQ_64B_GFX103, CONSTANT(64), CONSTANT(0), CONSTANT(0), CONSTANT(0), CONSTANT(0)); } else { ADD_PC(RADV_PC_OP_SUM_WEIGHTED_4, BYTES, "VRAM read size", "Memory", "Number of bytes read from VRAM", VRAM_READ_SIZE, GL2C_PERF_SEL_EA_RDREQ_32B_GFX101, CONSTANT(32), GL2C_PERF_SEL_EA_RDREQ_64B_GFX101, CONSTANT(64), GL2C_PERF_SEL_EA_RDREQ_96B_GFX101, CONSTANT(96), GL2C_PERF_SEL_EA_RDREQ_128B_GFX101, CONSTANT(128)); ADD_PC(RADV_PC_OP_SUM_WEIGHTED_4, BYTES, "VRAM write size", "Memory", "Number of bytes written to VRAM", VRAM_WRITE_SIZE, GL2C_PERF_SEL_MC_WRREQ_GFX101, CONSTANT(32), GL2C_PERF_SEL_EA_WRREQ_64B_GFX101, CONSTANT(32), CONSTANT(0), CONSTANT(0), CONSTANT(0), CONSTANT(0)); } ADD_PC(RADV_PC_OP_REVERSE_RATIO, BYTES, "L0 cache hit ratio", "Memory", "Hit ratio of L0 cache", L0_CACHE_HIT_RATIO, TCP_PERF_SEL_REQ_MISS_GFX10, TCP_PERF_SEL_REQ_GFX10); ADD_PC(RADV_PC_OP_REVERSE_RATIO, BYTES, "L1 cache hit ratio", "Memory", "Hit ratio of L1 cache", L1_CACHE_HIT_RATIO, GL1C_PERF_SEL_REQ_MISS, GL1C_PERF_SEL_REQ); if (pdev->rad_info.gfx_level >= GFX10_3) { ADD_PC(RADV_PC_OP_REVERSE_RATIO, BYTES, "L2 cache hit ratio", "Memory", "Hit ratio of L2 cache", L2_CACHE_HIT_RATIO, GL2C_PERF_SEL_MISS_GFX103, GL2C_PERF_SEL_REQ); } else { ADD_PC(RADV_PC_OP_REVERSE_RATIO, BYTES, "L2 cache hit ratio", "Memory", "Hit ratio of L2 cache", L2_CACHE_HIT_RATIO, GL2C_PERF_SEL_MISS_GFX101, GL2C_PERF_SEL_REQ); } } static bool radv_init_perfcounter_descs(struct radv_physical_device *pdev) { if (pdev->perfcounters) return true; uint32_t count; radv_query_perfcounter_descs(pdev, &count, NULL); struct radv_perfcounter_desc *descs = malloc(sizeof(*descs) * count); if (!descs) return false; radv_query_perfcounter_descs(pdev, &count, descs); pdev->num_perfcounters = count; pdev->perfcounters = descs; return true; } static int cmp_uint32_t(const void *a, const void *b) { uint32_t l = *(const uint32_t *)a; uint32_t r = *(const uint32_t *)b; return (l < r) ? -1 : (l > r) ? 1 : 0; } static VkResult radv_get_counter_registers(const struct radv_physical_device *pdevice, uint32_t num_indices, const uint32_t *indices, unsigned *out_num_regs, uint32_t **out_regs) { ASSERTED uint32_t num_counters = pdevice->num_perfcounters; const struct radv_perfcounter_desc *descs = pdevice->perfcounters; unsigned full_reg_cnt = num_indices * ARRAY_SIZE(descs->impl.regs); uint32_t *regs = malloc(full_reg_cnt * sizeof(uint32_t)); if (!regs) return VK_ERROR_OUT_OF_HOST_MEMORY; unsigned reg_cnt = 0; for (unsigned i = 0; i < num_indices; ++i) { uint32_t index = indices[i]; assert(index < num_counters); for (unsigned j = 0; j < ARRAY_SIZE(descs[index].impl.regs) && descs[index].impl.regs[j]; ++j) { if (!G_REG_CONSTANT(descs[index].impl.regs[j])) regs[reg_cnt++] = descs[index].impl.regs[j]; } } qsort(regs, reg_cnt, sizeof(uint32_t), cmp_uint32_t); unsigned deduped_reg_cnt = 0; for (unsigned i = 1; i < reg_cnt; ++i) { if (regs[i] != regs[deduped_reg_cnt]) regs[++deduped_reg_cnt] = regs[i]; } ++deduped_reg_cnt; *out_num_regs = deduped_reg_cnt; *out_regs = regs; return VK_SUCCESS; } static unsigned radv_pc_get_num_instances(const struct radv_physical_device *pdevice, struct ac_pc_block *ac_block) { return ac_block->num_instances * ((ac_block->b->b->flags & AC_PC_BLOCK_SE) ? pdevice->rad_info.max_se : 1); } static unsigned radv_get_num_counter_passes(const struct radv_physical_device *pdevice, unsigned num_regs, const uint32_t *regs) { enum ac_pc_gpu_block prev_block = NUM_GPU_BLOCK; unsigned block_reg_count = 0; struct ac_pc_block *ac_block = NULL; unsigned passes_needed = 1; for (unsigned i = 0; i < num_regs; ++i) { enum ac_pc_gpu_block block = G_REG_BLOCK(regs[i]); if (block != prev_block) { block_reg_count = 0; prev_block = block; ac_block = ac_pc_get_block(&pdevice->ac_perfcounters, block); } ++block_reg_count; passes_needed = MAX2(passes_needed, DIV_ROUND_UP(block_reg_count, ac_block->b->b->num_counters)); } return passes_needed; } void radv_pc_deinit_query_pool(struct radv_pc_query_pool *pool) { free(pool->counters); free(pool->pc_regs); } VkResult radv_pc_init_query_pool(struct radv_physical_device *pdevice, const VkQueryPoolCreateInfo *pCreateInfo, struct radv_pc_query_pool *pool) { const VkQueryPoolPerformanceCreateInfoKHR *perf_info = vk_find_struct_const(pCreateInfo->pNext, QUERY_POOL_PERFORMANCE_CREATE_INFO_KHR); VkResult result; if (!radv_init_perfcounter_descs(pdevice)) return VK_ERROR_OUT_OF_HOST_MEMORY; result = radv_get_counter_registers(pdevice, perf_info->counterIndexCount, perf_info->pCounterIndices, &pool->num_pc_regs, &pool->pc_regs); if (result != VK_SUCCESS) return result; pool->num_passes = radv_get_num_counter_passes(pdevice, pool->num_pc_regs, pool->pc_regs); uint32_t *pc_reg_offsets = malloc(pool->num_pc_regs * sizeof(uint32_t)); if (!pc_reg_offsets) return VK_ERROR_OUT_OF_HOST_MEMORY; unsigned offset = 0; for (unsigned i = 0; i < pool->num_pc_regs; ++i) { enum ac_pc_gpu_block block = pool->pc_regs[i] >> 16; struct ac_pc_block *ac_block = ac_pc_get_block(&pdevice->ac_perfcounters, block); unsigned num_instances = radv_pc_get_num_instances(pdevice, ac_block); pc_reg_offsets[i] = S_REG_OFFSET(offset) | S_REG_INSTANCES(num_instances); offset += sizeof(uint64_t) * 2 * num_instances; } /* allow an uint32_t per pass to signal completion. */ pool->b.stride = offset + 8 * pool->num_passes; pool->num_counters = perf_info->counterIndexCount; pool->counters = malloc(pool->num_counters * sizeof(struct radv_perfcounter_impl)); if (!pool->counters) { free(pc_reg_offsets); return VK_ERROR_OUT_OF_HOST_MEMORY; } for (unsigned i = 0; i < pool->num_counters; ++i) { pool->counters[i] = pdevice->perfcounters[perf_info->pCounterIndices[i]].impl; for (unsigned j = 0; j < ARRAY_SIZE(pool->counters[i].regs); ++j) { uint32_t reg = pool->counters[i].regs[j]; if (!reg || G_REG_CONSTANT(reg)) continue; unsigned k; for (k = 0; k < pool->num_pc_regs; ++k) if (pool->pc_regs[k] == reg) break; pool->counters[i].regs[j] = pc_reg_offsets[k]; } } free(pc_reg_offsets); return VK_SUCCESS; } static void radv_emit_instance(struct radv_cmd_buffer *cmd_buffer, int se, int instance) { struct radeon_cmdbuf *cs = cmd_buffer->cs; unsigned value = S_030800_SH_BROADCAST_WRITES(1); if (se >= 0) { value |= S_030800_SE_INDEX(se); } else { value |= S_030800_SE_BROADCAST_WRITES(1); } if (instance >= 0) { value |= S_030800_INSTANCE_INDEX(instance); } else { value |= S_030800_INSTANCE_BROADCAST_WRITES(1); } radeon_set_uconfig_reg(cs, R_030800_GRBM_GFX_INDEX, value); } static void radv_emit_select(struct radv_cmd_buffer *cmd_buffer, struct ac_pc_block *block, unsigned count, unsigned *selectors) { struct ac_pc_block_base *regs = block->b->b; struct radeon_cmdbuf *cs = cmd_buffer->cs; unsigned idx; assert(count <= regs->num_counters); /* Fake counters. */ if (!regs->select0) return; for (idx = 0; idx < count; ++idx) { radeon_set_perfctr_reg(cmd_buffer, regs->select0[idx], G_REG_SEL(selectors[idx]) | regs->select_or); } for (idx = 0; idx < regs->num_spm_counters; idx++) { radeon_set_uconfig_reg_seq(cs, regs->select1[idx], 1); radeon_emit(cs, 0); } } static void radv_pc_emit_block_instance_read(struct radv_cmd_buffer *cmd_buffer, struct ac_pc_block *block, unsigned count, uint64_t va) { struct ac_pc_block_base *regs = block->b->b; struct radeon_cmdbuf *cs = cmd_buffer->cs; unsigned reg = regs->counter0_lo; unsigned reg_delta = 8; assert(regs->select0); for (unsigned idx = 0; idx < count; ++idx) { if (regs->counters) reg = regs->counters[idx]; radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0)); radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_PERF) | COPY_DATA_DST_SEL(COPY_DATA_TC_L2) | COPY_DATA_WR_CONFIRM | COPY_DATA_COUNT_SEL); /* 64 bits */ radeon_emit(cs, reg >> 2); radeon_emit(cs, 0); /* unused */ radeon_emit(cs, va); radeon_emit(cs, va >> 32); va += sizeof(uint64_t) * 2 * radv_pc_get_num_instances(cmd_buffer->device->physical_device, block); reg += reg_delta; } } static void radv_pc_sample_block(struct radv_cmd_buffer *cmd_buffer, struct ac_pc_block *block, unsigned count, uint64_t va) { unsigned se_end = 1; if (block->b->b->flags & AC_PC_BLOCK_SE) se_end = cmd_buffer->device->physical_device->rad_info.max_se; for (unsigned se = 0; se < se_end; ++se) { for (unsigned instance = 0; instance < block->num_instances; ++instance) { radv_emit_instance(cmd_buffer, se, instance); radv_pc_emit_block_instance_read(cmd_buffer, block, count, va); va += sizeof(uint64_t) * 2; } } } static void radv_pc_wait_idle(struct radv_cmd_buffer *cmd_buffer) { struct radeon_cmdbuf *cs = cmd_buffer->cs; radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); radeon_emit(cs, EVENT_TYPE(V_028A90_CS_PARTIAL_FLUSH | EVENT_INDEX(4))); radeon_emit(cs, PKT3(PKT3_ACQUIRE_MEM, 6, 0)); radeon_emit(cs, 0); /* CP_COHER_CNTL */ radeon_emit(cs, 0xffffffff); /* CP_COHER_SIZE */ radeon_emit(cs, 0xffffff); /* CP_COHER_SIZE_HI */ radeon_emit(cs, 0); /* CP_COHER_BASE */ radeon_emit(cs, 0); /* CP_COHER_BASE_HI */ radeon_emit(cs, 0x0000000A); /* POLL_INTERVAL */ radeon_emit(cs, 0); /* GCR_CNTL */ radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0)); radeon_emit(cs, 0); } static void radv_pc_stop_and_sample(struct radv_cmd_buffer *cmd_buffer, struct radv_pc_query_pool *pool, uint64_t va, bool end) { struct radeon_cmdbuf *cs = cmd_buffer->cs; struct radv_physical_device *pdevice = cmd_buffer->device->physical_device; radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); radeon_emit(cs, EVENT_TYPE(V_028A90_PERFCOUNTER_SAMPLE) | EVENT_INDEX(0)); radv_pc_wait_idle(cmd_buffer); radv_emit_instance(cmd_buffer, -1, -1); radv_emit_windowed_counters(cmd_buffer->device, cs, cmd_buffer->qf, false); radeon_set_uconfig_reg(cs, R_036020_CP_PERFMON_CNTL, S_036020_PERFMON_STATE(V_036020_CP_PERFMON_STATE_STOP_COUNTING) | S_036020_PERFMON_SAMPLE_ENABLE(1)); for (unsigned pass = 0; pass < pool->num_passes; ++pass) { uint64_t pred_va = radv_buffer_get_va(cmd_buffer->device->perf_counter_bo) + PERF_CTR_BO_PASS_OFFSET + 8 * pass; uint64_t reg_va = va + (end ? 8 : 0); radeon_emit(cs, PKT3(PKT3_COND_EXEC, 3, 0)); radeon_emit(cs, pred_va); radeon_emit(cs, pred_va >> 32); radeon_emit(cs, 0); /* Cache policy */ uint32_t *skip_dwords = cs->buf + cs->cdw; radeon_emit(cs, 0); for (unsigned i = 0; i < pool->num_pc_regs;) { enum ac_pc_gpu_block block = G_REG_BLOCK(pool->pc_regs[i]); struct ac_pc_block *ac_block = ac_pc_get_block(&pdevice->ac_perfcounters, block); unsigned offset = ac_block->num_instances * pass; unsigned num_instances = radv_pc_get_num_instances(pdevice, ac_block); unsigned cnt = 1; while (cnt < pool->num_pc_regs - i && block == G_REG_BLOCK(pool->pc_regs[i + cnt])) ++cnt; if (offset < cnt) { unsigned pass_reg_cnt = MIN2(cnt - offset, ac_block->b->b->num_counters); radv_pc_sample_block(cmd_buffer, ac_block, pass_reg_cnt, reg_va + offset * num_instances * sizeof(uint64_t)); } i += cnt; reg_va += num_instances * sizeof(uint64_t) * 2 * cnt; } if (end) { uint64_t signal_va = va + pool->b.stride - 8 - 8 * pass; radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 3, 0)); radeon_emit(cs, S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(V_370_ME)); radeon_emit(cs, signal_va); radeon_emit(cs, signal_va >> 32); radeon_emit(cs, 1); /* value */ } *skip_dwords = cs->buf + cs->cdw - skip_dwords - 1; } radv_emit_instance(cmd_buffer, -1, -1); } void radv_pc_begin_query(struct radv_cmd_buffer *cmd_buffer, struct radv_pc_query_pool *pool, uint64_t va) { struct radeon_cmdbuf *cs = cmd_buffer->cs; struct radv_physical_device *pdevice = cmd_buffer->device->physical_device; ASSERTED unsigned cdw_max; cmd_buffer->state.uses_perf_counters = true; cdw_max = radeon_check_space(cmd_buffer->device->ws, cs, 256 + /* Random one time stuff */ 10 * pool->num_passes + /* COND_EXECs */ pool->b.stride / 8 * (5 + 8)); radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, pool->b.bo); radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, cmd_buffer->device->perf_counter_bo); uint64_t perf_ctr_va = radv_buffer_get_va(cmd_buffer->device->perf_counter_bo) + PERF_CTR_BO_FENCE_OFFSET; radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 3, 0)); radeon_emit(cs, S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(V_370_ME)); radeon_emit(cs, perf_ctr_va); radeon_emit(cs, perf_ctr_va >> 32); radeon_emit(cs, 0); /* value */ radv_pc_wait_idle(cmd_buffer); radeon_set_uconfig_reg(cs, R_036020_CP_PERFMON_CNTL, S_036020_PERFMON_STATE(V_036020_CP_PERFMON_STATE_DISABLE_AND_RESET)); radv_emit_inhibit_clockgating(cmd_buffer->device, cs, true); radv_emit_spi_config_cntl(cmd_buffer->device, cs, true); radv_perfcounter_emit_shaders(cs, 0x7f); for (unsigned pass = 0; pass < pool->num_passes; ++pass) { uint64_t pred_va = radv_buffer_get_va(cmd_buffer->device->perf_counter_bo) + PERF_CTR_BO_PASS_OFFSET + 8 * pass; radeon_emit(cs, PKT3(PKT3_COND_EXEC, 3, 0)); radeon_emit(cs, pred_va); radeon_emit(cs, pred_va >> 32); radeon_emit(cs, 0); /* Cache policy */ uint32_t *skip_dwords = cs->buf + cs->cdw; radeon_emit(cs, 0); for (unsigned i = 0; i < pool->num_pc_regs;) { enum ac_pc_gpu_block block = G_REG_BLOCK(pool->pc_regs[i]); struct ac_pc_block *ac_block = ac_pc_get_block(&pdevice->ac_perfcounters, block); unsigned offset = ac_block->num_instances * pass; unsigned cnt = 1; while (cnt < pool->num_pc_regs - i && block == G_REG_BLOCK(pool->pc_regs[i + cnt])) ++cnt; if (offset < cnt) { unsigned pass_reg_cnt = MIN2(cnt - offset, ac_block->b->b->num_counters); radv_emit_select(cmd_buffer, ac_block, pass_reg_cnt, pool->pc_regs + i + offset); } i += cnt; } *skip_dwords = cs->buf + cs->cdw - skip_dwords - 1; } radv_emit_instance(cmd_buffer, -1, -1); /* The following sequence actually starts the perfcounters. */ radv_pc_stop_and_sample(cmd_buffer, pool, va, false); radeon_set_uconfig_reg(cs, R_036020_CP_PERFMON_CNTL, S_036020_PERFMON_STATE(V_036020_CP_PERFMON_STATE_START_COUNTING)); radv_emit_windowed_counters(cmd_buffer->device, cs, cmd_buffer->qf, true); assert(cmd_buffer->cs->cdw <= cdw_max); } void radv_pc_end_query(struct radv_cmd_buffer *cmd_buffer, struct radv_pc_query_pool *pool, uint64_t va) { struct radeon_cmdbuf *cs = cmd_buffer->cs; ASSERTED unsigned cdw_max; cdw_max = radeon_check_space(cmd_buffer->device->ws, cs, 256 + /* Reserved for things that don't scale with passes/counters */ 5 * pool->num_passes + /* COND_EXECs */ pool->b.stride / 8 * 8); radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, pool->b.bo); radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, cmd_buffer->device->perf_counter_bo); uint64_t perf_ctr_va = radv_buffer_get_va(cmd_buffer->device->perf_counter_bo) + PERF_CTR_BO_FENCE_OFFSET; si_cs_emit_write_event_eop(cs, cmd_buffer->device->physical_device->rad_info.gfx_level, radv_cmd_buffer_uses_mec(cmd_buffer), V_028A90_BOTTOM_OF_PIPE_TS, 0, EOP_DST_SEL_MEM, EOP_DATA_SEL_VALUE_32BIT, perf_ctr_va, 1, cmd_buffer->gfx9_fence_va); radv_cp_wait_mem(cs, WAIT_REG_MEM_EQUAL, perf_ctr_va, 1, 0xffffffff); radv_pc_wait_idle(cmd_buffer); radv_pc_stop_and_sample(cmd_buffer, pool, va, true); radeon_set_uconfig_reg(cs, R_036020_CP_PERFMON_CNTL, S_036020_PERFMON_STATE(V_036020_CP_PERFMON_STATE_DISABLE_AND_RESET)); radv_emit_spi_config_cntl(cmd_buffer->device, cs, false); radv_emit_inhibit_clockgating(cmd_buffer->device, cs, false); assert(cmd_buffer->cs->cdw <= cdw_max); } static uint64_t radv_pc_sum_reg(uint32_t reg, const uint64_t *data) { unsigned instances = G_REG_INSTANCES(reg); unsigned offset = G_REG_OFFSET(reg) / 8; uint64_t result = 0; if (G_REG_CONSTANT(reg)) return reg & 0x7fffffffu; for (unsigned i = 0; i < instances; ++i) { result += data[offset + 2 * i + 1] - data[offset + 2 * i]; } return result; } static uint64_t radv_pc_max_reg(uint32_t reg, const uint64_t *data) { unsigned instances = G_REG_INSTANCES(reg); unsigned offset = G_REG_OFFSET(reg) / 8; uint64_t result = 0; if (G_REG_CONSTANT(reg)) return reg & 0x7fffffffu; for (unsigned i = 0; i < instances; ++i) { result = MAX2(result, data[offset + 2 * i + 1]); } return result; } static union VkPerformanceCounterResultKHR radv_pc_get_result(const struct radv_perfcounter_impl *impl, const uint64_t *data) { union VkPerformanceCounterResultKHR result; switch (impl->op) { case RADV_PC_OP_MAX: result.float64 = radv_pc_max_reg(impl->regs[0], data); break; case RADV_PC_OP_SUM: result.float64 = radv_pc_sum_reg(impl->regs[0], data); break; case RADV_PC_OP_RATIO_DIVSCALE: result.float64 = radv_pc_sum_reg(impl->regs[0], data) / (double)radv_pc_sum_reg(impl->regs[1], data) / radv_pc_sum_reg(impl->regs[2], data) * 100.0; break; case RADV_PC_OP_REVERSE_RATIO: { double tmp = radv_pc_sum_reg(impl->regs[1], data); result.float64 = (tmp - radv_pc_sum_reg(impl->regs[0], data)) / tmp * 100.0; break; } case RADV_PC_OP_SUM_WEIGHTED_4: result.float64 = 0.0; for (unsigned i = 0; i < 4; ++i) result.float64 += radv_pc_sum_reg(impl->regs[2 * i], data) * radv_pc_sum_reg(impl->regs[2 * i + 1], data); break; default: unreachable("unhandled performance counter operation"); } return result; } void radv_pc_get_results(const struct radv_pc_query_pool *pc_pool, const uint64_t *data, void *out) { union VkPerformanceCounterResultKHR *pc_result = out; for (unsigned i = 0; i < pc_pool->num_counters; ++i) { pc_result[i] = radv_pc_get_result(pc_pool->counters + i, data); } } VkResult radv_EnumeratePhysicalDeviceQueueFamilyPerformanceQueryCountersKHR( VkPhysicalDevice physicalDevice, uint32_t queueFamilyIndex, uint32_t *pCounterCount, VkPerformanceCounterKHR *pCounters, VkPerformanceCounterDescriptionKHR *pCounterDescriptions) { RADV_FROM_HANDLE(radv_physical_device, pdevice, physicalDevice); if (vk_queue_to_radv(pdevice, queueFamilyIndex) != RADV_QUEUE_GENERAL) { *pCounterCount = 0; return VK_SUCCESS; } if (!radv_init_perfcounter_descs(pdevice)) return VK_ERROR_OUT_OF_HOST_MEMORY; uint32_t counter_cnt = pdevice->num_perfcounters; const struct radv_perfcounter_desc *descs = pdevice->perfcounters; if (!pCounters && !pCounterDescriptions) { *pCounterCount = counter_cnt; return VK_SUCCESS; } VkResult result = counter_cnt > *pCounterCount ? VK_INCOMPLETE : VK_SUCCESS; counter_cnt = MIN2(counter_cnt, *pCounterCount); *pCounterCount = counter_cnt; for (uint32_t i = 0; i < counter_cnt; ++i) { if (pCounters) { pCounters[i].sType = VK_STRUCTURE_TYPE_PERFORMANCE_COUNTER_KHR; pCounters[i].unit = descs[i].unit; pCounters[i].scope = VK_PERFORMANCE_COUNTER_SCOPE_COMMAND_KHR; pCounters[i].storage = VK_PERFORMANCE_COUNTER_STORAGE_FLOAT64_KHR; memset(&pCounters[i].uuid, 0, sizeof(pCounters[i].uuid)); strcpy((char*)&pCounters[i].uuid, "RADV"); const uint32_t uuid = descs[i].uuid; memcpy(&pCounters[i].uuid[12], &uuid, sizeof(uuid)); } if (pCounterDescriptions) { pCounterDescriptions[i].sType = VK_STRUCTURE_TYPE_PERFORMANCE_COUNTER_DESCRIPTION_KHR; pCounterDescriptions[i].flags = VK_PERFORMANCE_COUNTER_DESCRIPTION_CONCURRENTLY_IMPACTED_BIT_KHR; strcpy(pCounterDescriptions[i].name, descs[i].name); strcpy(pCounterDescriptions[i].category, descs[i].category); strcpy(pCounterDescriptions[i].description, descs[i].description); } } return result; } void radv_GetPhysicalDeviceQueueFamilyPerformanceQueryPassesKHR( VkPhysicalDevice physicalDevice, const VkQueryPoolPerformanceCreateInfoKHR *pPerformanceQueryCreateInfo, uint32_t *pNumPasses) { RADV_FROM_HANDLE(radv_physical_device, pdevice, physicalDevice); if (pPerformanceQueryCreateInfo->counterIndexCount == 0) { *pNumPasses = 0; return; } if (!radv_init_perfcounter_descs(pdevice)) { /* Can't return an error, so log */ fprintf(stderr, "radv: Failed to init perf counters\n"); *pNumPasses = 1; return; } assert(vk_queue_to_radv(pdevice, pPerformanceQueryCreateInfo->queueFamilyIndex) == RADV_QUEUE_GENERAL); unsigned num_regs = 0; uint32_t *regs = NULL; VkResult result = radv_get_counter_registers(pdevice, pPerformanceQueryCreateInfo->counterIndexCount, pPerformanceQueryCreateInfo->pCounterIndices, &num_regs, ®s); if (result != VK_SUCCESS) { /* Can't return an error, so log */ fprintf(stderr, "radv: Failed to allocate memory for perf counters\n"); } *pNumPasses = radv_get_num_counter_passes(pdevice, num_regs, regs); free(regs); }