radv: Add perf counter lock/unlock commandbuffers.

These set the pass and make sure we don't have multiple submissions
at the same time touching the perf counters/pass at the same time.

Reviewed-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/16879>
This commit is contained in:
Bas Nieuwenhuizen 2022-06-04 19:54:21 +02:00 committed by Marge Bot
parent 5d52f12419
commit 10211913e1
3 changed files with 139 additions and 8 deletions

View File

@ -115,6 +115,11 @@
#define PKT3_DISPATCH_DIRECT 0x15
#define PKT3_DISPATCH_INDIRECT 0x16
#define PKT3_ATOMIC_MEM 0x1E
#define ATOMIC_OP(x) ((unsigned)((x)&0x7f) << 0)
#define TC_OP_ATOMIC_CMPSWAP_32 0x48
#define ATOMIC_COMMAND(x) ((unsigned)((x)&0x3) << 8)
#define ATOMIC_COMMAND_SINGLE_PASS 0x0
#define ATOMIC_COMMAND_LOOP 0x1
#define PKT3_OCCLUSION_QUERY 0x1F /* new for CIK */
#define PKT3_SET_PREDICATION 0x20
#define PKT3_COND_EXEC 0x22

View File

@ -3160,6 +3160,20 @@ radv_device_finish_notifier(struct radv_device *device)
#endif
}
static void
radv_device_finish_perf_counter_lock_cs(struct radv_device *device)
{
if (!device->perf_counter_lock_cs)
return;
for (unsigned i = 0; i < 2 * PERF_CTR_MAX_PASSES; ++i) {
if (device->perf_counter_lock_cs[i])
device->ws->cs_destroy(device->perf_counter_lock_cs[i]);
}
free(device->perf_counter_lock_cs);
}
VKAPI_ATTR VkResult VKAPI_CALL
radv_CreateDevice(VkPhysicalDevice physicalDevice, const VkDeviceCreateInfo *pCreateInfo,
const VkAllocationCallbacks *pAllocator, VkDevice *pDevice)
@ -3548,6 +3562,13 @@ radv_CreateDevice(VkPhysicalDevice physicalDevice, const VkDeviceCreateInfo *pCr
RADV_BO_PRIORITY_UPLOAD_BUFFER, 0, &device->perf_counter_bo);
if (result != VK_SUCCESS)
goto fail_cache;
device->perf_counter_lock_cs =
calloc(sizeof(struct radeon_winsys_cs *), 2 * PERF_CTR_MAX_PASSES);
if (!device->perf_counter_lock_cs) {
result = VK_ERROR_OUT_OF_HOST_MEMORY;
goto fail_cache;
}
}
*pDevice = radv_device_to_handle(device);
@ -3565,6 +3586,7 @@ fail:
radv_trap_handler_finish(device);
radv_finish_trace(device);
radv_device_finish_perf_counter_lock_cs(device);
if (device->perf_counter_bo)
device->ws->buffer_destroy(device->ws, device->perf_counter_bo);
if (device->gfx_init)
@ -3603,6 +3625,7 @@ radv_DestroyDevice(VkDevice _device, const VkAllocationCallbacks *pAllocator)
if (!device)
return;
radv_device_finish_perf_counter_lock_cs(device);
if (device->perf_counter_bo)
device->ws->buffer_destroy(device->ws, device->perf_counter_bo);
@ -4510,6 +4533,84 @@ fail:
return vk_error(queue, result);
}
static struct radeon_cmdbuf *
radv_create_perf_counter_lock_cs(struct radv_device *device, unsigned pass, bool unlock)
{
struct radeon_cmdbuf **cs_ref = &device->perf_counter_lock_cs[pass * 2 + (unlock ? 1 : 0)];
struct radeon_cmdbuf *cs;
if (*cs_ref)
return *cs_ref;
cs = device->ws->cs_create(device->ws, AMD_IP_GFX);
if (!cs)
return NULL;
ASSERTED unsigned cdw = radeon_check_space(device->ws, cs, 21);
if (!unlock) {
uint64_t mutex_va = radv_buffer_get_va(device->perf_counter_bo) + PERF_CTR_BO_LOCK_OFFSET;
radeon_emit(cs, PKT3(PKT3_ATOMIC_MEM, 7, 0));
radeon_emit(cs, ATOMIC_OP(TC_OP_ATOMIC_CMPSWAP_32) | ATOMIC_COMMAND(ATOMIC_COMMAND_LOOP));
radeon_emit(cs, mutex_va); /* addr lo */
radeon_emit(cs, mutex_va >> 32); /* addr hi */
radeon_emit(cs, 1); /* data lo */
radeon_emit(cs, 0); /* data hi */
radeon_emit(cs, 0); /* compare data lo */
radeon_emit(cs, 0); /* compare data hi */
radeon_emit(cs, 10); /* loop interval */
}
uint64_t va = radv_buffer_get_va(device->perf_counter_bo) + PERF_CTR_BO_PASS_OFFSET;
uint64_t unset_va = va + (unlock ? 8 * pass : 0);
uint64_t set_va = va + (unlock ? 0 : 8 * pass);
radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_IMM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
COPY_DATA_COUNT_SEL | COPY_DATA_WR_CONFIRM);
radeon_emit(cs, 0); /* immediate */
radeon_emit(cs, 0);
radeon_emit(cs, unset_va);
radeon_emit(cs, unset_va >> 32);
radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_IMM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
COPY_DATA_COUNT_SEL | COPY_DATA_WR_CONFIRM);
radeon_emit(cs, 1); /* immediate */
radeon_emit(cs, 0);
radeon_emit(cs, set_va);
radeon_emit(cs, set_va >> 32);
if (unlock) {
uint64_t mutex_va = radv_buffer_get_va(device->perf_counter_bo) + PERF_CTR_BO_LOCK_OFFSET;
radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_IMM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
COPY_DATA_COUNT_SEL | COPY_DATA_WR_CONFIRM);
radeon_emit(cs, 0); /* immediate */
radeon_emit(cs, 0);
radeon_emit(cs, mutex_va);
radeon_emit(cs, mutex_va >> 32);
}
assert(cs->cdw <= cdw);
VkResult result = device->ws->cs_finalize(cs);
if (result != VK_SUCCESS) {
device->ws->cs_destroy(cs);
return NULL;
}
/* All the casts are to avoid MSVC errors around pointer truncation in a non-taken
* alternative.
*/
if (p_atomic_cmpxchg((uintptr_t*)cs_ref, 0, (uintptr_t)cs) != 0) {
device->ws->cs_destroy(cs);
}
return *cs_ref;
}
static VkResult
radv_sparse_buffer_bind_memory(struct radv_device *device, const VkSparseBufferMemoryBindInfo *bind)
{
@ -4625,7 +4726,8 @@ radv_sparse_image_bind_memory(struct radv_device *device, const VkSparseImageMem
static VkResult
radv_update_preambles(struct radv_queue_state *queue, struct radv_device *device,
struct vk_command_buffer *const *cmd_buffers, uint32_t cmd_buffer_count)
struct vk_command_buffer *const *cmd_buffers, uint32_t cmd_buffer_count,
bool *uses_perf_counters)
{
if (queue->qf == RADV_QUEUE_TRANSFER)
return VK_SUCCESS;
@ -4637,6 +4739,7 @@ radv_update_preambles(struct radv_queue_state *queue, struct radv_device *device
* - Allocate the max size and reuse it, but don't free it until the queue is destroyed
*/
struct radv_queue_ring_info needs = queue->ring_info;
*uses_perf_counters = false;
for (uint32_t j = 0; j < cmd_buffer_count; j++) {
struct radv_cmd_buffer *cmd_buffer = container_of(cmd_buffers[j], struct radv_cmd_buffer, vk);
@ -4655,6 +4758,7 @@ radv_update_preambles(struct radv_queue_state *queue, struct radv_device *device
needs.gds |= cmd_buffer->gds_needed;
needs.gds_oa |= cmd_buffer->gds_oa_needed;
needs.sample_positions |= cmd_buffer->sample_positions_needed;
*uses_perf_counters |= cmd_buffer->state.uses_perf_counters;
}
/* Sanitize scratch size information. */
@ -4767,17 +4871,21 @@ radv_queue_submit_normal(struct radv_queue *queue, struct vk_queue_submit *submi
bool can_patch = true;
uint32_t advance;
VkResult result;
bool uses_perf_counters = false;
unsigned cmd_buffer_count = submission->command_buffer_count;
result = radv_update_preambles(&queue->state, queue->device, submission->command_buffers,
submission->command_buffer_count);
submission->command_buffer_count, &uses_perf_counters);
if (result != VK_SUCCESS)
return result;
if (queue->device->trace_bo)
simple_mtx_lock(&queue->device->trace_mtx);
struct radeon_cmdbuf **cs_array =
malloc(sizeof(struct radeon_cmdbuf *) * (submission->command_buffer_count));
if (uses_perf_counters)
cmd_buffer_count += 2;
struct radeon_cmdbuf **cs_array = malloc(sizeof(struct radeon_cmdbuf *) * cmd_buffer_count);
if (!cs_array)
goto fail;
@ -4785,13 +4893,25 @@ radv_queue_submit_normal(struct radv_queue *queue, struct vk_queue_submit *submi
struct radv_cmd_buffer *cmd_buffer = (struct radv_cmd_buffer *)submission->command_buffers[j];
assert(cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY);
cs_array[j] = cmd_buffer->cs;
cs_array[j + (uses_perf_counters ? 1 : 0)] = cmd_buffer->cs;
if ((cmd_buffer->usage_flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT))
can_patch = false;
cmd_buffer->status = RADV_CMD_BUFFER_STATUS_PENDING;
}
if (uses_perf_counters) {
cs_array[0] =
radv_create_perf_counter_lock_cs(queue->device, submission->perf_pass_index, false);
cs_array[cmd_buffer_count - 1] =
radv_create_perf_counter_lock_cs(queue->device, submission->perf_pass_index, true);
can_patch = false;
if (!cs_array[0] || !cs_array[cmd_buffer_count - 1]) {
result = VK_ERROR_OUT_OF_HOST_MEMORY;
goto fail;
}
}
/* For fences on the same queue/vm amdgpu doesn't wait till all processing is finished
* before starting the next cmdbuffer, so we need to do it here. */
bool need_wait = submission->wait_count > 0;
@ -4806,9 +4926,9 @@ radv_queue_submit_normal(struct radv_queue *queue, struct vk_queue_submit *submi
.continue_preamble_cs = queue->state.continue_preamble_cs,
};
for (uint32_t j = 0; j < submission->command_buffer_count; j += advance) {
advance = MIN2(max_cs_submission, submission->command_buffer_count - j);
bool last_submit = j + advance == submission->command_buffer_count;
for (uint32_t j = 0; j < cmd_buffer_count; j += advance) {
advance = MIN2(max_cs_submission, cmd_buffer_count - j);
bool last_submit = j + advance == cmd_buffer_count;
if (queue->device->trace_bo)
*queue->device->trace_id_ptr = 0;

View File

@ -918,6 +918,9 @@ struct radv_device {
* - the pass to use for profiling. (as an array of bools)
*/
struct radeon_winsys_bo *perf_counter_bo;
/* Interleaved lock/unlock commandbuffers for perfcounter passes. */
struct radeon_cmdbuf **perf_counter_lock_cs;
};
bool radv_device_acquire_performance_counters(struct radv_device *device);
@ -1532,6 +1535,9 @@ struct radv_cmd_state {
/* Whether to suspend streamout for internal driver operations. */
bool suspend_streamout;
/* Whether this commandbuffer uses performance counters. */
bool uses_perf_counters;
};
struct radv_cmd_pool {