radv: add support for exporting pipelines with RGP

This is still experimental and only enabled with RADV_THREAD_TRACE_PIPELINE to avoid breaking existing support. Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com> Reviewed-by: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9130>
2021-02-16 14:09:20 +01:00 · 2021-02-16 14:09:20 +01:00 · a72b7a9b6c
parent 884e14d784
commit a72b7a9b6c
2 changed files with 363 additions and 0 deletions
--- a/src/amd/vulkan/layers/radv_sqtt_layer.c
+++ b/src/amd/vulkan/layers/radv_sqtt_layer.c
@ -22,6 +22,7 @@
 */

 #include "radv_private.h"
+#include "radv_shader.h"

 #include "ac_rgp.h"
 #include "ac_sqtt.h"
@ -723,6 +724,12 @@ void sqtt_CmdCopyQueryPoolResults(
 	radv_Cmd##cmd_name(__VA_ARGS__); \
 	radv_write_end_general_api_marker(cmd_buffer, ApiCmd##cmd_name);

+static bool
+radv_sqtt_dump_pipeline()
+{
+	return getenv("RADV_THREAD_TRACE_PIPELINE");
+}
+
 void sqtt_CmdBindPipeline(
 	VkCommandBuffer                             commandBuffer,
 	VkPipelineBindPoint                         pipelineBindPoint,
@ -957,4 +964,339 @@ VkResult sqtt_DebugMarkerSetObjectTagEXT(
 	return VK_SUCCESS;
 }

+/* Pipelines */
+static enum rgp_hardware_stages
+radv_mesa_to_rgp_shader_stage(struct radv_pipeline *pipeline,
+			      gl_shader_stage stage)
+{
+	struct radv_shader_variant *shader = pipeline->shaders[stage];
+
+	switch (stage) {
+	case MESA_SHADER_VERTEX:
+		if (shader->info.vs.as_ls)
+			return RGP_HW_STAGE_LS;
+		else if (shader->info.vs.as_es)
+			return RGP_HW_STAGE_ES;
+		else if (shader->info.is_ngg)
+			return RGP_HW_STAGE_GS;
+		else
+			return RGP_HW_STAGE_VS;
+	case MESA_SHADER_TESS_CTRL:
+		return RGP_HW_STAGE_HS;
+	case MESA_SHADER_TESS_EVAL:
+		if (shader->info.tes.as_es)
+			return RGP_HW_STAGE_ES;
+		else if (shader->info.is_ngg)
+			return RGP_HW_STAGE_GS;
+		else
+			return RGP_HW_STAGE_VS;
+	case MESA_SHADER_GEOMETRY:
+		return RGP_HW_STAGE_GS;
+	case MESA_SHADER_FRAGMENT:
+		return RGP_HW_STAGE_PS;
+	case MESA_SHADER_COMPUTE:
+		return RGP_HW_STAGE_CS;
+	default:
+		unreachable("invalid mesa shader stage");
+	}
+}
+
+static VkResult
+radv_add_pso_correlation(struct radv_device *device,
+			 struct radv_pipeline *pipeline)
+{
+	struct ac_thread_trace_data *thread_trace_data = &device->thread_trace;
+	struct rgp_pso_correlation *pso_correlation = &thread_trace_data->rgp_pso_correlation;
+	struct rgp_pso_correlation_record *record;
+
+	record = malloc(sizeof(struct rgp_pso_correlation_record));
+	if (!record)
+		return VK_ERROR_OUT_OF_HOST_MEMORY;
+
+	record->api_pso_hash = 0;
+	record->pipeline_hash[0] = pipeline->pipeline_hash;
+	record->pipeline_hash[1] = pipeline->pipeline_hash;
+	memset(record->api_level_obj_name, 0, sizeof(record->api_level_obj_name));
+
+	simple_mtx_lock(&thread_trace_data->rgp_pso_correlation.lock);
+	list_addtail(&record->list, &pso_correlation->record);
+	pso_correlation->record_count++;
+	simple_mtx_unlock(&thread_trace_data->rgp_pso_correlation.lock);
+
+	return VK_SUCCESS;
+}
+
+static VkResult
+radv_add_code_object_loader_event(struct radv_device *device,
+				  struct radv_pipeline *pipeline)
+{
+	struct ac_thread_trace_data *thread_trace_data = &device->thread_trace;
+	struct rgp_loader_events *loader_events = &thread_trace_data->rgp_loader_events;
+	struct rgp_loader_events_record *record;
+	uint64_t base_va = ~0;
+
+	record = malloc(sizeof(struct rgp_loader_events_record));
+	if (!record)
+		return VK_ERROR_OUT_OF_HOST_MEMORY;
+
+	/* Find the lowest shader BO VA. */
+	for (unsigned i = 0; i < MESA_SHADER_STAGES; i++) {
+		struct radv_shader_variant *shader = pipeline->shaders[i];
+		uint64_t va;
+
+		if (!shader)
+			continue;
+
+		va = radv_buffer_get_va(shader->bo) + shader->bo_offset;
+		base_va = MIN2(base_va, va);
+	}
+
+	record->loader_event_type = RGP_LOAD_TO_GPU_MEMORY;
+	record->reserved = 0;
+	record->base_address = base_va & 0xffffffffffff;
+	record->code_object_hash[0] = pipeline->pipeline_hash;
+	record->code_object_hash[1] = pipeline->pipeline_hash;
+	record->time_stamp = os_time_get_nano();
+
+	simple_mtx_lock(&loader_events->lock);
+	list_addtail(&record->list, &loader_events->record);
+	loader_events->record_count++;
+	simple_mtx_unlock(&loader_events->lock);
+
+	return VK_SUCCESS;
+}
+
+static VkResult
+radv_add_code_object(struct radv_device *device,
+		     struct radv_pipeline *pipeline)
+{
+	struct ac_thread_trace_data *thread_trace_data = &device->thread_trace;
+	struct rgp_code_object *code_object = &thread_trace_data->rgp_code_object;
+	struct rgp_code_object_record *record;
+
+	record = malloc(sizeof(struct rgp_code_object_record));
+	if (!record)
+		return VK_ERROR_OUT_OF_HOST_MEMORY;
+
+	record->shader_stages_mask = 0;
+	record->num_shaders_combined = 0;
+	record->pipeline_hash[0] = pipeline->pipeline_hash;
+	record->pipeline_hash[1] = pipeline->pipeline_hash;
+
+	for (unsigned i = 0; i < MESA_SHADER_STAGES; i++) {
+		struct radv_shader_variant *shader = pipeline->shaders[i];
+		uint8_t *code;
+		uint64_t va;
+
+		if (!shader)
+			continue;
+
+		code = malloc(shader->code_size);
+		if (!code) {
+			free(record);
+			return VK_ERROR_OUT_OF_HOST_MEMORY;
+		}
+		memcpy(code, shader->code_ptr, shader->code_size);
+
+		va = radv_buffer_get_va(shader->bo) + shader->bo_offset;
+
+		record->shader_data[i].hash[0] = (uint64_t)(uintptr_t)shader;
+		record->shader_data[i].hash[1] = (uint64_t)(uintptr_t)shader >> 32;
+		record->shader_data[i].code_size = shader->code_size;
+		record->shader_data[i].code = code;
+		record->shader_data[i].vgpr_count = shader->config.num_vgprs;
+		record->shader_data[i].sgpr_count = shader->config.num_sgprs;
+		record->shader_data[i].base_address = va & 0xffffffffffff;
+		record->shader_data[i].elf_symbol_offset = 0;
+		record->shader_data[i].hw_stage = radv_mesa_to_rgp_shader_stage(pipeline, i);
+		record->shader_data[i].is_combined = false;
+
+		record->shader_stages_mask |= (1 << i);
+		record->num_shaders_combined++;
+	}
+
+	simple_mtx_lock(&code_object->lock);
+	list_addtail(&record->list, &code_object->record);
+	code_object->record_count++;
+	simple_mtx_unlock(&code_object->lock);
+
+	return VK_SUCCESS;
+}
+
+static VkResult
+radv_register_pipeline(struct radv_device *device,
+		       struct radv_pipeline *pipeline)
+{
+	VkResult result;
+
+	result = radv_add_pso_correlation(device, pipeline);
+	if (result != VK_SUCCESS)
+		return result;
+
+	result = radv_add_code_object_loader_event(device, pipeline);
+	if (result != VK_SUCCESS)
+		return result;
+
+	result = radv_add_code_object(device, pipeline);
+	if (result != VK_SUCCESS)
+		return result;
+
+	return VK_SUCCESS;
+}
+
+static void
+radv_unregister_pipeline(struct radv_device *device,
+			 struct radv_pipeline *pipeline)
+{
+	struct ac_thread_trace_data *thread_trace_data = &device->thread_trace;
+	struct rgp_pso_correlation *pso_correlation = &thread_trace_data->rgp_pso_correlation;
+	struct rgp_loader_events *loader_events = &thread_trace_data->rgp_loader_events;
+	struct rgp_code_object *code_object = &thread_trace_data->rgp_code_object;
+
+	/* Destroy the PSO correlation record. */
+	simple_mtx_lock(&pso_correlation->lock);
+	list_for_each_entry_safe(struct rgp_pso_correlation_record, record,
+				 &pso_correlation->record, list) {
+		if (record->pipeline_hash[0] == pipeline->pipeline_hash) {
+			pso_correlation->record_count--;
+			list_del(&record->list);
+			free(record);
+			break;
+		}
+	}
+	simple_mtx_unlock(&pso_correlation->lock);
+
+	/* Destroy the code object loader record. */
+	simple_mtx_lock(&loader_events->lock);
+	list_for_each_entry_safe(struct rgp_loader_events_record, record,
+				 &loader_events->record, list) {
+		if (record->code_object_hash[0] == pipeline->pipeline_hash) {
+			loader_events->record_count--;
+			list_del(&record->list);
+			free(record);
+			break;
+		}
+	}
+	simple_mtx_unlock(&loader_events->lock);
+
+	/* Destroy the code object record. */
+	simple_mtx_lock(&code_object->lock);
+	list_for_each_entry_safe(struct rgp_code_object_record, record,
+				 &code_object->record, list) {
+		if (record->pipeline_hash[0] == pipeline->pipeline_hash) {
+			uint32_t mask = record->shader_stages_mask;
+			int i;
+
+			/* Free the disassembly. */
+			while (mask) {
+				i = u_bit_scan(&mask);
+				free(record->shader_data[i].code);
+			}
+
+			code_object->record_count--;
+			list_del(&record->list);
+			free(record);
+			break;
+		}
+	}
+	simple_mtx_unlock(&code_object->lock);
+}
+
+VkResult sqtt_CreateGraphicsPipelines(
+	VkDevice                                    _device,
+	VkPipelineCache                             pipelineCache,
+	uint32_t                                    count,
+	const VkGraphicsPipelineCreateInfo*         pCreateInfos,
+	const VkAllocationCallbacks*                pAllocator,
+	VkPipeline*                                 pPipelines)
+{
+	RADV_FROM_HANDLE(radv_device, device, _device);
+	VkResult result;
+
+	result = radv_CreateGraphicsPipelines(_device, pipelineCache, count,
+					      pCreateInfos, pAllocator,
+					      pPipelines);
+	if (result != VK_SUCCESS)
+		return result;
+
+	if (radv_sqtt_dump_pipeline()) {
+		for (unsigned i = 0; i < count; i++) {
+			RADV_FROM_HANDLE(radv_pipeline, pipeline, pPipelines[i]);
+
+			if (!pipeline)
+				continue;
+
+			result = radv_register_pipeline(device, pipeline);
+			if (result != VK_SUCCESS)
+				goto fail;
+		}
+	}
+
+	return VK_SUCCESS;
+
+fail:
+	for (unsigned i = 0; i < count; i++) {
+		sqtt_DestroyPipeline(_device, pPipelines[i], pAllocator);
+		pPipelines[i] = VK_NULL_HANDLE;
+	}
+	return result;
+}
+
+VkResult sqtt_CreateComputePipelines(
+	VkDevice                                    _device,
+	VkPipelineCache                             pipelineCache,
+	uint32_t                                    count,
+	const VkComputePipelineCreateInfo*          pCreateInfos,
+	const VkAllocationCallbacks*                pAllocator,
+	VkPipeline*                                 pPipelines)
+{
+	RADV_FROM_HANDLE(radv_device, device, _device);
+	VkResult result;
+
+	result = radv_CreateComputePipelines(_device, pipelineCache, count,
+					     pCreateInfos, pAllocator,
+					     pPipelines);
+	if (result != VK_SUCCESS)
+		return result;
+
+	if (radv_sqtt_dump_pipeline()) {
+		for (unsigned i = 0; i < count; i++) {
+			RADV_FROM_HANDLE(radv_pipeline, pipeline, pPipelines[i]);
+
+			if (!pipeline)
+				continue;
+
+			result = radv_register_pipeline(device, pipeline);
+			if (result != VK_SUCCESS)
+				goto fail;
+		}
+	}
+
+	return VK_SUCCESS;
+
+fail:
+	for (unsigned i = 0; i < count; i++) {
+		sqtt_DestroyPipeline(_device, pPipelines[i], pAllocator);
+		pPipelines[i] = VK_NULL_HANDLE;
+	}
+	return result;
+}
+
+void sqtt_DestroyPipeline(
+	VkDevice                                    _device,
+	VkPipeline                                  _pipeline,
+	const VkAllocationCallbacks*                pAllocator)
+{
+	RADV_FROM_HANDLE(radv_device, device, _device);
+	RADV_FROM_HANDLE(radv_pipeline, pipeline, _pipeline);
+
+	if (!_pipeline)
+		return;
+
+       if (radv_sqtt_dump_pipeline())
+		radv_unregister_pipeline(device, pipeline);
+
+	radv_DestroyPipeline(_device, _pipeline, pAllocator);
+}
+
 #undef API_MARKER
--- a/src/amd/vulkan/radv_sqtt.c
+++ b/src/amd/vulkan/radv_sqtt.c
@ -432,6 +432,8 @@ radv_thread_trace_init_bo(struct radv_device *device)
 bool
 radv_thread_trace_init(struct radv_device *device)
 {
+	struct ac_thread_trace_data *thread_trace_data = &device->thread_trace;
+
 	/* Default buffer size set to 1MB per SE. */
 	device->thread_trace.buffer_size =
 		radv_get_int_debug_option("RADV_THREAD_TRACE_BUFFER_SIZE", 1024 * 1024);
@ -444,12 +446,22 @@ radv_thread_trace_init(struct radv_device *device)
 	if (!radv_thread_trace_init_bo(device))
 		return false;

+	list_inithead(&thread_trace_data->rgp_pso_correlation.record);
+	simple_mtx_init(&thread_trace_data->rgp_pso_correlation.lock, mtx_plain);
+
+	list_inithead(&thread_trace_data->rgp_loader_events.record);
+	simple_mtx_init(&thread_trace_data->rgp_loader_events.lock, mtx_plain);
+
+	list_inithead(&thread_trace_data->rgp_code_object.record);
+	simple_mtx_init(&thread_trace_data->rgp_code_object.lock, mtx_plain);
+
 	return true;
 }

 void
 radv_thread_trace_finish(struct radv_device *device)
 {
+	struct ac_thread_trace_data *thread_trace_data = &device->thread_trace;
 	struct radeon_winsys *ws = device->ws;

 	if (unlikely(device->thread_trace.bo))
@ -461,6 +473,15 @@ radv_thread_trace_finish(struct radv_device *device)
 		if (device->thread_trace.stop_cs[i])
 			ws->cs_destroy(device->thread_trace.stop_cs[i]);
 	}
+
+	assert(thread_trace_data->rgp_pso_correlation.record_count == 0);
+	simple_mtx_destroy(&thread_trace_data->rgp_pso_correlation.lock);
+
+	assert(thread_trace_data->rgp_loader_events.record_count == 0);
+	simple_mtx_destroy(&thread_trace_data->rgp_loader_events.lock);
+
+	assert(thread_trace_data->rgp_code_object.record_count == 0);
+	simple_mtx_destroy(&thread_trace_data->rgp_code_object.lock);
 }

 static bool