radv: Submit internal compute cmdbuf.

Use scheduled dependencies to create two submissions:
first we submit ACE then GFX.

Signed-off-by: Timur Kristóf <timur.kristof@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/16531>
This commit is contained in:
Timur Kristóf 2022-05-12 01:27:03 +02:00 committed by Marge Bot
parent eeb8366bbd
commit 5a1cbafd9d
2 changed files with 154 additions and 0 deletions

View File

@ -2874,10 +2874,33 @@ radv_queue_state_finish(struct radv_queue_state *queue, struct radeon_winsys *ws
static void
radv_queue_finish(struct radv_queue *queue)
{
if (queue->ace_internal_state) {
/* Prevent double free */
queue->ace_internal_state->task_rings_bo = NULL;
/* Clean up the internal ACE queue state. */
radv_queue_state_finish(queue->ace_internal_state, queue->device->ws);
free(queue->ace_internal_state);
}
radv_queue_state_finish(&queue->state, queue->device->ws);
vk_queue_finish(&queue->vk);
}
static bool
radv_queue_init_ace_internal_state(struct radv_queue *queue)
{
if (queue->ace_internal_state)
return true;
queue->ace_internal_state = calloc(1, sizeof(struct radv_queue_state));
if (!queue->ace_internal_state)
return false;
queue->ace_internal_state->qf = RADV_QUEUE_COMPUTE;
return true;
}
static VkResult
radv_device_init_border_color(struct radv_device *device)
{
@ -4893,6 +4916,37 @@ radv_update_preambles(struct radv_queue_state *queue, struct radv_device *device
return radv_update_preamble_cs(queue, device, &needs);
}
static VkResult
radv_update_ace_preambles(struct radv_queue *queue)
{
if (!radv_queue_init_ace_internal_state(queue))
return VK_ERROR_OUT_OF_HOST_MEMORY;
/* Copy task rings state.
* Task shaders that are submitted on the ACE queue need to share
* their ring buffers with the mesh shaders on the GFX queue.
*/
queue->ace_internal_state->ring_info.task_rings = queue->state.ring_info.task_rings;
queue->ace_internal_state->task_rings_bo = queue->state.task_rings_bo;
/* Copy some needed states from the parent queue state.
* These can only increase so it's okay to copy them as-is without checking.
* Note, task shaders use the scratch size from their graphics pipeline.
*/
struct radv_queue_ring_info needs = queue->ace_internal_state->ring_info;
needs.compute_scratch_size_per_wave = queue->state.ring_info.scratch_size_per_wave;
needs.compute_scratch_waves = queue->state.ring_info.scratch_waves;
needs.task_rings = queue->state.ring_info.task_rings;
return radv_update_preamble_cs(queue->ace_internal_state, queue->device, &needs);
}
static bool
radv_cmd_buffer_needs_ace(const struct radv_cmd_buffer *cmd_buffer)
{
return cmd_buffer->ace_internal.cs && cmd_buffer->task_rings_needed;
}
struct radv_deferred_queue_submission {
struct radv_queue *queue;
VkCommandBuffer *cmd_buffers;
@ -4964,12 +5018,104 @@ radv_queue_submit_empty(struct radv_queue *queue, struct vk_queue_submit *submis
submission->signal_count, submission->signals, false);
}
static VkResult
radv_queue_submit_with_ace(struct radv_queue *queue, struct vk_queue_submit *submission,
struct radeon_cmdbuf **cs_array, unsigned cs_count, unsigned cs_offset,
bool can_patch)
{
/* Submits command buffers that may have an internal ACE cmdbuf
* using scheduled dependencies. This guarantees that the GFX cmdbuf
* is only scheduled after ACE.
*
* TODO: Unfortunately this is prone to a deadlock, so is considered a
* temporary solution until gang submit is merged in the upstream kernel.
*/
struct radeon_winsys_ctx *ctx = queue->hw_ctx;
const uint32_t max_cs_submission = queue->device->trace_bo ? 1 : RADV_MAX_IBS_PER_SUBMIT;
const bool need_wait = submission->wait_count > 0;
VkResult result = VK_SUCCESS;
struct radeon_cmdbuf **ace_cs_array = calloc(max_cs_submission, sizeof(struct radeon_cmdbuf *));
if (!ace_cs_array) {
result = VK_ERROR_OUT_OF_HOST_MEMORY;
goto finish;
}
result = radv_update_ace_preambles(queue);
if (result != VK_SUCCESS)
goto finish;
struct radv_winsys_submit_info submit[2] = {
{
.ip_type = AMD_IP_COMPUTE,
.cs_array = ace_cs_array,
.cs_count = 0,
.initial_preamble_cs = need_wait
? queue->ace_internal_state->initial_full_flush_preamble_cs
: queue->ace_internal_state->initial_preamble_cs,
},
{
.ip_type = radv_queue_ring(queue),
.queue_index = queue->vk.index_in_family,
.cs_array = cs_array,
.cs_count = 0,
.initial_preamble_cs = need_wait ? queue->state.initial_full_flush_preamble_cs
: queue->state.initial_preamble_cs,
}};
for (uint32_t advance, j = 0; j < cs_count; j += advance) {
advance = MIN2(max_cs_submission, cs_count - j);
bool last_submit = j + advance == cs_count;
if (queue->device->trace_bo)
*queue->device->trace_id_ptr = 0;
for (unsigned c = 0; c < advance; ++c) {
const struct radv_cmd_buffer *cmd_buffer =
(struct radv_cmd_buffer *)submission->command_buffers[j + c + cs_offset];
if (!radv_cmd_buffer_needs_ace(cmd_buffer))
continue;
submit[0].cs_array[submit[0].cs_count++] = cmd_buffer->ace_internal.cs;
}
const uint32_t submit_count = 1 + !!submit[0].cs_count;
const struct radv_winsys_submit_info *submit_ptr = submit + !submit[0].cs_count;
submit[1].cs_count = advance;
result = queue->device->ws->cs_submit(
ctx, submit_count, submit_ptr, j == 0 ? submission->wait_count : 0, submission->waits,
last_submit ? submission->signal_count : 0, submission->signals, can_patch);
if (result != VK_SUCCESS)
goto finish;
if (queue->device->trace_bo) {
radv_check_gpu_hangs(queue, cs_array[j]);
}
if (queue->device->tma_bo) {
radv_check_trap_handler(queue);
}
submit[1].cs_array += submit[1].cs_count;
submit[1].initial_preamble_cs = queue->state.initial_preamble_cs;
submit[0].cs_count = 0;
submit[0].initial_preamble_cs = queue->ace_internal_state->initial_preamble_cs;
}
finish:
free(ace_cs_array);
return result;
}
static VkResult
radv_queue_submit_normal(struct radv_queue *queue, struct vk_queue_submit *submission)
{
struct radeon_winsys_ctx *ctx = queue->hw_ctx;
uint32_t max_cs_submission = queue->device->trace_bo ? 1 : RADV_MAX_IBS_PER_SUBMIT;
bool can_patch = true;
bool use_ace = false;
uint32_t advance;
VkResult result;
bool uses_perf_counters = false;
@ -4999,6 +5145,7 @@ radv_queue_submit_normal(struct radv_queue *queue, struct vk_queue_submit *submi
can_patch = false;
cmd_buffer->status = RADV_CMD_BUFFER_STATUS_PENDING;
use_ace |= radv_cmd_buffer_needs_ace(cmd_buffer);
}
if (uses_perf_counters) {
@ -5013,6 +5160,12 @@ radv_queue_submit_normal(struct radv_queue *queue, struct vk_queue_submit *submi
}
}
if (use_ace) {
result = radv_queue_submit_with_ace(queue, submission, cs_array, cmd_buffer_count, cs_offset,
can_patch);
goto fail;
}
/* For fences on the same queue/vm amdgpu doesn't wait till all processing is finished
* before starting the next cmdbuffer, so we need to do it here. */
bool need_wait = submission->wait_count > 0;

View File

@ -765,6 +765,7 @@ struct radv_queue {
struct radeon_winsys_ctx *hw_ctx;
enum radeon_ctx_priority priority;
struct radv_queue_state state;
struct radv_queue_state *ace_internal_state;
};
#define RADV_BORDER_COLOR_COUNT 4096