radv: Submit internal compute cmdbuf.
Use scheduled dependencies to create two submissions: first we submit ACE then GFX. Signed-off-by: Timur Kristóf <timur.kristof@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/16531>
This commit is contained in:
parent
eeb8366bbd
commit
5a1cbafd9d
|
@ -2874,10 +2874,33 @@ radv_queue_state_finish(struct radv_queue_state *queue, struct radeon_winsys *ws
|
|||
static void
|
||||
radv_queue_finish(struct radv_queue *queue)
|
||||
{
|
||||
if (queue->ace_internal_state) {
|
||||
/* Prevent double free */
|
||||
queue->ace_internal_state->task_rings_bo = NULL;
|
||||
|
||||
/* Clean up the internal ACE queue state. */
|
||||
radv_queue_state_finish(queue->ace_internal_state, queue->device->ws);
|
||||
free(queue->ace_internal_state);
|
||||
}
|
||||
|
||||
radv_queue_state_finish(&queue->state, queue->device->ws);
|
||||
vk_queue_finish(&queue->vk);
|
||||
}
|
||||
|
||||
static bool
|
||||
radv_queue_init_ace_internal_state(struct radv_queue *queue)
|
||||
{
|
||||
if (queue->ace_internal_state)
|
||||
return true;
|
||||
|
||||
queue->ace_internal_state = calloc(1, sizeof(struct radv_queue_state));
|
||||
if (!queue->ace_internal_state)
|
||||
return false;
|
||||
|
||||
queue->ace_internal_state->qf = RADV_QUEUE_COMPUTE;
|
||||
return true;
|
||||
}
|
||||
|
||||
static VkResult
|
||||
radv_device_init_border_color(struct radv_device *device)
|
||||
{
|
||||
|
@ -4893,6 +4916,37 @@ radv_update_preambles(struct radv_queue_state *queue, struct radv_device *device
|
|||
return radv_update_preamble_cs(queue, device, &needs);
|
||||
}
|
||||
|
||||
static VkResult
|
||||
radv_update_ace_preambles(struct radv_queue *queue)
|
||||
{
|
||||
if (!radv_queue_init_ace_internal_state(queue))
|
||||
return VK_ERROR_OUT_OF_HOST_MEMORY;
|
||||
|
||||
/* Copy task rings state.
|
||||
* Task shaders that are submitted on the ACE queue need to share
|
||||
* their ring buffers with the mesh shaders on the GFX queue.
|
||||
*/
|
||||
queue->ace_internal_state->ring_info.task_rings = queue->state.ring_info.task_rings;
|
||||
queue->ace_internal_state->task_rings_bo = queue->state.task_rings_bo;
|
||||
|
||||
/* Copy some needed states from the parent queue state.
|
||||
* These can only increase so it's okay to copy them as-is without checking.
|
||||
* Note, task shaders use the scratch size from their graphics pipeline.
|
||||
*/
|
||||
struct radv_queue_ring_info needs = queue->ace_internal_state->ring_info;
|
||||
needs.compute_scratch_size_per_wave = queue->state.ring_info.scratch_size_per_wave;
|
||||
needs.compute_scratch_waves = queue->state.ring_info.scratch_waves;
|
||||
needs.task_rings = queue->state.ring_info.task_rings;
|
||||
|
||||
return radv_update_preamble_cs(queue->ace_internal_state, queue->device, &needs);
|
||||
}
|
||||
|
||||
static bool
|
||||
radv_cmd_buffer_needs_ace(const struct radv_cmd_buffer *cmd_buffer)
|
||||
{
|
||||
return cmd_buffer->ace_internal.cs && cmd_buffer->task_rings_needed;
|
||||
}
|
||||
|
||||
struct radv_deferred_queue_submission {
|
||||
struct radv_queue *queue;
|
||||
VkCommandBuffer *cmd_buffers;
|
||||
|
@ -4964,12 +5018,104 @@ radv_queue_submit_empty(struct radv_queue *queue, struct vk_queue_submit *submis
|
|||
submission->signal_count, submission->signals, false);
|
||||
}
|
||||
|
||||
static VkResult
|
||||
radv_queue_submit_with_ace(struct radv_queue *queue, struct vk_queue_submit *submission,
|
||||
struct radeon_cmdbuf **cs_array, unsigned cs_count, unsigned cs_offset,
|
||||
bool can_patch)
|
||||
{
|
||||
/* Submits command buffers that may have an internal ACE cmdbuf
|
||||
* using scheduled dependencies. This guarantees that the GFX cmdbuf
|
||||
* is only scheduled after ACE.
|
||||
*
|
||||
* TODO: Unfortunately this is prone to a deadlock, so is considered a
|
||||
* temporary solution until gang submit is merged in the upstream kernel.
|
||||
*/
|
||||
struct radeon_winsys_ctx *ctx = queue->hw_ctx;
|
||||
const uint32_t max_cs_submission = queue->device->trace_bo ? 1 : RADV_MAX_IBS_PER_SUBMIT;
|
||||
const bool need_wait = submission->wait_count > 0;
|
||||
VkResult result = VK_SUCCESS;
|
||||
|
||||
struct radeon_cmdbuf **ace_cs_array = calloc(max_cs_submission, sizeof(struct radeon_cmdbuf *));
|
||||
if (!ace_cs_array) {
|
||||
result = VK_ERROR_OUT_OF_HOST_MEMORY;
|
||||
goto finish;
|
||||
}
|
||||
|
||||
result = radv_update_ace_preambles(queue);
|
||||
if (result != VK_SUCCESS)
|
||||
goto finish;
|
||||
|
||||
struct radv_winsys_submit_info submit[2] = {
|
||||
{
|
||||
.ip_type = AMD_IP_COMPUTE,
|
||||
.cs_array = ace_cs_array,
|
||||
.cs_count = 0,
|
||||
.initial_preamble_cs = need_wait
|
||||
? queue->ace_internal_state->initial_full_flush_preamble_cs
|
||||
: queue->ace_internal_state->initial_preamble_cs,
|
||||
},
|
||||
{
|
||||
.ip_type = radv_queue_ring(queue),
|
||||
.queue_index = queue->vk.index_in_family,
|
||||
.cs_array = cs_array,
|
||||
.cs_count = 0,
|
||||
.initial_preamble_cs = need_wait ? queue->state.initial_full_flush_preamble_cs
|
||||
: queue->state.initial_preamble_cs,
|
||||
}};
|
||||
|
||||
for (uint32_t advance, j = 0; j < cs_count; j += advance) {
|
||||
advance = MIN2(max_cs_submission, cs_count - j);
|
||||
bool last_submit = j + advance == cs_count;
|
||||
|
||||
if (queue->device->trace_bo)
|
||||
*queue->device->trace_id_ptr = 0;
|
||||
|
||||
for (unsigned c = 0; c < advance; ++c) {
|
||||
const struct radv_cmd_buffer *cmd_buffer =
|
||||
(struct radv_cmd_buffer *)submission->command_buffers[j + c + cs_offset];
|
||||
if (!radv_cmd_buffer_needs_ace(cmd_buffer))
|
||||
continue;
|
||||
|
||||
submit[0].cs_array[submit[0].cs_count++] = cmd_buffer->ace_internal.cs;
|
||||
}
|
||||
|
||||
const uint32_t submit_count = 1 + !!submit[0].cs_count;
|
||||
const struct radv_winsys_submit_info *submit_ptr = submit + !submit[0].cs_count;
|
||||
submit[1].cs_count = advance;
|
||||
|
||||
result = queue->device->ws->cs_submit(
|
||||
ctx, submit_count, submit_ptr, j == 0 ? submission->wait_count : 0, submission->waits,
|
||||
last_submit ? submission->signal_count : 0, submission->signals, can_patch);
|
||||
|
||||
if (result != VK_SUCCESS)
|
||||
goto finish;
|
||||
|
||||
if (queue->device->trace_bo) {
|
||||
radv_check_gpu_hangs(queue, cs_array[j]);
|
||||
}
|
||||
|
||||
if (queue->device->tma_bo) {
|
||||
radv_check_trap_handler(queue);
|
||||
}
|
||||
|
||||
submit[1].cs_array += submit[1].cs_count;
|
||||
submit[1].initial_preamble_cs = queue->state.initial_preamble_cs;
|
||||
submit[0].cs_count = 0;
|
||||
submit[0].initial_preamble_cs = queue->ace_internal_state->initial_preamble_cs;
|
||||
}
|
||||
|
||||
finish:
|
||||
free(ace_cs_array);
|
||||
return result;
|
||||
}
|
||||
|
||||
static VkResult
|
||||
radv_queue_submit_normal(struct radv_queue *queue, struct vk_queue_submit *submission)
|
||||
{
|
||||
struct radeon_winsys_ctx *ctx = queue->hw_ctx;
|
||||
uint32_t max_cs_submission = queue->device->trace_bo ? 1 : RADV_MAX_IBS_PER_SUBMIT;
|
||||
bool can_patch = true;
|
||||
bool use_ace = false;
|
||||
uint32_t advance;
|
||||
VkResult result;
|
||||
bool uses_perf_counters = false;
|
||||
|
@ -4999,6 +5145,7 @@ radv_queue_submit_normal(struct radv_queue *queue, struct vk_queue_submit *submi
|
|||
can_patch = false;
|
||||
|
||||
cmd_buffer->status = RADV_CMD_BUFFER_STATUS_PENDING;
|
||||
use_ace |= radv_cmd_buffer_needs_ace(cmd_buffer);
|
||||
}
|
||||
|
||||
if (uses_perf_counters) {
|
||||
|
@ -5013,6 +5160,12 @@ radv_queue_submit_normal(struct radv_queue *queue, struct vk_queue_submit *submi
|
|||
}
|
||||
}
|
||||
|
||||
if (use_ace) {
|
||||
result = radv_queue_submit_with_ace(queue, submission, cs_array, cmd_buffer_count, cs_offset,
|
||||
can_patch);
|
||||
goto fail;
|
||||
}
|
||||
|
||||
/* For fences on the same queue/vm amdgpu doesn't wait till all processing is finished
|
||||
* before starting the next cmdbuffer, so we need to do it here. */
|
||||
bool need_wait = submission->wait_count > 0;
|
||||
|
|
|
@ -765,6 +765,7 @@ struct radv_queue {
|
|||
struct radeon_winsys_ctx *hw_ctx;
|
||||
enum radeon_ctx_priority priority;
|
||||
struct radv_queue_state state;
|
||||
struct radv_queue_state *ace_internal_state;
|
||||
};
|
||||
|
||||
#define RADV_BORDER_COLOR_COUNT 4096
|
||||
|
|
Loading…
Reference in New Issue