radv: Synchronization for task shaders.

Add a separate flush_bits field for tracking cache
flushes in the ACE internal cmdbuf.
In barriers and image transitions we add these flush bits to ACE.

Create a semaphore in the upload BO which makes it possible
for ACE to wait for GFX for the purpose of synchronization.
This is necessary when a barrier needs to block task shaders.

Signed-off-by: Timur Kristóf <timur.kristof@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/16531>
This commit is contained in:
Timur Kristóf 2022-02-08 00:28:44 +01:00 committed by Marge Bot
parent 2479b62869
commit 4c6f83006d
3 changed files with 184 additions and 3 deletions

View File

@ -521,6 +521,9 @@ radv_reset_cmd_buffer(struct radv_cmd_buffer *cmd_buffer)
cmd_buffer->gds_needed = false;
cmd_buffer->gds_oa_needed = false;
cmd_buffer->sample_positions_needed = false;
cmd_buffer->ace_internal.sem.gfx2ace_value = 0;
cmd_buffer->ace_internal.sem.emitted_gfx2ace_value = 0;
cmd_buffer->ace_internal.sem.va = 0;
if (cmd_buffer->upload.upload_bo)
radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, cmd_buffer->upload.upload_bo);
@ -690,6 +693,105 @@ radv_cmd_buffer_trace_emit(struct radv_cmd_buffer *cmd_buffer)
radeon_emit(cs, AC_ENCODE_TRACE_POINT(cmd_buffer->state.trace_id));
}
static void
radv_ace_internal_barrier(struct radv_cmd_buffer *cmd_buffer, VkPipelineStageFlags2 src_stage_mask,
VkPipelineStageFlags2 dst_stage_mask)
{
/* Update flush bits from the main cmdbuf, except the stage flush. */
cmd_buffer->ace_internal.flush_bits |=
cmd_buffer->state.flush_bits & RADV_CMD_FLUSH_ALL_COMPUTE & ~RADV_CMD_FLAG_CS_PARTIAL_FLUSH;
/* Add stage flush only when necessary. */
if (src_stage_mask &
(VK_PIPELINE_STAGE_2_TASK_SHADER_BIT_NV | VK_PIPELINE_STAGE_2_TRANSFER_BIT |
VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT | VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT))
cmd_buffer->ace_internal.flush_bits |= RADV_CMD_FLAG_CS_PARTIAL_FLUSH;
/* Block task shaders when we have to wait for CP DMA on the GFX cmdbuf. */
if (src_stage_mask &
(VK_PIPELINE_STAGE_2_COPY_BIT | VK_PIPELINE_STAGE_2_CLEAR_BIT |
VK_PIPELINE_STAGE_2_ALL_TRANSFER_BIT | VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT |
VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT))
dst_stage_mask |= cmd_buffer->state.dma_is_busy ? VK_PIPELINE_STAGE_2_TASK_SHADER_BIT_NV : 0;
/* Increment the GFX/ACE semaphore when task shaders are blocked. */
if (dst_stage_mask &
(VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT_KHR | VK_PIPELINE_STAGE_2_DRAW_INDIRECT_BIT |
VK_PIPELINE_STAGE_2_TASK_SHADER_BIT_NV))
cmd_buffer->ace_internal.sem.gfx2ace_value++;
}
static void
radv_ace_internal_cache_flush(struct radv_cmd_buffer *cmd_buffer)
{
struct radeon_cmdbuf *ace_cs = cmd_buffer->ace_internal.cs;
const uint32_t flush_bits = cmd_buffer->ace_internal.flush_bits;
enum rgp_flush_bits sqtt_flush_bits = 0;
si_cs_emit_cache_flush(ace_cs, cmd_buffer->device->physical_device->rad_info.gfx_level, NULL, 0,
true, flush_bits, &sqtt_flush_bits, 0);
cmd_buffer->ace_internal.flush_bits = 0;
}
static uint64_t
radv_ace_internal_sem_create(struct radv_cmd_buffer *cmd_buffer)
{
/* DWORD 0: GFX->ACE semaphore (GFX blocks ACE, ie. ACE waits for GFX)
* DWORD 1: ACE->GFX semaphore
*/
uint64_t sem_init = 0;
uint32_t va_off = 0;
if (!radv_cmd_buffer_upload_data(cmd_buffer, sizeof(uint64_t), &sem_init, &va_off)) {
cmd_buffer->record_result = VK_ERROR_OUT_OF_HOST_MEMORY;
return 0;
}
return radv_buffer_get_va(cmd_buffer->upload.upload_bo) + va_off;
}
static bool
radv_ace_internal_sem_dirty(const struct radv_cmd_buffer *cmd_buffer)
{
return cmd_buffer->ace_internal.sem.gfx2ace_value !=
cmd_buffer->ace_internal.sem.emitted_gfx2ace_value;
}
ALWAYS_INLINE static bool
radv_flush_gfx2ace_semaphore(struct radv_cmd_buffer *cmd_buffer)
{
if (!radv_ace_internal_sem_dirty(cmd_buffer))
return false;
if (!cmd_buffer->ace_internal.sem.va) {
cmd_buffer->ace_internal.sem.va = radv_ace_internal_sem_create(cmd_buffer);
if (!cmd_buffer->ace_internal.sem.va)
return false;
}
/* GFX writes a value to the semaphore which ACE can wait for.*/
si_cs_emit_write_event_eop(
cmd_buffer->cs, cmd_buffer->device->physical_device->rad_info.gfx_level,
radv_cmd_buffer_uses_mec(cmd_buffer), V_028A90_BOTTOM_OF_PIPE_TS, 0, EOP_DST_SEL_MEM,
EOP_DATA_SEL_VALUE_32BIT, cmd_buffer->ace_internal.sem.va,
cmd_buffer->ace_internal.sem.gfx2ace_value, cmd_buffer->gfx9_eop_bug_va);
cmd_buffer->ace_internal.sem.emitted_gfx2ace_value = cmd_buffer->ace_internal.sem.gfx2ace_value;
return true;
}
ALWAYS_INLINE static void
radv_wait_gfx2ace_semaphore(struct radv_cmd_buffer *cmd_buffer)
{
assert(cmd_buffer->ace_internal.sem.va);
struct radeon_cmdbuf *ace_cs = cmd_buffer->ace_internal.cs;
radeon_check_space(cmd_buffer->device->ws, ace_cs, 7);
/* ACE waits for the semaphore which GFX wrote. */
radv_cp_wait_mem(ace_cs, WAIT_REG_MEM_GREATER_OR_EQUAL, cmd_buffer->ace_internal.sem.va,
cmd_buffer->ace_internal.sem.gfx2ace_value, 0xffffffff);
}
static struct radeon_cmdbuf *
radv_ace_internal_create(struct radv_cmd_buffer *cmd_buffer)
{
@ -711,6 +813,33 @@ radv_ace_internal_finalize(struct radv_cmd_buffer *cmd_buffer)
struct radv_device *device = cmd_buffer->device;
struct radeon_cmdbuf *ace_cs = cmd_buffer->ace_internal.cs;
/* Emit pending cache flush. */
radv_ace_internal_cache_flush(cmd_buffer);
/* Clear the ACE semaphore if it exists.
* This is necessary in case the same cmd buffer is submitted again in the future.
*/
if (cmd_buffer->ace_internal.sem.va) {
struct radeon_cmdbuf *main_cs = cmd_buffer->cs;
uint64_t gfx2ace_va = cmd_buffer->ace_internal.sem.va;
uint64_t ace2gfx_va = cmd_buffer->ace_internal.sem.va + 4;
/* ACE: write 1 to the ACE->GFX semaphore. */
si_cs_emit_write_event_eop(ace_cs, cmd_buffer->device->physical_device->rad_info.gfx_level,
true, V_028A90_BOTTOM_OF_PIPE_TS, 0, EOP_DST_SEL_MEM,
EOP_DATA_SEL_VALUE_32BIT, ace2gfx_va, 1,
cmd_buffer->gfx9_eop_bug_va);
/* Wait for ACE to finish, otherwise we may risk writing 0 to the semaphore
* when ACE is still waiting for it. This may not happen in practice, but
* better safe than sorry.
*/
radv_cp_wait_mem(main_cs, WAIT_REG_MEM_GREATER_OR_EQUAL, ace2gfx_va, 1, 0xffffffff);
/* GFX: clear GFX->ACE and ACE->GFX semaphores. */
radv_emit_clear_data(cmd_buffer, V_370_ME, gfx2ace_va, 8);
}
return device->ws->cs_finalize(ace_cs);
}
@ -734,6 +863,14 @@ radv_cmd_buffer_after_draw(struct radv_cmd_buffer *cmd_buffer, enum radv_cmd_flu
&cmd_buffer->gfx9_fence_idx, cmd_buffer->gfx9_fence_va,
radv_cmd_buffer_uses_mec(cmd_buffer), flags, &sqtt_flush_bits,
cmd_buffer->gfx9_eop_bug_va);
if (cmd_buffer->state.graphics_pipeline && (flags & RADV_CMD_FLAG_PS_PARTIAL_FLUSH) &&
radv_pipeline_has_stage(cmd_buffer->state.graphics_pipeline, MESA_SHADER_TASK)) {
/* Force wait for compute engines to be idle on the internal cmdbuf. */
si_cs_emit_cache_flush(cmd_buffer->ace_internal.cs,
cmd_buffer->device->physical_device->rad_info.gfx_level, NULL, 0,
true, RADV_CMD_FLAG_CS_PARTIAL_FLUSH, &sqtt_flush_bits, 0);
}
}
if (unlikely(cmd_buffer->device->trace_bo))
@ -4092,6 +4229,12 @@ radv_emit_draw_registers(struct radv_cmd_buffer *cmd_buffer, const struct radv_d
static void
radv_stage_flush(struct radv_cmd_buffer *cmd_buffer, VkPipelineStageFlags2 src_stage_mask)
{
/* For simplicity, if the barrier wants to wait for the task shader,
* just make it wait for the mesh shader too.
*/
if (src_stage_mask & VK_PIPELINE_STAGE_2_TASK_SHADER_BIT_NV)
src_stage_mask |= VK_PIPELINE_STAGE_2_MESH_SHADER_BIT_NV;
if (src_stage_mask & (VK_PIPELINE_STAGE_2_COPY_BIT |
VK_PIPELINE_STAGE_2_RESOLVE_BIT |
VK_PIPELINE_STAGE_2_BLIT_BIT |
@ -4384,6 +4527,8 @@ radv_emit_subpass_barrier(struct radv_cmd_buffer *cmd_buffer,
cmd_buffer->state.flush_bits |=
radv_dst_access_flush(cmd_buffer, barrier->dst_access_mask, iview->image);
}
radv_ace_internal_barrier(cmd_buffer, barrier->src_stage_mask, barrier->dst_stage_mask);
}
uint32_t
@ -6200,6 +6345,7 @@ radv_cmd_buffer_begin_subpass(struct radv_cmd_buffer *cmd_buffer, uint32_t subpa
radv_handle_subpass_image_transition(cmd_buffer, subpass->attachments[i], true);
}
radv_ace_internal_barrier(cmd_buffer, 0, 0);
radv_describe_barrier_end(cmd_buffer);
radv_cmd_buffer_clear_subpass(cmd_buffer);
@ -6318,6 +6464,7 @@ radv_cmd_buffer_end_subpass(struct radv_cmd_buffer *cmd_buffer)
radv_handle_subpass_image_transition(cmd_buffer, att, false);
}
radv_ace_internal_barrier(cmd_buffer, 0, 0);
radv_describe_barrier_end(cmd_buffer);
}
@ -7500,6 +7647,7 @@ radv_before_taskmesh_draw(struct radv_cmd_buffer *cmd_buffer, const struct radv_
if (!info->count || !gfx_result)
return false;
const bool need_task_semaphore = radv_flush_gfx2ace_semaphore(cmd_buffer);
struct radv_physical_device *pdevice = cmd_buffer->device->physical_device;
struct radeon_cmdbuf *ace_cs = cmd_buffer->ace_internal.cs;
struct radeon_winsys *ws = cmd_buffer->device->ws;
@ -7508,11 +7656,16 @@ radv_before_taskmesh_draw(struct radv_cmd_buffer *cmd_buffer, const struct radv_
ASSERTED const unsigned ace_cdw_max =
radeon_check_space(ws, ace_cs, 4096 + 128 * (drawCount - 1));
if (need_task_semaphore)
radv_wait_gfx2ace_semaphore(cmd_buffer);
if (pipeline_is_dirty) {
radv_pipeline_emit_hw_cs(pdevice, ace_cs, task_shader);
radv_pipeline_emit_compute_state(pdevice, ace_cs, task_shader);
}
radv_ace_internal_cache_flush(cmd_buffer);
/* Restore dirty state of descriptors
* They were marked non-dirty in radv_before_draw,
* but they need to be re-emitted now to the ACE cmdbuf.
@ -9384,6 +9537,8 @@ radv_barrier(struct radv_cmd_buffer *cmd_buffer, const VkDependencyInfo *dep_inf
radv_stage_flush(cmd_buffer, src_stage_mask);
cmd_buffer->state.flush_bits |= src_flush_bits;
radv_ace_internal_barrier(cmd_buffer, src_stage_mask, 0);
for (uint32_t i = 0; i < dep_info->imageMemoryBarrierCount; i++) {
RADV_FROM_HANDLE(radv_image, image, dep_info->pImageMemoryBarriers[i].image);
@ -9410,6 +9565,7 @@ radv_barrier(struct radv_cmd_buffer *cmd_buffer, const VkDependencyInfo *dep_inf
&dep_info->pImageMemoryBarriers[i].subresourceRange, sample_locs_info ? &sample_locations : NULL);
}
radv_ace_internal_barrier(cmd_buffer, 0, dst_stage_mask);
radv_cp_dma_wait_for_stages(cmd_buffer, src_stage_mask);
cmd_buffer->state.flush_bits |= dst_flush_bits;

View File

@ -60,13 +60,17 @@ radv_render_pass_add_subpass_dep(struct radv_render_pass *pass, const VkSubpassD
VkAccessFlags2 dst_access_mask = barrier ? barrier->dstAccessMask : dep->dstAccessMask;
if (dst == VK_SUBPASS_EXTERNAL) {
if (dst_stage_mask != VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT)
if (dst_stage_mask != VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT) {
pass->end_barrier.src_stage_mask |= src_stage_mask;
pass->end_barrier.dst_stage_mask |= dst_stage_mask;
}
pass->end_barrier.src_access_mask |= src_access_mask;
pass->end_barrier.dst_access_mask |= dst_access_mask;
} else {
if (dst_stage_mask != VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT)
if (dst_stage_mask != VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT) {
pass->subpasses[dst].start_barrier.src_stage_mask |= src_stage_mask;
pass->subpasses[dst].start_barrier.dst_stage_mask |= dst_stage_mask;
}
pass->subpasses[dst].start_barrier.src_access_mask |= src_access_mask;
pass->subpasses[dst].start_barrier.dst_access_mask |= dst_access_mask;
}

View File

@ -1186,7 +1186,11 @@ enum radv_cmd_flush_bits {
RADV_CMD_FLUSH_AND_INV_FRAMEBUFFER =
(RADV_CMD_FLAG_FLUSH_AND_INV_CB | RADV_CMD_FLAG_FLUSH_AND_INV_CB_META |
RADV_CMD_FLAG_FLUSH_AND_INV_DB | RADV_CMD_FLAG_FLUSH_AND_INV_DB_META)
RADV_CMD_FLAG_FLUSH_AND_INV_DB | RADV_CMD_FLAG_FLUSH_AND_INV_DB_META),
RADV_CMD_FLUSH_ALL_COMPUTE =
(RADV_CMD_FLAG_INV_ICACHE | RADV_CMD_FLAG_INV_SCACHE | RADV_CMD_FLAG_INV_VCACHE |
RADV_CMD_FLAG_INV_L2 | RADV_CMD_FLAG_WB_L2 | RADV_CMD_FLAG_CS_PARTIAL_FLUSH),
};
enum radv_nggc_settings {
@ -1644,6 +1648,22 @@ struct radv_cmd_buffer {
* also requires a submission to the compute queue.
*/
struct radeon_cmdbuf *cs;
/** Flush bits for the internal cmdbuf. */
enum radv_cmd_flush_bits flush_bits;
/**
* For synchronization between the ACE and GFX cmdbuf.
* The value of this semaphore is incremented whenever we
* encounter a barrier that affects ACE. At sync points,
* GFX writes the value to its address, and ACE waits until
* it detects that the value has been written.
*/
struct {
uint64_t va; /* Virtual address of the semaphore. */
uint32_t gfx2ace_value; /* Current value on GFX. */
uint32_t emitted_gfx2ace_value; /* Emitted value on GFX. */
} sem;
} ace_internal;
/**
@ -2782,6 +2802,7 @@ struct radv_sampler {
struct radv_subpass_barrier {
VkPipelineStageFlags2 src_stage_mask;
VkPipelineStageFlags2 dst_stage_mask;
VkAccessFlags2 src_access_mask;
VkAccessFlags2 dst_access_mask;
};