radv: Synchronization for task shaders.
Add a separate flush_bits field for tracking cache flushes in the ACE internal cmdbuf. In barriers and image transitions we add these flush bits to ACE. Create a semaphore in the upload BO which makes it possible for ACE to wait for GFX for the purpose of synchronization. This is necessary when a barrier needs to block task shaders. Signed-off-by: Timur Kristóf <timur.kristof@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/16531>
This commit is contained in:
parent
2479b62869
commit
4c6f83006d
|
@ -521,6 +521,9 @@ radv_reset_cmd_buffer(struct radv_cmd_buffer *cmd_buffer)
|
|||
cmd_buffer->gds_needed = false;
|
||||
cmd_buffer->gds_oa_needed = false;
|
||||
cmd_buffer->sample_positions_needed = false;
|
||||
cmd_buffer->ace_internal.sem.gfx2ace_value = 0;
|
||||
cmd_buffer->ace_internal.sem.emitted_gfx2ace_value = 0;
|
||||
cmd_buffer->ace_internal.sem.va = 0;
|
||||
|
||||
if (cmd_buffer->upload.upload_bo)
|
||||
radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, cmd_buffer->upload.upload_bo);
|
||||
|
@ -690,6 +693,105 @@ radv_cmd_buffer_trace_emit(struct radv_cmd_buffer *cmd_buffer)
|
|||
radeon_emit(cs, AC_ENCODE_TRACE_POINT(cmd_buffer->state.trace_id));
|
||||
}
|
||||
|
||||
static void
|
||||
radv_ace_internal_barrier(struct radv_cmd_buffer *cmd_buffer, VkPipelineStageFlags2 src_stage_mask,
|
||||
VkPipelineStageFlags2 dst_stage_mask)
|
||||
{
|
||||
/* Update flush bits from the main cmdbuf, except the stage flush. */
|
||||
cmd_buffer->ace_internal.flush_bits |=
|
||||
cmd_buffer->state.flush_bits & RADV_CMD_FLUSH_ALL_COMPUTE & ~RADV_CMD_FLAG_CS_PARTIAL_FLUSH;
|
||||
|
||||
/* Add stage flush only when necessary. */
|
||||
if (src_stage_mask &
|
||||
(VK_PIPELINE_STAGE_2_TASK_SHADER_BIT_NV | VK_PIPELINE_STAGE_2_TRANSFER_BIT |
|
||||
VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT | VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT))
|
||||
cmd_buffer->ace_internal.flush_bits |= RADV_CMD_FLAG_CS_PARTIAL_FLUSH;
|
||||
|
||||
/* Block task shaders when we have to wait for CP DMA on the GFX cmdbuf. */
|
||||
if (src_stage_mask &
|
||||
(VK_PIPELINE_STAGE_2_COPY_BIT | VK_PIPELINE_STAGE_2_CLEAR_BIT |
|
||||
VK_PIPELINE_STAGE_2_ALL_TRANSFER_BIT | VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT |
|
||||
VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT))
|
||||
dst_stage_mask |= cmd_buffer->state.dma_is_busy ? VK_PIPELINE_STAGE_2_TASK_SHADER_BIT_NV : 0;
|
||||
|
||||
/* Increment the GFX/ACE semaphore when task shaders are blocked. */
|
||||
if (dst_stage_mask &
|
||||
(VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT_KHR | VK_PIPELINE_STAGE_2_DRAW_INDIRECT_BIT |
|
||||
VK_PIPELINE_STAGE_2_TASK_SHADER_BIT_NV))
|
||||
cmd_buffer->ace_internal.sem.gfx2ace_value++;
|
||||
}
|
||||
|
||||
static void
|
||||
radv_ace_internal_cache_flush(struct radv_cmd_buffer *cmd_buffer)
|
||||
{
|
||||
struct radeon_cmdbuf *ace_cs = cmd_buffer->ace_internal.cs;
|
||||
const uint32_t flush_bits = cmd_buffer->ace_internal.flush_bits;
|
||||
enum rgp_flush_bits sqtt_flush_bits = 0;
|
||||
|
||||
si_cs_emit_cache_flush(ace_cs, cmd_buffer->device->physical_device->rad_info.gfx_level, NULL, 0,
|
||||
true, flush_bits, &sqtt_flush_bits, 0);
|
||||
|
||||
cmd_buffer->ace_internal.flush_bits = 0;
|
||||
}
|
||||
|
||||
static uint64_t
|
||||
radv_ace_internal_sem_create(struct radv_cmd_buffer *cmd_buffer)
|
||||
{
|
||||
/* DWORD 0: GFX->ACE semaphore (GFX blocks ACE, ie. ACE waits for GFX)
|
||||
* DWORD 1: ACE->GFX semaphore
|
||||
*/
|
||||
uint64_t sem_init = 0;
|
||||
uint32_t va_off = 0;
|
||||
if (!radv_cmd_buffer_upload_data(cmd_buffer, sizeof(uint64_t), &sem_init, &va_off)) {
|
||||
cmd_buffer->record_result = VK_ERROR_OUT_OF_HOST_MEMORY;
|
||||
return 0;
|
||||
}
|
||||
|
||||
return radv_buffer_get_va(cmd_buffer->upload.upload_bo) + va_off;
|
||||
}
|
||||
|
||||
static bool
|
||||
radv_ace_internal_sem_dirty(const struct radv_cmd_buffer *cmd_buffer)
|
||||
{
|
||||
return cmd_buffer->ace_internal.sem.gfx2ace_value !=
|
||||
cmd_buffer->ace_internal.sem.emitted_gfx2ace_value;
|
||||
}
|
||||
|
||||
ALWAYS_INLINE static bool
|
||||
radv_flush_gfx2ace_semaphore(struct radv_cmd_buffer *cmd_buffer)
|
||||
{
|
||||
if (!radv_ace_internal_sem_dirty(cmd_buffer))
|
||||
return false;
|
||||
|
||||
if (!cmd_buffer->ace_internal.sem.va) {
|
||||
cmd_buffer->ace_internal.sem.va = radv_ace_internal_sem_create(cmd_buffer);
|
||||
if (!cmd_buffer->ace_internal.sem.va)
|
||||
return false;
|
||||
}
|
||||
|
||||
/* GFX writes a value to the semaphore which ACE can wait for.*/
|
||||
si_cs_emit_write_event_eop(
|
||||
cmd_buffer->cs, cmd_buffer->device->physical_device->rad_info.gfx_level,
|
||||
radv_cmd_buffer_uses_mec(cmd_buffer), V_028A90_BOTTOM_OF_PIPE_TS, 0, EOP_DST_SEL_MEM,
|
||||
EOP_DATA_SEL_VALUE_32BIT, cmd_buffer->ace_internal.sem.va,
|
||||
cmd_buffer->ace_internal.sem.gfx2ace_value, cmd_buffer->gfx9_eop_bug_va);
|
||||
|
||||
cmd_buffer->ace_internal.sem.emitted_gfx2ace_value = cmd_buffer->ace_internal.sem.gfx2ace_value;
|
||||
return true;
|
||||
}
|
||||
|
||||
ALWAYS_INLINE static void
|
||||
radv_wait_gfx2ace_semaphore(struct radv_cmd_buffer *cmd_buffer)
|
||||
{
|
||||
assert(cmd_buffer->ace_internal.sem.va);
|
||||
struct radeon_cmdbuf *ace_cs = cmd_buffer->ace_internal.cs;
|
||||
radeon_check_space(cmd_buffer->device->ws, ace_cs, 7);
|
||||
|
||||
/* ACE waits for the semaphore which GFX wrote. */
|
||||
radv_cp_wait_mem(ace_cs, WAIT_REG_MEM_GREATER_OR_EQUAL, cmd_buffer->ace_internal.sem.va,
|
||||
cmd_buffer->ace_internal.sem.gfx2ace_value, 0xffffffff);
|
||||
}
|
||||
|
||||
static struct radeon_cmdbuf *
|
||||
radv_ace_internal_create(struct radv_cmd_buffer *cmd_buffer)
|
||||
{
|
||||
|
@ -711,6 +813,33 @@ radv_ace_internal_finalize(struct radv_cmd_buffer *cmd_buffer)
|
|||
struct radv_device *device = cmd_buffer->device;
|
||||
struct radeon_cmdbuf *ace_cs = cmd_buffer->ace_internal.cs;
|
||||
|
||||
/* Emit pending cache flush. */
|
||||
radv_ace_internal_cache_flush(cmd_buffer);
|
||||
|
||||
/* Clear the ACE semaphore if it exists.
|
||||
* This is necessary in case the same cmd buffer is submitted again in the future.
|
||||
*/
|
||||
if (cmd_buffer->ace_internal.sem.va) {
|
||||
struct radeon_cmdbuf *main_cs = cmd_buffer->cs;
|
||||
uint64_t gfx2ace_va = cmd_buffer->ace_internal.sem.va;
|
||||
uint64_t ace2gfx_va = cmd_buffer->ace_internal.sem.va + 4;
|
||||
|
||||
/* ACE: write 1 to the ACE->GFX semaphore. */
|
||||
si_cs_emit_write_event_eop(ace_cs, cmd_buffer->device->physical_device->rad_info.gfx_level,
|
||||
true, V_028A90_BOTTOM_OF_PIPE_TS, 0, EOP_DST_SEL_MEM,
|
||||
EOP_DATA_SEL_VALUE_32BIT, ace2gfx_va, 1,
|
||||
cmd_buffer->gfx9_eop_bug_va);
|
||||
|
||||
/* Wait for ACE to finish, otherwise we may risk writing 0 to the semaphore
|
||||
* when ACE is still waiting for it. This may not happen in practice, but
|
||||
* better safe than sorry.
|
||||
*/
|
||||
radv_cp_wait_mem(main_cs, WAIT_REG_MEM_GREATER_OR_EQUAL, ace2gfx_va, 1, 0xffffffff);
|
||||
|
||||
/* GFX: clear GFX->ACE and ACE->GFX semaphores. */
|
||||
radv_emit_clear_data(cmd_buffer, V_370_ME, gfx2ace_va, 8);
|
||||
}
|
||||
|
||||
return device->ws->cs_finalize(ace_cs);
|
||||
}
|
||||
|
||||
|
@ -734,6 +863,14 @@ radv_cmd_buffer_after_draw(struct radv_cmd_buffer *cmd_buffer, enum radv_cmd_flu
|
|||
&cmd_buffer->gfx9_fence_idx, cmd_buffer->gfx9_fence_va,
|
||||
radv_cmd_buffer_uses_mec(cmd_buffer), flags, &sqtt_flush_bits,
|
||||
cmd_buffer->gfx9_eop_bug_va);
|
||||
|
||||
if (cmd_buffer->state.graphics_pipeline && (flags & RADV_CMD_FLAG_PS_PARTIAL_FLUSH) &&
|
||||
radv_pipeline_has_stage(cmd_buffer->state.graphics_pipeline, MESA_SHADER_TASK)) {
|
||||
/* Force wait for compute engines to be idle on the internal cmdbuf. */
|
||||
si_cs_emit_cache_flush(cmd_buffer->ace_internal.cs,
|
||||
cmd_buffer->device->physical_device->rad_info.gfx_level, NULL, 0,
|
||||
true, RADV_CMD_FLAG_CS_PARTIAL_FLUSH, &sqtt_flush_bits, 0);
|
||||
}
|
||||
}
|
||||
|
||||
if (unlikely(cmd_buffer->device->trace_bo))
|
||||
|
@ -4092,6 +4229,12 @@ radv_emit_draw_registers(struct radv_cmd_buffer *cmd_buffer, const struct radv_d
|
|||
static void
|
||||
radv_stage_flush(struct radv_cmd_buffer *cmd_buffer, VkPipelineStageFlags2 src_stage_mask)
|
||||
{
|
||||
/* For simplicity, if the barrier wants to wait for the task shader,
|
||||
* just make it wait for the mesh shader too.
|
||||
*/
|
||||
if (src_stage_mask & VK_PIPELINE_STAGE_2_TASK_SHADER_BIT_NV)
|
||||
src_stage_mask |= VK_PIPELINE_STAGE_2_MESH_SHADER_BIT_NV;
|
||||
|
||||
if (src_stage_mask & (VK_PIPELINE_STAGE_2_COPY_BIT |
|
||||
VK_PIPELINE_STAGE_2_RESOLVE_BIT |
|
||||
VK_PIPELINE_STAGE_2_BLIT_BIT |
|
||||
|
@ -4384,6 +4527,8 @@ radv_emit_subpass_barrier(struct radv_cmd_buffer *cmd_buffer,
|
|||
cmd_buffer->state.flush_bits |=
|
||||
radv_dst_access_flush(cmd_buffer, barrier->dst_access_mask, iview->image);
|
||||
}
|
||||
|
||||
radv_ace_internal_barrier(cmd_buffer, barrier->src_stage_mask, barrier->dst_stage_mask);
|
||||
}
|
||||
|
||||
uint32_t
|
||||
|
@ -6200,6 +6345,7 @@ radv_cmd_buffer_begin_subpass(struct radv_cmd_buffer *cmd_buffer, uint32_t subpa
|
|||
radv_handle_subpass_image_transition(cmd_buffer, subpass->attachments[i], true);
|
||||
}
|
||||
|
||||
radv_ace_internal_barrier(cmd_buffer, 0, 0);
|
||||
radv_describe_barrier_end(cmd_buffer);
|
||||
|
||||
radv_cmd_buffer_clear_subpass(cmd_buffer);
|
||||
|
@ -6318,6 +6464,7 @@ radv_cmd_buffer_end_subpass(struct radv_cmd_buffer *cmd_buffer)
|
|||
radv_handle_subpass_image_transition(cmd_buffer, att, false);
|
||||
}
|
||||
|
||||
radv_ace_internal_barrier(cmd_buffer, 0, 0);
|
||||
radv_describe_barrier_end(cmd_buffer);
|
||||
}
|
||||
|
||||
|
@ -7500,6 +7647,7 @@ radv_before_taskmesh_draw(struct radv_cmd_buffer *cmd_buffer, const struct radv_
|
|||
if (!info->count || !gfx_result)
|
||||
return false;
|
||||
|
||||
const bool need_task_semaphore = radv_flush_gfx2ace_semaphore(cmd_buffer);
|
||||
struct radv_physical_device *pdevice = cmd_buffer->device->physical_device;
|
||||
struct radeon_cmdbuf *ace_cs = cmd_buffer->ace_internal.cs;
|
||||
struct radeon_winsys *ws = cmd_buffer->device->ws;
|
||||
|
@ -7508,11 +7656,16 @@ radv_before_taskmesh_draw(struct radv_cmd_buffer *cmd_buffer, const struct radv_
|
|||
ASSERTED const unsigned ace_cdw_max =
|
||||
radeon_check_space(ws, ace_cs, 4096 + 128 * (drawCount - 1));
|
||||
|
||||
if (need_task_semaphore)
|
||||
radv_wait_gfx2ace_semaphore(cmd_buffer);
|
||||
|
||||
if (pipeline_is_dirty) {
|
||||
radv_pipeline_emit_hw_cs(pdevice, ace_cs, task_shader);
|
||||
radv_pipeline_emit_compute_state(pdevice, ace_cs, task_shader);
|
||||
}
|
||||
|
||||
radv_ace_internal_cache_flush(cmd_buffer);
|
||||
|
||||
/* Restore dirty state of descriptors
|
||||
* They were marked non-dirty in radv_before_draw,
|
||||
* but they need to be re-emitted now to the ACE cmdbuf.
|
||||
|
@ -9384,6 +9537,8 @@ radv_barrier(struct radv_cmd_buffer *cmd_buffer, const VkDependencyInfo *dep_inf
|
|||
radv_stage_flush(cmd_buffer, src_stage_mask);
|
||||
cmd_buffer->state.flush_bits |= src_flush_bits;
|
||||
|
||||
radv_ace_internal_barrier(cmd_buffer, src_stage_mask, 0);
|
||||
|
||||
for (uint32_t i = 0; i < dep_info->imageMemoryBarrierCount; i++) {
|
||||
RADV_FROM_HANDLE(radv_image, image, dep_info->pImageMemoryBarriers[i].image);
|
||||
|
||||
|
@ -9410,6 +9565,7 @@ radv_barrier(struct radv_cmd_buffer *cmd_buffer, const VkDependencyInfo *dep_inf
|
|||
&dep_info->pImageMemoryBarriers[i].subresourceRange, sample_locs_info ? &sample_locations : NULL);
|
||||
}
|
||||
|
||||
radv_ace_internal_barrier(cmd_buffer, 0, dst_stage_mask);
|
||||
radv_cp_dma_wait_for_stages(cmd_buffer, src_stage_mask);
|
||||
|
||||
cmd_buffer->state.flush_bits |= dst_flush_bits;
|
||||
|
|
|
@ -60,13 +60,17 @@ radv_render_pass_add_subpass_dep(struct radv_render_pass *pass, const VkSubpassD
|
|||
VkAccessFlags2 dst_access_mask = barrier ? barrier->dstAccessMask : dep->dstAccessMask;
|
||||
|
||||
if (dst == VK_SUBPASS_EXTERNAL) {
|
||||
if (dst_stage_mask != VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT)
|
||||
if (dst_stage_mask != VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT) {
|
||||
pass->end_barrier.src_stage_mask |= src_stage_mask;
|
||||
pass->end_barrier.dst_stage_mask |= dst_stage_mask;
|
||||
}
|
||||
pass->end_barrier.src_access_mask |= src_access_mask;
|
||||
pass->end_barrier.dst_access_mask |= dst_access_mask;
|
||||
} else {
|
||||
if (dst_stage_mask != VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT)
|
||||
if (dst_stage_mask != VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT) {
|
||||
pass->subpasses[dst].start_barrier.src_stage_mask |= src_stage_mask;
|
||||
pass->subpasses[dst].start_barrier.dst_stage_mask |= dst_stage_mask;
|
||||
}
|
||||
pass->subpasses[dst].start_barrier.src_access_mask |= src_access_mask;
|
||||
pass->subpasses[dst].start_barrier.dst_access_mask |= dst_access_mask;
|
||||
}
|
||||
|
|
|
@ -1186,7 +1186,11 @@ enum radv_cmd_flush_bits {
|
|||
|
||||
RADV_CMD_FLUSH_AND_INV_FRAMEBUFFER =
|
||||
(RADV_CMD_FLAG_FLUSH_AND_INV_CB | RADV_CMD_FLAG_FLUSH_AND_INV_CB_META |
|
||||
RADV_CMD_FLAG_FLUSH_AND_INV_DB | RADV_CMD_FLAG_FLUSH_AND_INV_DB_META)
|
||||
RADV_CMD_FLAG_FLUSH_AND_INV_DB | RADV_CMD_FLAG_FLUSH_AND_INV_DB_META),
|
||||
|
||||
RADV_CMD_FLUSH_ALL_COMPUTE =
|
||||
(RADV_CMD_FLAG_INV_ICACHE | RADV_CMD_FLAG_INV_SCACHE | RADV_CMD_FLAG_INV_VCACHE |
|
||||
RADV_CMD_FLAG_INV_L2 | RADV_CMD_FLAG_WB_L2 | RADV_CMD_FLAG_CS_PARTIAL_FLUSH),
|
||||
};
|
||||
|
||||
enum radv_nggc_settings {
|
||||
|
@ -1644,6 +1648,22 @@ struct radv_cmd_buffer {
|
|||
* also requires a submission to the compute queue.
|
||||
*/
|
||||
struct radeon_cmdbuf *cs;
|
||||
|
||||
/** Flush bits for the internal cmdbuf. */
|
||||
enum radv_cmd_flush_bits flush_bits;
|
||||
|
||||
/**
|
||||
* For synchronization between the ACE and GFX cmdbuf.
|
||||
* The value of this semaphore is incremented whenever we
|
||||
* encounter a barrier that affects ACE. At sync points,
|
||||
* GFX writes the value to its address, and ACE waits until
|
||||
* it detects that the value has been written.
|
||||
*/
|
||||
struct {
|
||||
uint64_t va; /* Virtual address of the semaphore. */
|
||||
uint32_t gfx2ace_value; /* Current value on GFX. */
|
||||
uint32_t emitted_gfx2ace_value; /* Emitted value on GFX. */
|
||||
} sem;
|
||||
} ace_internal;
|
||||
|
||||
/**
|
||||
|
@ -2782,6 +2802,7 @@ struct radv_sampler {
|
|||
|
||||
struct radv_subpass_barrier {
|
||||
VkPipelineStageFlags2 src_stage_mask;
|
||||
VkPipelineStageFlags2 dst_stage_mask;
|
||||
VkAccessFlags2 src_access_mask;
|
||||
VkAccessFlags2 dst_access_mask;
|
||||
};
|
||||
|
|
Loading…
Reference in New Issue