v3dv: improve pipeline barrier handling

So far we have been getting away with finishing the current job in the
presence of a pipeline barrier and relying on the RCL serialization,
but of course this is not always enough.

This patch  addresses synchronization across different GPU units
(i.e. draw indirect after compute), as well as cases where we need to
sync before binning.

Fixes CTS failures in:
dEQP-VK.synchronization.op.single_queue.barrier.*

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/6766>
This commit is contained in:
Iago Toral Quiroga 2020-07-08 09:56:44 +02:00 committed by Marge Bot
parent adbce7723e
commit 0db95de577
3 changed files with 99 additions and 11 deletions

View File

@ -727,6 +727,46 @@ v3dv_cmd_buffer_finish_job(struct v3dv_cmd_buffer *cmd_buffer)
}
}
static bool
job_type_is_gpu(struct v3dv_job *job)
{
switch (job->type) {
case V3DV_JOB_TYPE_GPU_CL:
case V3DV_JOB_TYPE_GPU_CL_SECONDARY:
case V3DV_JOB_TYPE_GPU_TFU:
case V3DV_JOB_TYPE_GPU_CSD:
return true;
default:
return false;
}
}
static void
cmd_buffer_serialize_job_if_needed(struct v3dv_cmd_buffer *cmd_buffer,
struct v3dv_job *job)
{
assert(cmd_buffer && job);
if (!cmd_buffer->state.has_barrier)
return;
/* Serialization only affects GPU jobs, CPU jobs are always automatically
* serialized.
*/
if (!job_type_is_gpu(job))
return;
job->serialize = true;
if (cmd_buffer->state.has_bcl_barrier &&
(job->type == V3DV_JOB_TYPE_GPU_CL ||
job->type == V3DV_JOB_TYPE_GPU_CL_SECONDARY)) {
job->needs_bcl_sync = true;
}
cmd_buffer->state.has_barrier = false;
cmd_buffer->state.has_bcl_barrier = false;
}
void
v3dv_job_init(struct v3dv_job *job,
enum v3dv_job_type type,
@ -736,6 +776,9 @@ v3dv_job_init(struct v3dv_job *job,
{
assert(job);
/* Make sure we haven't made this new job current before calling here */
assert(!cmd_buffer || cmd_buffer->state.job != job);
job->type = type;
job->device = device;
@ -777,6 +820,8 @@ v3dv_job_init(struct v3dv_job *job,
*/
if (cmd_buffer->state.pass)
job->first_subpass = subpass_idx;
cmd_buffer_serialize_job_if_needed(cmd_buffer, job);
}
}
@ -804,8 +849,6 @@ v3dv_cmd_buffer_start_job(struct v3dv_cmd_buffer *cmd_buffer,
sizeof(struct v3dv_job), 8,
VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
cmd_buffer->state.job = job;
if (!job) {
fprintf(stderr, "Error: failed to allocate CPU memory for job\n");
v3dv_flag_oom(cmd_buffer, NULL);
@ -813,6 +856,7 @@ v3dv_cmd_buffer_start_job(struct v3dv_cmd_buffer *cmd_buffer,
}
v3dv_job_init(job, type, cmd_buffer->device, cmd_buffer, subpass_idx);
cmd_buffer->state.job = job;
return job;
}
@ -2350,6 +2394,16 @@ cmd_buffer_execute_inside_pass(struct v3dv_cmd_buffer *primary,
cl_emit(&primary_job->bcl, BRANCH_TO_SUB_LIST, branch) {
branch.address = v3dv_cl_address(secondary_job->bcl.bo, 0);
}
/* If this secondary has barriers, we need to flag them in the
* primary job.
*
* FIXME: This might be moving the sync point too early though,
* maybe we would need to split the primary in this case to ensure
* that barriers execute right before the secondary.
*/
primary_job->serialize |= secondary_job->serialize;
primary_job->needs_bcl_sync |= secondary_job->needs_bcl_sync;
} else if (secondary_job->type == V3DV_JOB_TYPE_CPU_CLEAR_ATTACHMENTS) {
const struct v3dv_clear_attachments_cpu_job_info *info =
&secondary_job->cpu.clear_attachments;
@ -4038,18 +4092,27 @@ v3dv_CmdPipelineBarrier(VkCommandBuffer commandBuffer,
VkDependencyFlags dependencyFlags,
uint32_t memoryBarrierCount,
const VkMemoryBarrier *pMemoryBarriers,
uint32_t bufferMemoryBarrierCount,
const VkBufferMemoryBarrier *pBufferMemoryBarriers,
uint32_t imageMemoryBarrierCount,
const VkImageMemoryBarrier *pImageMemoryBarriers)
uint32_t bufferBarrierCount,
const VkBufferMemoryBarrier *pBufferBarriers,
uint32_t imageBarrierCount,
const VkImageMemoryBarrier *pImageBarriers)
{
V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
/* If we have a recording job, finish it here */
struct v3dv_job *job = cmd_buffer->state.job;
if (!job)
return;
if (job)
v3dv_cmd_buffer_finish_job(cmd_buffer);
v3dv_cmd_buffer_finish_job(cmd_buffer);
cmd_buffer->state.has_barrier = true;
if (dstStageMask & (VK_PIPELINE_STAGE_VERTEX_INPUT_BIT |
VK_PIPELINE_STAGE_VERTEX_SHADER_BIT |
VK_PIPELINE_STAGE_GEOMETRY_SHADER_BIT |
VK_PIPELINE_STAGE_TESSELLATION_CONTROL_SHADER_BIT |
VK_PIPELINE_STAGE_TESSELLATION_EVALUATION_SHADER_BIT |
VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT)) {
cmd_buffer->state.has_bcl_barrier = true;
}
}
void

View File

@ -805,6 +805,12 @@ struct v3dv_job {
*/
bool always_flush;
/* Whether we need to serialize this job in our command stream */
bool serialize;
/* If this is a CL job, whether we should sync before binning */
bool needs_bcl_sync;
/* Job specs for CPU jobs */
union {
struct v3dv_reset_query_cpu_job_info query_reset;
@ -903,6 +909,12 @@ struct v3dv_cmd_buffer_state {
/* Used to flag OOM conditions during command buffer recording */
bool oom;
/* Whether we have recorded a pipeline barrier that we still need to
* process.
*/
bool has_barrier;
bool has_bcl_barrier;
/* Command buffer state saved during a meta operation */
struct {
uint32_t subpass_idx;

View File

@ -530,6 +530,12 @@ handle_cl_job(struct v3dv_queue *queue,
struct drm_v3d_submit_cl submit;
/* Sanity check: we should only flag a bcl sync on a job that needs to be
* serialized.
*/
assert(job->serialize || !job->needs_bcl_sync);
do_wait |= job->serialize;
/* We expect to have just one RCL per job which should fit in just one BO.
* Our BCL, could chain multiple BOS together though.
*/
@ -575,9 +581,12 @@ handle_cl_job(struct v3dv_queue *queue,
* we would have to extend our kernel interface to support the case where
* we have more than one semaphore to wait on.
*/
const bool needs_bcl_sync = do_wait && job->needs_bcl_sync;
const bool needs_rcl_sync = do_wait && !needs_bcl_sync;
mtx_lock(&queue->device->mutex);
submit.in_sync_bcl = 0;
submit.in_sync_rcl = do_wait ? device->last_job_sync : 0;
submit.in_sync_bcl = needs_bcl_sync ? device->last_job_sync : 0;
submit.in_sync_rcl = needs_rcl_sync ? device->last_job_sync : 0;
submit.out_sync = device->last_job_sync;
v3dv_clif_dump(device, job, &submit);
int ret = v3dv_ioctl(device->render_fd, DRM_IOCTL_V3D_SUBMIT_CL, &submit);
@ -605,6 +614,8 @@ handle_tfu_job(struct v3dv_queue *queue,
{
struct v3dv_device *device = queue->device;
do_wait |= job->serialize;
mtx_lock(&device->mutex);
job->tfu.in_sync = do_wait ? device->last_job_sync : 0;
job->tfu.out_sync = device->last_job_sync;
@ -628,6 +639,8 @@ handle_csd_job(struct v3dv_queue *queue,
struct drm_v3d_submit_csd *submit = &job->csd.submit;
do_wait |= job->serialize;
submit->bo_handle_count = job->bo_count;
uint32_t *bo_handles =
(uint32_t *) malloc(sizeof(uint32_t) * MAX2(4, submit->bo_handle_count * 2));