diff --git a/src/broadcom/vulkan/v3dv_cmd_buffer.c b/src/broadcom/vulkan/v3dv_cmd_buffer.c index d1e641b9316..eee71610ce5 100644 --- a/src/broadcom/vulkan/v3dv_cmd_buffer.c +++ b/src/broadcom/vulkan/v3dv_cmd_buffer.c @@ -5272,11 +5272,36 @@ v3dv_cmd_buffer_rewrite_indirect_csd_job( * lane occupancy. We can pack up to 16 workgroups into a supergroup. */ static uint32_t -choose_workgroups_per_supergroup(uint32_t num_wgs, uint32_t wg_size) +choose_workgroups_per_supergroup(struct v3d_device_info *devinfo, + struct v3dv_shader_variant *cs, + uint32_t num_wgs, + uint32_t wg_size) { + /* Compute maximum number of batches in a supergroup for this workgroup size. + * Each batch is 16 elements, and we can have up to 16 work groups in a + * supergroup: + * + * max_batches_per_sg = (wg_size * max_wgs_per_sg) / elements_per_batch + * since max_wgs_per_sg = 16 and elements_per_batch = 16, we get: + * max_batches_per_sg = wg_size + */ + uint32_t max_batches_per_sg = wg_size; + + /* QPU threads will stall at TSY barriers until the entire supergroup + * reaches the barrier. Limit the supergroup size to half the QPU threads + * available, so we can have at least 2 supergroups executing in parallel + * and we don't stall all our QPU threads when a supergroup hits a barrier. + */ + if (cs->prog_data.cs->base.has_control_barrier) { + uint32_t max_qpu_threads = + devinfo->qpu_count * cs->prog_data.cs->base.threads; + max_batches_per_sg = MIN2(max_batches_per_sg, max_qpu_threads / 2); + } + uint32_t max_wgs_per_sg = max_batches_per_sg * 16 / wg_size; + uint32_t best_wgs_per_sg = 1; uint32_t best_unused_lanes = 16; - for (uint32_t wgs_per_sg = 1; wgs_per_sg <= 16; wgs_per_sg++) { + for (uint32_t wgs_per_sg = 1; wgs_per_sg <= max_wgs_per_sg; wgs_per_sg++) { /* Don't try to pack more workgroups per supergroup than the total amount * of workgroups dispatched. */ @@ -5341,7 +5366,9 @@ cmd_buffer_create_csd_job(struct v3dv_cmd_buffer *cmd_buffer, cpd->local_size[1] * cpd->local_size[2]; - uint32_t wgs_per_sg = choose_workgroups_per_supergroup(num_wgs, wg_size); + uint32_t wgs_per_sg = + choose_workgroups_per_supergroup(&cmd_buffer->device->devinfo, + cs_variant, num_wgs, wg_size); uint32_t batches_per_sg = DIV_ROUND_UP(wgs_per_sg * wg_size, 16); uint32_t whole_sgs = num_wgs / wgs_per_sg; uint32_t rem_wgs = num_wgs - whole_sgs * wgs_per_sg;