From afc33a7430dfc459697ba2eac45e4ad63da542d9 Mon Sep 17 00:00:00 2001
From: Iago Toral Quiroga <itoral@igalia.com>
Date: Wed, 28 Apr 2021 14:48:13 +0200
Subject: [PATCH] v3dv: limit supergroup size in presence of TSY barriers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When a TSY barrier is hit, the entire supergroup will be synchronized.
If the supergoup is large and uses all available QPU threads it would
mean that we would sychronize and stall all running threads until all
of them reach the barrier, which may be inefficient.

This patch makes it so that if the compute shader has any such barriers
we limit the supergroup size so each supergroup only takes half of the
QPU threads available at most, so that if one supergroup hits a
barrier we have at least one other supergroup we can run, reducing
idle QPU time.

Reviewed-by: Alejandro Piñeiro <apinheiro@igalia.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/10541>
---
 src/broadcom/vulkan/v3dv_cmd_buffer.c | 33 ++++++++++++++++++++++++---
 1 file changed, 30 insertions(+), 3 deletions(-)

diff --git a/src/broadcom/vulkan/v3dv_cmd_buffer.c b/src/broadcom/vulkan/v3dv_cmd_buffer.c
index d1e641b9316..eee71610ce5 100644
--- a/src/broadcom/vulkan/v3dv_cmd_buffer.c
+++ b/src/broadcom/vulkan/v3dv_cmd_buffer.c
@@ -5272,11 +5272,36 @@ v3dv_cmd_buffer_rewrite_indirect_csd_job(
  * lane occupancy. We can pack up to 16 workgroups into a supergroup.
  */
 static uint32_t
-choose_workgroups_per_supergroup(uint32_t num_wgs, uint32_t wg_size)
+choose_workgroups_per_supergroup(struct v3d_device_info *devinfo,
+                                 struct v3dv_shader_variant *cs,
+                                 uint32_t num_wgs,
+                                 uint32_t wg_size)
 {
+   /* Compute maximum number of batches in a supergroup for this workgroup size.
+    * Each batch is 16 elements, and we can have up to 16 work groups in a
+    * supergroup:
+    *
+    * max_batches_per_sg = (wg_size * max_wgs_per_sg) / elements_per_batch
+    * since max_wgs_per_sg = 16 and elements_per_batch = 16, we get:
+    * max_batches_per_sg = wg_size
+    */
+   uint32_t max_batches_per_sg = wg_size;
+
+   /* QPU threads will stall at TSY barriers until the entire supergroup
+    * reaches the barrier. Limit the supergroup size to half the QPU threads
+    * available, so we can have at least 2 supergroups executing in parallel
+    * and we don't stall all our QPU threads when a supergroup hits a barrier.
+    */
+   if (cs->prog_data.cs->base.has_control_barrier) {
+      uint32_t max_qpu_threads =
+         devinfo->qpu_count * cs->prog_data.cs->base.threads;
+      max_batches_per_sg = MIN2(max_batches_per_sg, max_qpu_threads / 2);
+   }
+   uint32_t max_wgs_per_sg = max_batches_per_sg * 16 / wg_size;
+
    uint32_t best_wgs_per_sg = 1;
    uint32_t best_unused_lanes = 16;
-   for (uint32_t wgs_per_sg = 1; wgs_per_sg <= 16; wgs_per_sg++) {
+   for (uint32_t wgs_per_sg = 1; wgs_per_sg <= max_wgs_per_sg; wgs_per_sg++) {
       /* Don't try to pack more workgroups per supergroup than the total amount
        * of workgroups dispatched.
        */
@@ -5341,7 +5366,9 @@ cmd_buffer_create_csd_job(struct v3dv_cmd_buffer *cmd_buffer,
                             cpd->local_size[1] *
                             cpd->local_size[2];
 
-   uint32_t wgs_per_sg = choose_workgroups_per_supergroup(num_wgs, wg_size);
+   uint32_t wgs_per_sg =
+      choose_workgroups_per_supergroup(&cmd_buffer->device->devinfo,
+                                       cs_variant, num_wgs, wg_size);
    uint32_t batches_per_sg = DIV_ROUND_UP(wgs_per_sg * wg_size, 16);
    uint32_t whole_sgs = num_wgs / wgs_per_sg;
    uint32_t rem_wgs = num_wgs - whole_sgs * wgs_per_sg;