softpipe: Dispatch 4 CS invocations per tgsi_exec thread.

We were executing 1 non-helper invocation and 3 helpers per CS tgsi_exec machine, which was a total waste of the CPU when we could trivially have all 4 invocations do real work (at least in the common case of a gl_WorkGroupSize.x >= 4). This didn't have the effect on dEQP that I was hoping for, as it turns out that its shaders are almost all 1x1x1 workgroups. However, it does reduce the runtime of piglit arb_compute_shader-local-id from 2:10 to 47 seconds on my system. Part of #4097 Reviewed-by: Dave Airlie <airlied@redhat.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/14728>
2022-01-25 21:31:06 -08:00 · 2022-01-25 21:31:06 -08:00 · 53426d26c3
parent 62dc4be470
commit 53426d26c3
1 changed files with 6 additions and 5 deletions
--- a/src/gallium/drivers/softpipe/sp_compute.c
+++ b/src/gallium/drivers/softpipe/sp_compute.c
@ -55,7 +55,7 @@ cs_prepare(const struct sp_compute_shader *cs,
   if (machine->SysSemanticToIndex[TGSI_SEMANTIC_THREAD_ID] != -1) {
      unsigned i = machine->SysSemanticToIndex[TGSI_SEMANTIC_THREAD_ID];
      for (j = 0; j < TGSI_QUAD_SIZE; j++) {
-         machine->SystemValue[i].xyzw[0].i[j] = local_x;
+         machine->SystemValue[i].xyzw[0].i[j] = local_x + j;
         machine->SystemValue[i].xyzw[1].i[j] = local_y;
         machine->SystemValue[i].xyzw[2].i[j] = local_z;
      }
@ -180,7 +180,7 @@ softpipe_launch_grid(struct pipe_context *context,
   bwidth = cs->info.properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH];
   bheight = cs->info.properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT];
   bdepth = cs->info.properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH];
-   num_threads_in_group = bwidth * bheight * bdepth;
+   num_threads_in_group = DIV_ROUND_UP(bwidth, TGSI_QUAD_SIZE) * bheight * bdepth;

   fill_grid_size(context, info, grid_size);

@ -195,15 +195,15 @@ softpipe_launch_grid(struct pipe_context *context,
   }

   /* initialise machines + GRID_SIZE + THREAD_ID  + BLOCK_SIZE */
+   int idx = 0;
   for (local_z = 0; local_z < bdepth; local_z++) {
      for (local_y = 0; local_y < bheight; local_y++) {
-         for (local_x = 0; local_x < bwidth; local_x++) {
-            int idx = local_x + (local_y * bwidth) + (local_z * bheight * bwidth);
+         for (local_x = 0; local_x < bwidth; local_x += TGSI_QUAD_SIZE) {
            machines[idx] = tgsi_exec_machine_create(PIPE_SHADER_COMPUTE);

            machines[idx]->LocalMem = local_mem;
            machines[idx]->LocalMemSize = cs->shader.req_local_mem;
-            machines[idx]->NonHelperMask = 0x1;
+            machines[idx]->NonHelperMask = (1 << (MIN2(TGSI_QUAD_SIZE, bwidth - local_x))) - 1;
            cs_prepare(cs, machines[idx],
                       local_x, local_y, local_z,
                       grid_size[0], grid_size[1], grid_size[2],
@ -214,6 +214,7 @@ softpipe_launch_grid(struct pipe_context *context,
            tgsi_exec_set_constant_buffers(machines[idx], PIPE_MAX_CONSTANT_BUFFERS,
                                           softpipe->mapped_constants[PIPE_SHADER_COMPUTE],
                                           softpipe->const_buffer_size[PIPE_SHADER_COMPUTE]);
+            idx++;
         }
      }
   }