tu, ir3: Plumb through support for CS subgroup size/id

The way that the blob obtains the subgroup id on compute shaders is by just and'ing gl_LocalInvocationIndex with 63, since it advertizes a subgroupSize of 64. In order to support VK_EXT_subgroup_size_control and expose a subgroupSize of 128, we'll have to do something a little more flexible. Sometimes we have to fall back to a subgroup size of 64 due to various constraints, and in that case we have to fake a subgroup size of 128 while actually using 64 under the hood, by just pretending that the upper 64 invocations are all disabled. However when computing the subgroup id we need to use the "real" subgroup size. For this purpose we plumb through a driver param which exposes the real subgroup size. If the user forces a particular subgroup size then we lower load_subgroup_size in nir_lower_subgroups, otherwise we let it through, and we assume when translating to ir3 that load_subgroup_size means "give me the *actual* subgroup size that you decided in RA" and give you the driver param. Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/6752>
2021-05-31 14:21:04 +02:00 · 2021-05-31 14:21:04 +02:00 · 68b8b9e9e1
parent cc514bfa0e
commit 68b8b9e9e1
7 changed files with 94 additions and 7 deletions
--- a/src/compiler/nir/nir_intrinsics.py
+++ b/src/compiler/nir/nir_intrinsics.py
@ -1018,6 +1018,9 @@ system_value("tess_factor_base_ir3", 2)
 system_value("tess_param_base_ir3", 2)
 system_value("tcs_header_ir3", 1)

+# System values for freedreno compute shaders.
+system_value("subgroup_id_shift_ir3", 1)
+
 # IR3-specific intrinsics for tessellation control shaders.  cond_end_ir3 end
 # the shader when src0 is false and is used to narrow down the TCS shader to
 # just thread 0 before writing out tessellation levels.
--- a/src/freedreno/ir3/ir3_compiler_nir.c
+++ b/src/freedreno/ir3/ir3_compiler_nir.c
@ -2005,6 +2005,12 @@ emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr)
 			dst[i] = create_driver_param(ctx, IR3_DP_LOCAL_GROUP_SIZE_X + i);
 		}
 		break;
+	case nir_intrinsic_load_subgroup_size:
+		dst[0] = create_driver_param(ctx, IR3_DP_SUBGROUP_SIZE);
+		break;
+	case nir_intrinsic_load_subgroup_id_shift_ir3:
+		dst[0] = create_driver_param(ctx, IR3_DP_SUBGROUP_ID_SHIFT);
+		break;
 	case nir_intrinsic_discard_if:
 	case nir_intrinsic_discard:
 	case nir_intrinsic_demote:
--- a/src/freedreno/ir3/ir3_nir.c
+++ b/src/freedreno/ir3/ir3_nir.c
@ -409,6 +409,58 @@ ir3_finalize_nir(struct ir3_compiler *compiler, nir_shader *s)
 	nir_sweep(s);
 }

+static bool
+lower_subgroup_id_filter(const nir_instr *instr, const void *unused)
+{
+	(void)unused;
+
+	if (instr->type != nir_instr_type_intrinsic)
+		return false;
+
+	nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+	return intr->intrinsic == nir_intrinsic_load_subgroup_invocation ||
+		   intr->intrinsic == nir_intrinsic_load_subgroup_id ||
+		   intr->intrinsic == nir_intrinsic_load_num_subgroups;
+}
+
+static nir_ssa_def *
+lower_subgroup_id(nir_builder *b, nir_instr *instr, void *unused)
+{
+	(void)instr;
+	(void)unused;
+
+	nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+	if (intr->intrinsic == nir_intrinsic_load_subgroup_invocation) {
+		return nir_iand(b, nir_load_local_invocation_index(b),
+						nir_isub(b, nir_load_subgroup_size(b), nir_imm_int(b, 1)));
+	} else if (intr->intrinsic == nir_intrinsic_load_subgroup_id) {
+		return nir_ishr(b, nir_load_local_invocation_index(b),
+						nir_load_subgroup_id_shift_ir3(b));
+	} else {
+		assert(intr->intrinsic == nir_intrinsic_load_num_subgroups);
+		/* If the workgroup size is constant,
+		 * nir_lower_compute_system_values() will replace local_size with a
+		 * constant so this can mostly be constant folded away.
+		 */
+		nir_ssa_def *local_size = nir_load_workgroup_size(b);
+		nir_ssa_def *size =
+			nir_imul24(b, nir_channel(b, local_size, 0),
+					   nir_imul24(b, nir_channel(b, local_size, 1),
+								  nir_channel(b, local_size, 2)));
+		nir_ssa_def *one = nir_imm_int(b, 1);
+		return nir_iadd(b, one,
+						nir_ishr(b, nir_isub(b, size, one),
+							     nir_load_subgroup_id_shift_ir3(b)));
+	}
+}
+
+static bool
+ir3_nir_lower_subgroup_id_cs(nir_shader *shader)
+{
+	return nir_shader_lower_instructions(shader, lower_subgroup_id_filter,
+										 lower_subgroup_id, NULL);
+}
+
 /**
 * Late passes that need to be done after pscreen->finalize_nir()
 */
@ -706,6 +758,14 @@ ir3_nir_scan_driver_consts(nir_shader *shader,
 					layout->num_driver_params =
 						MAX2(layout->num_driver_params, IR3_DP_BASE_GROUP_Z + 1);
 					break;
+				case nir_intrinsic_load_subgroup_size:
+					layout->num_driver_params =
+						MAX2(layout->num_driver_params, IR3_DP_SUBGROUP_SIZE + 1);
+					break;
+				case nir_intrinsic_load_subgroup_id_shift_ir3:
+					layout->num_driver_params =
+						MAX2(layout->num_driver_params, IR3_DP_SUBGROUP_ID_SHIFT + 1);
+					break;
 				default:
 					break;
 				}
--- a/src/freedreno/ir3/ir3_shader.h
+++ b/src/freedreno/ir3/ir3_shader.h
@ -48,9 +48,11 @@ enum ir3_driver_param {
 	IR3_DP_BASE_GROUP_X = 4,
 	IR3_DP_BASE_GROUP_Y = 5,
 	IR3_DP_BASE_GROUP_Z = 6,
+	IR3_DP_SUBGROUP_SIZE = 7,
 	IR3_DP_LOCAL_GROUP_SIZE_X = 8,
 	IR3_DP_LOCAL_GROUP_SIZE_Y = 9,
 	IR3_DP_LOCAL_GROUP_SIZE_Z = 10,
+	IR3_DP_SUBGROUP_ID_SHIFT = 11,
 	/* NOTE: gl_NumWorkGroups should be vec4 aligned because
 	 * glDispatchComputeIndirect() needs to load these from
 	 * the info->indirect buffer.  Keep that in mind when/if
--- a/src/freedreno/vulkan/tu_cmd_buffer.c
+++ b/src/freedreno/vulkan/tu_cmd_buffer.c
@ -4171,6 +4171,8 @@ tu_emit_compute_driver_params(struct tu_cmd_buffer *cmd,
      &pipeline->program.link[type];
   const struct ir3_const_state *const_state = &link->const_state;
   uint32_t offset = const_state->offsets.driver_param;
+   unsigned subgroup_size = pipeline->compute.subgroup_size;
+   unsigned subgroup_shift = util_logbase2(subgroup_size);

   if (link->constlen <= offset)
      return;
@ -4179,13 +4181,15 @@ tu_emit_compute_driver_params(struct tu_cmd_buffer *cmd,
                              (link->constlen - offset) * 4);

   if (!info->indirect) {
-      uint32_t driver_params[8] = {
+      uint32_t driver_params[12] = {
         [IR3_DP_NUM_WORK_GROUPS_X] = info->blocks[0],
         [IR3_DP_NUM_WORK_GROUPS_Y] = info->blocks[1],
         [IR3_DP_NUM_WORK_GROUPS_Z] = info->blocks[2],
         [IR3_DP_BASE_GROUP_X] = info->offsets[0],
         [IR3_DP_BASE_GROUP_Y] = info->offsets[1],
         [IR3_DP_BASE_GROUP_Z] = info->offsets[2],
+         [IR3_DP_SUBGROUP_SIZE] = subgroup_size,
+         [IR3_DP_SUBGROUP_ID_SHIFT] = subgroup_shift,
      };

      assert(num_consts <= ARRAY_SIZE(driver_params));
@ -4236,19 +4240,28 @@ tu_emit_compute_driver_params(struct tu_cmd_buffer *cmd,
      tu_cs_emit_qw(cs, global_iova(cmd, cs_indirect_xyz[0]));
   }

-   /* Zeroing of IR3_DP_BASE_GROUP_X/Y/Z for indirect dispatch */
+   /* Fill out IR3_DP_SUBGROUP_SIZE and IR3_DP_SUBGROUP_ID_SHIFT for indirect
+    * dispatch.
+    */
   if (info->indirect && num_consts > IR3_DP_BASE_GROUP_X) {
-      assert(num_consts == align(IR3_DP_BASE_GROUP_Z, 4));
-
      tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 7);
      tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(offset + (IR3_DP_BASE_GROUP_X / 4)) |
                 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
                 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
                 CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) |
-                 CP_LOAD_STATE6_0_NUM_UNIT(1));
+                 CP_LOAD_STATE6_0_NUM_UNIT((num_consts - IR3_DP_BASE_GROUP_X) / 4));
      tu_cs_emit_qw(cs, 0);
-      for (uint32_t i = 0; i < 4; i++)
-         tu_cs_emit(cs, 0);
+      tu_cs_emit(cs, 0); /* BASE_GROUP_X */
+      tu_cs_emit(cs, 0); /* BASE_GROUP_Y */
+      tu_cs_emit(cs, 0); /* BASE_GROUP_Z */
+      tu_cs_emit(cs, subgroup_size);
+      if (num_consts > IR3_DP_LOCAL_GROUP_SIZE_X) {
+         assert(num_consts == align(IR3_DP_SUBGROUP_ID_SHIFT, 4));
+         tu_cs_emit(cs, 0); /* LOCAL_GROUP_SIZE_X */
+         tu_cs_emit(cs, 0); /* LOCAL_GROUP_SIZE_Y */
+         tu_cs_emit(cs, 0); /* LOCAL_GROUP_SIZE_Z */
+         tu_cs_emit(cs, subgroup_shift);
+      }
   }
 }

--- a/src/freedreno/vulkan/tu_pipeline.c
+++ b/src/freedreno/vulkan/tu_pipeline.c
@ -3143,6 +3143,8 @@ tu_compute_pipeline_create(VkDevice device,
   for (int i = 0; i < 3; i++)
      pipeline->compute.local_size[i] = v->local_size[i];

+   pipeline->compute.subgroup_size = v->info.double_threadsize ? 128 : 64;
+
   struct tu_cs prog_cs;
   tu_cs_begin_sub_stream(&pipeline->cs, 512, &prog_cs);
   tu6_emit_cs_config(&prog_cs, shader, v, &pvtmem, shader_iova);
--- a/src/freedreno/vulkan/tu_private.h
+++ b/src/freedreno/vulkan/tu_private.h
@ -1189,6 +1189,7 @@ struct tu_pipeline
   struct
   {
      uint32_t local_size[3];
+      uint32_t subgroup_size;
   } compute;

   bool provoking_vertex_last;