tu, ir3: Plumb through support for CS subgroup size/id

The way that the blob obtains the subgroup id on compute shaders is by
just and'ing gl_LocalInvocationIndex with 63, since it advertizes a
subgroupSize of 64. In order to support VK_EXT_subgroup_size_control and
expose a subgroupSize of 128, we'll have to do something a little more
flexible. Sometimes we have to fall back to a subgroup size of 64 due to
various constraints, and in that case we have to fake a subgroup size of
128 while actually using 64 under the hood, by just pretending that the
upper 64 invocations are all disabled. However when computing the
subgroup id we need to use the "real" subgroup size. For this purpose we
plumb through a driver param which exposes the real subgroup size. If
the user forces a particular subgroup size then we lower
load_subgroup_size in nir_lower_subgroups, otherwise we let it through,
and we assume when translating to ir3 that load_subgroup_size means
"give me the *actual* subgroup size that you decided in RA" and give you
the driver param.

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/6752>
This commit is contained in:
Connor Abbott 2021-05-31 14:21:04 +02:00 committed by Marge Bot
parent cc514bfa0e
commit 68b8b9e9e1
7 changed files with 94 additions and 7 deletions

View File

@ -1018,6 +1018,9 @@ system_value("tess_factor_base_ir3", 2)
system_value("tess_param_base_ir3", 2)
system_value("tcs_header_ir3", 1)
# System values for freedreno compute shaders.
system_value("subgroup_id_shift_ir3", 1)
# IR3-specific intrinsics for tessellation control shaders. cond_end_ir3 end
# the shader when src0 is false and is used to narrow down the TCS shader to
# just thread 0 before writing out tessellation levels.

View File

@ -2005,6 +2005,12 @@ emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr)
dst[i] = create_driver_param(ctx, IR3_DP_LOCAL_GROUP_SIZE_X + i);
}
break;
case nir_intrinsic_load_subgroup_size:
dst[0] = create_driver_param(ctx, IR3_DP_SUBGROUP_SIZE);
break;
case nir_intrinsic_load_subgroup_id_shift_ir3:
dst[0] = create_driver_param(ctx, IR3_DP_SUBGROUP_ID_SHIFT);
break;
case nir_intrinsic_discard_if:
case nir_intrinsic_discard:
case nir_intrinsic_demote:

View File

@ -409,6 +409,58 @@ ir3_finalize_nir(struct ir3_compiler *compiler, nir_shader *s)
nir_sweep(s);
}
static bool
lower_subgroup_id_filter(const nir_instr *instr, const void *unused)
{
(void)unused;
if (instr->type != nir_instr_type_intrinsic)
return false;
nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
return intr->intrinsic == nir_intrinsic_load_subgroup_invocation ||
intr->intrinsic == nir_intrinsic_load_subgroup_id ||
intr->intrinsic == nir_intrinsic_load_num_subgroups;
}
static nir_ssa_def *
lower_subgroup_id(nir_builder *b, nir_instr *instr, void *unused)
{
(void)instr;
(void)unused;
nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
if (intr->intrinsic == nir_intrinsic_load_subgroup_invocation) {
return nir_iand(b, nir_load_local_invocation_index(b),
nir_isub(b, nir_load_subgroup_size(b), nir_imm_int(b, 1)));
} else if (intr->intrinsic == nir_intrinsic_load_subgroup_id) {
return nir_ishr(b, nir_load_local_invocation_index(b),
nir_load_subgroup_id_shift_ir3(b));
} else {
assert(intr->intrinsic == nir_intrinsic_load_num_subgroups);
/* If the workgroup size is constant,
* nir_lower_compute_system_values() will replace local_size with a
* constant so this can mostly be constant folded away.
*/
nir_ssa_def *local_size = nir_load_workgroup_size(b);
nir_ssa_def *size =
nir_imul24(b, nir_channel(b, local_size, 0),
nir_imul24(b, nir_channel(b, local_size, 1),
nir_channel(b, local_size, 2)));
nir_ssa_def *one = nir_imm_int(b, 1);
return nir_iadd(b, one,
nir_ishr(b, nir_isub(b, size, one),
nir_load_subgroup_id_shift_ir3(b)));
}
}
static bool
ir3_nir_lower_subgroup_id_cs(nir_shader *shader)
{
return nir_shader_lower_instructions(shader, lower_subgroup_id_filter,
lower_subgroup_id, NULL);
}
/**
* Late passes that need to be done after pscreen->finalize_nir()
*/
@ -706,6 +758,14 @@ ir3_nir_scan_driver_consts(nir_shader *shader,
layout->num_driver_params =
MAX2(layout->num_driver_params, IR3_DP_BASE_GROUP_Z + 1);
break;
case nir_intrinsic_load_subgroup_size:
layout->num_driver_params =
MAX2(layout->num_driver_params, IR3_DP_SUBGROUP_SIZE + 1);
break;
case nir_intrinsic_load_subgroup_id_shift_ir3:
layout->num_driver_params =
MAX2(layout->num_driver_params, IR3_DP_SUBGROUP_ID_SHIFT + 1);
break;
default:
break;
}

View File

@ -48,9 +48,11 @@ enum ir3_driver_param {
IR3_DP_BASE_GROUP_X = 4,
IR3_DP_BASE_GROUP_Y = 5,
IR3_DP_BASE_GROUP_Z = 6,
IR3_DP_SUBGROUP_SIZE = 7,
IR3_DP_LOCAL_GROUP_SIZE_X = 8,
IR3_DP_LOCAL_GROUP_SIZE_Y = 9,
IR3_DP_LOCAL_GROUP_SIZE_Z = 10,
IR3_DP_SUBGROUP_ID_SHIFT = 11,
/* NOTE: gl_NumWorkGroups should be vec4 aligned because
* glDispatchComputeIndirect() needs to load these from
* the info->indirect buffer. Keep that in mind when/if

View File

@ -4171,6 +4171,8 @@ tu_emit_compute_driver_params(struct tu_cmd_buffer *cmd,
&pipeline->program.link[type];
const struct ir3_const_state *const_state = &link->const_state;
uint32_t offset = const_state->offsets.driver_param;
unsigned subgroup_size = pipeline->compute.subgroup_size;
unsigned subgroup_shift = util_logbase2(subgroup_size);
if (link->constlen <= offset)
return;
@ -4179,13 +4181,15 @@ tu_emit_compute_driver_params(struct tu_cmd_buffer *cmd,
(link->constlen - offset) * 4);
if (!info->indirect) {
uint32_t driver_params[8] = {
uint32_t driver_params[12] = {
[IR3_DP_NUM_WORK_GROUPS_X] = info->blocks[0],
[IR3_DP_NUM_WORK_GROUPS_Y] = info->blocks[1],
[IR3_DP_NUM_WORK_GROUPS_Z] = info->blocks[2],
[IR3_DP_BASE_GROUP_X] = info->offsets[0],
[IR3_DP_BASE_GROUP_Y] = info->offsets[1],
[IR3_DP_BASE_GROUP_Z] = info->offsets[2],
[IR3_DP_SUBGROUP_SIZE] = subgroup_size,
[IR3_DP_SUBGROUP_ID_SHIFT] = subgroup_shift,
};
assert(num_consts <= ARRAY_SIZE(driver_params));
@ -4236,19 +4240,28 @@ tu_emit_compute_driver_params(struct tu_cmd_buffer *cmd,
tu_cs_emit_qw(cs, global_iova(cmd, cs_indirect_xyz[0]));
}
/* Zeroing of IR3_DP_BASE_GROUP_X/Y/Z for indirect dispatch */
/* Fill out IR3_DP_SUBGROUP_SIZE and IR3_DP_SUBGROUP_ID_SHIFT for indirect
* dispatch.
*/
if (info->indirect && num_consts > IR3_DP_BASE_GROUP_X) {
assert(num_consts == align(IR3_DP_BASE_GROUP_Z, 4));
tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 7);
tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(offset + (IR3_DP_BASE_GROUP_X / 4)) |
CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) |
CP_LOAD_STATE6_0_NUM_UNIT(1));
CP_LOAD_STATE6_0_NUM_UNIT((num_consts - IR3_DP_BASE_GROUP_X) / 4));
tu_cs_emit_qw(cs, 0);
for (uint32_t i = 0; i < 4; i++)
tu_cs_emit(cs, 0);
tu_cs_emit(cs, 0); /* BASE_GROUP_X */
tu_cs_emit(cs, 0); /* BASE_GROUP_Y */
tu_cs_emit(cs, 0); /* BASE_GROUP_Z */
tu_cs_emit(cs, subgroup_size);
if (num_consts > IR3_DP_LOCAL_GROUP_SIZE_X) {
assert(num_consts == align(IR3_DP_SUBGROUP_ID_SHIFT, 4));
tu_cs_emit(cs, 0); /* LOCAL_GROUP_SIZE_X */
tu_cs_emit(cs, 0); /* LOCAL_GROUP_SIZE_Y */
tu_cs_emit(cs, 0); /* LOCAL_GROUP_SIZE_Z */
tu_cs_emit(cs, subgroup_shift);
}
}
}

View File

@ -3143,6 +3143,8 @@ tu_compute_pipeline_create(VkDevice device,
for (int i = 0; i < 3; i++)
pipeline->compute.local_size[i] = v->local_size[i];
pipeline->compute.subgroup_size = v->info.double_threadsize ? 128 : 64;
struct tu_cs prog_cs;
tu_cs_begin_sub_stream(&pipeline->cs, 512, &prog_cs);
tu6_emit_cs_config(&prog_cs, shader, v, &pvtmem, shader_iova);

View File

@ -1189,6 +1189,7 @@ struct tu_pipeline
struct
{
uint32_t local_size[3];
uint32_t subgroup_size;
} compute;
bool provoking_vertex_last;