broadcom/compiler: implement nir_intrinsic_load_subgroup_id correctly

For some reason, this was implemented with the bulk of the compute
shader enablement, but this intrinsic is specific to subgroups and
thus was not really used. Also, its implementation was not correct,
since it was returning the element index within the subgroup, not
the subgroup index itself, which is the index of the batch in the
dispatch.

Reviewed-by: Alejandro Piñeiro <apinheiro@igalia.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/11620>
This commit is contained in:
Iago Toral Quiroga 2021-06-22 10:55:04 +02:00
parent 71b7c7b0dc
commit 30dec8b414
1 changed files with 23 additions and 8 deletions

View File

@ -2773,6 +2773,13 @@ ntq_emit_load_ubo_unifa(struct v3d_compile *c, nir_intrinsic_instr *instr)
}
}
static inline struct qreg
emit_load_local_invocation_index(struct v3d_compile *c)
{
return vir_SHR(c, c->cs_payload[1],
vir_uniform_ui(c, 32 - c->local_invocation_index_bits));
}
static void
ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
{
@ -3034,12 +3041,6 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
}
break;
case nir_intrinsic_load_local_invocation_index:
ntq_store_dest(c, &instr->dest, 0,
vir_SHR(c, c->cs_payload[1],
vir_uniform_ui(c, 32 - c->local_invocation_index_bits)));
break;
case nir_intrinsic_load_workgroup_id: {
struct qreg x = vir_AND(c, c->cs_payload[0],
vir_uniform_ui(c, 0xffff));
@ -3066,10 +3067,24 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
break;
}
case nir_intrinsic_load_subgroup_id:
ntq_store_dest(c, &instr->dest, 0, vir_EIDX(c));
case nir_intrinsic_load_local_invocation_index:
ntq_store_dest(c, &instr->dest, 0,
emit_load_local_invocation_index(c));
break;
case nir_intrinsic_load_subgroup_id: {
/* This is basically the batch index, which is the Local
* Invocation Index divided by the SIMD width).
*/
STATIC_ASSERT(util_is_power_of_two_nonzero(V3D_CHANNELS));
const uint32_t divide_shift = ffs(V3D_CHANNELS) - 1;
struct qreg lii = emit_load_local_invocation_index(c);
ntq_store_dest(c, &instr->dest, 0,
vir_SHR(c, lii,
vir_uniform_ui(c, divide_shift)));
break;
}
case nir_intrinsic_load_per_vertex_input: {
/* The vertex shader writes all its used outputs into
* consecutive VPM offsets, so if any output component is