broadcom/compiler: implement nir_intrinsic_load_subgroup_id correctly

For some reason, this was implemented with the bulk of the compute shader enablement, but this intrinsic is specific to subgroups and thus was not really used. Also, its implementation was not correct, since it was returning the element index within the subgroup, not the subgroup index itself, which is the index of the batch in the dispatch. Reviewed-by: Alejandro Piñeiro <apinheiro@igalia.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/11620>
2021-06-22 10:55:04 +02:00 · 2021-06-22 10:55:04 +02:00 · 30dec8b414
parent 71b7c7b0dc
commit 30dec8b414
1 changed files with 23 additions and 8 deletions
--- a/src/broadcom/compiler/nir_to_vir.c
+++ b/src/broadcom/compiler/nir_to_vir.c
@ -2773,6 +2773,13 @@ ntq_emit_load_ubo_unifa(struct v3d_compile *c, nir_intrinsic_instr *instr)
        }
 }

+static inline struct qreg
+emit_load_local_invocation_index(struct v3d_compile *c)
+{
+        return vir_SHR(c, c->cs_payload[1],
+                       vir_uniform_ui(c, 32 - c->local_invocation_index_bits));
+}
+
 static void
 ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
 {
@ -3034,12 +3041,6 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
                }
                break;

-        case nir_intrinsic_load_local_invocation_index:
-                ntq_store_dest(c, &instr->dest, 0,
-                               vir_SHR(c, c->cs_payload[1],
-                                       vir_uniform_ui(c, 32 - c->local_invocation_index_bits)));
-                break;
-
        case nir_intrinsic_load_workgroup_id: {
                struct qreg x = vir_AND(c, c->cs_payload[0],
                                         vir_uniform_ui(c, 0xffff));
@ -3066,10 +3067,24 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
                break;
        }

-        case nir_intrinsic_load_subgroup_id:
-                ntq_store_dest(c, &instr->dest, 0, vir_EIDX(c));
+        case nir_intrinsic_load_local_invocation_index:
+                ntq_store_dest(c, &instr->dest, 0,
+                               emit_load_local_invocation_index(c));
                break;

+        case nir_intrinsic_load_subgroup_id: {
+                /* This is basically the batch index, which is the Local
+                 * Invocation Index divided by the SIMD width).
+                 */
+                STATIC_ASSERT(util_is_power_of_two_nonzero(V3D_CHANNELS));
+                const uint32_t divide_shift = ffs(V3D_CHANNELS) - 1;
+                struct qreg lii = emit_load_local_invocation_index(c);
+                ntq_store_dest(c, &instr->dest, 0,
+                               vir_SHR(c, lii,
+                                       vir_uniform_ui(c, divide_shift)));
+                break;
+        }
+
        case nir_intrinsic_load_per_vertex_input: {
                /* The vertex shader writes all its used outputs into
                 * consecutive VPM offsets, so if any output component is