broadcom/compiler: implement 2x32 global intrinsics

Notice we ignore the high 32-bit component of the address because
we know it must be 0.

Reviewed-by: Alejandro Piñeiro <apinheiro@igalia.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17275>
This commit is contained in:
Iago Toral Quiroga 2022-06-27 14:09:28 +02:00
parent b18cecbfb6
commit fa03d9c8be
2 changed files with 71 additions and 13 deletions

View File

@ -195,39 +195,51 @@ v3d_general_tmu_op(nir_intrinsic_instr *instr)
case nir_intrinsic_load_uniform:
case nir_intrinsic_load_shared:
case nir_intrinsic_load_scratch:
case nir_intrinsic_load_global_2x32:
case nir_intrinsic_store_ssbo:
case nir_intrinsic_store_shared:
case nir_intrinsic_store_scratch:
case nir_intrinsic_store_global_2x32:
return V3D_TMU_OP_REGULAR;
case nir_intrinsic_ssbo_atomic_add:
return v3d_get_op_for_atomic_add(instr, 2);
case nir_intrinsic_shared_atomic_add:
case nir_intrinsic_global_atomic_add_2x32:
return v3d_get_op_for_atomic_add(instr, 1);
case nir_intrinsic_ssbo_atomic_imin:
case nir_intrinsic_global_atomic_imin_2x32:
case nir_intrinsic_shared_atomic_imin:
return V3D_TMU_OP_WRITE_SMIN;
case nir_intrinsic_ssbo_atomic_umin:
case nir_intrinsic_global_atomic_umin_2x32:
case nir_intrinsic_shared_atomic_umin:
return V3D_TMU_OP_WRITE_UMIN_FULL_L1_CLEAR;
case nir_intrinsic_ssbo_atomic_imax:
case nir_intrinsic_global_atomic_imax_2x32:
case nir_intrinsic_shared_atomic_imax:
return V3D_TMU_OP_WRITE_SMAX;
case nir_intrinsic_ssbo_atomic_umax:
case nir_intrinsic_global_atomic_umax_2x32:
case nir_intrinsic_shared_atomic_umax:
return V3D_TMU_OP_WRITE_UMAX;
case nir_intrinsic_ssbo_atomic_and:
case nir_intrinsic_global_atomic_and_2x32:
case nir_intrinsic_shared_atomic_and:
return V3D_TMU_OP_WRITE_AND_READ_INC;
case nir_intrinsic_ssbo_atomic_or:
case nir_intrinsic_global_atomic_or_2x32:
case nir_intrinsic_shared_atomic_or:
return V3D_TMU_OP_WRITE_OR_READ_DEC;
case nir_intrinsic_ssbo_atomic_xor:
case nir_intrinsic_global_atomic_xor_2x32:
case nir_intrinsic_shared_atomic_xor:
return V3D_TMU_OP_WRITE_XOR_READ_NOT;
case nir_intrinsic_ssbo_atomic_exchange:
case nir_intrinsic_global_atomic_exchange_2x32:
case nir_intrinsic_shared_atomic_exchange:
return V3D_TMU_OP_WRITE_XCHG_READ_FLUSH;
case nir_intrinsic_ssbo_atomic_comp_swap:
case nir_intrinsic_global_atomic_comp_swap_2x32:
case nir_intrinsic_shared_atomic_comp_swap:
return V3D_TMU_OP_WRITE_CMPXCHG_READ_FLUSH;
default:
@ -489,7 +501,7 @@ emit_tmu_general_address_write(struct v3d_compile *c,
*/
static void
ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr,
bool is_shared_or_scratch)
bool is_shared_or_scratch, bool is_global)
{
uint32_t tmu_op = v3d_general_tmu_op(instr);
@ -499,24 +511,27 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr,
*/
bool atomic_add_replaced =
((instr->intrinsic == nir_intrinsic_ssbo_atomic_add ||
instr->intrinsic == nir_intrinsic_shared_atomic_add) &&
instr->intrinsic == nir_intrinsic_shared_atomic_add ||
instr->intrinsic == nir_intrinsic_global_atomic_add_2x32) &&
(tmu_op == V3D_TMU_OP_WRITE_AND_READ_INC ||
tmu_op == V3D_TMU_OP_WRITE_OR_READ_DEC));
bool is_store = (instr->intrinsic == nir_intrinsic_store_ssbo ||
instr->intrinsic == nir_intrinsic_store_scratch ||
instr->intrinsic == nir_intrinsic_store_shared);
instr->intrinsic == nir_intrinsic_store_shared ||
instr->intrinsic == nir_intrinsic_store_global_2x32);
bool is_load = (instr->intrinsic == nir_intrinsic_load_uniform ||
instr->intrinsic == nir_intrinsic_load_ubo ||
instr->intrinsic == nir_intrinsic_load_ssbo ||
instr->intrinsic == nir_intrinsic_load_scratch ||
instr->intrinsic == nir_intrinsic_load_shared);
instr->intrinsic == nir_intrinsic_load_shared ||
instr->intrinsic == nir_intrinsic_load_global_2x32);
if (!is_load)
c->tmu_dirty_rcl = true;
bool has_index = !is_shared_or_scratch;
bool has_index = !is_shared_or_scratch && !is_global;
int offset_src;
if (instr->intrinsic == nir_intrinsic_load_uniform) {
@ -525,6 +540,7 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr,
instr->intrinsic == nir_intrinsic_load_ubo ||
instr->intrinsic == nir_intrinsic_load_scratch ||
instr->intrinsic == nir_intrinsic_load_shared ||
instr->intrinsic == nir_intrinsic_load_global_2x32 ||
atomic_add_replaced) {
offset_src = 0 + has_index;
} else if (is_store) {
@ -568,6 +584,12 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr,
base_offset = c->cs_shared_offset;
const_offset += nir_intrinsic_base(instr);
}
} else if (is_global) {
/* Global load/store intrinsics use gloal addresses, so the
* offset is the target address and we don't need to add it
* to a base offset.
*/
base_offset = vir_uniform_ui(c, 0);
} else {
base_offset = vir_uniform(c, QUNIFORM_SSBO_OFFSET,
nir_src_as_uint(instr->src[is_store ?
@ -2713,7 +2735,7 @@ ntq_emit_load_uniform(struct v3d_compile *c, nir_intrinsic_instr *instr)
}
if (!ntq_emit_load_unifa(c, instr)) {
ntq_emit_tmu_general(c, instr, false);
ntq_emit_tmu_general(c, instr, false, false);
c->has_general_tmu_load = true;
}
}
@ -3291,13 +3313,18 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
ntq_emit_load_uniform(c, instr);
break;
case nir_intrinsic_load_global_2x32:
ntq_emit_tmu_general(c, instr, false, true);
c->has_general_tmu_load = true;
break;
case nir_intrinsic_load_ubo:
if (ntq_emit_inline_ubo_load(c, instr))
break;
FALLTHROUGH;
case nir_intrinsic_load_ssbo:
if (!ntq_emit_load_unifa(c, instr)) {
ntq_emit_tmu_general(c, instr, false);
ntq_emit_tmu_general(c, instr, false, false);
c->has_general_tmu_load = true;
}
break;
@ -3313,7 +3340,21 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
case nir_intrinsic_ssbo_atomic_exchange:
case nir_intrinsic_ssbo_atomic_comp_swap:
case nir_intrinsic_store_ssbo:
ntq_emit_tmu_general(c, instr, false);
ntq_emit_tmu_general(c, instr, false, false);
break;
case nir_intrinsic_global_atomic_add_2x32:
case nir_intrinsic_global_atomic_imin_2x32:
case nir_intrinsic_global_atomic_umin_2x32:
case nir_intrinsic_global_atomic_imax_2x32:
case nir_intrinsic_global_atomic_umax_2x32:
case nir_intrinsic_global_atomic_and_2x32:
case nir_intrinsic_global_atomic_or_2x32:
case nir_intrinsic_global_atomic_xor_2x32:
case nir_intrinsic_global_atomic_exchange_2x32:
case nir_intrinsic_global_atomic_comp_swap_2x32:
case nir_intrinsic_store_global_2x32:
ntq_emit_tmu_general(c, instr, false, true);
break;
case nir_intrinsic_shared_atomic_add:
@ -3328,12 +3369,12 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
case nir_intrinsic_shared_atomic_comp_swap:
case nir_intrinsic_store_shared:
case nir_intrinsic_store_scratch:
ntq_emit_tmu_general(c, instr, true);
ntq_emit_tmu_general(c, instr, true, false);
break;
case nir_intrinsic_load_scratch:
case nir_intrinsic_load_shared:
ntq_emit_tmu_general(c, instr, true);
ntq_emit_tmu_general(c, instr, true, false);
c->has_general_tmu_load = true;
break;

View File

@ -39,6 +39,7 @@ value_src(nir_intrinsic_op intrinsic)
switch (intrinsic) {
case nir_intrinsic_store_ssbo:
case nir_intrinsic_store_scratch:
case nir_intrinsic_store_global_2x32:
return 0;
default:
unreachable("Unsupported intrinsic");
@ -52,10 +53,12 @@ offset_src(nir_intrinsic_op intrinsic)
case nir_intrinsic_load_uniform:
case nir_intrinsic_load_shared:
case nir_intrinsic_load_scratch:
case nir_intrinsic_load_global_2x32:
return 0;
case nir_intrinsic_load_ubo:
case nir_intrinsic_load_ssbo:
case nir_intrinsic_store_scratch:
case nir_intrinsic_store_global_2x32:
return 1;
case nir_intrinsic_store_ssbo:
return 2;
@ -125,6 +128,7 @@ lower_load_bitsize(struct v3d_compile *c,
b->cursor = nir_before_instr(&intr->instr);
/* For global 2x32 we ignore Y component because it must be zero */
unsigned offset_idx = offset_src(intr->intrinsic);
nir_ssa_def *offset = nir_ssa_for_src(b, intr->src[offset_idx], 1);
@ -139,7 +143,12 @@ lower_load_bitsize(struct v3d_compile *c,
for (unsigned i = 0; i < info->num_srcs; i++) {
if (i == offset_idx) {
new_intr->src[i] = nir_src_for_ssa(scalar_offset);
nir_ssa_def *final_offset;
final_offset = intr->intrinsic != nir_intrinsic_load_global_2x32 ?
scalar_offset :
nir_vec2(b, scalar_offset,
nir_imm_int(b, 0));
new_intr->src[i] = nir_src_for_ssa(final_offset);
} else {
new_intr->src[i] = intr->src[i];
}
@ -178,6 +187,7 @@ lower_store_bitsize(struct v3d_compile *c,
b->cursor = nir_before_instr(&intr->instr);
/* For global 2x32 we ignore Y component because it must be zero */
unsigned offset_idx = offset_src(intr->intrinsic);
nir_ssa_def *offset = nir_ssa_for_src(b, intr->src[offset_idx], 1);
@ -200,7 +210,12 @@ lower_store_bitsize(struct v3d_compile *c,
nir_channels(b, value, 1 << component);
new_intr->src[i] = nir_src_for_ssa(scalar_value);
} else if (i == offset_idx) {
new_intr->src[i] = nir_src_for_ssa(scalar_offset);
nir_ssa_def *final_offset;
final_offset = intr->intrinsic != nir_intrinsic_store_global_2x32 ?
scalar_offset :
nir_vec2(b, scalar_offset,
nir_imm_int(b, 0));
new_intr->src[i] = nir_src_for_ssa(final_offset);
} else {
new_intr->src[i] = intr->src[i];
}
@ -229,10 +244,12 @@ lower_load_store_bitsize(nir_builder *b, nir_instr *instr, void *data)
case nir_intrinsic_load_ubo:
case nir_intrinsic_load_uniform:
case nir_intrinsic_load_scratch:
return lower_load_bitsize(c, b, intr);
case nir_intrinsic_load_global_2x32:
return lower_load_bitsize(c, b, intr);
case nir_intrinsic_store_ssbo:
case nir_intrinsic_store_scratch:
case nir_intrinsic_store_global_2x32:
return lower_store_bitsize(c, b, intr);
default: