broadcom/compiler: use nir_lower_wrmasks to simplify TMU general stores

This pass splits writemaks with non-consecutive bits into multiple
store operations ensuring that each store only has consecutive
writemask bits set.

We can use this to simplify writemask handling in our backend removing
a loop solely intended to handle this case.

Reviewed-by: Alejandro Piñeiro <apinheiro@igalia.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9619>
This commit is contained in:
Iago Toral Quiroga 2021-03-16 09:51:45 +01:00 committed by Marge Bot
parent 51a263530f
commit aefac60741
2 changed files with 94 additions and 76 deletions

View File

@ -549,87 +549,88 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr,
const uint32_t dest_components = nir_intrinsic_dest_components(instr);
uint32_t base_const_offset = const_offset;
uint32_t writemask = is_store ? nir_intrinsic_write_mask(instr) : 0;
do {
uint32_t tmu_writes = 0;
for (enum emit_mode mode = MODE_COUNT; mode != MODE_LAST; mode++) {
assert(mode == MODE_COUNT || tmu_writes > 0);
uint32_t tmu_writes = 0;
for (enum emit_mode mode = MODE_COUNT; mode != MODE_LAST; mode++) {
assert(mode == MODE_COUNT || tmu_writes > 0);
if (is_store) {
emit_tmu_general_store_writes(c, mode, instr,
base_const_offset,
&writemask,
&const_offset,
&tmu_writes);
} else if (!is_load && !atomic_add_replaced) {
emit_tmu_general_atomic_writes(c, mode, instr,
tmu_op,
has_index,
&tmu_writes);
}
if (is_store) {
emit_tmu_general_store_writes(c, mode, instr,
base_const_offset,
&writemask,
&const_offset,
&tmu_writes);
} else if (!is_load && !atomic_add_replaced) {
emit_tmu_general_atomic_writes(c, mode, instr,
tmu_op, has_index,
&tmu_writes);
}
/* The spec says that for atomics, the TYPE field is
* ignored, but that doesn't seem to be the case for
* CMPXCHG. Just use the number of tmud writes we did
* to decide the type (or choose "32bit" for atomic
* reads, which has been fine).
*/
uint32_t config = 0;
if (mode == MODE_EMIT) {
uint32_t num_components;
if (is_load || atomic_add_replaced)
num_components = instr->num_components;
else {
assert(tmu_writes > 0);
num_components = tmu_writes - 1;
}
uint32_t perquad =
is_load && !vir_in_nonuniform_control_flow(c)
? GENERAL_TMU_LOOKUP_PER_QUAD
: GENERAL_TMU_LOOKUP_PER_PIXEL;
config = 0xffffff00 | tmu_op << 3 | perquad;
if (num_components == 1) {
config |= GENERAL_TMU_LOOKUP_TYPE_32BIT_UI;
} else {
config |= GENERAL_TMU_LOOKUP_TYPE_VEC2 +
num_components - 2;
}
}
emit_tmu_general_address_write(c, mode, instr, config,
dynamic_src,
offset_src,
base_offset,
const_offset,
&tmu_writes);
assert(tmu_writes > 0);
if (mode == MODE_COUNT) {
/* Make sure we won't exceed the 16-entry TMU
* fifo if each thread is storing at the same
* time.
*/
while (tmu_writes > 16 / c->threads)
c->threads /= 2;
/* If pipelining this TMU operation would
* overflow TMU fifos, we need to flush.
*/
if (ntq_tmu_fifo_overflow(c, dest_components))
ntq_flush_tmu(c);
/* The spec says that for atomics, the TYPE field is
* ignored, but that doesn't seem to be the case for
* CMPXCHG. Just use the number of tmud writes we did
* to decide the type (or choose "32bit" for atomic
* reads, which has been fine).
*/
uint32_t config = 0;
if (mode == MODE_EMIT) {
uint32_t num_components;
if (is_load || atomic_add_replaced) {
num_components = instr->num_components;
} else {
/* Delay emission of the thread switch and
* LDTMU/TMUWT until we really need to do it to
* improve pipelining.
*/
const uint32_t component_mask =
(1 << dest_components) - 1;
ntq_add_pending_tmu_flush(c, &instr->dest,
component_mask);
assert(tmu_writes > 0);
num_components = tmu_writes - 1;
}
uint32_t perquad =
is_load && !vir_in_nonuniform_control_flow(c)
? GENERAL_TMU_LOOKUP_PER_QUAD
: GENERAL_TMU_LOOKUP_PER_PIXEL;
config = 0xffffff00 | tmu_op << 3 | perquad;
if (num_components == 1) {
config |= GENERAL_TMU_LOOKUP_TYPE_32BIT_UI;
} else {
config |= GENERAL_TMU_LOOKUP_TYPE_VEC2 +
num_components - 2;
}
}
} while (is_store && writemask != 0);
emit_tmu_general_address_write(c, mode, instr, config,
dynamic_src, offset_src,
base_offset, const_offset,
&tmu_writes);
assert(tmu_writes > 0);
if (mode == MODE_COUNT) {
/* Make sure we won't exceed the 16-entry TMU
* fifo if each thread is storing at the same
* time.
*/
while (tmu_writes > 16 / c->threads)
c->threads /= 2;
/* If pipelining this TMU operation would
* overflow TMU fifos, we need to flush.
*/
if (ntq_tmu_fifo_overflow(c, dest_components))
ntq_flush_tmu(c);
} else {
/* Delay emission of the thread switch and
* LDTMU/TMUWT until we really need to do it to
* improve pipelining.
*/
const uint32_t component_mask =
(1 << dest_components) - 1;
ntq_add_pending_tmu_flush(c, &instr->dest,
component_mask);
}
}
/* nir_lower_wrmasks should've ensured that any writemask on a store
* operation only has consecutive bits set, in which case we should've
* processed the full writemask above.
*/
assert(writemask == 0);
}
static struct qreg *

View File

@ -1072,6 +1072,21 @@ v3d_intrinsic_dependency_cb(nir_intrinsic_instr *intr,
return false;
}
static bool
should_split_wrmask(const nir_instr *instr, const void *data)
{
nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
switch (intr->intrinsic) {
case nir_intrinsic_store_ssbo:
case nir_intrinsic_store_shared:
case nir_intrinsic_store_global:
case nir_intrinsic_store_scratch:
return true;
default:
return false;
}
}
static void
v3d_attempt_compile(struct v3d_compile *c)
{
@ -1137,6 +1152,8 @@ v3d_attempt_compile(struct v3d_compile *c)
NIR_PASS_V(c->s, v3d_nir_lower_robust_buffer_access, c);
}
NIR_PASS_V(c->s, nir_lower_wrmasks, should_split_wrmask, c->s);
v3d_optimize_nir(c->s);
/* Do late algebraic optimization to turn add(a, neg(b)) back into