broadcom/compiler: be more aggressive skipping unifa writes

We had an optimization in place to skip a unifa write if the address
happens to be right after the last ldunifa read address, but we can
take this further and update the unifa address by emitting ldunifa
instructions if needed to skip a unifa write that is close enough.
This is because a unifa write involves 4 cycles: 1 for the write
and 3 delay slots before we can emit the first ldunifa.

So if we have code like this:

unifa addr + 0
ldunifa.r0
unifa addr + 12
ldunifa.r1

In practice we end up with QPU like this:

unifa addr + 0
nop
nop
nop
ldunifa.r0
unifa addr + 12
nop
nop
nop
ldunifa.r1

And with this patch we get:

unifa addr + 0
nop
nop
nop
ldunifa.r0  <--- reads offset 0
ldunifa.-   <--- reads offset 4
ldunifa.-   <--- reads offset 8
ldunifa.r1  <--- reads offset 12

Of course, QPU scheduling might find ways to fill the NOPs to some
extent and remove some of the gains, but generally speaking, this is
still usually a win.

Going by shader-db results, allowing the next unifa address to be up
to 12 bytes after the address resulting from the last ldunifa read
shows the best results:

total instructions in shared programs: 13817048 -> 13812202 (-0.04%)
instructions in affected programs: 602701 -> 597855 (-0.80%)
helped: 1750
HURT: 760
Instructions are helped.

total uniforms in shared programs: 3795485 -> 3793200 (-0.06%)
uniforms in affected programs: 43930 -> 41645 (-5.20%)
helped: 898
HURT: 0
Uniforms are helped.

total max-temps in shared programs: 2326612 -> 2326621 (<.01%)
max-temps in affected programs: 651 -> 660 (1.38%)
helped: 10
HURT: 21
Inconclusive result (value mean confidence interval includes 0).

total sfu-stalls in shared programs: 30942 -> 30906 (-0.12%)
sfu-stalls in affected programs: 627 -> 591 (-5.74%)
helped: 186
HURT: 158
Inconclusive result (value mean confidence interval includes 0).

total inst-and-stalls in shared programs: 13847990 -> 13843108 (-0.04%)
inst-and-stalls in affected programs: 601404 -> 596522 (-0.81%)
helped: 1747
HURT: 757
Inst-and-stalls are helped.

Reviewed-by: Alejandro Piñeiro <apinheiro@igalia.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9384>
This commit is contained in:
Iago Toral Quiroga 2021-03-02 13:05:09 +01:00
parent 2897a83ff8
commit c3732ac0d0
1 changed files with 22 additions and 6 deletions

View File

@ -2662,6 +2662,19 @@ ntq_emit_load_interpolated_input(struct v3d_compile *c,
return vir_FADD(c, vir_FMUL(c, pInterp, wInterp), C);
}
static void
emit_ldunifa(struct v3d_compile *c, struct qreg *result)
{
struct qinst *ldunifa =
vir_add_inst(V3D_QPU_A_NOP, c->undef, c->undef, c->undef);
ldunifa->qpu.sig.ldunifa = true;
if (result)
*result = vir_emit_def(c, ldunifa);
else
vir_emit_nondef(c, ldunifa);
c->last_unifa_offset += 4;
}
static void
ntq_emit_load_ubo_unifa(struct v3d_compile *c, nir_intrinsic_instr *instr)
{
@ -2680,12 +2693,15 @@ ntq_emit_load_ubo_unifa(struct v3d_compile *c, nir_intrinsic_instr *instr)
* constant offset loads.
*/
bool skip_unifa = false;
uint32_t ldunifa_skips = 0;
if (dynamic_src) {
c->last_unifa_block = NULL;
} else if (c->cur_block == c->last_unifa_block &&
c->last_unifa_index == index &&
c->last_unifa_offset == const_offset) {
c->last_unifa_offset <= const_offset &&
c->last_unifa_offset + 12 >= const_offset) {
skip_unifa = true;
ldunifa_skips = (const_offset - c->last_unifa_offset) / 4;
} else {
c->last_unifa_block = c->cur_block;
c->last_unifa_index = index;
@ -2704,15 +2720,15 @@ ntq_emit_load_ubo_unifa(struct v3d_compile *c, nir_intrinsic_instr *instr)
vir_ADD_dest(c, unifa, base_offset,
ntq_get_src(c, instr->src[1], 0));
}
} else {
for (int i = 0; i < ldunifa_skips; i++)
emit_ldunifa(c, NULL);
}
for (uint32_t i = 0; i < nir_intrinsic_dest_components(instr); i++) {
struct qinst *ldunifa =
vir_add_inst(V3D_QPU_A_NOP, c->undef, c->undef, c->undef);
ldunifa->qpu.sig.ldunifa = true;
struct qreg data = vir_emit_def(c, ldunifa);
struct qreg data;
emit_ldunifa(c, &data);
ntq_store_dest(c, &instr->dest, i, vir_MOV(c, data));
c->last_unifa_offset += 4;
}
}