broadcom/compiler: be more aggressive skipping unifa writes
We had an optimization in place to skip a unifa write if the address happens to be right after the last ldunifa read address, but we can take this further and update the unifa address by emitting ldunifa instructions if needed to skip a unifa write that is close enough. This is because a unifa write involves 4 cycles: 1 for the write and 3 delay slots before we can emit the first ldunifa. So if we have code like this: unifa addr + 0 ldunifa.r0 unifa addr + 12 ldunifa.r1 In practice we end up with QPU like this: unifa addr + 0 nop nop nop ldunifa.r0 unifa addr + 12 nop nop nop ldunifa.r1 And with this patch we get: unifa addr + 0 nop nop nop ldunifa.r0 <--- reads offset 0 ldunifa.- <--- reads offset 4 ldunifa.- <--- reads offset 8 ldunifa.r1 <--- reads offset 12 Of course, QPU scheduling might find ways to fill the NOPs to some extent and remove some of the gains, but generally speaking, this is still usually a win. Going by shader-db results, allowing the next unifa address to be up to 12 bytes after the address resulting from the last ldunifa read shows the best results: total instructions in shared programs: 13817048 -> 13812202 (-0.04%) instructions in affected programs: 602701 -> 597855 (-0.80%) helped: 1750 HURT: 760 Instructions are helped. total uniforms in shared programs: 3795485 -> 3793200 (-0.06%) uniforms in affected programs: 43930 -> 41645 (-5.20%) helped: 898 HURT: 0 Uniforms are helped. total max-temps in shared programs: 2326612 -> 2326621 (<.01%) max-temps in affected programs: 651 -> 660 (1.38%) helped: 10 HURT: 21 Inconclusive result (value mean confidence interval includes 0). total sfu-stalls in shared programs: 30942 -> 30906 (-0.12%) sfu-stalls in affected programs: 627 -> 591 (-5.74%) helped: 186 HURT: 158 Inconclusive result (value mean confidence interval includes 0). total inst-and-stalls in shared programs: 13847990 -> 13843108 (-0.04%) inst-and-stalls in affected programs: 601404 -> 596522 (-0.81%) helped: 1747 HURT: 757 Inst-and-stalls are helped. Reviewed-by: Alejandro Piñeiro <apinheiro@igalia.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9384>
This commit is contained in:
parent
2897a83ff8
commit
c3732ac0d0
|
@ -2662,6 +2662,19 @@ ntq_emit_load_interpolated_input(struct v3d_compile *c,
|
|||
return vir_FADD(c, vir_FMUL(c, pInterp, wInterp), C);
|
||||
}
|
||||
|
||||
static void
|
||||
emit_ldunifa(struct v3d_compile *c, struct qreg *result)
|
||||
{
|
||||
struct qinst *ldunifa =
|
||||
vir_add_inst(V3D_QPU_A_NOP, c->undef, c->undef, c->undef);
|
||||
ldunifa->qpu.sig.ldunifa = true;
|
||||
if (result)
|
||||
*result = vir_emit_def(c, ldunifa);
|
||||
else
|
||||
vir_emit_nondef(c, ldunifa);
|
||||
c->last_unifa_offset += 4;
|
||||
}
|
||||
|
||||
static void
|
||||
ntq_emit_load_ubo_unifa(struct v3d_compile *c, nir_intrinsic_instr *instr)
|
||||
{
|
||||
|
@ -2680,12 +2693,15 @@ ntq_emit_load_ubo_unifa(struct v3d_compile *c, nir_intrinsic_instr *instr)
|
|||
* constant offset loads.
|
||||
*/
|
||||
bool skip_unifa = false;
|
||||
uint32_t ldunifa_skips = 0;
|
||||
if (dynamic_src) {
|
||||
c->last_unifa_block = NULL;
|
||||
} else if (c->cur_block == c->last_unifa_block &&
|
||||
c->last_unifa_index == index &&
|
||||
c->last_unifa_offset == const_offset) {
|
||||
c->last_unifa_offset <= const_offset &&
|
||||
c->last_unifa_offset + 12 >= const_offset) {
|
||||
skip_unifa = true;
|
||||
ldunifa_skips = (const_offset - c->last_unifa_offset) / 4;
|
||||
} else {
|
||||
c->last_unifa_block = c->cur_block;
|
||||
c->last_unifa_index = index;
|
||||
|
@ -2704,15 +2720,15 @@ ntq_emit_load_ubo_unifa(struct v3d_compile *c, nir_intrinsic_instr *instr)
|
|||
vir_ADD_dest(c, unifa, base_offset,
|
||||
ntq_get_src(c, instr->src[1], 0));
|
||||
}
|
||||
} else {
|
||||
for (int i = 0; i < ldunifa_skips; i++)
|
||||
emit_ldunifa(c, NULL);
|
||||
}
|
||||
|
||||
for (uint32_t i = 0; i < nir_intrinsic_dest_components(instr); i++) {
|
||||
struct qinst *ldunifa =
|
||||
vir_add_inst(V3D_QPU_A_NOP, c->undef, c->undef, c->undef);
|
||||
ldunifa->qpu.sig.ldunifa = true;
|
||||
struct qreg data = vir_emit_def(c, ldunifa);
|
||||
struct qreg data;
|
||||
emit_ldunifa(c, &data);
|
||||
ntq_store_dest(c, &instr->dest, i, vir_MOV(c, data));
|
||||
c->last_unifa_offset += 4;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue