v3d: Delay emitting ldvpm on V3D 4.x until it's actually used.

For V3D 3.x, we emitted the ldvpms all at the top so that we didn't need
to do VPM setup when the load_inputs are out of order.  For V3D 4.x, we
can reduce register pressure by delaying our loads until they're actually
needed.  This also avoids a bunch of silly MOVs in the pre-opt VIR dump.

total instructions in shared programs: 6421415 -> 6419933 (-0.02%)
total uniforms in shared programs: 2393139 -> 2393140 (<.01%)
total threads in shared programs: 153864 -> 153906 (0.03%)
This commit is contained in:
Eric Anholt 2019-02-14 21:11:20 -08:00
parent 5a84d46896
commit 1a775d43c9
1 changed files with 43 additions and 6 deletions

View File

@ -1537,6 +1537,12 @@ ntq_setup_vpm_inputs(struct v3d_compile *c)
&num_components, ~0);
}
/* The actual loads will happen directly in nir_intrinsic_load_input
* on newer versions.
*/
if (c->devinfo->ver >= 40)
return;
for (int loc = 0; loc < ARRAY_SIZE(c->vattr_sizes); loc++) {
resize_qreg_array(c, &c->inputs, &c->inputs_array_size,
(loc + 1) * 4);
@ -1868,12 +1874,43 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
break;
case nir_intrinsic_load_input:
for (int i = 0; i < instr->num_components; i++) {
offset = (nir_intrinsic_base(instr) +
nir_src_as_uint(instr->src[0]));
int comp = nir_intrinsic_component(instr) + i;
ntq_store_dest(c, &instr->dest, i,
vir_MOV(c, c->inputs[offset * 4 + comp]));
offset = (nir_intrinsic_base(instr) +
nir_src_as_uint(instr->src[0]));
if (c->s->info.stage != MESA_SHADER_FRAGMENT &&
c->devinfo->ver >= 40) {
/* Emit the LDVPM directly now, rather than at the top
* of the shader like we did for V3D 3.x (which needs
* vpmsetup when not just taking the next offset).
*
* Note that delaying like this may introduce stalls,
* as LDVPMV takes a minimum of 1 instruction but may
* be slower if the VPM unit is busy with another QPU.
*/
int index = 0;
if (c->s->info.system_values_read &
(1ull << SYSTEM_VALUE_INSTANCE_ID)) {
index++;
}
if (c->s->info.system_values_read &
(1ull << SYSTEM_VALUE_VERTEX_ID)) {
index++;
}
for (int i = 0; i < offset; i++)
index += c->vattr_sizes[i];
index += nir_intrinsic_component(instr);
for (int i = 0; i < instr->num_components; i++) {
struct qreg vpm_offset =
vir_uniform_ui(c, index++);
ntq_store_dest(c, &instr->dest, i,
vir_LDVPMV_IN(c, vpm_offset));
}
} else {
for (int i = 0; i < instr->num_components; i++) {
int comp = nir_intrinsic_component(instr) + i;
ntq_store_dest(c, &instr->dest, i,
vir_MOV(c, c->inputs[offset * 4 +
comp]));
}
}
break;