broadcom/compiler: use nir_opt_load_store_vectorize
This will make it so we pack consecutive scalar operations into a vector operation, reducing the amount of load/store operations in the NIR program. Our backend can handle vector load/stores, and doing so may be more efficient since we don't need to setup individual load/stores all the time. A pathological case is: dEQP-VK.spirv_assembly.instruction.compute.opcopymemory.array which goes from 862 instructions to only 573 by converting all scalar SSBO load/store operations to vec4 operations. total instructions in shared programs: 13752607 -> 13733627 (-0.14%) instructions in affected programs: 367117 -> 348137 (-5.17%) helped: 1168 HURT: 371 Instructions are helped. total threads in shared programs: 412230 -> 412272 (0.01%) threads in affected programs: 54 -> 96 (77.78%) helped: 23 HURT: 2 Threads are helped. total uniforms in shared programs: 3790248 -> 3784601 (-0.15%) uniforms in affected programs: 57417 -> 51770 (-9.84%) helped: 1420 HURT: 19 Uniforms are helped. total max-temps in shared programs: 2322170 -> 2322714 (0.02%) max-temps in affected programs: 14353 -> 14897 (3.79%) helped: 185 HURT: 306 Max-temps are HURT. total spills in shared programs: 5940 -> 6010 (1.18%) spills in affected programs: 65 -> 135 (107.69%) helped: 0 HURT: 11 total fills in shared programs: 13372 -> 13494 (0.91%) fills in affected programs: 75 -> 197 (162.67%) helped: 0 HURT: 11 total sfu-stalls in shared programs: 31505 -> 31521 (0.05%) sfu-stalls in affected programs: 751 -> 767 (2.13%) helped: 210 HURT: 246 Inconclusive result (value mean confidence interval includes 0). total inst-and-stalls in shared programs: 13784112 -> 13765148 (-0.14%) inst-and-stalls in affected programs: 360283 -> 341319 (-5.26%) helped: 1125 HURT: 366 Inst-and-stalls are helped. Reviewed-by: Alejandro Piñeiro <apinheiro@igalia.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9619>
This commit is contained in:
parent
3db322f305
commit
51a263530f
|
@ -1745,6 +1745,33 @@ emit_geom_end(struct v3d_compile *c)
|
||||||
vir_VPMWT(c);
|
vir_VPMWT(c);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static bool
|
||||||
|
mem_vectorize_callback(unsigned align_mul, unsigned align_offset,
|
||||||
|
unsigned bit_size,
|
||||||
|
unsigned num_components,
|
||||||
|
nir_intrinsic_instr *low,
|
||||||
|
nir_intrinsic_instr *high,
|
||||||
|
void *data)
|
||||||
|
{
|
||||||
|
/* Our backend is 32-bit only at present */
|
||||||
|
if (bit_size != 32)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
if (align_mul % 4 != 0 || align_offset % 4 != 0)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
/* Vector accesses wrap at 16-byte boundaries so we can't vectorize
|
||||||
|
* if the resulting vector crosses a 16-byte boundary.
|
||||||
|
*/
|
||||||
|
assert(util_is_power_of_two_nonzero(align_mul));
|
||||||
|
align_mul = MIN2(align_mul, 16);
|
||||||
|
align_offset &= 0xf;
|
||||||
|
if (16 - align_mul + align_offset + num_components * 4 > 16)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
v3d_optimize_nir(struct nir_shader *s)
|
v3d_optimize_nir(struct nir_shader *s)
|
||||||
{
|
{
|
||||||
|
@ -1769,6 +1796,15 @@ v3d_optimize_nir(struct nir_shader *s)
|
||||||
NIR_PASS(progress, s, nir_opt_algebraic);
|
NIR_PASS(progress, s, nir_opt_algebraic);
|
||||||
NIR_PASS(progress, s, nir_opt_constant_folding);
|
NIR_PASS(progress, s, nir_opt_constant_folding);
|
||||||
|
|
||||||
|
nir_load_store_vectorize_options vectorize_opts = {
|
||||||
|
.modes = nir_var_mem_ssbo | nir_var_mem_ubo |
|
||||||
|
nir_var_mem_push_const | nir_var_mem_shared |
|
||||||
|
nir_var_mem_global,
|
||||||
|
.callback = mem_vectorize_callback,
|
||||||
|
.robust_modes = 0,
|
||||||
|
};
|
||||||
|
NIR_PASS(progress, s, nir_opt_load_store_vectorize, &vectorize_opts);
|
||||||
|
|
||||||
if (lower_flrp != 0) {
|
if (lower_flrp != 0) {
|
||||||
bool lower_flrp_progress = false;
|
bool lower_flrp_progress = false;
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue