radeonsi: unroll loops of up to 128 iterations

It's not exactly 128 because longer loop bodies scale the number down.

This improves perf for VP13/Creo and Piano. Most other tests either didn't
show any difference or are CPU-bound.

v2:
- The lowering passes had to be moved to the optimization loop because unrolling creates lowerable variables.
- Piano has some pattern that looks like corruption and the pattern changed with loop unrolling.
  The pattern is present on other drivers as well.

v3:
- I removed the Piano test from CI traces because the image is random. The output was wrong even before
  this MR, and now it's randomly wrong.

|   PERCENTAGE DELTAS    |  Shaders |    SGPRs |    VGPRs |SpillSGPR |SpillVGPR | PrivVGPR |  Scratch | CodeSize | MaxWaves |
|------------------------|----------|----------|----------|----------|----------|----------|----------|----------|----------|
| alien_isolation        |      2936|    .     |    0.02 %|    .     |    .     |    .     |    .     |    0.83 %|    .     |
| deadcore               |        76|   18.47 %|    .     |    .     |    .     |    .     |    .     |  167.69 %|    .     |
| deus_ex_mankind_div..  |      1410|    0.10 %|    0.15 %|    .     |    .     |    .     |    .     |    1.70 %|    .     |
| f1-2015                |       775|    0.37 %|    0.16 %|    .     |    .     |    .     |    .     |    3.25 %|   -0.07 %|
| hitman                 |      1413|    0.10 %|   -0.03 %|    6.45 %|    .     |    .     |    .     |    0.61 %|    0.03 %|
| metro_2033_redux       |      2670|    .     |    .     |    .     |    .     |    .     |    .     |    0.13 %|    0.01 %|
| pixmark-piano-0.7.0    |         2|    .     |   14.29 %| -100.00 %|    .     |    .     |    .     |   78.07 %|   -4.76 %|
| reflections_subway     |        98|   -0.53 %|    .     |    .     |    .     |    .     |    .     |    7.64 %|    .     |
| thea                   |       172|    0.12 %|   -0.81 %|    .     |    .     |    .     |    .     |    0.65 %|    0.15 %|
| ubershaders            |        54|    .     |    .     |    .     |    .     |    .     |    .     |   61.13 %|    .     |
| ue4_effects_cave       |       290|    0.05 %|    .     |    .     |    .     |    .     |    .     |    2.62 %|    .     |
| vp13-creo              |        26|   -3.38 %|   -4.20 %|    .     |    .     |    .     |    .     |   88.56 %|    2.62 %|
| vp13-sw                |       100|   -0.36 %|   -9.14 %|    .     | -100.00 %|    .     | -100.00 %|  -17.97 %|    0.39 %|
| vp20-creo              |        22|   -0.82 %|   -3.33 %|    .     |    .     |    .     |    .     |   81.59 %|    1.51 %|
| vp20-sw                |       296|   -4.51 %|   -0.63 %|    .     |    .     |    .     |    .     |   58.93 %|    0.20 %|
|------------------------|----------|----------|----------|----------|----------|----------|----------|----------|----------|
| All affected           |       189|    3.05 %|   -2.87 %|  500.00 %| -100.00 %|    .     | -100.00 %|  135.61 %|    1.32 %|
|------------------------|----------|----------|----------|----------|----------|----------|----------|----------|----------|
| Total                  |     57794|    0.01 %|   -0.02 %|    0.27 %|   -3.13 %|    .     |   -2.89 %|    1.73 %|    .     |

Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> (v1)
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13966>
This commit is contained in:
Marek Olšák 2021-11-28 04:55:47 -05:00 committed by Marge Bot
parent af9ec3c45d
commit 9ff086052a
3 changed files with 5 additions and 9 deletions

View File

@ -34,10 +34,6 @@ traces:
expectations:
- device: gl-radeonsi-stoney
checksum: 84c499203944cdc59e70450c324bb8df
- path: gputest/pixmark-piano.trace
expectations:
- device: gl-radeonsi-stoney
checksum: a7317d54d452d19ce630c7f554f2279b
- path: gputest/triangle.trace
expectations:
- device: gl-radeonsi-stoney

View File

@ -1054,7 +1054,7 @@ void si_init_screen_get_functions(struct si_screen *sscreen)
.has_dot_4x8 = sscreen->info.has_accelerated_dot_product,
.has_dot_2x16 = sscreen->info.has_accelerated_dot_product,
.optimize_sample_mask_in = true,
.max_unroll_iterations = 32,
.max_unroll_iterations = 128,
.max_unroll_iterations_aggressive = 128,
.use_interpolated_input_intrinsics = true,
.lower_uniforms_to_ubo = true,

View File

@ -597,15 +597,15 @@ void si_nir_opts(struct si_screen *sscreen, struct nir_shader *nir, bool first)
{
bool progress;
NIR_PASS_V(nir, nir_lower_vars_to_ssa);
NIR_PASS_V(nir, nir_lower_alu_to_scalar, si_alu_to_scalar_filter, sscreen);
NIR_PASS_V(nir, nir_lower_phis_to_scalar, false);
do {
progress = false;
bool lower_alu_to_scalar = false;
bool lower_phis_to_scalar = false;
NIR_PASS(progress, nir, nir_lower_vars_to_ssa);
NIR_PASS(progress, nir, nir_lower_alu_to_scalar, si_alu_to_scalar_filter, sscreen);
NIR_PASS(progress, nir, nir_lower_phis_to_scalar, false);
if (first) {
NIR_PASS(progress, nir, nir_split_array_vars, nir_var_function_temp);
NIR_PASS(lower_alu_to_scalar, nir, nir_shrink_vec_array_vars, nir_var_function_temp);