broadcom/compiler: refactor compile strategies

Until now, if we can't compile at 4 threads we would lower thread count
with optimizations disabled, however, lowering thread count doubles the
amount of registers available per thread, so that alone is already a big
relief for register pressure so it makes sense to enable optimizations
when we do that, and progressively disable them until we enable spilling
as a last resort.

This can slightly improve performance for some applications. Sponza,
for example, gets a ~1.5% boost. I see several UE4 shaders that also get
compiled to better code at 2 threads with this, but it is more difficult
to assess how much this improves performance in practice due to the large
variance in frame times that we observe with UE4 demos.

Also, if a compiler strategy disables an optimization that did not make
any progress in the previous compile attempt, we would end up re-compiling
the exact same shader code and failing again. This, patch keeps track of
which strategies won't make progress and skips them in that case to save
some CPU time during shader compiles.

Care should be taken to ensure that we try to compile with the default
NIR scheduler at minimum thread count at least once though, so a specific
strategy for this is added, to prevent the scenario where no optimizations
are used and we skip directly to the fallback scheduler if the default
strategy fails at 4 threads.

Similarly, we now also explicitly specify which strategies are allowed to do
TMU spills and make sure we take this into account when deciding to skip
strategies. This prevents the case where no optimizations are used in a
shader and we skip directly to the fallback scheduler after failing
compilation at 2 threads with the default NIR scheduler but without trying
to spill first.

Reviewed-by: Alejandro Piñeiro <apinheiro@igalia.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/10647>
This commit is contained in:
Iago Toral Quiroga 2021-05-03 10:30:31 +02:00
parent 296fe4daa6
commit d19ce36ff2
4 changed files with 133 additions and 60 deletions

View File

@ -282,6 +282,8 @@ ntq_add_pending_tmu_flush(struct v3d_compile *c,
if (c->disable_tmu_pipelining)
ntq_flush_tmu(c);
else if (c->tmu.flush_count > 1)
c->pipelined_any_tmu = true;
}
enum emit_mode {
@ -1828,10 +1830,13 @@ v3d_optimize_nir(struct v3d_compile *c, struct nir_shader *s)
if (c && !c->disable_loop_unrolling &&
s->options->max_unroll_iterations > 0) {
NIR_PASS(progress, s, nir_opt_loop_unroll,
nir_var_shader_in |
nir_var_shader_out |
nir_var_function_temp);
bool local_progress = false;
NIR_PASS(local_progress, s, nir_opt_loop_unroll,
nir_var_shader_in |
nir_var_shader_out |
nir_var_function_temp);
c->unrolled_any_loops |= local_progress;
progress |= local_progress;
}
} while (progress);

View File

@ -646,12 +646,14 @@ struct v3d_compile {
* TMU spills.
*/
bool disable_tmu_pipelining;
bool pipelined_any_tmu;
/* Disable sorting of UBO loads with constant offset. This may
* increase the chances of being able to compile shaders with high
* register pressure.
*/
bool disable_constant_ubo_load_sorting;
bool sorted_any_ubo_loads;
/* Emits ldunif for each new uniform, even if the uniform was already
* emitted in the same block. Useful to compile shaders with high
@ -662,6 +664,7 @@ struct v3d_compile {
/* Disables loop unrolling to reduce register pressure. */
bool disable_loop_unrolling;
bool unrolled_any_loops;
/* Minimum number of threads we are willing to use to register allocate
* a shader with the current compilation strategy. This only prevents
@ -671,6 +674,13 @@ struct v3d_compile {
*/
uint32_t min_threads_for_reg_alloc;
/* Whether TMU spills are allowed. If this is disabled it may cause
* register allocation to fail. We set this to favor other compilation
* strategies that can reduce register pressure and hopefully reduce or
* eliminate TMU spills in the shader.
*/
bool tmu_spilling_allowed;
/* The UBO index and block used with the last unifa load, as well as the
* current unifa offset *after* emitting that load. This is used to skip
* unifa writes (and their 3 delay slot) when the next UBO load reads

View File

@ -526,6 +526,7 @@ vir_compile_init(const struct v3d_compiler *compiler,
void *debug_output_data,
int program_id, int variant_id,
uint32_t min_threads_for_reg_alloc,
bool tmu_spilling_allowed,
bool disable_loop_unrolling,
bool disable_constant_ubo_load_sorting,
bool disable_tmu_pipelining,
@ -543,6 +544,7 @@ vir_compile_init(const struct v3d_compiler *compiler,
c->debug_output_data = debug_output_data;
c->compilation_result = V3D_COMPILATION_SUCCEEDED;
c->min_threads_for_reg_alloc = min_threads_for_reg_alloc;
c->tmu_spilling_allowed = tmu_spilling_allowed;
c->fallback_scheduler = fallback_scheduler;
c->disable_tmu_pipelining = disable_tmu_pipelining;
c->disable_constant_ubo_load_sorting = disable_constant_ubo_load_sorting;
@ -1333,11 +1335,10 @@ v3d_nir_sort_constant_ubo_loads_block(struct v3d_compile *c,
static bool
v3d_nir_sort_constant_ubo_loads(nir_shader *s, struct v3d_compile *c)
{
bool progress = false;
nir_foreach_function(function, s) {
if (function->impl) {
nir_foreach_block(block, function->impl) {
progress |=
c->sorted_any_ubo_loads |=
v3d_nir_sort_constant_ubo_loads_block(c, block);
}
nir_metadata_preserve(function->impl,
@ -1345,7 +1346,7 @@ v3d_nir_sort_constant_ubo_loads(nir_shader *s, struct v3d_compile *c)
nir_metadata_dominance);
}
}
return progress;
return c->sorted_any_ubo_loads;
}
static void
@ -1508,6 +1509,82 @@ int v3d_shaderdb_dump(struct v3d_compile *c,
c->nop_count);
}
/* This is a list of incremental changes to the compilation strategy
* that will be used to try to compile the shader successfully. The
* default strategy is to enable all optimizations which will have
* the highest register pressure but is expected to produce most
* optimal code. Following strategies incrementally disable specific
* optimizations that are known to contribute to register pressure
* in order to be able to compile the shader successfully while meeting
* thread count requirements.
*
* V3D 4.1+ has a min thread count of 2, but we can use 1 here to also
* cover previous hardware as well (meaning that we are not limiting
* register allocation to any particular thread count). This is fine
* because v3d_nir_to_vir will cap this to the actual minimum.
*/
struct v3d_compiler_strategy {
const char *name;
uint32_t min_threads_for_reg_alloc;
bool disable_loop_unrolling;
bool disable_ubo_load_sorting;
bool disable_tmu_pipelining;
bool tmu_spilling_allowed;
} static const strategies[] = {
/*0*/ { "default", 4, false, false, false, false },
/*1*/ { "disable loop unrolling", 4, true, false, false, false },
/*2*/ { "disable UBO load sorting", 4, true, true, false, false },
/*3*/ { "disable TMU pipelining", 4, true, true, true, false },
/*4*/ { "lower thread count", 1, false, false, false, false },
/*5*/ { "disable loop unrolling (ltc)", 1, true, false, false, false },
/*6*/ { "disable UBO load sorting (ltc)", 1, true, true, false, false },
/*7*/ { "disable TMU pipelining (ltc)", 1, true, true, true, true },
/*8*/ { "fallback scheduler", 1, true, true, true, true }
};
/**
* If a particular optimization didn't make any progress during a compile
* attempt disabling it alone won't allow us to compile the shader successfuly,
* since we'll end up with the same code. Detect these scenarios so we can
* avoid wasting time with useless compiles. We should also consider if the
* strategy changes other aspects of the compilation process though, like
* spilling, and not skip it in that case.
*/
static bool
skip_compile_strategy(struct v3d_compile *c, uint32_t idx)
{
/* We decide if we can skip a strategy based on the optimizations that
* were active in the previous strategy, so we should only be calling this
* for strategies after the first.
*/
assert(idx > 0);
/* Don't skip a strategy that changes spilling behavior */
if (strategies[idx].tmu_spilling_allowed !=
strategies[idx - 1].tmu_spilling_allowed) {
return false;
}
switch (idx) {
/* Loop unrolling: skip if we didn't unroll any loops */
case 1:
case 5:
return !c->unrolled_any_loops;
/* UBO load sorting: skip if we didn't sort any loads */
case 2:
case 6:
return !c->sorted_any_ubo_loads;
/* TMU pipelining: skip if we didn't pipeline any TMU ops */
case 3:
case 7:
return !c->pipelined_any_tmu;
/* Lower thread count: skip if we already tried less that 4 threads */
case 4:
return c->threads < 4;
default:
return false;
};
}
uint64_t *v3d_compile(const struct v3d_compiler *compiler,
struct v3d_key *key,
struct v3d_prog_data **out_prog_data,
@ -1518,42 +1595,40 @@ uint64_t *v3d_compile(const struct v3d_compiler *compiler,
int program_id, int variant_id,
uint32_t *final_assembly_size)
{
struct v3d_compile *c;
/* This is a list of incremental changes to the compilation strategy
* that will be used to try to compile the shader successfully. The
* default strategy is to enable all optimizations which will have
* the highest register pressure but is expected to produce most
* optimal code. Following strategies incrementally disable specific
* optimizations that are known to contribute to register pressure
* in order to be able to compile the shader successfully while meeting
* thread count requirements.
*
* V3D 4.1+ has a min thread count of 2, but we can use 1 here to also
* cover previous hardware as well (meaning that we are not limiting
* register allocation to any particular thread count). This is fine
* because v3d_nir_to_vir will cap this to the actual minimum.
*/
struct v3d_compiler_strategy {
const char *name;
uint32_t min_threads_for_reg_alloc;
} static const strategies[] = {
{ "default", 4 },
{ "disable loop unrolling", 4 },
{ "disable UBO load sorting", 1 },
{ "disable TMU pipelining", 1 },
{ "fallback scheduler", 1 }
};
struct v3d_compile *c = NULL;
for (int i = 0; i < ARRAY_SIZE(strategies); i++) {
/* Fallback strategy */
if (i > 0) {
assert(c);
if (skip_compile_strategy(c, i))
continue;
char *debug_msg;
int ret = asprintf(&debug_msg,
"Falling back to strategy '%s' for %s",
strategies[i].name,
vir_get_stage_name(c));
if (ret >= 0) {
if (unlikely(V3D_DEBUG & V3D_DEBUG_PERF))
fprintf(stderr, "%s\n", debug_msg);
c->debug_output(debug_msg, c->debug_output_data);
free(debug_msg);
}
vir_compile_destroy(c);
}
c = vir_compile_init(compiler, key, s,
debug_output, debug_output_data,
program_id, variant_id,
strategies[i].min_threads_for_reg_alloc,
i > 0, /* Disable loop unrolling */
i > 1, /* Disable UBO load sorting */
i > 2, /* Disable TMU pipelining */
i > 3 /* Fallback_scheduler */);
strategies[i].tmu_spilling_allowed,
strategies[i].disable_loop_unrolling,
strategies[i].disable_ubo_load_sorting,
strategies[i].disable_tmu_pipelining,
i == ARRAY_SIZE(strategies) - 1);
v3d_attempt_compile(c);
@ -1562,23 +1637,6 @@ uint64_t *v3d_compile(const struct v3d_compiler *compiler,
V3D_COMPILATION_FAILED_REGISTER_ALLOCATION) {
break;
}
/* Fallback strategy */
char *debug_msg;
int ret = asprintf(&debug_msg,
"Falling back to strategy '%s' for %s",
strategies[i + 1].name,
vir_get_stage_name(c));
if (ret >= 0) {
if (unlikely(V3D_DEBUG & V3D_DEBUG_PERF))
fprintf(stderr, "%s\n", debug_msg);
c->debug_output(debug_msg, c->debug_output_data);
free(debug_msg);
}
vir_compile_destroy(c);
}
if (unlikely(V3D_DEBUG & V3D_DEBUG_PERF) &&

View File

@ -506,15 +506,15 @@ get_spill_batch_size(struct v3d_compile *c)
return 20;
}
/* Don't emit spills using the TMU until we've dropped thread count first. Also,
* don't spill if we have enabled any other optimization that can lead to
* higher register pressure, such as TMU pipelining, we rather recompile without
* the optimization in that case.
/* Don't emit spills using the TMU until we've dropped thread count first. We,
* may also disable spilling when certain optimizations that are known to
* increase register pressure are active so we favor recompiling with
* optimizations disabled instead of spilling.
*/
static inline bool
tmu_spilling_allowed(struct v3d_compile *c, int thread_index)
{
return thread_index == 0 && c->disable_tmu_pipelining;
return thread_index == 0 && c->tmu_spilling_allowed;
}
#define CLASS_BIT_PHYS (1 << 0)