broadcom/compiler: specify maximum thread count in compile strategies

Once we have exhausted compile strategies at 4 threads and we start
enabling lower thread counts, there is no point in starting compiles
with 4 threads for them, we know these will fail, so let's start at
2 in these cases.

This also has another nice implication: if the driver compiles at 4
threads and fails to register allocate, we were allowing it to try
with 2 threads, but this would only retry the register allocation
process and would not really recompile the shader with 2 threads. This
is not optimal, because at 2 threads we have more TMU fifo space for
each thread and we can do more TMU pipelining, so we were missing that
opportunity.

This improves performance in Sponza by ~1.5% and also seems to help
UE4 slightly.

Reviewed-by: Alejandro Piñeiro <apinheiro@igalia.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/10647>
This commit is contained in:
Iago Toral Quiroga 2021-05-05 11:26:13 +02:00
parent d19ce36ff2
commit c11e479852
1 changed files with 15 additions and 12 deletions

View File

@ -525,6 +525,7 @@ vir_compile_init(const struct v3d_compiler *compiler,
void *debug_output_data),
void *debug_output_data,
int program_id, int variant_id,
uint32_t max_threads,
uint32_t min_threads_for_reg_alloc,
bool tmu_spilling_allowed,
bool disable_loop_unrolling,
@ -539,7 +540,7 @@ vir_compile_init(const struct v3d_compiler *compiler,
c->key = key;
c->program_id = program_id;
c->variant_id = variant_id;
c->threads = 4;
c->threads = max_threads;
c->debug_output = debug_output;
c->debug_output_data = debug_output_data;
c->compilation_result = V3D_COMPILATION_SUCCEEDED;
@ -1525,21 +1526,22 @@ int v3d_shaderdb_dump(struct v3d_compile *c,
*/
struct v3d_compiler_strategy {
const char *name;
uint32_t min_threads_for_reg_alloc;
uint32_t max_threads;
uint32_t min_threads;
bool disable_loop_unrolling;
bool disable_ubo_load_sorting;
bool disable_tmu_pipelining;
bool tmu_spilling_allowed;
} static const strategies[] = {
/*0*/ { "default", 4, false, false, false, false },
/*1*/ { "disable loop unrolling", 4, true, false, false, false },
/*2*/ { "disable UBO load sorting", 4, true, true, false, false },
/*3*/ { "disable TMU pipelining", 4, true, true, true, false },
/*4*/ { "lower thread count", 1, false, false, false, false },
/*5*/ { "disable loop unrolling (ltc)", 1, true, false, false, false },
/*6*/ { "disable UBO load sorting (ltc)", 1, true, true, false, false },
/*7*/ { "disable TMU pipelining (ltc)", 1, true, true, true, true },
/*8*/ { "fallback scheduler", 1, true, true, true, true }
/*0*/ { "default", 4, 4, false, false, false, false },
/*1*/ { "disable loop unrolling", 4, 4, true, false, false, false },
/*2*/ { "disable UBO load sorting", 4, 4, true, true, false, false },
/*3*/ { "disable TMU pipelining", 4, 4, true, true, true, false },
/*4*/ { "lower thread count", 2, 1, false, false, false, false },
/*5*/ { "disable loop unrolling (ltc)", 2, 1, true, false, false, false },
/*6*/ { "disable UBO load sorting (ltc)", 2, 1, true, true, false, false },
/*7*/ { "disable TMU pipelining (ltc)", 2, 1, true, true, true, true },
/*8*/ { "fallback scheduler", 2, 1, true, true, true, true }
};
/**
@ -1623,7 +1625,8 @@ uint64_t *v3d_compile(const struct v3d_compiler *compiler,
c = vir_compile_init(compiler, key, s,
debug_output, debug_output_data,
program_id, variant_id,
strategies[i].min_threads_for_reg_alloc,
strategies[i].max_threads,
strategies[i].min_threads,
strategies[i].tmu_spilling_allowed,
strategies[i].disable_loop_unrolling,
strategies[i].disable_ubo_load_sorting,