From 40e091267dd02d729cc6d12d190309f103217111 Mon Sep 17 00:00:00 2001 From: Iago Toral Quiroga Date: Fri, 4 Feb 2022 13:40:50 +0100 Subject: [PATCH] broadcom/compiler: define max number of tmu spills for compile strategies MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Instead of whether they are allowed to spill or not. This is more flexible. Also, while we are not currently enabling spilling on any 4-thread strategies, should we do that in the future, always prefer a 4-thread compile. Reviewed-by: Alejandro PiƱeiro Part-of: --- src/broadcom/compiler/nir_to_vir.c | 2 +- src/broadcom/compiler/v3d_compiler.h | 2 +- src/broadcom/compiler/vir.c | 41 +++++++++++-------- src/broadcom/compiler/vir_register_allocate.c | 11 ++--- 4 files changed, 28 insertions(+), 28 deletions(-) diff --git a/src/broadcom/compiler/nir_to_vir.c b/src/broadcom/compiler/nir_to_vir.c index abfbecc502f..b645ef5790a 100644 --- a/src/broadcom/compiler/nir_to_vir.c +++ b/src/broadcom/compiler/nir_to_vir.c @@ -4473,7 +4473,7 @@ v3d_nir_to_vir(struct v3d_compile *c) while (true) { bool spilled; temp_registers = v3d_register_allocate(c, &spilled); - if (spilled) + if (spilled && c->spills + c->fills <= c->max_tmu_spills) continue; if (temp_registers) diff --git a/src/broadcom/compiler/v3d_compiler.h b/src/broadcom/compiler/v3d_compiler.h index e42ea184c45..844a9603606 100644 --- a/src/broadcom/compiler/v3d_compiler.h +++ b/src/broadcom/compiler/v3d_compiler.h @@ -710,7 +710,7 @@ struct v3d_compile { * strategies that can reduce register pressure and hopefully reduce or * eliminate TMU spills in the shader. */ - bool tmu_spilling_allowed; + uint32_t max_tmu_spills; /* The UBO index and block used with the last unifa load, as well as the * current unifa offset *after* emitting that load. This is used to skip diff --git a/src/broadcom/compiler/vir.c b/src/broadcom/compiler/vir.c index a08206fdf30..4753b2e274e 100644 --- a/src/broadcom/compiler/vir.c +++ b/src/broadcom/compiler/vir.c @@ -541,7 +541,7 @@ vir_compile_init(const struct v3d_compiler *compiler, int program_id, int variant_id, uint32_t max_threads, uint32_t min_threads_for_reg_alloc, - bool tmu_spilling_allowed, + uint32_t max_tmu_spills, bool disable_loop_unrolling, bool disable_constant_ubo_load_sorting, bool disable_tmu_pipelining, @@ -559,7 +559,7 @@ vir_compile_init(const struct v3d_compiler *compiler, c->debug_output_data = debug_output_data; c->compilation_result = V3D_COMPILATION_SUCCEEDED; c->min_threads_for_reg_alloc = min_threads_for_reg_alloc; - c->tmu_spilling_allowed = tmu_spilling_allowed; + c->max_tmu_spills = max_tmu_spills; c->fallback_scheduler = fallback_scheduler; c->disable_tmu_pipelining = disable_tmu_pipelining; c->disable_constant_ubo_load_sorting = disable_constant_ubo_load_sorting; @@ -1624,17 +1624,17 @@ struct v3d_compiler_strategy { bool disable_loop_unrolling; bool disable_ubo_load_sorting; bool disable_tmu_pipelining; - bool tmu_spilling_allowed; + uint32_t max_tmu_spills; } static const strategies[] = { - /*0*/ { "default", 4, 4, false, false, false, false }, - /*1*/ { "disable loop unrolling", 4, 4, true, false, false, false }, - /*2*/ { "disable UBO load sorting", 4, 4, true, true, false, false }, - /*3*/ { "disable TMU pipelining", 4, 4, true, true, true, false }, - /*4*/ { "lower thread count", 2, 1, false, false, false, true }, - /*5*/ { "disable loop unrolling (ltc)", 2, 1, true, false, false, true }, - /*6*/ { "disable UBO load sorting (ltc)", 2, 1, true, true, false, true }, - /*7*/ { "disable TMU pipelining (ltc)", 2, 1, true, true, true, true }, - /*8*/ { "fallback scheduler", 2, 1, true, true, true, true } + /*0*/ { "default", 4, 4, false, false, false, 0 }, + /*1*/ { "disable loop unrolling", 4, 4, true, false, false, 0 }, + /*2*/ { "disable UBO load sorting", 4, 4, true, true, false, 0 }, + /*3*/ { "disable TMU pipelining", 4, 4, true, true, true, 0 }, + /*4*/ { "lower thread count", 2, 1, false, false, false, -1 }, + /*5*/ { "disable loop unrolling (ltc)", 2, 1, true, false, false, -1 }, + /*6*/ { "disable UBO load sorting (ltc)", 2, 1, true, true, false, -1 }, + /*7*/ { "disable TMU pipelining (ltc)", 2, 1, true, true, true, -1 }, + /*8*/ { "fallback scheduler", 2, 1, true, true, true, -1 } }; /** @@ -1655,8 +1655,8 @@ skip_compile_strategy(struct v3d_compile *c, uint32_t idx) assert(idx > 0); /* Don't skip a strategy that changes spilling behavior */ - if (strategies[idx].tmu_spilling_allowed != - strategies[idx - 1].tmu_spilling_allowed) { + if (strategies[idx].max_tmu_spills != + strategies[idx - 1].max_tmu_spills) { return false; } @@ -1726,7 +1726,7 @@ uint64_t *v3d_compile(const struct v3d_compiler *compiler, program_id, variant_id, strategies[strat].max_threads, strategies[strat].min_threads, - strategies[strat].tmu_spilling_allowed, + strategies[strat].max_tmu_spills, strategies[strat].disable_loop_unrolling, strategies[strat].disable_ubo_load_sorting, strategies[strat].disable_tmu_pipelining, @@ -1738,11 +1738,16 @@ uint64_t *v3d_compile(const struct v3d_compiler *compiler, if (c->compilation_result == V3D_COMPILATION_FAILED) break; - /* If we compiled without spills, choose this. Otherwise keep - * going and track strategy with less spilling. + /* If we compiled without spills, choose this. + * Otherwise if this is a 4-thread compile, choose this (these + * have a very low cap on the allowed TMU spills so we assume + * it will be better than a 2-thread compile without spills). + * Otherwise, keep going while tracking the strategy with the + * lowest spill count. */ if (c->compilation_result == V3D_COMPILATION_SUCCEEDED) { - if (c->spills == 0) { + if (c->spills == 0 || + strategies[strat].min_threads == 4) { best_c = c; break; } else if (c->spills + c->fills < diff --git a/src/broadcom/compiler/vir_register_allocate.c b/src/broadcom/compiler/vir_register_allocate.c index e26b790c946..05b71e3369a 100644 --- a/src/broadcom/compiler/vir_register_allocate.c +++ b/src/broadcom/compiler/vir_register_allocate.c @@ -561,15 +561,10 @@ get_spill_batch_size(struct v3d_compile *c) return 20; } -/* Don't emit spills using the TMU until we've dropped thread count first. We, - * may also disable spilling when certain optimizations that are known to - * increase register pressure are active so we favor recompiling with - * optimizations disabled instead of spilling. - */ static inline bool -tmu_spilling_allowed(struct v3d_compile *c, int thread_index) +tmu_spilling_allowed(struct v3d_compile *c) { - return thread_index == 0 && c->tmu_spilling_allowed; + return c->spills + c->fills < c->max_tmu_spills; } #define CLASS_BIT_PHYS (1 << 0) @@ -818,7 +813,7 @@ v3d_register_allocate(struct v3d_compile *c, bool *spilled) if (i > 0 && !is_uniform) break; - if (is_uniform || tmu_spilling_allowed(c, thread_index)) { + if (is_uniform || tmu_spilling_allowed(c)) { v3d_spill_reg(c, map[node].temp); /* Ask the outer loop to call back in. */