diff --git a/src/panfrost/bifrost/bi_pressure_schedule.c b/src/panfrost/bifrost/bi_pressure_schedule.c new file mode 100644 index 00000000000..f27331cb16a --- /dev/null +++ b/src/panfrost/bifrost/bi_pressure_schedule.c @@ -0,0 +1,341 @@ +/* + * Copyright (C) 2022 Collabora Ltd. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Authors (Collabora): + * Alyssa Rosenzweig + */ + +/* Bottom-up local scheduler to reduce register pressure */ + +#include "compiler.h" +#include "util/dag.h" + +struct sched_ctx { + /* Dependency graph */ + struct dag *dag; + + /* Live set */ + uint8_t *live; + + /* Size of the live set */ + unsigned max; +}; + +struct sched_node { + struct dag_node dag; + + /* Instruction this node represents */ + bi_instr *instr; +}; + +static unsigned +label_index(bi_context *ctx, bi_index idx) +{ + if (idx.reg) { + assert(idx.value < ctx->reg_alloc); + return idx.value + ctx->ssa_alloc; + } else { + assert(idx.value < ctx->ssa_alloc); + return idx.value; + } +} + +static void +add_dep(struct sched_node *a, struct sched_node *b) +{ + if (a && b) + dag_add_edge(&a->dag, &b->dag, 0); +} + +static struct dag * +create_dag(bi_context *ctx, bi_block *block, void *memctx) +{ + struct dag *dag = dag_create(ctx); + + unsigned count = ctx->ssa_alloc + ctx->reg_alloc; + struct sched_node **last_read = + calloc(count, sizeof(struct sched_node *)); + struct sched_node **last_write = + calloc(count, sizeof(struct sched_node *)); + struct sched_node *coverage = NULL; + struct sched_node *preload = NULL; + + /* Last memory load, to serialize stores against */ + struct sched_node *memory_load = NULL; + + /* Last memory store, to serialize loads and stores against */ + struct sched_node *memory_store = NULL; + + bi_foreach_instr_in_block(block, I) { + /* Leave branches at the end */ + if (I->op == BI_OPCODE_JUMP || bi_opcode_props[I->op].branch) + break; + + assert(I->branch_target == NULL); + + struct sched_node *node = rzalloc(memctx, struct sched_node); + node->instr = I; + dag_init_node(dag, &node->dag); + + /* Reads depend on writes */ + bi_foreach_src(I, s) { + bi_index src = I->src[s]; + + if (src.type == BI_INDEX_NORMAL) { + add_dep(node, last_write[label_index(ctx, src)]); + + /* Serialize access to nir_register for + * simplicity. We could do better. + */ + if (src.reg) + add_dep(node, last_read[label_index(ctx, src)]); + } + } + + /* Writes depend on reads and writes */ + bi_foreach_dest(I, s) { + bi_index dest = I->dest[s]; + + if (dest.type == BI_INDEX_NORMAL) { + add_dep(node, last_read[label_index(ctx, dest)]); + add_dep(node, last_write[label_index(ctx, dest)]); + + last_write[label_index(ctx, dest)] = node; + } + } + + bi_foreach_src(I, s) { + bi_index src = I->src[s]; + + if (src.type == BI_INDEX_NORMAL) { + last_read[label_index(ctx, src)] = node; + } + } + + switch (bi_opcode_props[I->op].message) { + case BIFROST_MESSAGE_LOAD: + /* Regular memory loads needs to be serialized against + * other memory access. However, UBO memory is read-only + * so it can be moved around freely. + */ + if (I->seg != BI_SEG_UBO) { + add_dep(node, memory_store); + memory_load = node; + } + + break; + + case BIFROST_MESSAGE_STORE: + assert(I->seg != BI_SEG_UBO); + add_dep(node, memory_load); + add_dep(node, memory_store); + memory_store = node; + break; + + case BIFROST_MESSAGE_ATOMIC: + case BIFROST_MESSAGE_BARRIER: + add_dep(node, memory_load); + add_dep(node, memory_store); + memory_load = node; + memory_store = node; + break; + + case BIFROST_MESSAGE_BLEND: + case BIFROST_MESSAGE_Z_STENCIL: + case BIFROST_MESSAGE_TILE: + add_dep(node, coverage); + coverage = node; + break; + + case BIFROST_MESSAGE_ATEST: + /* ATEST signals the end of shader side effects */ + add_dep(node, memory_store); + memory_store = node; + + /* ATEST also updates coverage */ + add_dep(node, coverage); + coverage = node; + break; + default: + break; + } + + add_dep(node, preload); + + if (I->op == BI_OPCODE_DISCARD_F32) { + /* Serialize against ATEST */ + add_dep(node, coverage); + coverage = node; + + /* Also serialize against memory and barriers */ + add_dep(node, memory_load); + add_dep(node, memory_store); + memory_load = node; + memory_store = node; + } else if (I->op == BI_OPCODE_MOV_I32 && I->src[0].type == BI_INDEX_REGISTER) { + preload = node; + } + } + + free(last_read); + free(last_write); + + return dag; +} + +/* + * Calculate the change in register pressure from scheduling a given + * instruction. Equivalently, calculate the difference in the number of live + * registers before and after the instruction, given the live set after the + * instruction. This calculation follows immediately from the dataflow + * definition of liveness: + * + * live_in = (live_out - KILL) + GEN + */ +static signed +calculate_pressure_delta(bi_instr *I, uint8_t *live, unsigned max) +{ + signed delta = 0; + + /* Destinations must be unique */ + bi_foreach_dest(I, d) { + unsigned node = bi_get_node(I->dest[d]); + + if (node < max && live[node]) + delta -= bi_count_write_registers(I, d); + } + + bi_foreach_src(I, src) { + unsigned node = bi_get_node(I->src[src]); + if (node >= max) + continue; + + /* Filter duplicates */ + bool dupe = false; + + for (unsigned i = 0; i < src; ++i) { + if (bi_get_node(I->src[i]) == node) { + dupe = true; + break; + } + } + + if (!dupe && !live[node]) + delta += bi_count_read_registers(I, src); + } + + return delta; +} + +/* + * Choose the next instruction, bottom-up. For now we use a simple greedy + * heuristic: choose the instruction that has the best effect on liveness. + */ +static struct sched_node * +choose_instr(struct sched_ctx *s) +{ + int32_t min_delta = INT32_MAX; + struct sched_node *best = NULL; + + list_for_each_entry(struct sched_node, n, &s->dag->heads, dag.link) { + int32_t delta = calculate_pressure_delta(n->instr, s->live, s->max); + + if (delta < min_delta) { + best = n; + min_delta = delta; + } + } + + return best; +} + +static void +pressure_schedule_block(bi_context *ctx, bi_block *block, struct sched_ctx *s) +{ + /* off by a constant, that's ok */ + signed pressure = 0; + signed orig_max_pressure = 0; + unsigned nr_ins = 0; + + memcpy(s->live, block->live_out, s->max); + + bi_foreach_instr_in_block_rev(block, I) { + pressure += calculate_pressure_delta(I, s->live, s->max); + orig_max_pressure = MAX2(pressure, orig_max_pressure); + bi_liveness_ins_update(s->live, I, s->max); + nr_ins++; + } + + memcpy(s->live, block->live_out, s->max); + + /* off by a constant, that's ok */ + signed max_pressure = 0; + pressure = 0; + + struct sched_node **schedule = calloc(nr_ins, sizeof(struct sched_node *)); + nr_ins = 0; + + while (!list_is_empty(&s->dag->heads)) { + struct sched_node *node = choose_instr(s); + pressure += calculate_pressure_delta(node->instr, s->live, s->max); + max_pressure = MAX2(pressure, max_pressure); + dag_prune_head(s->dag, &node->dag); + + schedule[nr_ins++] = node; + bi_liveness_ins_update(s->live, node->instr, s->max); + } + + /* Bail if it looks like it's worse */ + if (max_pressure >= orig_max_pressure) { + free(schedule); + return; + } + + /* Apply the schedule */ + for (unsigned i = 0; i < nr_ins; ++i) { + bi_remove_instruction(schedule[i]->instr); + list_add(&schedule[i]->instr->link, &block->instructions); + } + + free(schedule); +} + +void +bi_pressure_schedule(bi_context *ctx) +{ + bi_compute_liveness(ctx); + unsigned temp_count = bi_max_temp(ctx); + void *memctx = ralloc_context(ctx); + uint8_t *live = ralloc_array(memctx, uint8_t, temp_count); + + bi_foreach_block(ctx, block) { + struct sched_ctx sctx = { + .dag = create_dag(ctx, block, memctx), + .max = temp_count, + .live = live + }; + + pressure_schedule_block(ctx, block, &sctx); + } + + ralloc_free(memctx); +} diff --git a/src/panfrost/bifrost/bifrost.h b/src/panfrost/bifrost/bifrost.h index d80844e012f..9d95de5622d 100644 --- a/src/panfrost/bifrost/bifrost.h +++ b/src/panfrost/bifrost/bifrost.h @@ -48,6 +48,7 @@ extern "C" { #define BIFROST_DBG_NOSB 0x0400 #define BIFROST_DBG_NOPRELOAD 0x0800 #define BIFROST_DBG_SPILL 0x1000 +#define BIFROST_DBG_NOPSCHED 0x2000 extern int bifrost_debug; diff --git a/src/panfrost/bifrost/bifrost_compile.c b/src/panfrost/bifrost/bifrost_compile.c index 139770cec2e..95a47d51abc 100644 --- a/src/panfrost/bifrost/bifrost_compile.c +++ b/src/panfrost/bifrost/bifrost_compile.c @@ -28,6 +28,7 @@ #include "compiler/glsl/glsl_to_nir.h" #include "compiler/nir_types.h" #include "compiler/nir/nir_builder.h" +#include "compiler/nir/nir_schedule.h" #include "util/u_debug.h" #include "disassemble.h" @@ -47,6 +48,7 @@ static const struct debug_named_value bifrost_debug_options[] = { {"verbose", BIFROST_DBG_VERBOSE, "Disassemble verbosely"}, {"internal", BIFROST_DBG_INTERNAL, "Dump even internal shaders"}, {"nosched", BIFROST_DBG_NOSCHED, "Force trivial bundling"}, + {"nopsched", BIFROST_DBG_NOPSCHED, "Disable scheduling for pressure"}, {"inorder", BIFROST_DBG_INORDER, "Force in-order bundling"}, {"novalidate",BIFROST_DBG_NOVALIDATE, "Skip IR validation"}, {"noopt", BIFROST_DBG_NOOPT, "Skip optimization passes"}, @@ -5000,6 +5002,9 @@ bi_compile_variant_nir(nir_shader *nir, bi_opt_fuse_dual_texture(ctx); } + if (likely(!(bifrost_debug & BIFROST_DBG_NOPSCHED))) + bi_pressure_schedule(ctx); + bi_validate(ctx, "Late lowering"); bi_register_allocate(ctx); diff --git a/src/panfrost/bifrost/compiler.h b/src/panfrost/bifrost/compiler.h index 035fe21f682..bfdd27c4d81 100644 --- a/src/panfrost/bifrost/compiler.h +++ b/src/panfrost/bifrost/compiler.h @@ -1062,6 +1062,7 @@ void va_lower_split_64bit(bi_context *ctx); void bi_lower_opt_instruction(bi_instr *I); +void bi_pressure_schedule(bi_context *ctx); void bi_schedule(bi_context *ctx); bool bi_can_fma(bi_instr *ins); bool bi_can_add(bi_instr *ins); diff --git a/src/panfrost/bifrost/meson.build b/src/panfrost/bifrost/meson.build index b04185080c6..f0d5a074919 100644 --- a/src/panfrost/bifrost/meson.build +++ b/src/panfrost/bifrost/meson.build @@ -38,6 +38,7 @@ libpanfrost_bifrost_files = files( 'bi_opt_message_preload.c', 'bi_opt_mod_props.c', 'bi_opt_dual_tex.c', + 'bi_pressure_schedule.c', 'bi_pack.c', 'bi_ra.c', 'bi_schedule.c',