ir3: Rewrite nop insertion

Don't try to chase across blocks to find a matching destination for a
given source. This can be prone to exponential blowup when there is a
complicated series of if-ladders and we have to crawl through every
possible path. With scalar ALU, this was causing timeouts on one test
when we stopped counting scalar ALU. Rather than adding yet more
band-aids, just switch to a different approach that most other backends
are using where we have a scoreboard of outstanding registers and we
keep track of the cycle when each register becomes "ready". This
integrates nicely into the pre-existing ir3 legalize infrastructure for
(ss) and (sy), although it does require duplicating the logic in
ir3_delayslots() in a different form.

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/28750>
This commit is contained in:
Connor Abbott 2024-04-11 16:26:26 -04:00 committed by Marge Bot
parent 9df3323564
commit 61b2bd861f
4 changed files with 320 additions and 221 deletions

View File

@ -1288,6 +1288,26 @@ reg_size(const struct ir3_register *reg)
return reg_elems(reg) * reg_elem_size(reg);
}
/* Post-RA, we don't have arrays any more, so we have to be a bit careful here
* and have to handle relative accesses specially.
*/
static inline unsigned
post_ra_reg_elems(struct ir3_register *reg)
{
if (reg->flags & IR3_REG_RELATIV)
return reg->size;
return reg_elems(reg);
}
static inline unsigned
post_ra_reg_num(struct ir3_register *reg)
{
if (reg->flags & IR3_REG_RELATIV)
return reg->array.base;
return reg->num;
}
static inline unsigned
dest_regs(struct ir3_instruction *instr)
{
@ -1871,8 +1891,6 @@ int ir3_delayslots(struct ir3_instruction *assigner,
unsigned ir3_delayslots_with_repeat(struct ir3_instruction *assigner,
struct ir3_instruction *consumer,
unsigned assigner_n, unsigned consumer_n);
unsigned ir3_delay_calc(struct ir3_block *block,
struct ir3_instruction *instr, bool mergedregs);
/* estimated (ss)/(sy) delay calculation */

View File

@ -95,38 +95,6 @@ ir3_delayslots(struct ir3_instruction *assigner,
}
}
static bool
count_instruction(struct ir3_instruction *n)
{
/* NOTE: don't count branch/jump since we don't know yet if they will
* be eliminated later in resolve_jumps().. really should do that
* earlier so we don't have this constraint.
*/
return is_alu(n) ||
(is_flow(n) && (n->opc != OPC_JUMP) && (n->opc != OPC_BR) &&
(n->opc != OPC_BRAA) && (n->opc != OPC_BRAO));
}
/* Post-RA, we don't have arrays any more, so we have to be a bit careful here
* and have to handle relative accesses specially.
*/
static unsigned
post_ra_reg_elems(struct ir3_register *reg)
{
if (reg->flags & IR3_REG_RELATIV)
return reg->size;
return reg_elems(reg);
}
static unsigned
post_ra_reg_num(struct ir3_register *reg)
{
if (reg->flags & IR3_REG_RELATIV)
return reg->array.base;
return reg->num;
}
unsigned
ir3_delayslots_with_repeat(struct ir3_instruction *assigner,
struct ir3_instruction *consumer,
@ -211,128 +179,3 @@ ir3_delayslots_with_repeat(struct ir3_instruction *assigner,
return offset > delay ? 0 : delay - offset;
}
static unsigned
delay_calc_srcn(struct ir3_instruction *assigner,
struct ir3_instruction *consumer, unsigned assigner_n,
unsigned consumer_n, bool mergedregs)
{
struct ir3_register *src = consumer->srcs[consumer_n];
struct ir3_register *dst = assigner->dsts[assigner_n];
bool mismatched_half =
(src->flags & IR3_REG_HALF) != (dst->flags & IR3_REG_HALF);
/* In the mergedregs case or when the register is a special register,
* half-registers do not alias with full registers.
*/
if ((!mergedregs || is_reg_special(src) || is_reg_special(dst)) &&
mismatched_half)
return 0;
unsigned src_start = post_ra_reg_num(src) * reg_elem_size(src);
unsigned src_end = src_start + post_ra_reg_elems(src) * reg_elem_size(src);
unsigned dst_start = post_ra_reg_num(dst) * reg_elem_size(dst);
unsigned dst_end = dst_start + post_ra_reg_elems(dst) * reg_elem_size(dst);
if (dst_start >= src_end || src_start >= dst_end)
return 0;
return ir3_delayslots_with_repeat(assigner, consumer, assigner_n, consumer_n);
}
static unsigned
delay_calc(struct ir3_block *block, struct ir3_instruction *start,
struct ir3_instruction *consumer, unsigned distance,
regmask_t *in_mask, bool mergedregs)
{
regmask_t mask;
memcpy(&mask, in_mask, sizeof(mask));
unsigned delay = 0;
/* Search backwards starting at the instruction before start, unless it's
* NULL then search backwards from the block end.
*/
struct list_head *start_list =
start ? start->node.prev : block->instr_list.prev;
list_for_each_entry_from_rev (struct ir3_instruction, assigner, start_list,
&block->instr_list, node) {
if (count_instruction(assigner))
distance += assigner->nop;
if (distance + delay >= MAX_NOPS)
return delay;
if (is_meta(assigner))
continue;
unsigned new_delay = 0;
foreach_dst_n (dst, dst_n, assigner) {
if (dst->wrmask == 0)
continue;
if (!regmask_get(&mask, dst))
continue;
foreach_src_n (src, src_n, consumer) {
if (src->flags & (IR3_REG_IMMED | IR3_REG_CONST))
continue;
unsigned src_delay = delay_calc_srcn(
assigner, consumer, dst_n, src_n, mergedregs);
new_delay = MAX2(new_delay, src_delay);
}
regmask_clear(&mask, dst);
}
new_delay = new_delay > distance ? new_delay - distance : 0;
delay = MAX2(delay, new_delay);
if (count_instruction(assigner))
distance += 1 + assigner->repeat;
}
/* Note: this allows recursion into "block" if it has already been
* visited, but *not* recursion into its predecessors. We may have to
* visit the original block twice, for the loop case where we have to
* consider definititons in an earlier iterations of the same loop:
*
* while (...) {
* mov.u32u32 ..., r0.x
* ...
* mov.u32u32 r0.x, ...
* }
*
* However any other recursion would be unnecessary.
*/
if (block->data != block) {
block->data = block;
for (unsigned i = 0; i < block->predecessors_count; i++) {
struct ir3_block *pred = block->predecessors[i];
unsigned pred_delay = delay_calc(pred, NULL, consumer, distance,
&mask, mergedregs);
delay = MAX2(delay, pred_delay);
}
block->data = NULL;
}
return delay;
}
/**
* Calculate delay for nop insertion. This must exactly match hardware
* requirements, including recursing into predecessor blocks.
*/
unsigned
ir3_delay_calc(struct ir3_block *block, struct ir3_instruction *instr,
bool mergedregs)
{
regmask_t mask;
regmask_init(&mask, mergedregs);
foreach_src (src, instr) {
if (!(src->flags & (IR3_REG_IMMED | IR3_REG_CONST)))
regmask_set(&mask, src);
}
return delay_calc(block, NULL, instr, 0, &mask, mergedregs);
}

View File

@ -54,11 +54,37 @@ struct ir3_legalize_ctx {
bool has_inputs;
};
struct ir3_nop_state {
unsigned full_ready[4 * 48];
unsigned half_ready[4 * 48];
};
struct ir3_legalize_state {
regmask_t needs_ss;
regmask_t needs_ss_war; /* write after read */
regmask_t needs_sy;
bool needs_ss_for_const;
/* Each of these arrays contains the cycle when the corresponding register
* becomes "ready" i.e. does not require any more nops. There is a special
* mechanism to let ALU instructions read compatible (i.e. same halfness)
* destinations of another ALU instruction with less delay, so this can
* depend on what type the consuming instruction is, which is why there are
* multiple arrays. The cycle is counted relative to the start of the block.
*/
/* When ALU instructions reading the given full/half register will be ready.
*/
struct ir3_nop_state alu_nop;
/* When non-ALU (e.g. cat5) instructions reading the given full/half register
* will be ready.
*/
struct ir3_nop_state non_alu_nop;
/* When p0.x-w, a0.x, and a1.x are ready. */
unsigned pred_ready[4];
unsigned addr_ready[2];
};
struct ir3_legalize_block_data {
@ -87,6 +113,177 @@ apply_sy(struct ir3_instruction *instr,
regmask_init(&state->needs_sy, mergedregs);
}
static bool
count_instruction(struct ir3_instruction *n)
{
/* NOTE: don't count branch/jump since we don't know yet if they will
* be eliminated later in resolve_jumps().. really should do that
* earlier so we don't have this constraint.
*/
return is_alu(n) ||
(is_flow(n) && (n->opc != OPC_JUMP) && (n->opc != OPC_BR) &&
(n->opc != OPC_BRAA) && (n->opc != OPC_BRAO));
}
static unsigned *
get_ready_slot(struct ir3_legalize_state *state,
struct ir3_register *reg, unsigned num,
bool consumer_alu, bool matching_size)
{
if (reg->flags & IR3_REG_PREDICATE) {
assert(num == reg->num);
assert(reg_num(reg) == REG_P0);
return &state->pred_ready[reg_comp(reg)];
}
if (reg->num == regid(REG_A0, 0))
return &state->addr_ready[0];
if (reg->num == regid(REG_A0, 1))
return &state->addr_ready[1];
struct ir3_nop_state *nop =
consumer_alu ? &state->alu_nop : &state->non_alu_nop;
assert(!(reg->flags & IR3_REG_SHARED));
if (reg->flags & IR3_REG_HALF) {
if (matching_size)
return &nop->half_ready[num];
else
return &nop->full_ready[num / 2];
} else {
if (matching_size)
return &nop->full_ready[num];
/* If "num" is large enough, then it can't alias a half-reg because only
* the first half of the full reg speace aliases half regs. Return NULL in
* this case.
*/
else if (num * 2 < ARRAY_SIZE(nop->half_ready))
return &nop->half_ready[num * 2];
else
return NULL;
}
}
static unsigned
delay_calc(struct ir3_legalize_state *state,
struct ir3_instruction *instr,
unsigned cycle)
{
/* As far as we know, shader outputs don't need any delay. */
if (instr->opc == OPC_END || instr->opc == OPC_CHMASK)
return 0;
unsigned delay = 0;
foreach_src_n (src, n, instr) {
if (src->flags & (IR3_REG_CONST | IR3_REG_IMMED | IR3_REG_SHARED))
continue;
unsigned elems = post_ra_reg_elems(src);
unsigned num = post_ra_reg_num(src);
unsigned src_cycle = cycle;
/* gat and swz have scalar sources and each source is read in a
* subsequent cycle.
*/
if (instr->opc == OPC_GAT || instr->opc == OPC_SWZ)
src_cycle += n;
/* cat3 instructions consume their last source two cycles later, so they
* only need a delay of 1.
*/
if ((is_mad(instr->opc) || is_madsh(instr->opc)) && n == 2)
src_cycle += 2;
for (unsigned elem = 0; elem < elems; elem++, num++) {
unsigned ready_cycle =
*get_ready_slot(state, src, num, is_alu(instr), true);
delay = MAX2(delay, MAX2(ready_cycle, src_cycle) - src_cycle);
/* Increment cycle for ALU instructions with (rptN) where sources are
* read each subsequent cycle.
*/
if (instr->repeat && !(src->flags & IR3_REG_RELATIV))
src_cycle++;
}
}
return delay;
}
static void
delay_update(struct ir3_legalize_state *state,
struct ir3_instruction *instr,
unsigned cycle,
bool mergedregs)
{
foreach_dst_n (dst, n, instr) {
unsigned elems = post_ra_reg_elems(dst);
unsigned num = post_ra_reg_num(dst);
unsigned dst_cycle = cycle;
/* sct and swz have scalar destinations and each destination is written in
* a subsequent cycle.
*/
if (instr->opc == OPC_SCT || instr->opc == OPC_SWZ)
dst_cycle += n;
/* For relative accesses with (rptN), we have no way of knowing which
* component is accessed when, so we have to assume the worst and mark
* every array member as being written at the end.
*/
if (dst->flags & IR3_REG_RELATIV)
dst_cycle += instr->repeat;
if (dst->flags & IR3_REG_SHARED)
continue;
for (unsigned elem = 0; elem < elems; elem++, num++) {
for (unsigned consumer_alu = 0; consumer_alu < 2; consumer_alu++) {
for (unsigned matching_size = 0; matching_size < 2; matching_size++) {
unsigned *ready_slot =
get_ready_slot(state, dst, num, consumer_alu, matching_size);
if (!ready_slot)
continue;
bool reset_ready_slot = false;
unsigned delay = 0;
if (!is_alu(instr)) {
/* Apparently writes that require (ss) or (sy) are
* synchronized against previous writes, so consumers don't
* have to wait for any previous overlapping ALU instructions
* to complete.
*/
reset_ready_slot = true;
} else if ((dst->flags & IR3_REG_PREDICATE) ||
reg_num(dst) == REG_A0) {
delay = 6;
if (!matching_size)
continue;
} else {
delay = (consumer_alu && matching_size) ? 3 : 6;
}
if (!matching_size) {
for (unsigned i = 0; i < reg_elem_size(dst); i++) {
ready_slot[i] =
reset_ready_slot ? 0 :
MAX2(ready_slot[i], dst_cycle + delay);
}
} else {
*ready_slot =
reset_ready_slot ? 0 :
MAX2(*ready_slot, dst_cycle + delay);
}
}
}
/* Increment cycle for ALU instructions with (rptN) where destinations
* are written each subsequent cycle.
*/
if (instr->repeat && !(dst->flags & IR3_REG_RELATIV))
dst_cycle++;
}
}
}
/* We want to evaluate each block from the position of any other
* predecessor block, in order that the flags set are the union of
* all possible program paths.
@ -140,6 +337,21 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block)
&pstate->needs_ss_war);
regmask_or(&state->needs_sy, &state->needs_sy, &pstate->needs_sy);
state->needs_ss_for_const |= pstate->needs_ss_for_const;
/* Our nop state is the max of the predecessor blocks */
for (unsigned i = 0; i < ARRAY_SIZE(state->pred_ready); i++)
state->pred_ready[i] = MAX2(state->pred_ready[i],
pstate->pred_ready[i]);
for (unsigned i = 0; i < ARRAY_SIZE(state->alu_nop.full_ready); i++) {
state->alu_nop.full_ready[i] = MAX2(state->alu_nop.full_ready[i],
pstate->alu_nop.full_ready[i]);
state->alu_nop.half_ready[i] = MAX2(state->alu_nop.half_ready[i],
pstate->alu_nop.half_ready[i]);
state->non_alu_nop.full_ready[i] = MAX2(state->non_alu_nop.full_ready[i],
pstate->non_alu_nop.full_ready[i]);
state->non_alu_nop.half_ready[i] = MAX2(state->non_alu_nop.half_ready[i],
pstate->non_alu_nop.half_ready[i]);
}
}
/* We need to take phsyical-only edges into account when tracking shared
@ -178,6 +390,8 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block)
list_replace(&block->instr_list, &instr_list);
list_inithead(&block->instr_list);
unsigned cycle = 0;
foreach_instr_safe (n, &instr_list) {
unsigned i;
@ -257,11 +471,40 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block)
nop = ir3_NOP(block);
nop->flags |= IR3_INSTR_SS;
n->flags &= ~IR3_INSTR_SS;
last_n = nop;
cycle++;
}
/* need to be able to set (ss) on first instruction: */
if (list_is_empty(&block->instr_list) && (opc_cat(n->opc) >= 5) && !is_meta(n))
ir3_NOP(block);
unsigned delay = delay_calc(state, n, cycle);
/* NOTE: I think the nopN encoding works for a5xx and
* probably a4xx, but not a3xx. So far only tested on
* a6xx.
*/
if ((delay > 0) && (ctx->compiler->gen >= 6) && last_n &&
((opc_cat(last_n->opc) == 2) || (opc_cat(last_n->opc) == 3)) &&
(last_n->repeat == 0)) {
/* the previous cat2/cat3 instruction can encode at most 3 nop's: */
unsigned transfer = MIN2(delay, 3 - last_n->nop);
last_n->nop += transfer;
delay -= transfer;
cycle += transfer;
}
if ((delay > 0) && last_n && (last_n->opc == OPC_NOP)) {
/* the previous nop can encode at most 5 repeats: */
unsigned transfer = MIN2(delay, 5 - last_n->repeat);
last_n->repeat += transfer;
delay -= transfer;
cycle += transfer;
}
if (delay > 0) {
assert(delay <= 6);
ir3_NOP(block)->repeat = delay - 1;
cycle += delay;
}
if (ctx->compiler->samgq_workaround &&
ctx->type != MESA_SHADER_FRAGMENT &&
@ -328,6 +571,14 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block)
}
}
if (count_instruction(n))
cycle += 1;
delay_update(state, n, cycle, mergedregs);
if (count_instruction(n))
cycle += n->repeat;
if (ctx->early_input_release && is_input(n)) {
last_input_needs_ss |= (n->opc == OPC_LDLV);
@ -384,6 +635,24 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block)
list_add(&baryf->node, &block->instr_list);
}
/* Currently our nop state contains the cycle offset from the start of this
* block when each register becomes ready. But successor blocks need the
* cycle offset from their start, which is this block's end. Translate the
* cycle offset.
*/
for (unsigned i = 0; i < ARRAY_SIZE(state->pred_ready); i++)
state->pred_ready[i] = MAX2(state->pred_ready[i], cycle) - cycle;
for (unsigned i = 0; i < ARRAY_SIZE(state->alu_nop.full_ready); i++) {
state->alu_nop.full_ready[i] =
MAX2(state->alu_nop.full_ready[i], cycle) - cycle;
state->alu_nop.half_ready[i] =
MAX2(state->alu_nop.half_ready[i], cycle) - cycle;
state->non_alu_nop.full_ready[i] =
MAX2(state->non_alu_nop.full_ready[i], cycle) - cycle;
state->non_alu_nop.half_ready[i] =
MAX2(state->non_alu_nop.half_ready[i], cycle) - cycle;
}
bd->valid = true;
if (memcmp(&prev_state, state, sizeof(*state))) {
@ -407,8 +676,7 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block)
* dsxpp.1.p dst, src
*
* We apply this after flags syncing, as we don't want to sync in between the
* two (which might happen if dst == src). We do it before nop scheduling
* because that needs to count actual instructions.
* two (which might happen if dst == src).
*/
static bool
apply_fine_deriv_macro(struct ir3_legalize_ctx *ctx, struct ir3_block *block)
@ -865,55 +1133,6 @@ kill_sched(struct ir3 *ir, struct ir3_shader_variant *so)
}
}
/* Insert nop's required to make this a legal/valid shader program: */
static void
nop_sched(struct ir3 *ir, struct ir3_shader_variant *so)
{
foreach_block (block, &ir->block_list) {
struct ir3_instruction *last = NULL;
struct list_head instr_list;
/* remove all the instructions from the list, we'll be adding
* them back in as we go
*/
list_replace(&block->instr_list, &instr_list);
list_inithead(&block->instr_list);
foreach_instr_safe (instr, &instr_list) {
unsigned delay = ir3_delay_calc(block, instr, so->mergedregs);
/* NOTE: I think the nopN encoding works for a5xx and
* probably a4xx, but not a3xx. So far only tested on
* a6xx.
*/
if ((delay > 0) && (ir->compiler->gen >= 6) && last &&
((opc_cat(last->opc) == 2) || (opc_cat(last->opc) == 3)) &&
(last->repeat == 0)) {
/* the previous cat2/cat3 instruction can encode at most 3 nop's: */
unsigned transfer = MIN2(delay, 3 - last->nop);
last->nop += transfer;
delay -= transfer;
}
if ((delay > 0) && last && (last->opc == OPC_NOP)) {
/* the previous nop can encode at most 5 repeats: */
unsigned transfer = MIN2(delay, 5 - last->repeat);
last->repeat += transfer;
delay -= transfer;
}
if (delay > 0) {
assert(delay <= 6);
ir3_NOP(block)->repeat = delay - 1;
}
list_addtail(&instr->node, &block->instr_list);
last = instr;
}
}
}
static void
dbg_sync_sched(struct ir3 *ir, struct ir3_shader_variant *so)
{
@ -1227,8 +1446,6 @@ ir3_legalize(struct ir3 *ir, struct ir3_shader_variant *so, int *max_bary)
progress |= apply_fine_deriv_macro(ctx, block);
}
nop_sched(ir, so);
if (ir3_shader_debug & IR3_DBG_FULLSYNC) {
dbg_sync_sched(ir, so);
}

View File

@ -145,6 +145,30 @@ fixup_wrmask(struct ir3 *ir)
}
}
/* Calculate the number of nops added before the last instruction by
* ir3_legalize.
*/
static unsigned
calc_nops(struct ir3_block *block, struct ir3_instruction *last)
{
unsigned nops = 0;
foreach_instr_rev (instr, &block->instr_list) {
if (instr == last)
continue;
if (instr->opc == OPC_NOP) {
nops += 1 + instr->repeat;
} else {
if (is_alu(instr))
nops += instr->nop;
break;
}
}
return nops;
}
int
main(int argc, char **argv)
{
@ -177,13 +201,10 @@ main(int argc, char **argv)
break;
}
/* The delay calc is expecting the instr to not yet be added to the
* block, so remove it from the block so that it doesn't get counted
* in the distance from assigner:
*/
list_delinit(&last->node);
int max_bary;
ir3_legalize(ir, shader->variants, &max_bary);
unsigned n = ir3_delay_calc(block, last, true);
unsigned n = calc_nops(block, last);
if (n != test->expected_delay) {
printf("%d: FAIL: Expected delay %u, but got %u, for:\n%s\n", i,