ir3/postsched: Handle sync dependencies better
We want to model soft dependencies, but because of how there's only a single bit to wait on all of them, there may be unnecessary delays inserted when a (sy)-consumer follows an unrelated (sy)-producer. Previously there was some code to try to work around this, but we can just model it directly using the sfu_delay and tex_delay cycle counts that we have to maintain anyway and delete it. This also gets rid of the calls to ir3_delay_postra with soft=true which would be more complicated to handle in the next commit. There is a functional change here: the idea of preferring less nop's over critical path length (max_delay) up to 3 nops is kept (and we delete the TODO which is already sort-of resolved by it), but delays due to (ss)/(sy) and nops are now treated equally, rather than always preferring nops over syncs. So if our estimate indicates that scheduling an (ss) consumer will result in a wait of one cycle and there's another instruction that will require one nop, we will treat them otherwise equal and choose based on max_delay instead. This results in more sstall, but the decrease in nops is much greater. total nops in shared programs: 376613 -> 345482 (-8.27%) nops in affected programs: 275483 -> 244352 (-11.30%) helped: 3226 HURT: 110 helped stats (abs) min: 1 max: 78 x̄: 9.73 x̃: 7 helped stats (rel) min: 0.19% max: 100.00% x̄: 19.48% x̃: 13.68% HURT stats (abs) min: 1 max: 16 x̄: 2.43 x̃: 2 HURT stats (rel) min: 0.00% max: 150.00% x̄: 13.34% x̃: 4.36% 95% mean confidence interval for nops value: -9.61 -9.06 95% mean confidence interval for nops %-change: -19.01% -17.78% Nops are helped. total sstall in shared programs: 126195 -> 133806 (6.03%) sstall in affected programs: 79440 -> 87051 (9.58%) helped: 300 HURT: 1922 helped stats (abs) min: 1 max: 15 x̄: 4.72 x̃: 4 helped stats (rel) min: 1.05% max: 100.00% x̄: 17.15% x̃: 14.55% HURT stats (abs) min: 1 max: 29 x̄: 4.70 x̃: 4 HURT stats (rel) min: 0.00% max: 900.00% x̄: 25.38% x̃: 10.53% 95% mean confidence interval for sstall value: 3.22 3.63 95% mean confidence interval for sstall %-change: 17.50% 21.78% Sstall are HURT. total (ss) in shared programs: 35190 -> 35472 (0.80%) (ss) in affected programs: 6433 -> 6715 (4.38%) helped: 163 HURT: 401 helped stats (abs) min: 1 max: 2 x̄: 1.06 x̃: 1 helped stats (rel) min: 1.92% max: 33.33% x̄: 11.53% x̃: 10.00% HURT stats (abs) min: 1 max: 3 x̄: 1.13 x̃: 1 HURT stats (rel) min: 1.56% max: 100.00% x̄: 15.33% x̃: 12.50% 95% mean confidence interval for (ss) value: 0.41 0.59 95% mean confidence interval for (ss) %-change: 6.22% 8.93% (ss) are HURT. total (sy) in shared programs: 13476 -> 13521 (0.33%) (sy) in affected programs: 669 -> 714 (6.73%) helped: 30 HURT: 78 helped stats (abs) min: 1 max: 2 x̄: 1.13 x̃: 1 helped stats (rel) min: 4.00% max: 50.00% x̄: 21.22% x̃: 21.11% HURT stats (abs) min: 1 max: 2 x̄: 1.01 x̃: 1 HURT stats (rel) min: 3.45% max: 100.00% x̄: 31.93% x̃: 25.00% 95% mean confidence interval for (sy) value: 0.23 0.60 95% mean confidence interval for (sy) %-change: 11.19% 23.15% (sy) are HURT. Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13722>
This commit is contained in:
parent
b9f61d7287
commit
a54e7baa65
|
@ -154,25 +154,26 @@ dump_state(struct ir3_postsched_ctx *ctx)
|
|||
}
|
||||
}
|
||||
|
||||
/* Determine if this is an instruction that we'd prefer not to schedule
|
||||
* yet, in order to avoid an (ss) sync. This is limited by the sfu_delay
|
||||
* counter, ie. the more cycles it has been since the last SFU, the less
|
||||
* costly a sync would be.
|
||||
*/
|
||||
static bool
|
||||
would_sync(struct ir3_postsched_ctx *ctx, struct ir3_instruction *instr)
|
||||
static unsigned
|
||||
node_delay(struct ir3_postsched_ctx *ctx, struct ir3_postsched_node *n)
|
||||
{
|
||||
if (ctx->sfu_delay) {
|
||||
if (has_sfu_src(instr))
|
||||
return true;
|
||||
}
|
||||
return ir3_delay_calc_postra(ctx->block, n->instr, false, ctx->v->mergedregs);
|
||||
}
|
||||
|
||||
if (ctx->tex_delay) {
|
||||
if (has_tex_src(instr))
|
||||
return true;
|
||||
}
|
||||
static unsigned
|
||||
node_delay_soft(struct ir3_postsched_ctx *ctx, struct ir3_postsched_node *n)
|
||||
{
|
||||
unsigned delay = node_delay(ctx, n);
|
||||
|
||||
return false;
|
||||
/* This takes into account that as when we schedule multiple tex or sfu, the
|
||||
* first user has to wait for all of them to complete.
|
||||
*/
|
||||
if (n->has_sfu_src)
|
||||
delay = MAX2(delay, ctx->sfu_delay);
|
||||
if (n->has_tex_src)
|
||||
delay = MAX2(delay, ctx->tex_delay);
|
||||
|
||||
return delay;
|
||||
}
|
||||
|
||||
/* find instruction to schedule: */
|
||||
|
@ -215,8 +216,7 @@ choose_instr(struct ir3_postsched_ctx *ctx)
|
|||
|
||||
/* Next prioritize discards: */
|
||||
foreach_sched_node (n, &ctx->dag->heads) {
|
||||
unsigned d =
|
||||
ir3_delay_calc_postra(ctx->block, n->instr, false, ctx->v->mergedregs);
|
||||
unsigned d = node_delay(ctx, n);
|
||||
|
||||
if (d > 0)
|
||||
continue;
|
||||
|
@ -235,8 +235,7 @@ choose_instr(struct ir3_postsched_ctx *ctx)
|
|||
|
||||
/* Next prioritize expensive instructions: */
|
||||
foreach_sched_node (n, &ctx->dag->heads) {
|
||||
unsigned d =
|
||||
ir3_delay_calc_postra(ctx->block, n->instr, false, ctx->v->mergedregs);
|
||||
unsigned d = node_delay(ctx, n);
|
||||
|
||||
if (d > 0)
|
||||
continue;
|
||||
|
@ -253,49 +252,32 @@ choose_instr(struct ir3_postsched_ctx *ctx)
|
|||
return chosen->instr;
|
||||
}
|
||||
|
||||
/*
|
||||
* Sometimes be better to take a nop, rather than scheduling an
|
||||
* instruction that would require an (ss) shortly after another
|
||||
* SFU.. ie. if last SFU was just one or two instr ago, and we
|
||||
* could choose between taking a nop and then scheduling
|
||||
* something else, vs scheduling the immed avail instruction that
|
||||
* would require (ss), we are better with the nop.
|
||||
*/
|
||||
for (unsigned delay = 0; delay < 4; delay++) {
|
||||
foreach_sched_node (n, &ctx->dag->heads) {
|
||||
if (would_sync(ctx, n->instr))
|
||||
continue;
|
||||
|
||||
unsigned d = ir3_delay_calc_postra(ctx->block, n->instr, true,
|
||||
ctx->v->mergedregs);
|
||||
|
||||
if (d > delay)
|
||||
continue;
|
||||
|
||||
if (!chosen || (chosen->max_delay < n->max_delay))
|
||||
chosen = n;
|
||||
}
|
||||
|
||||
if (chosen) {
|
||||
di(chosen->instr, "csp: chose (soft ready, delay=%u)", delay);
|
||||
return chosen->instr;
|
||||
}
|
||||
}
|
||||
|
||||
/* Next try to find a ready leader w/ soft delay (ie. including extra
|
||||
* delay for things like tex fetch which can be synchronized w/ sync
|
||||
* bit (but we probably do want to schedule some other instructions
|
||||
* while we wait)
|
||||
* while we wait). We also allow a small amount of nops, to prefer now-nops
|
||||
* over future-nops up to a point, as that gives better results.
|
||||
*/
|
||||
unsigned chosen_delay = 0;
|
||||
foreach_sched_node (n, &ctx->dag->heads) {
|
||||
unsigned d =
|
||||
ir3_delay_calc_postra(ctx->block, n->instr, true, ctx->v->mergedregs);
|
||||
unsigned d = node_delay_soft(ctx, n);
|
||||
|
||||
if (d > 0)
|
||||
if (d > 3)
|
||||
continue;
|
||||
|
||||
if (!chosen || (chosen->max_delay < n->max_delay))
|
||||
if (!chosen || d < chosen_delay) {
|
||||
chosen = n;
|
||||
chosen_delay = d;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (d > chosen_delay)
|
||||
continue;
|
||||
|
||||
if (chosen->max_delay < n->max_delay) {
|
||||
chosen = n;
|
||||
chosen_delay = d;
|
||||
}
|
||||
}
|
||||
|
||||
if (chosen) {
|
||||
|
@ -308,8 +290,7 @@ choose_instr(struct ir3_postsched_ctx *ctx)
|
|||
* stalls.. but we've already decided there is not a better option.
|
||||
*/
|
||||
foreach_sched_node (n, &ctx->dag->heads) {
|
||||
unsigned d =
|
||||
ir3_delay_calc_postra(ctx->block, n->instr, false, ctx->v->mergedregs);
|
||||
unsigned d = node_delay(ctx, n);
|
||||
|
||||
if (d > 0)
|
||||
continue;
|
||||
|
@ -324,9 +305,6 @@ choose_instr(struct ir3_postsched_ctx *ctx)
|
|||
}
|
||||
|
||||
/* Otherwise choose leader with maximum cost:
|
||||
*
|
||||
* TODO should we try to balance cost and delays? I guess it is
|
||||
* a balance between now-nop's and future-nop's?
|
||||
*/
|
||||
foreach_sched_node (n, &ctx->dag->heads) {
|
||||
if (!chosen || chosen->max_delay < n->max_delay)
|
||||
|
|
Loading…
Reference in New Issue