broadcom/compiler: try to fill up delay slots after a branch instruction
For this we do something similar to what we do with thrsw where we try to move the branch instruction earlier so the previous instructions execute in the delay slots of the branch. Generally, we can do this with any instruction except: - If the instruction reads a uniform: since our branches do as well and uniforms come from an ordered FIFO stream. - If the instruction writes flags, since our branch instruction will probably read them. - If the instruction is in the delay slots of another thread switch, branch, or unifa write, which is disallowed. total instructions in shared programs: 13648140 -> 13613972 (-0.25%) instructions in affected programs: 2209552 -> 2175384 (-1.55%) helped: 6765 HURT: 0 Instructions are helped. total max-temps in shared programs: 2318687 -> 2318436 (-0.01%) max-temps in affected programs: 5046 -> 4795 (-4.97%) helped: 152 HURT: 0 Max-temps are helped. total inst-and-stalls in shared programs: 13680494 -> 13646326 (-0.25%) inst-and-stalls in affected programs: 2220394 -> 2186226 (-1.54%) helped: 6765 HURT: 0 Inst-and-stalls are helped. total nops in shared programs: 399818 -> 365640 (-8.55%) nops in affected programs: 127311 -> 93133 (-26.85%) helped: 6765 HURT: 0 Nops are helped. Reviewed-by: Alejandro Piñeiro <apinheiro@igalia.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9918>
This commit is contained in:
parent
f33ca092da
commit
e266e6c634
|
@ -490,6 +490,7 @@ struct choose_scoreboard {
|
||||||
int last_unifa_write_tick;
|
int last_unifa_write_tick;
|
||||||
int last_uniforms_reset_tick;
|
int last_uniforms_reset_tick;
|
||||||
int last_thrsw_tick;
|
int last_thrsw_tick;
|
||||||
|
int last_branch_tick;
|
||||||
bool tlb_locked;
|
bool tlb_locked;
|
||||||
bool fixup_ldvary;
|
bool fixup_ldvary;
|
||||||
int ldvary_count;
|
int ldvary_count;
|
||||||
|
@ -1078,6 +1079,16 @@ retry:
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Don't try to put a branch in the delay slots of another
|
||||||
|
* branch or a unifa write.
|
||||||
|
*/
|
||||||
|
if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) {
|
||||||
|
if (scoreboard->last_branch_tick + 3 >= scoreboard->tick)
|
||||||
|
continue;
|
||||||
|
if (scoreboard->last_unifa_write_tick + 3 >= scoreboard->tick)
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
/* If we're trying to pair with another instruction, check
|
/* If we're trying to pair with another instruction, check
|
||||||
* that they're compatible.
|
* that they're compatible.
|
||||||
*/
|
*/
|
||||||
|
@ -1674,11 +1685,17 @@ emit_thrsw(struct v3d_compile *c,
|
||||||
assert(inst->qpu.alu.add.op == V3D_QPU_A_NOP);
|
assert(inst->qpu.alu.add.op == V3D_QPU_A_NOP);
|
||||||
assert(inst->qpu.alu.mul.op == V3D_QPU_M_NOP);
|
assert(inst->qpu.alu.mul.op == V3D_QPU_M_NOP);
|
||||||
|
|
||||||
/* Don't try to emit a thrsw in the delay slots of a previous thrsw */
|
/* Don't try to emit a thrsw in the delay slots of a previous thrsw
|
||||||
|
* or branch.
|
||||||
|
*/
|
||||||
while (scoreboard->last_thrsw_tick + 2 >= scoreboard->tick) {
|
while (scoreboard->last_thrsw_tick + 2 >= scoreboard->tick) {
|
||||||
emit_nop(c, block, scoreboard);
|
emit_nop(c, block, scoreboard);
|
||||||
time++;
|
time++;
|
||||||
}
|
}
|
||||||
|
while (scoreboard->last_branch_tick + 3 >= scoreboard->tick) {
|
||||||
|
emit_nop(c, block, scoreboard);
|
||||||
|
time++;
|
||||||
|
}
|
||||||
|
|
||||||
/* Find how far back into previous instructions we can put the THRSW. */
|
/* Find how far back into previous instructions we can put the THRSW. */
|
||||||
int slots_filled = 0;
|
int slots_filled = 0;
|
||||||
|
@ -1745,6 +1762,97 @@ emit_thrsw(struct v3d_compile *c,
|
||||||
return time;
|
return time;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static bool
|
||||||
|
qpu_inst_valid_in_branch_delay_slot(struct v3d_compile *c, struct qinst *inst)
|
||||||
|
{
|
||||||
|
if (inst->qpu.type == V3D_QPU_INSTR_TYPE_BRANCH)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
if (inst->qpu.sig.thrsw)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
if (v3d_qpu_writes_unifa(c->devinfo, &inst->qpu))
|
||||||
|
return false;
|
||||||
|
|
||||||
|
if (vir_has_uniform(inst))
|
||||||
|
return false;
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
emit_branch(struct v3d_compile *c,
|
||||||
|
struct qblock *block,
|
||||||
|
struct choose_scoreboard *scoreboard,
|
||||||
|
struct qinst *inst)
|
||||||
|
{
|
||||||
|
assert(inst->qpu.type == V3D_QPU_INSTR_TYPE_BRANCH);
|
||||||
|
|
||||||
|
/* We should've not picked up a branch for the delay slots of a previous
|
||||||
|
* thrsw, branch or unifa write instruction.
|
||||||
|
*/
|
||||||
|
int branch_tick = scoreboard->tick;
|
||||||
|
assert(scoreboard->last_thrsw_tick + 2 < branch_tick);
|
||||||
|
assert(scoreboard->last_branch_tick + 3 < branch_tick);
|
||||||
|
assert(scoreboard->last_unifa_write_tick + 3 < branch_tick);
|
||||||
|
|
||||||
|
/* Insert the branch instruction */
|
||||||
|
insert_scheduled_instruction(c, block, scoreboard, inst);
|
||||||
|
|
||||||
|
/* Now see if we can move the branch instruction back into the
|
||||||
|
* instruction stream to fill its delay slots
|
||||||
|
*/
|
||||||
|
int slots_filled = 0;
|
||||||
|
while (slots_filled < 3 && block->instructions.next != &inst->link) {
|
||||||
|
struct qinst *prev_inst = (struct qinst *) inst->link.prev;
|
||||||
|
assert(prev_inst->qpu.type != V3D_QPU_INSTR_TYPE_BRANCH);
|
||||||
|
|
||||||
|
/* Can't move the branch instruction if that would place it
|
||||||
|
* in the delay slots of other instructions.
|
||||||
|
*/
|
||||||
|
if (scoreboard->last_branch_tick + 3 >=
|
||||||
|
branch_tick - slots_filled - 1) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (scoreboard->last_thrsw_tick + 2 >=
|
||||||
|
branch_tick - slots_filled - 1) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (scoreboard->last_unifa_write_tick + 3 >=
|
||||||
|
branch_tick - slots_filled - 1) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Can't move a conditional branch before the instruction
|
||||||
|
* that writes the flags for its condition.
|
||||||
|
*/
|
||||||
|
if (v3d_qpu_writes_flags(&prev_inst->qpu) &&
|
||||||
|
inst->qpu.branch.cond != V3D_QPU_BRANCH_COND_ALWAYS) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!qpu_inst_valid_in_branch_delay_slot(c, prev_inst))
|
||||||
|
break;
|
||||||
|
|
||||||
|
list_del(&prev_inst->link);
|
||||||
|
list_add(&prev_inst->link, &inst->link);
|
||||||
|
slots_filled++;
|
||||||
|
}
|
||||||
|
|
||||||
|
block->branch_qpu_ip = c->qpu_inst_count - 1 - slots_filled;
|
||||||
|
scoreboard->last_branch_tick = branch_tick - slots_filled;
|
||||||
|
|
||||||
|
/* Fill any remaining delay slots.
|
||||||
|
*
|
||||||
|
* FIXME: For unconditional branches we could fill these with the
|
||||||
|
* first instructions in the successor block.
|
||||||
|
*/
|
||||||
|
for (int i = 0; i < 3 - slots_filled; i++)
|
||||||
|
emit_nop(c, block, scoreboard);
|
||||||
|
}
|
||||||
|
|
||||||
static bool
|
static bool
|
||||||
alu_reads_register(struct v3d_qpu_instr *inst,
|
alu_reads_register(struct v3d_qpu_instr *inst,
|
||||||
bool add, bool magic, uint32_t index)
|
bool add, bool magic, uint32_t index)
|
||||||
|
@ -2025,23 +2133,11 @@ schedule_instructions(struct v3d_compile *c,
|
||||||
|
|
||||||
if (inst->sig.thrsw) {
|
if (inst->sig.thrsw) {
|
||||||
time += emit_thrsw(c, block, scoreboard, qinst, false);
|
time += emit_thrsw(c, block, scoreboard, qinst, false);
|
||||||
|
} else if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) {
|
||||||
|
emit_branch(c, block, scoreboard, qinst);
|
||||||
} else {
|
} else {
|
||||||
insert_scheduled_instruction(c, block,
|
insert_scheduled_instruction(c, block,
|
||||||
scoreboard, qinst);
|
scoreboard, qinst);
|
||||||
|
|
||||||
if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) {
|
|
||||||
block->branch_qpu_ip = c->qpu_inst_count - 1;
|
|
||||||
/* Fill the delay slots.
|
|
||||||
*
|
|
||||||
* We should fill these with actual instructions,
|
|
||||||
* instead, but that will probably need to be done
|
|
||||||
* after this, once we know what the leading
|
|
||||||
* instructions of the successors are (so we can
|
|
||||||
* handle A/B register file write latency)
|
|
||||||
*/
|
|
||||||
for (int i = 0; i < 3; i++)
|
|
||||||
emit_nop(c, block, scoreboard);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2111,11 +2207,15 @@ qpu_set_branch_targets(struct v3d_compile *c)
|
||||||
/* Walk back through the delay slots to find the branch
|
/* Walk back through the delay slots to find the branch
|
||||||
* instr.
|
* instr.
|
||||||
*/
|
*/
|
||||||
|
struct qinst *branch = NULL;
|
||||||
struct list_head *entry = block->instructions.prev;
|
struct list_head *entry = block->instructions.prev;
|
||||||
for (int i = 0; i < 3; i++)
|
for (int i = 0; i < 3; i++) {
|
||||||
entry = entry->prev;
|
entry = entry->prev;
|
||||||
struct qinst *branch = container_of(entry, struct qinst, link);
|
branch = container_of(entry, struct qinst, link);
|
||||||
assert(branch->qpu.type == V3D_QPU_INSTR_TYPE_BRANCH);
|
if (branch->qpu.type == V3D_QPU_INSTR_TYPE_BRANCH)
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
assert(branch && branch->qpu.type == V3D_QPU_INSTR_TYPE_BRANCH);
|
||||||
|
|
||||||
/* Make sure that the if-we-don't-jump
|
/* Make sure that the if-we-don't-jump
|
||||||
* successor was scheduled just after the
|
* successor was scheduled just after the
|
||||||
|
@ -2169,6 +2269,7 @@ v3d_qpu_schedule_instructions(struct v3d_compile *c)
|
||||||
scoreboard.last_magic_sfu_write_tick = -10;
|
scoreboard.last_magic_sfu_write_tick = -10;
|
||||||
scoreboard.last_uniforms_reset_tick = -10;
|
scoreboard.last_uniforms_reset_tick = -10;
|
||||||
scoreboard.last_thrsw_tick = -10;
|
scoreboard.last_thrsw_tick = -10;
|
||||||
|
scoreboard.last_branch_tick = -10;
|
||||||
scoreboard.last_stallable_sfu_tick = -10;
|
scoreboard.last_stallable_sfu_tick = -10;
|
||||||
|
|
||||||
if (debug) {
|
if (debug) {
|
||||||
|
|
Loading…
Reference in New Issue