pan/bi: Switch to new scheduler

Delete the old.

Signed-off-by: Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
Reviewed-by: Boris Brezillon <boris.brezillon@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/8354>
This commit is contained in:
Alyssa Rosenzweig 2021-01-06 15:02:28 -05:00 committed by Marge Bot
parent f0c0082ab0
commit 77933d16d8
5 changed files with 86 additions and 160 deletions

View File

@ -463,7 +463,7 @@ static struct bi_packed_tuple
bi_pack_tuple(bi_clause *clause, bi_tuple *tuple, bi_tuple *prev, bool first_tuple, gl_shader_stage stage)
{
bi_assign_slots(tuple, prev);
bi_assign_fau_idx(clause, tuple);
tuple->regs.fau_idx = tuple->fau_idx;
tuple->regs.first_instruction = first_tuple;
bi_flip_slots(&tuple->regs);
@ -509,36 +509,54 @@ bi_pack_tuple(bi_clause *clause, bi_tuple *tuple, bi_tuple *prev, bool first_tup
return packed;
}
/* Packs the next two constants as a dedicated constant quadword at the end of
* the clause, returning the number packed. There are two cases to consider:
*
* Case #1: Branching is not used. For a single constant copy the upper nibble
* over, easy.
*
* Case #2: Branching is used. For a single constant, it suffices to set the
* upper nibble to 4 and leave the latter constant 0, which matches what the
* blob does.
*
* Extending to multiple constants is considerably more tricky and left for
* future work.
/* A block contains at most one PC-relative constant, from a terminal branch.
* Find the last instruction and if it is a relative branch, fix up the
* PC-relative constant to contain the absolute offset. This occurs at pack
* time instead of schedule time because the number of quadwords between each
* block is not known until after all other passes have finished.
*/
static unsigned
bi_pack_constants(bi_context *ctx, bi_clause *clause,
unsigned word_idx, bool ec0_packed,
static void
bi_assign_branch_offset(bi_context *ctx, bi_block *block)
{
if (list_is_empty(&block->clauses))
return;
bi_clause *clause = list_last_entry(&block->clauses, bi_clause, link);
bi_instr *br = bi_last_instr_in_clause(clause);
if (!br->branch_target)
return;
/* Put it in the high place */
int32_t qwords = bi_block_offset(ctx, clause, br->branch_target);
int32_t bytes = qwords * 16;
/* Copy so we can toy with the sign without undefined behaviour */
uint32_t raw = 0;
memcpy(&raw, &bytes, sizeof(raw));
/* Clear off top bits for A1/B1 bits */
raw &= ~0xF0000000;
/* Put in top 32-bits */
assert(clause->pcrel_idx < 8);
clause->constants[clause->pcrel_idx] |= ((uint64_t) raw) << 32ull;
}
static void
bi_pack_constants(unsigned tuple_count, uint64_t *constants,
unsigned word_idx, unsigned constant_words, bool ec0_packed,
struct util_dynarray *emission)
{
unsigned index = (word_idx << 1) + ec0_packed;
/* After these two, are we done? Determines tag */
bool done = clause->constant_count <= (index + 2);
/* Is the constant we're packing for a branch? */
bool branches = clause->branch_constant && done;
/* Do more constants follow */
bool more = (word_idx + 1) < constant_words;
/* Indexed first by tuple count and second by constant word number,
* indicates the position in the clause */
unsigned pos[8][3] = {
unsigned pos_lookup[8][3] = {
{ 0 },
{ 1 },
{ 3 },
@ -549,57 +567,20 @@ bi_pack_constants(bi_context *ctx, bi_clause *clause,
{ 9, 12 }
};
/* Compute branch offset instead of a dummy 0 */
bool terminal_branch = true;
if (branches) {
bi_instr *br = clause->tuples[clause->tuple_count - 1].add;
assert(br && br->branch_target);
if (!bi_is_terminal_block(br->branch_target)) {
/* Put it in the high place */
int32_t qwords = bi_block_offset(ctx, clause, br->branch_target);
int32_t bytes = qwords * 16;
/* Copy so we get proper sign behaviour */
uint32_t raw = 0;
memcpy(&raw, &bytes, sizeof(raw));
/* Clear off top bits for the magic bits */
raw &= ~0xF0000000;
terminal_branch = false;
/* Put in top 32-bits */
clause->constants[index + 0] = ((uint64_t) raw) << 32ull;
}
}
uint64_t hi = clause->constants[index + 0] >> 60ull;
/* Compute the pos, and check everything is reasonable */
assert((tuple_count - 1) < 8);
assert(word_idx < 3);
unsigned pos = pos_lookup[tuple_count - 1][word_idx];
assert(pos != 0 || (tuple_count == 1 && word_idx == 0));
struct bifrost_fmt_constant quad = {
.pos = pos[clause->tuple_count - 1][word_idx], /* TODO */
.tag = done ? BIFROST_FMTC_FINAL : BIFROST_FMTC_CONSTANTS,
.imm_1 = clause->constants[index + 0] >> 4,
.imm_2 = ((hi < 8) ? (hi << 60ull) : 0) >> 4,
.pos = pos,
.tag = more ? BIFROST_FMTC_CONSTANTS : BIFROST_FMTC_FINAL,
.imm_1 = constants[index + 0] >> 4,
.imm_2 = constants[index + 1] >> 4,
};
if (branches && !terminal_branch) {
/* Branch offsets are less than 60-bits so this should work at
* least for now */
quad.imm_1 |= (4ull << 60ull) >> 4;
assert (hi == 0);
}
/* XXX: On G71, Connor observed that the difference of the top 4 bits
* of the second constant with the first must be less than 8, otherwise
* we have to swap them. On G52, I'm able to reproduce a similar issue
* but with a different workaround (modeled above with a single
* constant, unclear how to workaround for multiple constants.) Further
* investigation needed. Possibly an errata. XXX */
util_dynarray_append(emission, struct bifrost_fmt_constant, quad);
return 2;
}
static inline uint8_t
@ -800,9 +781,6 @@ bi_pack_clause(bi_context *ctx, bi_clause *clause,
struct util_dynarray *emission, gl_shader_stage stage,
bool tdd)
{
/* TODO After the deadline lowering */
bi_lower_cubeface2(ctx, &clause->tuples[0]);
struct bi_packed_tuple ins[8] = { 0 };
for (unsigned i = 0; i < clause->tuple_count; ++i) {
@ -857,8 +835,8 @@ bi_pack_clause(bi_context *ctx, bi_clause *clause,
/* Pack the remaining constants */
for (unsigned pos = 0; pos < constant_quads; ++pos) {
bi_pack_constants(ctx, clause, pos, ec0_packed,
emission);
bi_pack_constants(clause->tuple_count, clause->constants,
pos, constant_quads, ec0_packed, emission);
}
}
@ -909,6 +887,8 @@ bi_pack(bi_context *ctx, struct util_dynarray *emission)
bi_foreach_block(ctx, _block) {
bi_block *block = (bi_block *) _block;
bi_assign_branch_offset(ctx, block);
/* Passthrough the first clause of where we're branching to for
* the last clause of the block (the clause with the branch) */

View File

@ -266,11 +266,14 @@ bi_spill_dest(bi_builder *b, bi_index index, bi_index temp, uint32_t offset,
{
b->cursor = bi_after_clause(clause);
bi_instr *st = bi_store_to(b, channels * 32, bi_null(),
temp, bi_imm_u32(offset), bi_zero(), BI_SEG_TL);
/* setup FAU as [offset][0] */
bi_instr *st = bi_store_to(b, channels * 32, bi_null(), temp,
bi_passthrough(BIFROST_SRC_FAU_LO),
bi_passthrough(BIFROST_SRC_FAU_HI),
BI_SEG_TL);
bi_clause *singleton = bi_singleton(b->shader, st, block, 0, (1 << 0),
true);
offset, true);
list_add(&singleton->link, &clause->link);
b->shader->spills++;
@ -281,12 +284,14 @@ bi_fill_src(bi_builder *b, bi_index index, bi_index temp, uint32_t offset,
bi_clause *clause, bi_block *block, unsigned channels)
{
b->cursor = bi_before_clause(clause);
bi_instr *ld = bi_load_to(b, channels * 32, temp, bi_imm_u32(offset),
bi_zero(), BI_SEG_TL);
bi_instr *ld = bi_load_to(b, channels * 32, temp,
bi_passthrough(BIFROST_SRC_FAU_LO),
bi_passthrough(BIFROST_SRC_FAU_HI),
BI_SEG_TL);
ld->no_spill = true;
bi_clause *singleton = bi_singleton(b->shader, ld, block, 0,
(1 << 0), true);
(1 << 0), offset, true);
list_addtail(&singleton->link, &clause->link);
b->shader->fills++;

View File

@ -239,6 +239,7 @@ bi_singleton(void *memctx, bi_instr *ins,
bi_block *block,
unsigned scoreboard_id,
unsigned dependencies,
uint64_t combined_constant,
bool osrb)
{
bi_clause *u = rzalloc(memctx, bi_clause);
@ -266,42 +267,14 @@ bi_singleton(void *memctx, bi_instr *ins,
/* Let's be optimistic, we'll fix up later */
u->flow_control = BIFROST_FLOW_NBTB;
/* Build up a combined constant, count in 32-bit words */
uint64_t combined_constant = 0;
unsigned constant_count = 0;
assert(!ins->branch_target);
bi_foreach_src(ins, s) {
if (ins->src[s].type != BI_INDEX_CONSTANT) continue;
unsigned value = ins->src[s].value;
/* Allow fast zero */
if (value == 0 && u->tuples[0].fma) continue;
if (constant_count == 0) {
combined_constant = ins->src[s].value;
} else if (constant_count == 1) {
/* Allow reuse */
if (combined_constant == value)
continue;
combined_constant |= ((uint64_t) value) << 32ull;
} else {
/* No more room! */
assert((combined_constant & 0xffffffff) == value ||
(combined_constant >> 32ull) == value);
}
constant_count++;
}
if (ins->branch_target)
u->branch_constant = true;
/* XXX: Investigate errors when constants are not used */
if (constant_count || u->branch_constant || true) {
if (combined_constant) {
/* Clause in 64-bit, above in 32-bit */
u->constant_count = 1;
u->constants[0] = combined_constant;
u->tuples[0].fau_idx = bi_constant_field(0) |
(combined_constant & 0xF);
}
u->next_clause_prefetch = (ins->op != BI_OPCODE_JUMP);
@ -414,44 +387,6 @@ bi_reads_t(bi_instr *ins, unsigned src)
}
}
/* Eventually, we'll need a proper scheduling, grouping instructions
* into clauses and ordering/assigning grouped instructions to the
* appropriate FMA/ADD slots. Right now we do the dumbest possible
* thing just to have the scheduler stubbed out so we can focus on
* codegen */
void
bi_schedule(bi_context *ctx)
{
bool is_first = true;
bi_foreach_block(ctx, block) {
bi_block *bblock = (bi_block *) block;
list_inithead(&bblock->clauses);
bi_foreach_instr_in_block(bblock, ins) {
bi_clause *u = bi_singleton(ctx, ins,
bblock, 0, (1 << 0),
!is_first);
is_first = false;
list_addtail(&u->link, &bblock->clauses);
}
/* Back-to-back bit affects only the last clause of a block,
* the rest are implicitly true */
if (!list_is_empty(&bblock->clauses)) {
bi_clause *last_clause = list_last_entry(&bblock->clauses, bi_clause, link);
if (!bi_back_to_back(bblock))
last_clause->flow_control = BIFROST_FLOW_NBTB_UNCONDITIONAL;
}
bblock->scheduled = true;
}
}
/* Counts the number of 64-bit constants required by a clause. TODO: We
* might want to account for merging, right now we overestimate, but
* that's probably fine most of the time */
@ -1427,6 +1362,16 @@ bi_schedule_block(bi_context *ctx, bi_block *block)
bi_free_worklist(st);
}
void
bi_schedule(bi_context *ctx)
{
bi_foreach_block(ctx, block) {
bi_block *bblock = (bi_block *) block;
bi_schedule_block(ctx, bblock);
bi_opt_dead_code_eliminate(ctx, bblock, true);
}
}
#ifndef NDEBUG
static bi_builder *

View File

@ -1669,22 +1669,17 @@ bi_emit_cube_coord(bi_builder *b, bi_index coord,
bi_index *face, bi_index *s, bi_index *t)
{
/* Compute max { |x|, |y|, |z| } */
bi_index cubeface1 = bi_cubeface1(b, coord,
bi_instr *cubeface = bi_cubeface_to(b, bi_temp(b->shader), coord,
bi_word(coord, 1), bi_word(coord, 2));
/* Calculate packed exponent / face / infinity. In reality this reads
* the destination from cubeface1 but that's handled by lowering */
bi_instr *cubeface2 = bi_cubeface1_to(b, bi_temp(b->shader), coord,
bi_word(coord, 1), bi_word(coord, 2));
cubeface2->op = BI_OPCODE_CUBEFACE2; /* XXX: DEEP VOODOO */
cubeface->dest[1] = bi_temp(b->shader);
/* Select coordinates */
bi_index ssel = bi_cube_ssel(b, bi_word(coord, 2), coord,
cubeface2->dest[0]);
cubeface->dest[1]);
bi_index tsel = bi_cube_tsel(b, bi_word(coord, 1), bi_word(coord, 2),
cubeface2->dest[0]);
cubeface->dest[1]);
/* The OpenGL ES specification requires us to transform an input vector
* (x, y, z) to the coordinate, given the selected S/T:
@ -1700,7 +1695,7 @@ bi_emit_cube_coord(bi_builder *b, bi_index coord,
* Take the reciprocal of max{x, y, z}
*/
bi_index rcp = bi_frcp_f32(b, cubeface1);
bi_index rcp = bi_frcp_f32(b, cubeface->dest[0]);
/* Calculate 0.5 * (1.0 / max{x, y, z}) */
bi_index fma1 = bi_fma_f32(b, rcp, bi_imm_f32(0.5f), bi_zero(),
@ -1722,7 +1717,7 @@ bi_emit_cube_coord(bi_builder *b, bi_index coord,
* because the TEXS_CUBE and TEXC instructions expect the face index to
* be at this position.
*/
*face = cubeface2->dest[0];
*face = cubeface->dest[1];
}
/* Emits a cube map descriptor, returning lower 32-bits and putting upper

View File

@ -743,6 +743,7 @@ bi_singleton(void *memctx, bi_instr *ins,
bi_block *block,
unsigned scoreboard_id,
unsigned dependencies,
uint64_t combined_constant,
bool osrb);
/* Liveness */