pan/bi: Switch to new scheduler
Delete the old. Signed-off-by: Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com> Reviewed-by: Boris Brezillon <boris.brezillon@collabora.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/8354>
This commit is contained in:
parent
f0c0082ab0
commit
77933d16d8
|
@ -463,7 +463,7 @@ static struct bi_packed_tuple
|
|||
bi_pack_tuple(bi_clause *clause, bi_tuple *tuple, bi_tuple *prev, bool first_tuple, gl_shader_stage stage)
|
||||
{
|
||||
bi_assign_slots(tuple, prev);
|
||||
bi_assign_fau_idx(clause, tuple);
|
||||
tuple->regs.fau_idx = tuple->fau_idx;
|
||||
tuple->regs.first_instruction = first_tuple;
|
||||
|
||||
bi_flip_slots(&tuple->regs);
|
||||
|
@ -509,36 +509,54 @@ bi_pack_tuple(bi_clause *clause, bi_tuple *tuple, bi_tuple *prev, bool first_tup
|
|||
return packed;
|
||||
}
|
||||
|
||||
/* Packs the next two constants as a dedicated constant quadword at the end of
|
||||
* the clause, returning the number packed. There are two cases to consider:
|
||||
*
|
||||
* Case #1: Branching is not used. For a single constant copy the upper nibble
|
||||
* over, easy.
|
||||
*
|
||||
* Case #2: Branching is used. For a single constant, it suffices to set the
|
||||
* upper nibble to 4 and leave the latter constant 0, which matches what the
|
||||
* blob does.
|
||||
*
|
||||
* Extending to multiple constants is considerably more tricky and left for
|
||||
* future work.
|
||||
/* A block contains at most one PC-relative constant, from a terminal branch.
|
||||
* Find the last instruction and if it is a relative branch, fix up the
|
||||
* PC-relative constant to contain the absolute offset. This occurs at pack
|
||||
* time instead of schedule time because the number of quadwords between each
|
||||
* block is not known until after all other passes have finished.
|
||||
*/
|
||||
|
||||
static unsigned
|
||||
bi_pack_constants(bi_context *ctx, bi_clause *clause,
|
||||
unsigned word_idx, bool ec0_packed,
|
||||
static void
|
||||
bi_assign_branch_offset(bi_context *ctx, bi_block *block)
|
||||
{
|
||||
if (list_is_empty(&block->clauses))
|
||||
return;
|
||||
|
||||
bi_clause *clause = list_last_entry(&block->clauses, bi_clause, link);
|
||||
bi_instr *br = bi_last_instr_in_clause(clause);
|
||||
|
||||
if (!br->branch_target)
|
||||
return;
|
||||
|
||||
/* Put it in the high place */
|
||||
int32_t qwords = bi_block_offset(ctx, clause, br->branch_target);
|
||||
int32_t bytes = qwords * 16;
|
||||
|
||||
/* Copy so we can toy with the sign without undefined behaviour */
|
||||
uint32_t raw = 0;
|
||||
memcpy(&raw, &bytes, sizeof(raw));
|
||||
|
||||
/* Clear off top bits for A1/B1 bits */
|
||||
raw &= ~0xF0000000;
|
||||
|
||||
/* Put in top 32-bits */
|
||||
assert(clause->pcrel_idx < 8);
|
||||
clause->constants[clause->pcrel_idx] |= ((uint64_t) raw) << 32ull;
|
||||
}
|
||||
|
||||
static void
|
||||
bi_pack_constants(unsigned tuple_count, uint64_t *constants,
|
||||
unsigned word_idx, unsigned constant_words, bool ec0_packed,
|
||||
struct util_dynarray *emission)
|
||||
{
|
||||
unsigned index = (word_idx << 1) + ec0_packed;
|
||||
|
||||
/* After these two, are we done? Determines tag */
|
||||
bool done = clause->constant_count <= (index + 2);
|
||||
|
||||
/* Is the constant we're packing for a branch? */
|
||||
bool branches = clause->branch_constant && done;
|
||||
/* Do more constants follow */
|
||||
bool more = (word_idx + 1) < constant_words;
|
||||
|
||||
/* Indexed first by tuple count and second by constant word number,
|
||||
* indicates the position in the clause */
|
||||
unsigned pos[8][3] = {
|
||||
unsigned pos_lookup[8][3] = {
|
||||
{ 0 },
|
||||
{ 1 },
|
||||
{ 3 },
|
||||
|
@ -549,57 +567,20 @@ bi_pack_constants(bi_context *ctx, bi_clause *clause,
|
|||
{ 9, 12 }
|
||||
};
|
||||
|
||||
/* Compute branch offset instead of a dummy 0 */
|
||||
bool terminal_branch = true;
|
||||
|
||||
if (branches) {
|
||||
bi_instr *br = clause->tuples[clause->tuple_count - 1].add;
|
||||
assert(br && br->branch_target);
|
||||
|
||||
if (!bi_is_terminal_block(br->branch_target)) {
|
||||
/* Put it in the high place */
|
||||
int32_t qwords = bi_block_offset(ctx, clause, br->branch_target);
|
||||
int32_t bytes = qwords * 16;
|
||||
|
||||
/* Copy so we get proper sign behaviour */
|
||||
uint32_t raw = 0;
|
||||
memcpy(&raw, &bytes, sizeof(raw));
|
||||
|
||||
/* Clear off top bits for the magic bits */
|
||||
raw &= ~0xF0000000;
|
||||
terminal_branch = false;
|
||||
|
||||
/* Put in top 32-bits */
|
||||
clause->constants[index + 0] = ((uint64_t) raw) << 32ull;
|
||||
}
|
||||
}
|
||||
|
||||
uint64_t hi = clause->constants[index + 0] >> 60ull;
|
||||
/* Compute the pos, and check everything is reasonable */
|
||||
assert((tuple_count - 1) < 8);
|
||||
assert(word_idx < 3);
|
||||
unsigned pos = pos_lookup[tuple_count - 1][word_idx];
|
||||
assert(pos != 0 || (tuple_count == 1 && word_idx == 0));
|
||||
|
||||
struct bifrost_fmt_constant quad = {
|
||||
.pos = pos[clause->tuple_count - 1][word_idx], /* TODO */
|
||||
.tag = done ? BIFROST_FMTC_FINAL : BIFROST_FMTC_CONSTANTS,
|
||||
.imm_1 = clause->constants[index + 0] >> 4,
|
||||
.imm_2 = ((hi < 8) ? (hi << 60ull) : 0) >> 4,
|
||||
.pos = pos,
|
||||
.tag = more ? BIFROST_FMTC_CONSTANTS : BIFROST_FMTC_FINAL,
|
||||
.imm_1 = constants[index + 0] >> 4,
|
||||
.imm_2 = constants[index + 1] >> 4,
|
||||
};
|
||||
|
||||
if (branches && !terminal_branch) {
|
||||
/* Branch offsets are less than 60-bits so this should work at
|
||||
* least for now */
|
||||
quad.imm_1 |= (4ull << 60ull) >> 4;
|
||||
assert (hi == 0);
|
||||
}
|
||||
|
||||
/* XXX: On G71, Connor observed that the difference of the top 4 bits
|
||||
* of the second constant with the first must be less than 8, otherwise
|
||||
* we have to swap them. On G52, I'm able to reproduce a similar issue
|
||||
* but with a different workaround (modeled above with a single
|
||||
* constant, unclear how to workaround for multiple constants.) Further
|
||||
* investigation needed. Possibly an errata. XXX */
|
||||
|
||||
util_dynarray_append(emission, struct bifrost_fmt_constant, quad);
|
||||
|
||||
return 2;
|
||||
}
|
||||
|
||||
static inline uint8_t
|
||||
|
@ -800,9 +781,6 @@ bi_pack_clause(bi_context *ctx, bi_clause *clause,
|
|||
struct util_dynarray *emission, gl_shader_stage stage,
|
||||
bool tdd)
|
||||
{
|
||||
/* TODO After the deadline lowering */
|
||||
bi_lower_cubeface2(ctx, &clause->tuples[0]);
|
||||
|
||||
struct bi_packed_tuple ins[8] = { 0 };
|
||||
|
||||
for (unsigned i = 0; i < clause->tuple_count; ++i) {
|
||||
|
@ -857,8 +835,8 @@ bi_pack_clause(bi_context *ctx, bi_clause *clause,
|
|||
/* Pack the remaining constants */
|
||||
|
||||
for (unsigned pos = 0; pos < constant_quads; ++pos) {
|
||||
bi_pack_constants(ctx, clause, pos, ec0_packed,
|
||||
emission);
|
||||
bi_pack_constants(clause->tuple_count, clause->constants,
|
||||
pos, constant_quads, ec0_packed, emission);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -909,6 +887,8 @@ bi_pack(bi_context *ctx, struct util_dynarray *emission)
|
|||
bi_foreach_block(ctx, _block) {
|
||||
bi_block *block = (bi_block *) _block;
|
||||
|
||||
bi_assign_branch_offset(ctx, block);
|
||||
|
||||
/* Passthrough the first clause of where we're branching to for
|
||||
* the last clause of the block (the clause with the branch) */
|
||||
|
||||
|
|
|
@ -266,11 +266,14 @@ bi_spill_dest(bi_builder *b, bi_index index, bi_index temp, uint32_t offset,
|
|||
{
|
||||
b->cursor = bi_after_clause(clause);
|
||||
|
||||
bi_instr *st = bi_store_to(b, channels * 32, bi_null(),
|
||||
temp, bi_imm_u32(offset), bi_zero(), BI_SEG_TL);
|
||||
/* setup FAU as [offset][0] */
|
||||
bi_instr *st = bi_store_to(b, channels * 32, bi_null(), temp,
|
||||
bi_passthrough(BIFROST_SRC_FAU_LO),
|
||||
bi_passthrough(BIFROST_SRC_FAU_HI),
|
||||
BI_SEG_TL);
|
||||
|
||||
bi_clause *singleton = bi_singleton(b->shader, st, block, 0, (1 << 0),
|
||||
true);
|
||||
offset, true);
|
||||
|
||||
list_add(&singleton->link, &clause->link);
|
||||
b->shader->spills++;
|
||||
|
@ -281,12 +284,14 @@ bi_fill_src(bi_builder *b, bi_index index, bi_index temp, uint32_t offset,
|
|||
bi_clause *clause, bi_block *block, unsigned channels)
|
||||
{
|
||||
b->cursor = bi_before_clause(clause);
|
||||
bi_instr *ld = bi_load_to(b, channels * 32, temp, bi_imm_u32(offset),
|
||||
bi_zero(), BI_SEG_TL);
|
||||
bi_instr *ld = bi_load_to(b, channels * 32, temp,
|
||||
bi_passthrough(BIFROST_SRC_FAU_LO),
|
||||
bi_passthrough(BIFROST_SRC_FAU_HI),
|
||||
BI_SEG_TL);
|
||||
ld->no_spill = true;
|
||||
|
||||
bi_clause *singleton = bi_singleton(b->shader, ld, block, 0,
|
||||
(1 << 0), true);
|
||||
(1 << 0), offset, true);
|
||||
|
||||
list_addtail(&singleton->link, &clause->link);
|
||||
b->shader->fills++;
|
||||
|
|
|
@ -239,6 +239,7 @@ bi_singleton(void *memctx, bi_instr *ins,
|
|||
bi_block *block,
|
||||
unsigned scoreboard_id,
|
||||
unsigned dependencies,
|
||||
uint64_t combined_constant,
|
||||
bool osrb)
|
||||
{
|
||||
bi_clause *u = rzalloc(memctx, bi_clause);
|
||||
|
@ -266,42 +267,14 @@ bi_singleton(void *memctx, bi_instr *ins,
|
|||
/* Let's be optimistic, we'll fix up later */
|
||||
u->flow_control = BIFROST_FLOW_NBTB;
|
||||
|
||||
/* Build up a combined constant, count in 32-bit words */
|
||||
uint64_t combined_constant = 0;
|
||||
unsigned constant_count = 0;
|
||||
assert(!ins->branch_target);
|
||||
|
||||
bi_foreach_src(ins, s) {
|
||||
if (ins->src[s].type != BI_INDEX_CONSTANT) continue;
|
||||
unsigned value = ins->src[s].value;
|
||||
|
||||
/* Allow fast zero */
|
||||
if (value == 0 && u->tuples[0].fma) continue;
|
||||
|
||||
if (constant_count == 0) {
|
||||
combined_constant = ins->src[s].value;
|
||||
} else if (constant_count == 1) {
|
||||
/* Allow reuse */
|
||||
if (combined_constant == value)
|
||||
continue;
|
||||
|
||||
combined_constant |= ((uint64_t) value) << 32ull;
|
||||
} else {
|
||||
/* No more room! */
|
||||
assert((combined_constant & 0xffffffff) == value ||
|
||||
(combined_constant >> 32ull) == value);
|
||||
}
|
||||
|
||||
constant_count++;
|
||||
}
|
||||
|
||||
if (ins->branch_target)
|
||||
u->branch_constant = true;
|
||||
|
||||
/* XXX: Investigate errors when constants are not used */
|
||||
if (constant_count || u->branch_constant || true) {
|
||||
if (combined_constant) {
|
||||
/* Clause in 64-bit, above in 32-bit */
|
||||
u->constant_count = 1;
|
||||
u->constants[0] = combined_constant;
|
||||
u->tuples[0].fau_idx = bi_constant_field(0) |
|
||||
(combined_constant & 0xF);
|
||||
}
|
||||
|
||||
u->next_clause_prefetch = (ins->op != BI_OPCODE_JUMP);
|
||||
|
@ -414,44 +387,6 @@ bi_reads_t(bi_instr *ins, unsigned src)
|
|||
}
|
||||
}
|
||||
|
||||
/* Eventually, we'll need a proper scheduling, grouping instructions
|
||||
* into clauses and ordering/assigning grouped instructions to the
|
||||
* appropriate FMA/ADD slots. Right now we do the dumbest possible
|
||||
* thing just to have the scheduler stubbed out so we can focus on
|
||||
* codegen */
|
||||
|
||||
void
|
||||
bi_schedule(bi_context *ctx)
|
||||
{
|
||||
bool is_first = true;
|
||||
|
||||
bi_foreach_block(ctx, block) {
|
||||
bi_block *bblock = (bi_block *) block;
|
||||
|
||||
list_inithead(&bblock->clauses);
|
||||
|
||||
bi_foreach_instr_in_block(bblock, ins) {
|
||||
bi_clause *u = bi_singleton(ctx, ins,
|
||||
bblock, 0, (1 << 0),
|
||||
!is_first);
|
||||
|
||||
is_first = false;
|
||||
list_addtail(&u->link, &bblock->clauses);
|
||||
}
|
||||
|
||||
/* Back-to-back bit affects only the last clause of a block,
|
||||
* the rest are implicitly true */
|
||||
|
||||
if (!list_is_empty(&bblock->clauses)) {
|
||||
bi_clause *last_clause = list_last_entry(&bblock->clauses, bi_clause, link);
|
||||
if (!bi_back_to_back(bblock))
|
||||
last_clause->flow_control = BIFROST_FLOW_NBTB_UNCONDITIONAL;
|
||||
}
|
||||
|
||||
bblock->scheduled = true;
|
||||
}
|
||||
}
|
||||
|
||||
/* Counts the number of 64-bit constants required by a clause. TODO: We
|
||||
* might want to account for merging, right now we overestimate, but
|
||||
* that's probably fine most of the time */
|
||||
|
@ -1427,6 +1362,16 @@ bi_schedule_block(bi_context *ctx, bi_block *block)
|
|||
bi_free_worklist(st);
|
||||
}
|
||||
|
||||
void
|
||||
bi_schedule(bi_context *ctx)
|
||||
{
|
||||
bi_foreach_block(ctx, block) {
|
||||
bi_block *bblock = (bi_block *) block;
|
||||
bi_schedule_block(ctx, bblock);
|
||||
bi_opt_dead_code_eliminate(ctx, bblock, true);
|
||||
}
|
||||
}
|
||||
|
||||
#ifndef NDEBUG
|
||||
|
||||
static bi_builder *
|
||||
|
|
|
@ -1669,22 +1669,17 @@ bi_emit_cube_coord(bi_builder *b, bi_index coord,
|
|||
bi_index *face, bi_index *s, bi_index *t)
|
||||
{
|
||||
/* Compute max { |x|, |y|, |z| } */
|
||||
bi_index cubeface1 = bi_cubeface1(b, coord,
|
||||
bi_instr *cubeface = bi_cubeface_to(b, bi_temp(b->shader), coord,
|
||||
bi_word(coord, 1), bi_word(coord, 2));
|
||||
|
||||
/* Calculate packed exponent / face / infinity. In reality this reads
|
||||
* the destination from cubeface1 but that's handled by lowering */
|
||||
bi_instr *cubeface2 = bi_cubeface1_to(b, bi_temp(b->shader), coord,
|
||||
bi_word(coord, 1), bi_word(coord, 2));
|
||||
cubeface2->op = BI_OPCODE_CUBEFACE2; /* XXX: DEEP VOODOO */
|
||||
cubeface->dest[1] = bi_temp(b->shader);
|
||||
|
||||
/* Select coordinates */
|
||||
|
||||
bi_index ssel = bi_cube_ssel(b, bi_word(coord, 2), coord,
|
||||
cubeface2->dest[0]);
|
||||
cubeface->dest[1]);
|
||||
|
||||
bi_index tsel = bi_cube_tsel(b, bi_word(coord, 1), bi_word(coord, 2),
|
||||
cubeface2->dest[0]);
|
||||
cubeface->dest[1]);
|
||||
|
||||
/* The OpenGL ES specification requires us to transform an input vector
|
||||
* (x, y, z) to the coordinate, given the selected S/T:
|
||||
|
@ -1700,7 +1695,7 @@ bi_emit_cube_coord(bi_builder *b, bi_index coord,
|
|||
* Take the reciprocal of max{x, y, z}
|
||||
*/
|
||||
|
||||
bi_index rcp = bi_frcp_f32(b, cubeface1);
|
||||
bi_index rcp = bi_frcp_f32(b, cubeface->dest[0]);
|
||||
|
||||
/* Calculate 0.5 * (1.0 / max{x, y, z}) */
|
||||
bi_index fma1 = bi_fma_f32(b, rcp, bi_imm_f32(0.5f), bi_zero(),
|
||||
|
@ -1722,7 +1717,7 @@ bi_emit_cube_coord(bi_builder *b, bi_index coord,
|
|||
* because the TEXS_CUBE and TEXC instructions expect the face index to
|
||||
* be at this position.
|
||||
*/
|
||||
*face = cubeface2->dest[0];
|
||||
*face = cubeface->dest[1];
|
||||
}
|
||||
|
||||
/* Emits a cube map descriptor, returning lower 32-bits and putting upper
|
||||
|
|
|
@ -743,6 +743,7 @@ bi_singleton(void *memctx, bi_instr *ins,
|
|||
bi_block *block,
|
||||
unsigned scoreboard_id,
|
||||
unsigned dependencies,
|
||||
uint64_t combined_constant,
|
||||
bool osrb);
|
||||
|
||||
/* Liveness */
|
||||
|
|
Loading…
Reference in New Issue