2072 lines
73 KiB
C
2072 lines
73 KiB
C
/*
|
|
* Copyright (C) 2020 Collabora Ltd.
|
|
*
|
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
|
* copy of this software and associated documentation files (the "Software"),
|
|
* to deal in the Software without restriction, including without limitation
|
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
|
* and/or sell copies of the Software, and to permit persons to whom the
|
|
* Software is furnished to do so, subject to the following conditions:
|
|
*
|
|
* The above copyright notice and this permission notice (including the next
|
|
* paragraph) shall be included in all copies or substantial portions of the
|
|
* Software.
|
|
*
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
|
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
* SOFTWARE.
|
|
*
|
|
* Authors (Collabora):
|
|
* Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
|
|
*/
|
|
|
|
#include "compiler.h"
|
|
#include "bi_builder.h"
|
|
|
|
/* Arguments common to worklist, passed by value for convenience */
|
|
|
|
struct bi_worklist {
|
|
/* # of instructions in the block */
|
|
unsigned count;
|
|
|
|
/* Instructions in the block */
|
|
bi_instr **instructions;
|
|
|
|
/* Bitset of instructions in the block ready for scheduling */
|
|
BITSET_WORD *worklist;
|
|
|
|
/* The backwards dependency graph. nr_dependencies is the number of
|
|
* unscheduled instructions that must still be scheduled after (before)
|
|
* this instruction. dependents are which instructions need to be
|
|
* scheduled before (after) this instruction. */
|
|
unsigned *dep_counts;
|
|
BITSET_WORD **dependents;
|
|
};
|
|
|
|
/* State of a single tuple and clause under construction */
|
|
|
|
struct bi_reg_state {
|
|
/* Number of register writes */
|
|
unsigned nr_writes;
|
|
|
|
/* Register reads, expressed as (equivalence classes of)
|
|
* sources. Only 3 reads are allowed, but up to 2 may spill as
|
|
* "forced" for the next scheduled tuple, provided such a tuple
|
|
* can be constructed */
|
|
bi_index reads[5];
|
|
unsigned nr_reads;
|
|
|
|
/* The previous tuple scheduled (= the next tuple executed in the
|
|
* program) may require certain writes, in order to bypass the register
|
|
* file and use a temporary passthrough for the value. Up to 2 such
|
|
* constraints are architecturally satisfiable */
|
|
unsigned forced_count;
|
|
bi_index forceds[2];
|
|
};
|
|
|
|
struct bi_tuple_state {
|
|
/* Is this the last tuple in the clause */
|
|
bool last;
|
|
|
|
/* Scheduled ADD instruction, or null if none */
|
|
bi_instr *add;
|
|
|
|
/* Reads for previous (succeeding) tuple */
|
|
bi_index prev_reads[5];
|
|
unsigned nr_prev_reads;
|
|
bi_tuple *prev;
|
|
|
|
/* Register slot state for current tuple */
|
|
struct bi_reg_state reg;
|
|
|
|
/* Constants are shared in the tuple. If constant_count is nonzero, it
|
|
* is a size for constant count. Otherwise, fau is the slot read from
|
|
* FAU, or zero if none is assigned. Ordinarily FAU slot 0 reads zero,
|
|
* but within a tuple, that should be encoded as constant_count != 0
|
|
* and constants[0] = constants[1] = 0 */
|
|
unsigned constant_count;
|
|
|
|
union {
|
|
uint32_t constants[2];
|
|
enum bir_fau fau;
|
|
};
|
|
|
|
unsigned pcrel_idx;
|
|
};
|
|
|
|
struct bi_const_state {
|
|
unsigned constant_count;
|
|
bool pcrel; /* applies to first const */
|
|
uint32_t constants[2];
|
|
|
|
/* Index of the constant into the clause */
|
|
unsigned word_idx;
|
|
};
|
|
|
|
enum bi_ftz_state {
|
|
/* No flush-to-zero state assigned yet */
|
|
BI_FTZ_STATE_NONE,
|
|
|
|
/* Never flush-to-zero */
|
|
BI_FTZ_STATE_DISABLE,
|
|
|
|
/* Always flush-to-zero */
|
|
BI_FTZ_STATE_ENABLE,
|
|
};
|
|
|
|
struct bi_clause_state {
|
|
/* Has a message-passing instruction already been assigned? */
|
|
bool message;
|
|
|
|
/* Indices already accessed, this needs to be tracked to avoid hazards
|
|
* around message-passing instructions */
|
|
unsigned access_count;
|
|
bi_index accesses[(BI_MAX_SRCS + BI_MAX_DESTS) * 16];
|
|
|
|
unsigned tuple_count;
|
|
struct bi_const_state consts[8];
|
|
|
|
/* Numerical state of the clause */
|
|
enum bi_ftz_state ftz;
|
|
};
|
|
|
|
/* Determines messsage type by checking the table and a few special cases. Only
|
|
* case missing is tilebuffer instructions that access depth/stencil, which
|
|
* require a Z_STENCIL message (to implement
|
|
* ARM_shader_framebuffer_fetch_depth_stencil) */
|
|
|
|
static enum bifrost_message_type
|
|
bi_message_type_for_instr(bi_instr *ins)
|
|
{
|
|
enum bifrost_message_type msg = bi_opcode_props[ins->op].message;
|
|
bool ld_var_special = (ins->op == BI_OPCODE_LD_VAR_SPECIAL);
|
|
|
|
if (ld_var_special && ins->varying_name == BI_VARYING_NAME_FRAG_Z)
|
|
return BIFROST_MESSAGE_Z_STENCIL;
|
|
|
|
if (msg == BIFROST_MESSAGE_LOAD && ins->seg == BI_SEG_UBO)
|
|
return BIFROST_MESSAGE_ATTRIBUTE;
|
|
|
|
return msg;
|
|
}
|
|
|
|
/* Attribute, texture, and UBO load (attribute message) instructions support
|
|
* bindless, so just check the message type */
|
|
|
|
ASSERTED static bool
|
|
bi_supports_dtsel(bi_instr *ins)
|
|
{
|
|
switch (bi_message_type_for_instr(ins)) {
|
|
case BIFROST_MESSAGE_ATTRIBUTE:
|
|
return ins->op != BI_OPCODE_LD_GCLK_U64;
|
|
case BIFROST_MESSAGE_TEX:
|
|
return true;
|
|
default:
|
|
return false;
|
|
}
|
|
}
|
|
|
|
/* Adds an edge to the dependency graph */
|
|
|
|
static void
|
|
bi_push_dependency(unsigned parent, unsigned child,
|
|
BITSET_WORD **dependents, unsigned *dep_counts)
|
|
{
|
|
if (!BITSET_TEST(dependents[parent], child)) {
|
|
BITSET_SET(dependents[parent], child);
|
|
dep_counts[child]++;
|
|
}
|
|
}
|
|
|
|
static void
|
|
add_dependency(struct util_dynarray *table, unsigned index, unsigned child,
|
|
BITSET_WORD **dependents, unsigned *dep_counts)
|
|
{
|
|
assert(index < 64);
|
|
util_dynarray_foreach(table + index, unsigned, parent)
|
|
bi_push_dependency(*parent, child, dependents, dep_counts);
|
|
}
|
|
|
|
static void
|
|
mark_access(struct util_dynarray *table, unsigned index, unsigned parent)
|
|
{
|
|
assert(index < 64);
|
|
util_dynarray_append(&table[index], unsigned, parent);
|
|
}
|
|
|
|
static bool
|
|
bi_is_sched_barrier(bi_instr *I)
|
|
{
|
|
switch (I->op) {
|
|
case BI_OPCODE_BARRIER:
|
|
case BI_OPCODE_DISCARD_F32:
|
|
return true;
|
|
default:
|
|
return false;
|
|
}
|
|
}
|
|
|
|
static void
|
|
bi_create_dependency_graph(struct bi_worklist st, bool inorder, bool is_blend)
|
|
{
|
|
struct util_dynarray last_read[64], last_write[64];
|
|
|
|
for (unsigned i = 0; i < 64; ++i) {
|
|
util_dynarray_init(&last_read[i], NULL);
|
|
util_dynarray_init(&last_write[i], NULL);
|
|
}
|
|
|
|
/* Initialize dependency graph */
|
|
for (unsigned i = 0; i < st.count; ++i) {
|
|
st.dependents[i] =
|
|
calloc(BITSET_WORDS(st.count), sizeof(BITSET_WORD));
|
|
|
|
st.dep_counts[i] = 0;
|
|
}
|
|
|
|
unsigned prev_msg = ~0;
|
|
|
|
/* Populate dependency graph */
|
|
for (signed i = st.count - 1; i >= 0; --i) {
|
|
bi_instr *ins = st.instructions[i];
|
|
|
|
bi_foreach_src(ins, s) {
|
|
if (ins->src[s].type != BI_INDEX_REGISTER) continue;
|
|
unsigned count = bi_count_read_registers(ins, s);
|
|
|
|
for (unsigned c = 0; c < count; ++c)
|
|
add_dependency(last_write, ins->src[s].value + c, i, st.dependents, st.dep_counts);
|
|
}
|
|
|
|
/* Keep message-passing ops in order. (This pass only cares
|
|
* about bundling; reordering of message-passing instructions
|
|
* happens during earlier scheduling.) */
|
|
|
|
if (bi_message_type_for_instr(ins)) {
|
|
if (prev_msg != ~0)
|
|
bi_push_dependency(prev_msg, i, st.dependents, st.dep_counts);
|
|
|
|
prev_msg = i;
|
|
}
|
|
|
|
/* Handle schedule barriers, adding All the deps */
|
|
if (inorder || bi_is_sched_barrier(ins)) {
|
|
for (unsigned j = 0; j < st.count; ++j) {
|
|
if (i == j) continue;
|
|
|
|
bi_push_dependency(MAX2(i, j), MIN2(i, j),
|
|
st.dependents, st.dep_counts);
|
|
}
|
|
}
|
|
|
|
bi_foreach_dest(ins, d) {
|
|
if (ins->dest[d].type != BI_INDEX_REGISTER) continue;
|
|
unsigned dest = ins->dest[d].value;
|
|
|
|
unsigned count = bi_count_write_registers(ins, d);
|
|
|
|
for (unsigned c = 0; c < count; ++c) {
|
|
add_dependency(last_read, dest + c, i, st.dependents, st.dep_counts);
|
|
add_dependency(last_write, dest + c, i, st.dependents, st.dep_counts);
|
|
mark_access(last_write, dest + c, i);
|
|
}
|
|
}
|
|
|
|
/* Blend shaders are allowed to clobber R0-R15. Treat these
|
|
* registers like extra destinations for scheduling purposes.
|
|
*/
|
|
if (ins->op == BI_OPCODE_BLEND && !is_blend) {
|
|
for (unsigned c = 0; c < 16; ++c) {
|
|
add_dependency(last_read, c, i, st.dependents, st.dep_counts);
|
|
add_dependency(last_write, c, i, st.dependents, st.dep_counts);
|
|
mark_access(last_write, c, i);
|
|
}
|
|
}
|
|
|
|
bi_foreach_src(ins, s) {
|
|
if (ins->src[s].type != BI_INDEX_REGISTER) continue;
|
|
|
|
unsigned count = bi_count_read_registers(ins, s);
|
|
|
|
for (unsigned c = 0; c < count; ++c)
|
|
mark_access(last_read, ins->src[s].value + c, i);
|
|
}
|
|
}
|
|
|
|
/* If there is a branch, all instructions depend on it, as interblock
|
|
* execution must be purely in-order */
|
|
|
|
bi_instr *last = st.instructions[st.count - 1];
|
|
if (last->branch_target || last->op == BI_OPCODE_JUMP) {
|
|
for (signed i = st.count - 2; i >= 0; --i)
|
|
bi_push_dependency(st.count - 1, i, st.dependents, st.dep_counts);
|
|
}
|
|
|
|
/* Free the intermediate structures */
|
|
for (unsigned i = 0; i < 64; ++i) {
|
|
util_dynarray_fini(&last_read[i]);
|
|
util_dynarray_fini(&last_write[i]);
|
|
}
|
|
}
|
|
|
|
/* Scheduler pseudoinstruction lowerings to enable instruction pairings.
|
|
* Currently only support CUBEFACE -> *CUBEFACE1/+CUBEFACE2
|
|
*/
|
|
|
|
static bi_instr *
|
|
bi_lower_cubeface(bi_context *ctx,
|
|
struct bi_clause_state *clause, struct bi_tuple_state *tuple)
|
|
{
|
|
bi_instr *pinstr = tuple->add;
|
|
bi_builder b = bi_init_builder(ctx, bi_before_instr(pinstr));
|
|
bi_instr *cubeface1 = bi_cubeface1_to(&b, pinstr->dest[0],
|
|
pinstr->src[0], pinstr->src[1], pinstr->src[2]);
|
|
|
|
pinstr->op = BI_OPCODE_CUBEFACE2;
|
|
pinstr->dest[0] = pinstr->dest[1];
|
|
pinstr->dest[1] = bi_null();
|
|
pinstr->src[0] = cubeface1->dest[0];
|
|
pinstr->src[1] = bi_null();
|
|
pinstr->src[2] = bi_null();
|
|
|
|
return cubeface1;
|
|
}
|
|
|
|
/* Psuedo arguments are (rbase, address lo, address hi). We need *ATOM_C.i32 to
|
|
* have the arguments (address lo, address hi, rbase), and +ATOM_CX to have the
|
|
* arguments (rbase, address lo, address hi, rbase) */
|
|
|
|
static bi_instr *
|
|
bi_lower_atom_c(bi_context *ctx, struct bi_clause_state *clause, struct
|
|
bi_tuple_state *tuple)
|
|
{
|
|
bi_instr *pinstr = tuple->add;
|
|
bi_builder b = bi_init_builder(ctx, bi_before_instr(pinstr));
|
|
bi_instr *atom_c = bi_atom_c_return_i32(&b,
|
|
pinstr->src[1], pinstr->src[2], pinstr->src[0],
|
|
pinstr->atom_opc);
|
|
|
|
if (bi_is_null(pinstr->dest[0]))
|
|
atom_c->op = BI_OPCODE_ATOM_C_I32;
|
|
|
|
pinstr->op = BI_OPCODE_ATOM_CX;
|
|
pinstr->src[3] = atom_c->src[2];
|
|
|
|
return atom_c;
|
|
}
|
|
|
|
static bi_instr *
|
|
bi_lower_atom_c1(bi_context *ctx, struct bi_clause_state *clause, struct
|
|
bi_tuple_state *tuple)
|
|
{
|
|
bi_instr *pinstr = tuple->add;
|
|
bi_builder b = bi_init_builder(ctx, bi_before_instr(pinstr));
|
|
bi_instr *atom_c = bi_atom_c1_return_i32(&b,
|
|
pinstr->src[0], pinstr->src[1], pinstr->atom_opc);
|
|
|
|
if (bi_is_null(pinstr->dest[0]))
|
|
atom_c->op = BI_OPCODE_ATOM_C1_I32;
|
|
|
|
pinstr->op = BI_OPCODE_ATOM_CX;
|
|
pinstr->src[2] = pinstr->src[1];
|
|
pinstr->src[1] = pinstr->src[0];
|
|
pinstr->src[3] = bi_dontcare(&b);
|
|
pinstr->src[0] = bi_null();
|
|
|
|
return atom_c;
|
|
}
|
|
|
|
static bi_instr *
|
|
bi_lower_seg_add(bi_context *ctx,
|
|
struct bi_clause_state *clause, struct bi_tuple_state *tuple)
|
|
{
|
|
bi_instr *pinstr = tuple->add;
|
|
bi_builder b = bi_init_builder(ctx, bi_before_instr(pinstr));
|
|
|
|
bi_instr *fma = bi_seg_add_to(&b, pinstr->dest[0], pinstr->src[0],
|
|
pinstr->preserve_null, pinstr->seg);
|
|
|
|
pinstr->op = BI_OPCODE_SEG_ADD;
|
|
pinstr->src[0] = pinstr->src[1];
|
|
pinstr->src[1] = bi_null();
|
|
|
|
assert(pinstr->dest[0].type == BI_INDEX_REGISTER);
|
|
pinstr->dest[0].value += 1;
|
|
|
|
return fma;
|
|
}
|
|
|
|
static bi_instr *
|
|
bi_lower_dtsel(bi_context *ctx,
|
|
struct bi_clause_state *clause, struct bi_tuple_state *tuple)
|
|
{
|
|
bi_instr *add = tuple->add;
|
|
bi_builder b = bi_init_builder(ctx, bi_before_instr(add));
|
|
|
|
bi_instr *dtsel = bi_dtsel_imm_to(&b, bi_temp(b.shader),
|
|
add->src[0], add->table);
|
|
add->src[0] = dtsel->dest[0];
|
|
|
|
assert(bi_supports_dtsel(add));
|
|
return dtsel;
|
|
}
|
|
|
|
/* Flatten linked list to array for O(1) indexing */
|
|
|
|
static bi_instr **
|
|
bi_flatten_block(bi_block *block, unsigned *len)
|
|
{
|
|
if (list_is_empty(&block->instructions))
|
|
return NULL;
|
|
|
|
*len = list_length(&block->instructions);
|
|
bi_instr **instructions = malloc(sizeof(bi_instr *) * (*len));
|
|
|
|
unsigned i = 0;
|
|
|
|
bi_foreach_instr_in_block(block, ins)
|
|
instructions[i++] = ins;
|
|
|
|
return instructions;
|
|
}
|
|
|
|
/* The worklist would track instructions without outstanding dependencies. For
|
|
* debug, force in-order scheduling (no dependency graph is constructed).
|
|
*/
|
|
|
|
static struct bi_worklist
|
|
bi_initialize_worklist(bi_block *block, bool inorder, bool is_blend)
|
|
{
|
|
struct bi_worklist st = { };
|
|
st.instructions = bi_flatten_block(block, &st.count);
|
|
|
|
if (!st.count)
|
|
return st;
|
|
|
|
st.dependents = calloc(st.count, sizeof(st.dependents[0]));
|
|
st.dep_counts = calloc(st.count, sizeof(st.dep_counts[0]));
|
|
|
|
bi_create_dependency_graph(st, inorder, is_blend);
|
|
st.worklist = calloc(BITSET_WORDS(st.count), sizeof(BITSET_WORD));
|
|
|
|
for (unsigned i = 0; i < st.count; ++i) {
|
|
if (st.dep_counts[i] == 0)
|
|
BITSET_SET(st.worklist, i);
|
|
}
|
|
|
|
return st;
|
|
}
|
|
|
|
static void
|
|
bi_free_worklist(struct bi_worklist st)
|
|
{
|
|
free(st.dep_counts);
|
|
free(st.dependents);
|
|
free(st.instructions);
|
|
free(st.worklist);
|
|
}
|
|
|
|
static void
|
|
bi_update_worklist(struct bi_worklist st, unsigned idx)
|
|
{
|
|
assert(st.dep_counts[idx] == 0);
|
|
|
|
if (!st.dependents[idx])
|
|
return;
|
|
|
|
/* Iterate each dependent to remove one dependency (`done`),
|
|
* adding dependents to the worklist where possible. */
|
|
|
|
unsigned i;
|
|
BITSET_FOREACH_SET(i, st.dependents[idx], st.count) {
|
|
assert(st.dep_counts[i] != 0);
|
|
unsigned new_deps = --st.dep_counts[i];
|
|
|
|
if (new_deps == 0)
|
|
BITSET_SET(st.worklist, i);
|
|
}
|
|
|
|
free(st.dependents[idx]);
|
|
}
|
|
|
|
/* Scheduler predicates */
|
|
|
|
/* IADDC.i32 can implement IADD.u32 if no saturation or swizzling is in use */
|
|
static bool
|
|
bi_can_iaddc(bi_instr *ins)
|
|
{
|
|
return (ins->op == BI_OPCODE_IADD_U32 && !ins->saturate &&
|
|
ins->src[0].swizzle == BI_SWIZZLE_H01 &&
|
|
ins->src[1].swizzle == BI_SWIZZLE_H01);
|
|
}
|
|
|
|
/*
|
|
* The encoding of *FADD.v2f16 only specifies a single abs flag. All abs
|
|
* encodings are permitted by swapping operands; however, this scheme fails if
|
|
* both operands are equal. Test for this case.
|
|
*/
|
|
static bool
|
|
bi_impacted_abs(bi_instr *I)
|
|
{
|
|
return I->src[0].abs && I->src[1].abs &&
|
|
bi_is_word_equiv(I->src[0], I->src[1]);
|
|
}
|
|
|
|
bool
|
|
bi_can_fma(bi_instr *ins)
|
|
{
|
|
/* +IADD.i32 -> *IADDC.i32 */
|
|
if (bi_can_iaddc(ins))
|
|
return true;
|
|
|
|
/* +MUX -> *CSEL */
|
|
if (bi_can_replace_with_csel(ins))
|
|
return true;
|
|
|
|
/* *FADD.v2f16 has restricted abs modifiers, use +FADD.v2f16 instead */
|
|
if (ins->op == BI_OPCODE_FADD_V2F16 && bi_impacted_abs(ins))
|
|
return false;
|
|
|
|
/* TODO: some additional fp16 constraints */
|
|
return bi_opcode_props[ins->op].fma;
|
|
}
|
|
|
|
static bool
|
|
bi_impacted_fadd_widens(bi_instr *I)
|
|
{
|
|
enum bi_swizzle swz0 = I->src[0].swizzle;
|
|
enum bi_swizzle swz1 = I->src[1].swizzle;
|
|
|
|
return (swz0 == BI_SWIZZLE_H00 && swz1 == BI_SWIZZLE_H11) ||
|
|
(swz0 == BI_SWIZZLE_H11 && swz1 == BI_SWIZZLE_H11) ||
|
|
(swz0 == BI_SWIZZLE_H11 && swz1 == BI_SWIZZLE_H00);
|
|
}
|
|
|
|
bool
|
|
bi_can_add(bi_instr *ins)
|
|
{
|
|
/* +FADD.v2f16 lacks clamp modifier, use *FADD.v2f16 instead */
|
|
if (ins->op == BI_OPCODE_FADD_V2F16 && ins->clamp)
|
|
return false;
|
|
|
|
/* +FCMP.v2f16 lacks abs modifier, use *FCMP.v2f16 instead */
|
|
if (ins->op == BI_OPCODE_FCMP_V2F16 && (ins->src[0].abs || ins->src[1].abs))
|
|
return false;
|
|
|
|
/* +FADD.f32 has restricted widens, use +FADD.f32 for the full set */
|
|
if (ins->op == BI_OPCODE_FADD_F32 && bi_impacted_fadd_widens(ins))
|
|
return false;
|
|
|
|
/* TODO: some additional fp16 constraints */
|
|
return bi_opcode_props[ins->op].add;
|
|
}
|
|
|
|
/* Architecturally, no single instruction has a "not last" constraint. However,
|
|
* pseudoinstructions writing multiple destinations (expanding to multiple
|
|
* paired instructions) can run afoul of the "no two writes on the last clause"
|
|
* constraint, so we check for that here.
|
|
*
|
|
* Exception to the exception: TEXC, which writes to multiple sets of staging
|
|
* registers. Staging registers bypass the usual register write mechanism so
|
|
* this restriction does not apply.
|
|
*/
|
|
|
|
static bool
|
|
bi_must_not_last(bi_instr *ins)
|
|
{
|
|
return !bi_is_null(ins->dest[0]) && !bi_is_null(ins->dest[1]) &&
|
|
(ins->op != BI_OPCODE_TEXC);
|
|
}
|
|
|
|
/* Check for a message-passing instruction. +DISCARD.f32 is special-cased; we
|
|
* treat it as a message-passing instruction for the purpose of scheduling
|
|
* despite no passing no logical message. Otherwise invalid encoding faults may
|
|
* be raised for unknown reasons (possibly an errata).
|
|
*/
|
|
|
|
bool
|
|
bi_must_message(bi_instr *ins)
|
|
{
|
|
return (bi_opcode_props[ins->op].message != BIFROST_MESSAGE_NONE) ||
|
|
(ins->op == BI_OPCODE_DISCARD_F32);
|
|
}
|
|
|
|
static bool
|
|
bi_fma_atomic(enum bi_opcode op)
|
|
{
|
|
switch (op) {
|
|
case BI_OPCODE_ATOM_C_I32:
|
|
case BI_OPCODE_ATOM_C_I64:
|
|
case BI_OPCODE_ATOM_C1_I32:
|
|
case BI_OPCODE_ATOM_C1_I64:
|
|
case BI_OPCODE_ATOM_C1_RETURN_I32:
|
|
case BI_OPCODE_ATOM_C1_RETURN_I64:
|
|
case BI_OPCODE_ATOM_C_RETURN_I32:
|
|
case BI_OPCODE_ATOM_C_RETURN_I64:
|
|
case BI_OPCODE_ATOM_POST_I32:
|
|
case BI_OPCODE_ATOM_POST_I64:
|
|
case BI_OPCODE_ATOM_PRE_I64:
|
|
return true;
|
|
default:
|
|
return false;
|
|
}
|
|
}
|
|
|
|
bool
|
|
bi_reads_zero(bi_instr *ins)
|
|
{
|
|
return !(bi_fma_atomic(ins->op) || ins->op == BI_OPCODE_IMULD);
|
|
}
|
|
|
|
bool
|
|
bi_reads_temps(bi_instr *ins, unsigned src)
|
|
{
|
|
switch (ins->op) {
|
|
/* Cannot permute a temporary */
|
|
case BI_OPCODE_CLPER_I32:
|
|
case BI_OPCODE_CLPER_V6_I32:
|
|
return src != 0;
|
|
case BI_OPCODE_IMULD:
|
|
return false;
|
|
default:
|
|
return true;
|
|
}
|
|
}
|
|
|
|
static bool
|
|
bi_impacted_t_modifiers(bi_instr *I, unsigned src)
|
|
{
|
|
enum bi_swizzle swizzle = I->src[src].swizzle;
|
|
|
|
switch (I->op) {
|
|
case BI_OPCODE_F16_TO_F32:
|
|
case BI_OPCODE_F16_TO_S32:
|
|
case BI_OPCODE_F16_TO_U32:
|
|
case BI_OPCODE_MKVEC_V2I16:
|
|
case BI_OPCODE_S16_TO_F32:
|
|
case BI_OPCODE_S16_TO_S32:
|
|
case BI_OPCODE_U16_TO_F32:
|
|
case BI_OPCODE_U16_TO_U32:
|
|
return (swizzle != BI_SWIZZLE_H00);
|
|
|
|
case BI_OPCODE_BRANCH_F32:
|
|
case BI_OPCODE_LOGB_F32:
|
|
case BI_OPCODE_ILOGB_F32:
|
|
case BI_OPCODE_FADD_F32:
|
|
case BI_OPCODE_FCMP_F32:
|
|
case BI_OPCODE_FREXPE_F32:
|
|
case BI_OPCODE_FREXPM_F32:
|
|
case BI_OPCODE_FROUND_F32:
|
|
return (swizzle != BI_SWIZZLE_H01);
|
|
|
|
case BI_OPCODE_IADD_S32:
|
|
case BI_OPCODE_IADD_U32:
|
|
case BI_OPCODE_ISUB_S32:
|
|
case BI_OPCODE_ISUB_U32:
|
|
case BI_OPCODE_IADD_V4S8:
|
|
case BI_OPCODE_IADD_V4U8:
|
|
case BI_OPCODE_ISUB_V4S8:
|
|
case BI_OPCODE_ISUB_V4U8:
|
|
return (src == 1) && (swizzle != BI_SWIZZLE_H01);
|
|
|
|
case BI_OPCODE_S8_TO_F32:
|
|
case BI_OPCODE_S8_TO_S32:
|
|
case BI_OPCODE_U8_TO_F32:
|
|
case BI_OPCODE_U8_TO_U32:
|
|
return (swizzle != BI_SWIZZLE_B0000);
|
|
|
|
case BI_OPCODE_V2S8_TO_V2F16:
|
|
case BI_OPCODE_V2S8_TO_V2S16:
|
|
case BI_OPCODE_V2U8_TO_V2F16:
|
|
case BI_OPCODE_V2U8_TO_V2U16:
|
|
return (swizzle != BI_SWIZZLE_B0022);
|
|
|
|
case BI_OPCODE_IADD_V2S16:
|
|
case BI_OPCODE_IADD_V2U16:
|
|
case BI_OPCODE_ISUB_V2S16:
|
|
case BI_OPCODE_ISUB_V2U16:
|
|
return (src == 1) && (swizzle >= BI_SWIZZLE_H11);
|
|
|
|
#if 0
|
|
/* Restriction on IADD in 64-bit clauses on G72 */
|
|
case BI_OPCODE_IADD_S64:
|
|
case BI_OPCODE_IADD_U64:
|
|
return (src == 1) && (swizzle != BI_SWIZZLE_D0);
|
|
#endif
|
|
|
|
default:
|
|
return false;
|
|
}
|
|
}
|
|
|
|
bool
|
|
bi_reads_t(bi_instr *ins, unsigned src)
|
|
{
|
|
/* Branch offset cannot come from passthrough */
|
|
if (bi_opcode_props[ins->op].branch)
|
|
return src != 2;
|
|
|
|
/* Table can never read passthrough */
|
|
if (bi_opcode_props[ins->op].table)
|
|
return false;
|
|
|
|
/* Staging register reads may happen before the succeeding register
|
|
* block encodes a write, so effectively there is no passthrough */
|
|
if (bi_is_staging_src(ins, src))
|
|
return false;
|
|
|
|
/* Bifrost cores newer than Mali G71 have restrictions on swizzles on
|
|
* same-cycle temporaries. Check the list for these hazards. */
|
|
if (bi_impacted_t_modifiers(ins, src))
|
|
return false;
|
|
|
|
/* Descriptor must not come from a passthrough */
|
|
switch (ins->op) {
|
|
case BI_OPCODE_LD_CVT:
|
|
case BI_OPCODE_LD_TILE:
|
|
case BI_OPCODE_ST_CVT:
|
|
case BI_OPCODE_ST_TILE:
|
|
case BI_OPCODE_TEXC:
|
|
return src != 2;
|
|
case BI_OPCODE_BLEND:
|
|
return src != 2 && src != 3;
|
|
|
|
/* +JUMP can't read the offset from T */
|
|
case BI_OPCODE_JUMP:
|
|
return false;
|
|
|
|
/* Else, just check if we can read any temps */
|
|
default:
|
|
return bi_reads_temps(ins, src);
|
|
}
|
|
}
|
|
|
|
/* Counts the number of 64-bit constants required by a clause. TODO: We
|
|
* might want to account for merging, right now we overestimate, but
|
|
* that's probably fine most of the time */
|
|
|
|
static unsigned
|
|
bi_nconstants(struct bi_clause_state *clause)
|
|
{
|
|
unsigned count_32 = 0;
|
|
|
|
for (unsigned i = 0; i < ARRAY_SIZE(clause->consts); ++i)
|
|
count_32 += clause->consts[i].constant_count;
|
|
|
|
return DIV_ROUND_UP(count_32, 2);
|
|
}
|
|
|
|
/* Would there be space for constants if we added one tuple? */
|
|
|
|
static bool
|
|
bi_space_for_more_constants(struct bi_clause_state *clause)
|
|
{
|
|
return (bi_nconstants(clause) < 13 - (clause->tuple_count + 1));
|
|
}
|
|
|
|
/* Updates the FAU assignment for a tuple. A valid FAU assignment must be
|
|
* possible (as a precondition), though not necessarily on the selected unit;
|
|
* this is gauranteed per-instruction by bi_lower_fau and per-tuple by
|
|
* bi_instr_schedulable */
|
|
|
|
static bool
|
|
bi_update_fau(struct bi_clause_state *clause,
|
|
struct bi_tuple_state *tuple,
|
|
bi_instr *instr, bool fma, bool destructive)
|
|
{
|
|
/* Maintain our own constants, for nondestructive mode */
|
|
uint32_t copied_constants[2], copied_count;
|
|
unsigned *constant_count = &tuple->constant_count;
|
|
uint32_t *constants = tuple->constants;
|
|
enum bir_fau fau = tuple->fau;
|
|
|
|
if (!destructive) {
|
|
memcpy(copied_constants, tuple->constants,
|
|
(*constant_count) * sizeof(constants[0]));
|
|
copied_count = tuple->constant_count;
|
|
|
|
constant_count = &copied_count;
|
|
constants = copied_constants;
|
|
}
|
|
|
|
bi_foreach_src(instr, s) {
|
|
bi_index src = instr->src[s];
|
|
|
|
if (src.type == BI_INDEX_FAU) {
|
|
bool no_constants = *constant_count == 0;
|
|
bool no_other_fau = (fau == src.value) || !fau;
|
|
bool mergable = no_constants && no_other_fau;
|
|
|
|
if (destructive) {
|
|
assert(mergable);
|
|
tuple->fau = src.value;
|
|
} else if (!mergable) {
|
|
return false;
|
|
}
|
|
|
|
fau = src.value;
|
|
} else if (src.type == BI_INDEX_CONSTANT) {
|
|
/* No need to reserve space if we have a fast 0 */
|
|
if (src.value == 0 && fma && bi_reads_zero(instr))
|
|
continue;
|
|
|
|
/* If there is a branch target, #0 by convention is the
|
|
* PC-relative offset to the target */
|
|
bool pcrel = instr->branch_target && src.value == 0;
|
|
bool found = false;
|
|
|
|
for (unsigned i = 0; i < *constant_count; ++i) {
|
|
found |= (constants[i] == src.value) &&
|
|
(i != tuple->pcrel_idx);
|
|
}
|
|
|
|
/* pcrel constants are unique, so don't match */
|
|
if (found && !pcrel)
|
|
continue;
|
|
|
|
bool no_fau = (*constant_count > 0) || !fau;
|
|
bool mergable = no_fau && ((*constant_count) < 2);
|
|
|
|
if (destructive) {
|
|
assert(mergable);
|
|
|
|
if (pcrel)
|
|
tuple->pcrel_idx = *constant_count;
|
|
} else if (!mergable)
|
|
return false;
|
|
|
|
constants[(*constant_count)++] = src.value;
|
|
}
|
|
}
|
|
|
|
/* Constants per clause may be limited by tuple count */
|
|
bool room_for_constants = (*constant_count == 0) ||
|
|
bi_space_for_more_constants(clause);
|
|
|
|
if (destructive)
|
|
assert(room_for_constants);
|
|
else if (!room_for_constants)
|
|
return false;
|
|
|
|
return true;
|
|
}
|
|
|
|
/* Given an in-progress tuple, a candidate new instruction to add to the tuple,
|
|
* and a source (index) from that candidate, determine whether this source is
|
|
* "new", in the sense of requiring an additional read slot. That is, checks
|
|
* whether the specified source reads from the register file via a read slot
|
|
* (determined by its type and placement) and whether the source was already
|
|
* specified by a prior read slot (to avoid double counting) */
|
|
|
|
static bool
|
|
bi_tuple_is_new_src(bi_instr *instr, struct bi_reg_state *reg, unsigned src_idx)
|
|
{
|
|
bi_index src = instr->src[src_idx];
|
|
|
|
/* Only consider sources which come from the register file */
|
|
if (!(src.type == BI_INDEX_NORMAL || src.type == BI_INDEX_REGISTER))
|
|
return false;
|
|
|
|
/* Staging register reads bypass the usual register file mechanism */
|
|
if (bi_is_staging_src(instr, src_idx))
|
|
return false;
|
|
|
|
/* If a source is already read in the tuple, it is already counted */
|
|
for (unsigned t = 0; t < reg->nr_reads; ++t)
|
|
if (bi_is_word_equiv(src, reg->reads[t]))
|
|
return false;
|
|
|
|
/* If a source is read in _this instruction_, it is already counted */
|
|
for (unsigned t = 0; t < src_idx; ++t)
|
|
if (bi_is_word_equiv(src, instr->src[t]))
|
|
return false;
|
|
|
|
return true;
|
|
}
|
|
|
|
/* Given two tuples in source order, count the number of register reads of the
|
|
* successor, determined as the number of unique words accessed that aren't
|
|
* written by the predecessor (since those are tempable).
|
|
*/
|
|
|
|
static unsigned
|
|
bi_count_succ_reads(bi_index t0, bi_index t1,
|
|
bi_index *succ_reads, unsigned nr_succ_reads)
|
|
{
|
|
unsigned reads = 0;
|
|
|
|
for (unsigned i = 0; i < nr_succ_reads; ++i) {
|
|
bool unique = true;
|
|
|
|
for (unsigned j = 0; j < i; ++j)
|
|
if (bi_is_word_equiv(succ_reads[i], succ_reads[j]))
|
|
unique = false;
|
|
|
|
if (!unique)
|
|
continue;
|
|
|
|
if (bi_is_word_equiv(succ_reads[i], t0))
|
|
continue;
|
|
|
|
if (bi_is_word_equiv(succ_reads[i], t1))
|
|
continue;
|
|
|
|
reads++;
|
|
}
|
|
|
|
return reads;
|
|
}
|
|
|
|
/* Not all instructions can read from the staging passthrough (as determined by
|
|
* reads_t), check if a given pair of instructions has such a restriction. Note
|
|
* we also use this mechanism to prevent data races around staging register
|
|
* reads, so we allow the input source to potentially be vector-valued */
|
|
|
|
static bool
|
|
bi_has_staging_passthrough_hazard(bi_index fma, bi_instr *add)
|
|
{
|
|
bi_foreach_src(add, s) {
|
|
bi_index src = add->src[s];
|
|
|
|
if (src.type != BI_INDEX_REGISTER)
|
|
continue;
|
|
|
|
unsigned count = bi_count_read_registers(add, s);
|
|
bool read = false;
|
|
|
|
for (unsigned d = 0; d < count; ++d)
|
|
read |= bi_is_equiv(fma, bi_register(src.value + d));
|
|
|
|
if (read && !bi_reads_t(add, s))
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
/* Likewise for cross-tuple passthrough (reads_temps) */
|
|
|
|
static bool
|
|
bi_has_cross_passthrough_hazard(bi_tuple *succ, bi_instr *ins)
|
|
{
|
|
bi_foreach_instr_in_tuple(succ, pins) {
|
|
bi_foreach_src(pins, s) {
|
|
if (bi_is_word_equiv(ins->dest[0], pins->src[s]) &&
|
|
!bi_reads_temps(pins, s))
|
|
return true;
|
|
}
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
/* Is a register written other than the staging mechanism? ATEST is special,
|
|
* writing to both a staging register and a regular register (fixed packing).
|
|
* BLEND is special since it has to write r48 the normal way even if it never
|
|
* gets read. This depends on liveness analysis, as a register is not needed
|
|
* for a write that will be discarded after one tuple. */
|
|
|
|
static unsigned
|
|
bi_write_count(bi_instr *instr, uint64_t live_after_temp)
|
|
{
|
|
if (instr->op == BI_OPCODE_ATEST || instr->op == BI_OPCODE_BLEND)
|
|
return 1;
|
|
|
|
unsigned count = 0;
|
|
|
|
bi_foreach_dest(instr, d) {
|
|
if (d == 0 && bi_opcode_props[instr->op].sr_write)
|
|
continue;
|
|
|
|
if (bi_is_null(instr->dest[d]))
|
|
continue;
|
|
|
|
assert(instr->dest[0].type == BI_INDEX_REGISTER);
|
|
if (live_after_temp & BITFIELD64_BIT(instr->dest[0].value))
|
|
count++;
|
|
}
|
|
|
|
return count;
|
|
}
|
|
|
|
/*
|
|
* Test if an instruction required flush-to-zero mode. Currently only supported
|
|
* for f16<-->f32 conversions to implement fquantize16
|
|
*/
|
|
static bool
|
|
bi_needs_ftz(bi_instr *I)
|
|
{
|
|
return (I->op == BI_OPCODE_F16_TO_F32 ||
|
|
I->op == BI_OPCODE_V2F32_TO_V2F16) && I->ftz;
|
|
}
|
|
|
|
/*
|
|
* Test if an instruction would be numerically incompatible with the clause. At
|
|
* present we only consider flush-to-zero modes.
|
|
*/
|
|
static bool
|
|
bi_numerically_incompatible(struct bi_clause_state *clause, bi_instr *instr)
|
|
{
|
|
return (clause->ftz != BI_FTZ_STATE_NONE) &&
|
|
((clause->ftz == BI_FTZ_STATE_ENABLE) != bi_needs_ftz(instr));
|
|
}
|
|
|
|
/* Instruction placement entails two questions: what subset of instructions in
|
|
* the block can legally be scheduled? and of those which is the best? That is,
|
|
* we seek to maximize a cost function on a subset of the worklist satisfying a
|
|
* particular predicate. The necessary predicate is determined entirely by
|
|
* Bifrost's architectural limitations and is described in the accompanying
|
|
* whitepaper. The cost function is a heuristic. */
|
|
|
|
static bool
|
|
bi_instr_schedulable(bi_instr *instr,
|
|
struct bi_clause_state *clause,
|
|
struct bi_tuple_state *tuple,
|
|
uint64_t live_after_temp,
|
|
bool fma)
|
|
{
|
|
/* The units must match */
|
|
if ((fma && !bi_can_fma(instr)) || (!fma && !bi_can_add(instr)))
|
|
return false;
|
|
|
|
/* There can only be one message-passing instruction per clause */
|
|
if (bi_must_message(instr) && clause->message)
|
|
return false;
|
|
|
|
/* Some instructions have placement requirements */
|
|
if (bi_opcode_props[instr->op].last && !tuple->last)
|
|
return false;
|
|
|
|
if (bi_must_not_last(instr) && tuple->last)
|
|
return false;
|
|
|
|
/* Numerical properties must be compatible with the clause */
|
|
if (bi_numerically_incompatible(clause, instr))
|
|
return false;
|
|
|
|
/* Message-passing instructions are not guaranteed write within the
|
|
* same clause (most likely they will not), so if a later instruction
|
|
* in the clause accesses the destination, the message-passing
|
|
* instruction can't be scheduled */
|
|
if (bi_opcode_props[instr->op].sr_write) {
|
|
bi_foreach_dest(instr, d) {
|
|
if (bi_is_null(instr->dest[d]))
|
|
continue;
|
|
|
|
unsigned nr = bi_count_write_registers(instr, d);
|
|
assert(instr->dest[d].type == BI_INDEX_REGISTER);
|
|
unsigned reg = instr->dest[d].value;
|
|
|
|
for (unsigned i = 0; i < clause->access_count; ++i) {
|
|
bi_index idx = clause->accesses[i];
|
|
for (unsigned d = 0; d < nr; ++d) {
|
|
if (bi_is_equiv(bi_register(reg + d), idx))
|
|
return false;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
if (bi_opcode_props[instr->op].sr_read && !bi_is_null(instr->src[0])) {
|
|
unsigned nr = bi_count_read_registers(instr, 0);
|
|
assert(instr->src[0].type == BI_INDEX_REGISTER);
|
|
unsigned reg = instr->src[0].value;
|
|
|
|
for (unsigned i = 0; i < clause->access_count; ++i) {
|
|
bi_index idx = clause->accesses[i];
|
|
for (unsigned d = 0; d < nr; ++d) {
|
|
if (bi_is_equiv(bi_register(reg + d), idx))
|
|
return false;
|
|
}
|
|
}
|
|
}
|
|
|
|
/* If FAU is already assigned, we may not disrupt that. Do a
|
|
* non-disruptive test update */
|
|
if (!bi_update_fau(clause, tuple, instr, fma, false))
|
|
return false;
|
|
|
|
/* If this choice of FMA would force a staging passthrough, the ADD
|
|
* instruction must support such a passthrough */
|
|
if (tuple->add && bi_has_staging_passthrough_hazard(instr->dest[0], tuple->add))
|
|
return false;
|
|
|
|
/* If this choice of destination would force a cross-tuple passthrough, the next tuple must support that */
|
|
if (tuple->prev && bi_has_cross_passthrough_hazard(tuple->prev, instr))
|
|
return false;
|
|
|
|
/* Register file writes are limited */
|
|
unsigned total_writes = tuple->reg.nr_writes;
|
|
total_writes += bi_write_count(instr, live_after_temp);
|
|
|
|
/* Last tuple in a clause can only write a single value */
|
|
if (tuple->last && total_writes > 1)
|
|
return false;
|
|
|
|
/* Register file reads are limited, so count unique */
|
|
|
|
unsigned unique_new_srcs = 0;
|
|
|
|
bi_foreach_src(instr, s) {
|
|
if (bi_tuple_is_new_src(instr, &tuple->reg, s))
|
|
unique_new_srcs++;
|
|
}
|
|
|
|
unsigned total_srcs = tuple->reg.nr_reads + unique_new_srcs;
|
|
|
|
bool can_spill_to_moves = (!tuple->add);
|
|
can_spill_to_moves &= (bi_nconstants(clause) < 13 - (clause->tuple_count + 2));
|
|
can_spill_to_moves &= (clause->tuple_count < 7);
|
|
|
|
/* However, we can get an extra 1 or 2 sources by inserting moves */
|
|
if (total_srcs > (can_spill_to_moves ? 4 : 3))
|
|
return false;
|
|
|
|
/* Count effective reads for the successor */
|
|
unsigned succ_reads = bi_count_succ_reads(instr->dest[0],
|
|
tuple->add ? tuple->add->dest[0] : bi_null(),
|
|
tuple->prev_reads, tuple->nr_prev_reads);
|
|
|
|
/* Successor must satisfy R+W <= 4, so we require W <= 4-R */
|
|
if ((signed) total_writes > (4 - (signed) succ_reads))
|
|
return false;
|
|
|
|
return true;
|
|
}
|
|
|
|
static signed
|
|
bi_instr_cost(bi_instr *instr, struct bi_tuple_state *tuple)
|
|
{
|
|
signed cost = 0;
|
|
|
|
/* Instructions that can schedule to either FMA or to ADD should be
|
|
* deprioritized since they're easier to reschedule elsewhere */
|
|
if (bi_can_fma(instr) && bi_can_add(instr))
|
|
cost++;
|
|
|
|
/* Message-passing instructions impose constraints on the registers
|
|
* later in the clause, so schedule them as late within a clause as
|
|
* possible (<==> prioritize them since we're backwards <==> decrease
|
|
* cost) */
|
|
if (bi_must_message(instr))
|
|
cost--;
|
|
|
|
/* Last instructions are big constraints (XXX: no effect on shader-db) */
|
|
if (bi_opcode_props[instr->op].last)
|
|
cost -= 2;
|
|
|
|
return cost;
|
|
}
|
|
|
|
static unsigned
|
|
bi_choose_index(struct bi_worklist st,
|
|
struct bi_clause_state *clause,
|
|
struct bi_tuple_state *tuple,
|
|
uint64_t live_after_temp,
|
|
bool fma)
|
|
{
|
|
unsigned i, best_idx = ~0;
|
|
signed best_cost = INT_MAX;
|
|
|
|
BITSET_FOREACH_SET(i, st.worklist, st.count) {
|
|
bi_instr *instr = st.instructions[i];
|
|
|
|
if (!bi_instr_schedulable(instr, clause, tuple, live_after_temp, fma))
|
|
continue;
|
|
|
|
signed cost = bi_instr_cost(instr, tuple);
|
|
|
|
/* Tie break in favour of later instructions, under the
|
|
* assumption this promotes temporary usage (reducing pressure
|
|
* on the register file). This is a side effect of a prepass
|
|
* scheduling for pressure. */
|
|
|
|
if (cost <= best_cost) {
|
|
best_idx = i;
|
|
best_cost = cost;
|
|
}
|
|
}
|
|
|
|
return best_idx;
|
|
}
|
|
|
|
static void
|
|
bi_pop_instr(struct bi_clause_state *clause, struct bi_tuple_state *tuple,
|
|
bi_instr *instr, uint64_t live_after_temp, bool fma)
|
|
{
|
|
bi_update_fau(clause, tuple, instr, fma, true);
|
|
|
|
/* TODO: maybe opt a bit? or maybe doesn't matter */
|
|
assert(clause->access_count + BI_MAX_SRCS + BI_MAX_DESTS <= ARRAY_SIZE(clause->accesses));
|
|
memcpy(clause->accesses + clause->access_count, instr->src, sizeof(instr->src));
|
|
clause->access_count += BI_MAX_SRCS;
|
|
memcpy(clause->accesses + clause->access_count, instr->dest, sizeof(instr->dest));
|
|
clause->access_count += BI_MAX_DESTS;
|
|
tuple->reg.nr_writes += bi_write_count(instr, live_after_temp);
|
|
|
|
bi_foreach_src(instr, s) {
|
|
if (bi_tuple_is_new_src(instr, &tuple->reg, s))
|
|
tuple->reg.reads[tuple->reg.nr_reads++] = instr->src[s];
|
|
}
|
|
|
|
/* This could be optimized to allow pairing integer instructions with
|
|
* special flush-to-zero instructions, but punting on this until we have
|
|
* a workload that cares.
|
|
*/
|
|
clause->ftz = bi_needs_ftz(instr) ? BI_FTZ_STATE_ENABLE :
|
|
BI_FTZ_STATE_DISABLE;
|
|
}
|
|
|
|
/* Choose the best instruction and pop it off the worklist. Returns NULL if no
|
|
* instruction is available. This function is destructive. */
|
|
|
|
static bi_instr *
|
|
bi_take_instr(bi_context *ctx, struct bi_worklist st,
|
|
struct bi_clause_state *clause,
|
|
struct bi_tuple_state *tuple,
|
|
uint64_t live_after_temp,
|
|
bool fma)
|
|
{
|
|
if (tuple->add && tuple->add->op == BI_OPCODE_CUBEFACE)
|
|
return bi_lower_cubeface(ctx, clause, tuple);
|
|
else if (tuple->add && tuple->add->op == BI_OPCODE_ATOM_RETURN_I32)
|
|
return bi_lower_atom_c(ctx, clause, tuple);
|
|
else if (tuple->add && tuple->add->op == BI_OPCODE_ATOM1_RETURN_I32)
|
|
return bi_lower_atom_c1(ctx, clause, tuple);
|
|
else if (tuple->add && tuple->add->op == BI_OPCODE_SEG_ADD_I64)
|
|
return bi_lower_seg_add(ctx, clause, tuple);
|
|
else if (tuple->add && tuple->add->table)
|
|
return bi_lower_dtsel(ctx, clause, tuple);
|
|
|
|
/* TODO: Optimize these moves */
|
|
if (!fma && tuple->nr_prev_reads > 3) {
|
|
/* Only spill by one source for now */
|
|
assert(tuple->nr_prev_reads == 4);
|
|
|
|
/* Pick a source to spill */
|
|
bi_index src = tuple->prev_reads[0];
|
|
|
|
/* Schedule the spill */
|
|
bi_builder b = bi_init_builder(ctx, bi_before_tuple(tuple->prev));
|
|
bi_instr *mov = bi_mov_i32_to(&b, src, src);
|
|
bi_pop_instr(clause, tuple, mov, live_after_temp, fma);
|
|
return mov;
|
|
}
|
|
|
|
#ifndef NDEBUG
|
|
/* Don't pair instructions if debugging */
|
|
if ((bifrost_debug & BIFROST_DBG_NOSCHED) && tuple->add)
|
|
return NULL;
|
|
#endif
|
|
|
|
unsigned idx = bi_choose_index(st, clause, tuple, live_after_temp, fma);
|
|
|
|
if (idx >= st.count)
|
|
return NULL;
|
|
|
|
/* Update state to reflect taking the instruction */
|
|
bi_instr *instr = st.instructions[idx];
|
|
|
|
BITSET_CLEAR(st.worklist, idx);
|
|
bi_update_worklist(st, idx);
|
|
bi_pop_instr(clause, tuple, instr, live_after_temp, fma);
|
|
|
|
/* Fixups */
|
|
if (instr->op == BI_OPCODE_IADD_U32 && fma) {
|
|
assert(bi_can_iaddc(instr));
|
|
instr->op = BI_OPCODE_IADDC_I32;
|
|
instr->src[2] = bi_zero();
|
|
} else if (fma && bi_can_replace_with_csel(instr)) {
|
|
bi_replace_mux_with_csel(instr, false);
|
|
}
|
|
|
|
return instr;
|
|
}
|
|
|
|
/* Variant of bi_rewrite_index_src_single that uses word-equivalence, rewriting
|
|
* to a passthrough register. If except_sr is true, the staging sources are
|
|
* skipped, so staging register reads are not accidentally encoded as
|
|
* passthrough (which is impossible) */
|
|
|
|
static void
|
|
bi_use_passthrough(bi_instr *ins, bi_index old,
|
|
enum bifrost_packed_src new,
|
|
bool except_sr)
|
|
{
|
|
/* Optional for convenience */
|
|
if (!ins || bi_is_null(old))
|
|
return;
|
|
|
|
bi_foreach_src(ins, i) {
|
|
if ((i == 0 || i == 4) && except_sr)
|
|
continue;
|
|
|
|
if (bi_is_word_equiv(ins->src[i], old)) {
|
|
ins->src[i].type = BI_INDEX_PASS;
|
|
ins->src[i].value = new;
|
|
ins->src[i].reg = false;
|
|
ins->src[i].offset = 0;
|
|
}
|
|
}
|
|
}
|
|
|
|
/* Rewrites an adjacent pair of tuples _prec_eding and _succ_eding to use
|
|
* intertuple passthroughs where necessary. Passthroughs are allowed as a
|
|
* post-condition of scheduling. Note we rewrite ADD first, FMA second --
|
|
* opposite the order of execution. This is deliberate -- if both FMA and ADD
|
|
* write to the same logical register, the next executed tuple will get the
|
|
* latter result. There's no interference issue under the assumption of correct
|
|
* register allocation. */
|
|
|
|
static void
|
|
bi_rewrite_passthrough(bi_tuple prec, bi_tuple succ)
|
|
{
|
|
bool sr_read = succ.add ? bi_opcode_props[succ.add->op].sr_read : false;
|
|
|
|
if (prec.add) {
|
|
bi_use_passthrough(succ.fma, prec.add->dest[0], BIFROST_SRC_PASS_ADD, false);
|
|
bi_use_passthrough(succ.add, prec.add->dest[0], BIFROST_SRC_PASS_ADD, sr_read);
|
|
}
|
|
|
|
if (prec.fma) {
|
|
bi_use_passthrough(succ.fma, prec.fma->dest[0], BIFROST_SRC_PASS_FMA, false);
|
|
bi_use_passthrough(succ.add, prec.fma->dest[0], BIFROST_SRC_PASS_FMA, sr_read);
|
|
}
|
|
}
|
|
|
|
static void
|
|
bi_rewrite_fau_to_pass(bi_tuple *tuple)
|
|
{
|
|
bi_foreach_instr_and_src_in_tuple(tuple, ins, s) {
|
|
if (ins->src[s].type != BI_INDEX_FAU) continue;
|
|
|
|
bi_index pass = bi_passthrough(ins->src[s].offset ?
|
|
BIFROST_SRC_FAU_HI : BIFROST_SRC_FAU_LO);
|
|
|
|
ins->src[s] = bi_replace_index(ins->src[s], pass);
|
|
}
|
|
}
|
|
|
|
static void
|
|
bi_rewrite_zero(bi_instr *ins, bool fma)
|
|
{
|
|
bi_index zero = bi_passthrough(fma ? BIFROST_SRC_STAGE : BIFROST_SRC_FAU_LO);
|
|
|
|
bi_foreach_src(ins, s) {
|
|
bi_index src = ins->src[s];
|
|
|
|
if (src.type == BI_INDEX_CONSTANT && src.value == 0)
|
|
ins->src[s] = bi_replace_index(src, zero);
|
|
}
|
|
}
|
|
|
|
/* Assumes #0 to {T, FAU} rewrite has already occurred */
|
|
|
|
static void
|
|
bi_rewrite_constants_to_pass(bi_tuple *tuple, uint64_t constant, bool pcrel)
|
|
{
|
|
bi_foreach_instr_and_src_in_tuple(tuple, ins, s) {
|
|
if (ins->src[s].type != BI_INDEX_CONSTANT) continue;
|
|
|
|
uint32_t cons = ins->src[s].value;
|
|
|
|
ASSERTED bool lo = (cons == (constant & 0xffffffff));
|
|
bool hi = (cons == (constant >> 32ull));
|
|
|
|
/* PC offsets always live in the upper half, set to zero by
|
|
* convention before pack time. (This is safe, since if you
|
|
* wanted to compare against zero, you would use a BRANCHZ
|
|
* instruction instead.) */
|
|
if (cons == 0 && ins->branch_target != NULL) {
|
|
assert(pcrel);
|
|
hi = true;
|
|
lo = false;
|
|
} else if (pcrel) {
|
|
hi = false;
|
|
}
|
|
|
|
assert(lo || hi);
|
|
|
|
ins->src[s] = bi_replace_index(ins->src[s],
|
|
bi_passthrough(hi ? BIFROST_SRC_FAU_HI :
|
|
BIFROST_SRC_FAU_LO));
|
|
}
|
|
}
|
|
|
|
/* Constructs a constant state given a tuple state. This has the
|
|
* postcondition that pcrel applies to the first constant by convention,
|
|
* and PC-relative constants will be #0 by convention here, so swap to
|
|
* match if needed */
|
|
|
|
static struct bi_const_state
|
|
bi_get_const_state(struct bi_tuple_state *tuple)
|
|
{
|
|
struct bi_const_state consts = {
|
|
.constant_count = tuple->constant_count,
|
|
.constants[0] = tuple->constants[0],
|
|
.constants[1] = tuple->constants[1],
|
|
.pcrel = tuple->add && tuple->add->branch_target,
|
|
};
|
|
|
|
/* pcrel applies to the first constant by convention, and
|
|
* PC-relative constants will be #0 by convention here, so swap
|
|
* to match if needed */
|
|
if (consts.pcrel && consts.constants[0]) {
|
|
assert(consts.constant_count == 2);
|
|
assert(consts.constants[1] == 0);
|
|
|
|
consts.constants[1] = consts.constants[0];
|
|
consts.constants[0] = 0;
|
|
}
|
|
|
|
return consts;
|
|
}
|
|
|
|
/* Merges constants in a clause, satisfying the following rules, assuming no
|
|
* more than one tuple has pcrel:
|
|
*
|
|
* 1. If a tuple has two constants, they must be packed together. If one is
|
|
* pcrel, it must be the high constant to use the M1=4 modification [sx64(E0) +
|
|
* (PC << 32)]. Otherwise choose an arbitrary order.
|
|
*
|
|
* 4. If a tuple has one constant, it may be shared with an existing
|
|
* pair that already contains that constant, or it may be combined with another
|
|
* (distinct) tuple of a single constant.
|
|
*
|
|
* This gaurantees a packing is possible. The next routine handles modification
|
|
* related swapping, to satisfy format 12 and the lack of modification for
|
|
* tuple count 5/8 in EC0.
|
|
*/
|
|
|
|
static uint64_t
|
|
bi_merge_u32(uint32_t c0, uint32_t c1, bool pcrel)
|
|
{
|
|
/* At this point in the constant merge algorithm, pcrel constants are
|
|
* treated as zero, so pcrel implies at least one constants is zero */
|
|
assert(!pcrel || (c0 == 0 || c1 == 0));
|
|
|
|
/* Order: pcrel, maximum non-pcrel, minimum non-pcrel */
|
|
uint32_t hi = pcrel ? 0 : MAX2(c0, c1);
|
|
uint32_t lo = (c0 == hi) ? c1 : c0;
|
|
|
|
/* Merge in the selected order */
|
|
return lo | (((uint64_t) hi) << 32ull);
|
|
}
|
|
|
|
static unsigned
|
|
bi_merge_pairs(struct bi_const_state *consts, unsigned tuple_count,
|
|
uint64_t *merged, unsigned *pcrel_pair)
|
|
{
|
|
unsigned merge_count = 0;
|
|
|
|
for (unsigned t = 0; t < tuple_count; ++t) {
|
|
if (consts[t].constant_count != 2) continue;
|
|
|
|
unsigned idx = ~0;
|
|
uint64_t val = bi_merge_u32(consts[t].constants[0],
|
|
consts[t].constants[1], consts[t].pcrel);
|
|
|
|
/* Skip the pcrel pair if assigned, because if one is assigned,
|
|
* this one is not pcrel by uniqueness so it's a mismatch */
|
|
for (unsigned s = 0; s < merge_count; ++s) {
|
|
if (merged[s] == val && (*pcrel_pair) != s) {
|
|
idx = s;
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (idx == ~0) {
|
|
idx = merge_count++;
|
|
merged[idx] = val;
|
|
|
|
if (consts[t].pcrel)
|
|
(*pcrel_pair) = idx;
|
|
}
|
|
|
|
consts[t].word_idx = idx;
|
|
}
|
|
|
|
return merge_count;
|
|
}
|
|
|
|
static unsigned
|
|
bi_merge_singles(struct bi_const_state *consts, unsigned tuple_count,
|
|
uint64_t *pairs, unsigned pair_count, unsigned *pcrel_pair)
|
|
{
|
|
bool pending = false, pending_pcrel = false;
|
|
uint32_t pending_single = 0;
|
|
|
|
for (unsigned t = 0; t < tuple_count; ++t) {
|
|
if (consts[t].constant_count != 1) continue;
|
|
|
|
uint32_t val = consts[t].constants[0];
|
|
unsigned idx = ~0;
|
|
|
|
/* Try to match, but don't match pcrel with non-pcrel, even
|
|
* though we can merge a pcrel with a non-pcrel single */
|
|
for (unsigned i = 0; i < pair_count; ++i) {
|
|
bool lo = ((pairs[i] & 0xffffffff) == val);
|
|
bool hi = ((pairs[i] >> 32) == val);
|
|
bool match = (lo || hi);
|
|
match &= ((*pcrel_pair) != i);
|
|
if (match && !consts[t].pcrel) {
|
|
idx = i;
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (idx == ~0) {
|
|
idx = pair_count;
|
|
|
|
if (pending && pending_single != val) {
|
|
assert(!(pending_pcrel && consts[t].pcrel));
|
|
bool pcrel = pending_pcrel || consts[t].pcrel;
|
|
|
|
if (pcrel)
|
|
*pcrel_pair = idx;
|
|
|
|
pairs[pair_count++] = bi_merge_u32(pending_single, val, pcrel);
|
|
|
|
pending = pending_pcrel = false;
|
|
} else {
|
|
pending = true;
|
|
pending_pcrel = consts[t].pcrel;
|
|
pending_single = val;
|
|
}
|
|
}
|
|
|
|
consts[t].word_idx = idx;
|
|
}
|
|
|
|
/* Shift so it works whether pending_pcrel is set or not */
|
|
if (pending) {
|
|
if (pending_pcrel)
|
|
*pcrel_pair = pair_count;
|
|
|
|
pairs[pair_count++] = ((uint64_t) pending_single) << 32ull;
|
|
}
|
|
|
|
return pair_count;
|
|
}
|
|
|
|
static unsigned
|
|
bi_merge_constants(struct bi_const_state *consts, uint64_t *pairs, unsigned *pcrel_idx)
|
|
{
|
|
unsigned pair_count = bi_merge_pairs(consts, 8, pairs, pcrel_idx);
|
|
return bi_merge_singles(consts, 8, pairs, pair_count, pcrel_idx);
|
|
}
|
|
|
|
/* Swap two constants at word i and i+1 by swapping their actual positions and
|
|
* swapping all references so the meaning of the clause is preserved */
|
|
|
|
static void
|
|
bi_swap_constants(struct bi_const_state *consts, uint64_t *pairs, unsigned i)
|
|
{
|
|
uint64_t tmp_pair = pairs[i + 0];
|
|
pairs[i + 0] = pairs[i + 1];
|
|
pairs[i + 1] = tmp_pair;
|
|
|
|
for (unsigned t = 0; t < 8; ++t) {
|
|
if (consts[t].word_idx == i)
|
|
consts[t].word_idx = (i + 1);
|
|
else if (consts[t].word_idx == (i + 1))
|
|
consts[t].word_idx = i;
|
|
}
|
|
}
|
|
|
|
/* Given merged constants, one of which might be PC-relative, fix up the M
|
|
* values so the PC-relative constant (if it exists) has the M1=4 modification
|
|
* and other constants are used as-is (which might require swapping) */
|
|
|
|
static unsigned
|
|
bi_apply_constant_modifiers(struct bi_const_state *consts,
|
|
uint64_t *pairs, unsigned *pcrel_idx,
|
|
unsigned tuple_count, unsigned constant_count)
|
|
{
|
|
unsigned start = bi_ec0_packed(tuple_count) ? 1 : 0;
|
|
|
|
/* Clauses with these tuple counts lack an M field for the packed EC0,
|
|
* so EC0 cannot be PC-relative, which might require swapping (and
|
|
* possibly adding an unused constant) to fit */
|
|
|
|
if (*pcrel_idx == 0 && (tuple_count == 5 || tuple_count == 8)) {
|
|
constant_count = MAX2(constant_count, 2);
|
|
*pcrel_idx = 1;
|
|
bi_swap_constants(consts, pairs, 0);
|
|
}
|
|
|
|
/* EC0 might be packed free, after that constants are packed in pairs
|
|
* (with clause format 12), with M1 values computed from the pair */
|
|
|
|
for (unsigned i = start; i < constant_count; i += 2) {
|
|
bool swap = false;
|
|
bool last = (i + 1) == constant_count;
|
|
|
|
unsigned A1 = (pairs[i] >> 60);
|
|
unsigned B1 = (pairs[i + 1] >> 60);
|
|
|
|
if (*pcrel_idx == i || *pcrel_idx == (i + 1)) {
|
|
/* PC-relative constant must be E0, not E1 */
|
|
swap = (*pcrel_idx == (i + 1));
|
|
|
|
/* Set M1 = 4 by noting (A - B) mod 16 = 4 is
|
|
* equivalent to A = (B + 4) mod 16 and that we can
|
|
* control A */
|
|
unsigned B = swap ? A1 : B1;
|
|
unsigned A = (B + 4) & 0xF;
|
|
pairs[*pcrel_idx] |= ((uint64_t) A) << 60;
|
|
|
|
/* Swapped if swap set, identity if swap not set */
|
|
*pcrel_idx = i;
|
|
} else {
|
|
/* Compute M1 value if we don't swap */
|
|
unsigned M1 = (16 + A1 - B1) & 0xF;
|
|
|
|
/* For M1 = 0 or M1 >= 8, the constants are unchanged,
|
|
* we have 0 < (A1 - B1) % 16 < 8, which implies (B1 -
|
|
* A1) % 16 >= 8, so swapping will let them be used
|
|
* unchanged */
|
|
swap = (M1 != 0) && (M1 < 8);
|
|
|
|
/* However, we can't swap the last constant, so we
|
|
* force M1 = 0 instead for this case */
|
|
if (last && swap) {
|
|
pairs[i + 1] |= pairs[i] & (0xfull << 60);
|
|
swap = false;
|
|
}
|
|
}
|
|
|
|
if (swap) {
|
|
assert(!last);
|
|
bi_swap_constants(consts, pairs, i);
|
|
}
|
|
}
|
|
|
|
return constant_count;
|
|
}
|
|
|
|
/* Schedule a single clause. If no instructions remain, return NULL. */
|
|
|
|
static bi_clause *
|
|
bi_schedule_clause(bi_context *ctx, bi_block *block, struct bi_worklist st, uint64_t *live)
|
|
{
|
|
struct bi_clause_state clause_state = { 0 };
|
|
bi_clause *clause = rzalloc(ctx, bi_clause);
|
|
bi_tuple *tuple = NULL;
|
|
|
|
const unsigned max_tuples = ARRAY_SIZE(clause->tuples);
|
|
|
|
/* TODO: Decide flow control better */
|
|
clause->flow_control = BIFROST_FLOW_NBTB;
|
|
|
|
/* The last clause can only write one instruction, so initialize that */
|
|
struct bi_reg_state reg_state = {};
|
|
bi_index prev_reads[5] = { bi_null() };
|
|
unsigned nr_prev_reads = 0;
|
|
|
|
/* We need to track future liveness. The main *live set tracks what is
|
|
* live at the current point int he program we are scheduling, but to
|
|
* determine temp eligibility, we instead want what will be live after
|
|
* the next tuple in the program. If you scheduled forwards, you'd need
|
|
* a crystall ball for this. Luckily we schedule backwards, so we just
|
|
* delay updates to the live_after_temp by an extra tuple. */
|
|
uint64_t live_after_temp = *live;
|
|
uint64_t live_next_tuple = live_after_temp;
|
|
|
|
do {
|
|
struct bi_tuple_state tuple_state = {
|
|
.last = (clause->tuple_count == 0),
|
|
.reg = reg_state,
|
|
.nr_prev_reads = nr_prev_reads,
|
|
.prev = tuple,
|
|
.pcrel_idx = ~0,
|
|
};
|
|
|
|
assert(nr_prev_reads < ARRAY_SIZE(prev_reads));
|
|
memcpy(tuple_state.prev_reads, prev_reads, sizeof(prev_reads));
|
|
|
|
unsigned idx = max_tuples - clause->tuple_count - 1;
|
|
|
|
tuple = &clause->tuples[idx];
|
|
|
|
if (clause->message && bi_opcode_props[clause->message->op].sr_read && !bi_is_null(clause->message->src[0])) {
|
|
unsigned nr = bi_count_read_registers(clause->message, 0);
|
|
live_after_temp |= (BITFIELD64_MASK(nr) << clause->message->src[0].value);
|
|
}
|
|
|
|
/* Since we schedule backwards, we schedule ADD first */
|
|
tuple_state.add = bi_take_instr(ctx, st, &clause_state, &tuple_state, live_after_temp, false);
|
|
tuple->fma = bi_take_instr(ctx, st, &clause_state, &tuple_state, live_after_temp, true);
|
|
tuple->add = tuple_state.add;
|
|
|
|
/* Update liveness from the new instructions */
|
|
if (tuple->add)
|
|
*live = bi_postra_liveness_ins(*live, tuple->add);
|
|
|
|
if (tuple->fma)
|
|
*live = bi_postra_liveness_ins(*live, tuple->fma);
|
|
|
|
/* Rotate in the new per-tuple liveness */
|
|
live_after_temp = live_next_tuple;
|
|
live_next_tuple = *live;
|
|
|
|
/* We may have a message, but only one per clause */
|
|
if (tuple->add && bi_must_message(tuple->add)) {
|
|
assert(!clause_state.message);
|
|
clause_state.message = true;
|
|
|
|
clause->message_type =
|
|
bi_message_type_for_instr(tuple->add);
|
|
clause->message = tuple->add;
|
|
|
|
/* We don't need to set dependencies for blend shaders
|
|
* because the BLEND instruction in the fragment
|
|
* shader should have already done the wait */
|
|
if (!ctx->inputs->is_blend) {
|
|
switch (tuple->add->op) {
|
|
case BI_OPCODE_ATEST:
|
|
clause->dependencies |= (1 << BIFROST_SLOT_ELDEST_DEPTH);
|
|
break;
|
|
case BI_OPCODE_LD_TILE:
|
|
case BI_OPCODE_ST_TILE:
|
|
clause->dependencies |= (1 << BIFROST_SLOT_ELDEST_COLOUR);
|
|
break;
|
|
case BI_OPCODE_BLEND:
|
|
clause->dependencies |= (1 << BIFROST_SLOT_ELDEST_DEPTH);
|
|
clause->dependencies |= (1 << BIFROST_SLOT_ELDEST_COLOUR);
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
clause_state.consts[idx] = bi_get_const_state(&tuple_state);
|
|
|
|
/* Before merging constants, eliminate zeroes, otherwise the
|
|
* merging will fight over the #0 that never gets read (and is
|
|
* never marked as read by update_fau) */
|
|
if (tuple->fma && bi_reads_zero(tuple->fma))
|
|
bi_rewrite_zero(tuple->fma, true);
|
|
|
|
/* Rewrite away FAU, constant write is deferred */
|
|
if (!tuple_state.constant_count) {
|
|
tuple->fau_idx = tuple_state.fau;
|
|
bi_rewrite_fau_to_pass(tuple);
|
|
}
|
|
|
|
/* Use passthrough register for cross-stage accesses. Since
|
|
* there are just FMA and ADD stages, that means we rewrite to
|
|
* passthrough the sources of the ADD that read from the
|
|
* destination of the FMA */
|
|
|
|
if (tuple->fma) {
|
|
bi_use_passthrough(tuple->add, tuple->fma->dest[0],
|
|
BIFROST_SRC_STAGE, false);
|
|
}
|
|
|
|
/* Don't add an empty tuple, unless the worklist has nothing
|
|
* but a (pseudo)instruction failing to schedule due to a "not
|
|
* last instruction" constraint */
|
|
|
|
int some_instruction = __bitset_ffs(st.worklist, BITSET_WORDS(st.count));
|
|
bool not_last = (some_instruction > 0) &&
|
|
bi_must_not_last(st.instructions[some_instruction - 1]);
|
|
|
|
bool insert_empty = tuple_state.last && not_last;
|
|
|
|
if (!(tuple->fma || tuple->add || insert_empty))
|
|
break;
|
|
|
|
clause->tuple_count++;
|
|
|
|
/* Adding enough tuple might overflow constants */
|
|
if (!bi_space_for_more_constants(&clause_state))
|
|
break;
|
|
|
|
#ifndef NDEBUG
|
|
/* Don't schedule more than 1 tuple if debugging */
|
|
if ((bifrost_debug & BIFROST_DBG_NOSCHED) && !insert_empty)
|
|
break;
|
|
#endif
|
|
|
|
/* Link through the register state */
|
|
STATIC_ASSERT(sizeof(prev_reads) == sizeof(tuple_state.reg.reads));
|
|
memcpy(prev_reads, tuple_state.reg.reads, sizeof(prev_reads));
|
|
nr_prev_reads = tuple_state.reg.nr_reads;
|
|
clause_state.tuple_count++;
|
|
} while(clause->tuple_count < 8);
|
|
|
|
/* Don't schedule an empty clause */
|
|
if (!clause->tuple_count)
|
|
return NULL;
|
|
|
|
/* Before merging, rewrite away any tuples that read only zero */
|
|
for (unsigned i = max_tuples - clause->tuple_count; i < max_tuples; ++i) {
|
|
bi_tuple *tuple = &clause->tuples[i];
|
|
struct bi_const_state *st = &clause_state.consts[i];
|
|
|
|
if (st->constant_count == 0 || st->constants[0] || st->constants[1] || st->pcrel)
|
|
continue;
|
|
|
|
bi_foreach_instr_in_tuple(tuple, ins)
|
|
bi_rewrite_zero(ins, false);
|
|
|
|
/* Constant has been demoted to FAU, so don't pack it separately */
|
|
st->constant_count = 0;
|
|
|
|
/* Default */
|
|
assert(tuple->fau_idx == BIR_FAU_ZERO);
|
|
}
|
|
|
|
uint64_t constant_pairs[8] = { 0 };
|
|
unsigned pcrel_idx = ~0;
|
|
unsigned constant_words =
|
|
bi_merge_constants(clause_state.consts, constant_pairs, &pcrel_idx);
|
|
|
|
constant_words = bi_apply_constant_modifiers(clause_state.consts,
|
|
constant_pairs, &pcrel_idx, clause->tuple_count,
|
|
constant_words);
|
|
|
|
clause->pcrel_idx = pcrel_idx;
|
|
|
|
for (unsigned i = max_tuples - clause->tuple_count; i < max_tuples; ++i) {
|
|
bi_tuple *tuple = &clause->tuples[i];
|
|
|
|
/* If no constants, leave FAU as it is, possibly defaulting to 0 */
|
|
if (clause_state.consts[i].constant_count == 0)
|
|
continue;
|
|
|
|
/* FAU is already handled */
|
|
assert(!tuple->fau_idx);
|
|
|
|
unsigned word_idx = clause_state.consts[i].word_idx;
|
|
assert(word_idx <= 8);
|
|
|
|
/* We could try to merge regardless of bottom bits as well, but
|
|
* that's probably diminishing returns */
|
|
uint64_t pair = constant_pairs[word_idx];
|
|
unsigned lo = pair & 0xF;
|
|
|
|
tuple->fau_idx = bi_constant_field(word_idx) | lo;
|
|
bi_rewrite_constants_to_pass(tuple, pair, word_idx == pcrel_idx);
|
|
}
|
|
|
|
clause->constant_count = constant_words;
|
|
memcpy(clause->constants, constant_pairs, sizeof(constant_pairs));
|
|
|
|
/* Branches must be last, so this can be factored out */
|
|
bi_instr *last = clause->tuples[max_tuples - 1].add;
|
|
clause->next_clause_prefetch = !last || (last->op != BI_OPCODE_JUMP);
|
|
clause->block = block;
|
|
|
|
clause->ftz = (clause_state.ftz == BI_FTZ_STATE_ENABLE);
|
|
|
|
/* We emit in reverse and emitted to the back of the tuples array, so
|
|
* move it up front for easy indexing */
|
|
memmove(clause->tuples,
|
|
clause->tuples + (max_tuples - clause->tuple_count),
|
|
clause->tuple_count * sizeof(clause->tuples[0]));
|
|
|
|
/* Use passthrough register for cross-tuple accesses. Note this is
|
|
* after the memmove, so this is forwards. Skip the first tuple since
|
|
* there is nothing before it to passthrough */
|
|
|
|
for (unsigned t = 1; t < clause->tuple_count; ++t)
|
|
bi_rewrite_passthrough(clause->tuples[t - 1], clause->tuples[t]);
|
|
|
|
return clause;
|
|
}
|
|
|
|
static void
|
|
bi_schedule_block(bi_context *ctx, bi_block *block)
|
|
{
|
|
list_inithead(&block->clauses);
|
|
|
|
/* Copy list to dynamic array */
|
|
struct bi_worklist st = bi_initialize_worklist(block,
|
|
bifrost_debug & BIFROST_DBG_INORDER,
|
|
ctx->inputs->is_blend);
|
|
|
|
if (!st.count) {
|
|
bi_free_worklist(st);
|
|
return;
|
|
}
|
|
|
|
/* We need to track liveness during scheduling in order to determine whether we can use temporary (passthrough) registers */
|
|
uint64_t live = block->reg_live_out;
|
|
|
|
/* Schedule as many clauses as needed to fill the block */
|
|
bi_clause *u = NULL;
|
|
while((u = bi_schedule_clause(ctx, block, st, &live)))
|
|
list_add(&u->link, &block->clauses);
|
|
|
|
/* Back-to-back bit affects only the last clause of a block,
|
|
* the rest are implicitly true */
|
|
if (!list_is_empty(&block->clauses)) {
|
|
bi_clause *last_clause = list_last_entry(&block->clauses, bi_clause, link);
|
|
if (bi_reconverge_branches(block))
|
|
last_clause->flow_control = BIFROST_FLOW_NBTB_UNCONDITIONAL;
|
|
}
|
|
|
|
/* Reorder instructions to match the new schedule. First remove
|
|
* existing instructions and then recreate the list */
|
|
|
|
bi_foreach_instr_in_block_safe(block, ins) {
|
|
list_del(&ins->link);
|
|
}
|
|
|
|
bi_foreach_clause_in_block(block, clause) {
|
|
for (unsigned i = 0; i < clause->tuple_count; ++i) {
|
|
bi_foreach_instr_in_tuple(&clause->tuples[i], ins) {
|
|
list_addtail(&ins->link, &block->instructions);
|
|
}
|
|
}
|
|
}
|
|
|
|
block->scheduled = true;
|
|
|
|
#ifndef NDEBUG
|
|
unsigned i;
|
|
bool incomplete = false;
|
|
|
|
BITSET_FOREACH_SET(i, st.worklist, st.count) {
|
|
bi_print_instr(st.instructions[i], stderr);
|
|
incomplete = true;
|
|
}
|
|
|
|
if (incomplete)
|
|
unreachable("The above instructions failed to schedule.");
|
|
#endif
|
|
|
|
bi_free_worklist(st);
|
|
}
|
|
|
|
static bool
|
|
bi_check_fau_src(bi_instr *ins, unsigned s, uint32_t *constants, unsigned *cwords, bi_index *fau)
|
|
{
|
|
bi_index src = ins->src[s];
|
|
|
|
/* Staging registers can't have FAU accesses */
|
|
if (bi_is_staging_src(ins, s))
|
|
return (src.type != BI_INDEX_CONSTANT) && (src.type != BI_INDEX_FAU);
|
|
|
|
if (src.type == BI_INDEX_CONSTANT) {
|
|
/* Allow fast zero */
|
|
if (src.value == 0 && bi_opcode_props[ins->op].fma && bi_reads_zero(ins))
|
|
return true;
|
|
|
|
if (!bi_is_null(*fau))
|
|
return false;
|
|
|
|
/* Else, try to inline a constant */
|
|
for (unsigned i = 0; i < *cwords; ++i) {
|
|
if (src.value == constants[i])
|
|
return true;
|
|
}
|
|
|
|
if (*cwords >= 2)
|
|
return false;
|
|
|
|
constants[(*cwords)++] = src.value;
|
|
} else if (src.type == BI_INDEX_FAU) {
|
|
if (*cwords != 0)
|
|
return false;
|
|
|
|
/* Can only read from one pair of FAU words */
|
|
if (!bi_is_null(*fau) && (src.value != fau->value))
|
|
return false;
|
|
|
|
/* If there is a target, we'll need a PC-relative constant */
|
|
if (ins->branch_target)
|
|
return false;
|
|
|
|
*fau = src;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
void
|
|
bi_lower_fau(bi_context *ctx)
|
|
{
|
|
bi_foreach_instr_global_safe(ctx, ins) {
|
|
bi_builder b = bi_init_builder(ctx, bi_before_instr(ins));
|
|
|
|
uint32_t constants[2];
|
|
unsigned cwords = 0;
|
|
bi_index fau = bi_null();
|
|
|
|
/* ATEST must have the ATEST datum encoded, not any other
|
|
* uniform. See to it this is the case. */
|
|
if (ins->op == BI_OPCODE_ATEST)
|
|
fau = ins->src[2];
|
|
|
|
bi_foreach_src(ins, s) {
|
|
if (bi_check_fau_src(ins, s, constants, &cwords, &fau)) continue;
|
|
|
|
bi_index copy = bi_mov_i32(&b, ins->src[s]);
|
|
ins->src[s] = bi_replace_index(ins->src[s], copy);
|
|
}
|
|
}
|
|
}
|
|
|
|
/* Only v7 allows specifying a dependency on the tilebuffer for the first
|
|
* clause of a shader. v6 requires adding a NOP clause with the depedency. */
|
|
|
|
static void
|
|
bi_add_nop_for_atest(bi_context *ctx)
|
|
{
|
|
/* Only needed on v6 */
|
|
if (ctx->arch >= 7)
|
|
return;
|
|
|
|
if (list_is_empty(&ctx->blocks))
|
|
return;
|
|
|
|
/* Fetch the first clause of the shader */
|
|
bi_block *block = list_first_entry(&ctx->blocks, bi_block, link);
|
|
bi_clause *clause = bi_next_clause(ctx, block, NULL);
|
|
|
|
if (!clause || !(clause->dependencies & ((1 << BIFROST_SLOT_ELDEST_DEPTH) |
|
|
(1 << BIFROST_SLOT_ELDEST_COLOUR))))
|
|
return;
|
|
|
|
/* Add a NOP so we can wait for the dependencies required by the first
|
|
* clause */
|
|
|
|
bi_instr *I = rzalloc(ctx, bi_instr);
|
|
I->op = BI_OPCODE_NOP;
|
|
I->dest[0] = bi_null();
|
|
|
|
bi_clause *new_clause = ralloc(ctx, bi_clause);
|
|
*new_clause = (bi_clause) {
|
|
.flow_control = BIFROST_FLOW_NBTB,
|
|
.next_clause_prefetch = true,
|
|
.block = clause->block,
|
|
|
|
.tuple_count = 1,
|
|
.tuples[0] = { .fma = I, },
|
|
};
|
|
|
|
list_add(&new_clause->link, &clause->block->clauses);
|
|
}
|
|
|
|
void
|
|
bi_schedule(bi_context *ctx)
|
|
{
|
|
/* Fed into both scheduling and DCE */
|
|
bi_postra_liveness(ctx);
|
|
|
|
bi_foreach_block(ctx, block) {
|
|
bi_schedule_block(ctx, block);
|
|
}
|
|
|
|
bi_opt_dce_post_ra(ctx);
|
|
bi_add_nop_for_atest(ctx);
|
|
}
|