310 lines
11 KiB
C
310 lines
11 KiB
C
/*
|
|
* Copyright (C) 2020 Collabora Ltd.
|
|
*
|
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
|
* copy of this software and associated documentation files (the "Software"),
|
|
* to deal in the Software without restriction, including without limitation
|
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
|
* and/or sell copies of the Software, and to permit persons to whom the
|
|
* Software is furnished to do so, subject to the following conditions:
|
|
*
|
|
* The above copyright notice and this permission notice (including the next
|
|
* paragraph) shall be included in all copies or substantial portions of the
|
|
* Software.
|
|
*
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
|
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
* SOFTWARE.
|
|
*
|
|
* Authors (Collabora):
|
|
* Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
|
|
*/
|
|
|
|
#include "compiler.h"
|
|
|
|
/* Assign dependency slots to each clause and calculate dependencies, This pass
|
|
* must be run after scheduling.
|
|
*
|
|
* 1. A clause that does not produce a message must use the sentinel slot #0
|
|
* 2a. A clause that depends on the results of a previous message-passing
|
|
* instruction must depend on that instruction's dependency slot, unless all
|
|
* reaching code paths already depended on it.
|
|
* 2b. More generally, any dependencies must be encoded. This includes
|
|
* Write-After-Write and Write-After-Read hazards with LOAD/STORE to memory.
|
|
* 3. The shader must wait on slot #6 before running BLEND, ATEST
|
|
* 4. The shader must wait on slot #7 before running BLEND, ST_TILE
|
|
* 5. ATEST, ZS_EMIT must be issued with slot #0
|
|
* 6. BARRIER must be issued with slot #7 and wait on every active slot.
|
|
* 7. Only slots #0 through #5 may be used for clauses not otherwise specified.
|
|
* 8. If a clause writes to a read staging register of an unresolved
|
|
* dependency, it must set a staging barrier.
|
|
*
|
|
* Note it _is_ legal to reuse slots for multiple message passing instructions
|
|
* with overlapping liveness, albeit with a slight performance penalty. As such
|
|
* the problem is significantly easier than register allocation, rather than
|
|
* spilling we may simply reuse slots. (TODO: does this have an optimal
|
|
* linear-time solution).
|
|
*
|
|
* Within these constraints we are free to assign slots as we like. This pass
|
|
* attempts to minimize stalls (TODO).
|
|
*/
|
|
|
|
#define BI_NUM_GENERAL_SLOTS 6
|
|
#define BI_NUM_SLOTS 8
|
|
#define BI_NUM_REGISTERS 64
|
|
#define BI_SLOT_SERIAL 0 /* arbitrary */
|
|
|
|
/*
|
|
* Due to the crude scoreboarding we do, we need to serialize varying loads and
|
|
* memory access. Identify these instructions here.
|
|
*/
|
|
static bool
|
|
bi_should_serialize(bi_instr *I)
|
|
{
|
|
/* For debug, serialize everything to disable scoreboard opts */
|
|
if (bifrost_debug & BIFROST_DBG_NOSB)
|
|
return true;
|
|
|
|
/* Although nominally on the attribute unit, image loads have the same
|
|
* coherency requirements as general memory loads. Serialize them for
|
|
* now until we can do something more clever.
|
|
*/
|
|
if (I->op == BI_OPCODE_LD_ATTR_TEX)
|
|
return true;
|
|
|
|
switch (bi_opcode_props[I->op].message) {
|
|
case BIFROST_MESSAGE_VARYING:
|
|
case BIFROST_MESSAGE_LOAD:
|
|
case BIFROST_MESSAGE_STORE:
|
|
case BIFROST_MESSAGE_ATOMIC:
|
|
return true;
|
|
default:
|
|
return false;
|
|
}
|
|
}
|
|
|
|
/* Given a scoreboard model, choose a slot for a clause wrapping a given
|
|
* message passing instruction. No side effects. */
|
|
|
|
static unsigned
|
|
bi_choose_scoreboard_slot(bi_instr *message)
|
|
{
|
|
/* ATEST, ZS_EMIT must be issued with slot #0 */
|
|
if (message->op == BI_OPCODE_ATEST || message->op == BI_OPCODE_ZS_EMIT)
|
|
return 0;
|
|
|
|
/* BARRIER must be issued with slot #7 */
|
|
if (message->op == BI_OPCODE_BARRIER)
|
|
return 7;
|
|
|
|
/* For now, make serialization is easy */
|
|
if (bi_should_serialize(message))
|
|
return BI_SLOT_SERIAL;
|
|
|
|
return 0;
|
|
}
|
|
|
|
static uint64_t
|
|
bi_read_mask(bi_instr *I, bool staging_only)
|
|
{
|
|
uint64_t mask = 0;
|
|
|
|
if (staging_only && !bi_opcode_props[I->op].sr_read)
|
|
return mask;
|
|
|
|
bi_foreach_src(I, s) {
|
|
if (I->src[s].type == BI_INDEX_REGISTER) {
|
|
unsigned reg = I->src[s].value;
|
|
unsigned count = bi_count_read_registers(I, s);
|
|
|
|
mask |= (BITFIELD64_MASK(count) << reg);
|
|
}
|
|
|
|
if (staging_only)
|
|
break;
|
|
}
|
|
|
|
return mask;
|
|
}
|
|
|
|
static uint64_t
|
|
bi_write_mask(bi_instr *I)
|
|
{
|
|
uint64_t mask = 0;
|
|
|
|
bi_foreach_dest(I, d) {
|
|
if (bi_is_null(I->dest[d])) continue;
|
|
|
|
assert(I->dest[d].type == BI_INDEX_REGISTER);
|
|
|
|
unsigned reg = I->dest[d].value;
|
|
unsigned count = bi_count_write_registers(I, d);
|
|
|
|
mask |= (BITFIELD64_MASK(count) << reg);
|
|
}
|
|
|
|
/* Instructions like AXCHG.i32 unconditionally both read and write
|
|
* staging registers. Even if we discard the result, the write still
|
|
* happens logically and needs to be included in our calculations.
|
|
* Obscurely, ATOM_CX is sr_write but can ignore the staging register in
|
|
* certain circumstances; this does not require consideration.
|
|
*/
|
|
if (bi_opcode_props[I->op].sr_write && bi_is_null(I->dest[0]) &&
|
|
!bi_is_null(I->src[0])) {
|
|
|
|
unsigned reg = I->src[0].value;
|
|
unsigned count = bi_count_write_registers(I, 0);
|
|
|
|
mask |= (BITFIELD64_MASK(count) << reg);
|
|
}
|
|
|
|
return mask;
|
|
}
|
|
|
|
/* Update the scoreboard model to assign an instruction to a given slot */
|
|
|
|
static void
|
|
bi_push_clause(struct bi_scoreboard_state *st, bi_clause *clause)
|
|
{
|
|
bi_instr *I = clause->message;
|
|
unsigned slot = clause->scoreboard_id;
|
|
|
|
if (!I)
|
|
return;
|
|
|
|
st->read[slot] |= bi_read_mask(I, true);
|
|
|
|
if (bi_opcode_props[I->op].sr_write)
|
|
st->write[slot] |= bi_write_mask(I);
|
|
}
|
|
|
|
/* Adds a dependency on each slot writing any specified register */
|
|
|
|
static void
|
|
bi_depend_on_writers(bi_clause *clause, struct bi_scoreboard_state *st, uint64_t regmask)
|
|
{
|
|
for (unsigned slot = 0; slot < ARRAY_SIZE(st->write); ++slot) {
|
|
if (!(st->write[slot] & regmask))
|
|
continue;
|
|
|
|
st->write[slot] = 0;
|
|
st->read[slot] = 0;
|
|
|
|
clause->dependencies |= BITFIELD_BIT(slot);
|
|
}
|
|
}
|
|
|
|
static void
|
|
bi_set_staging_barrier(bi_clause *clause, struct bi_scoreboard_state *st, uint64_t regmask)
|
|
{
|
|
for (unsigned slot = 0; slot < ARRAY_SIZE(st->read); ++slot) {
|
|
if (!(st->read[slot] & regmask))
|
|
continue;
|
|
|
|
st->read[slot] = 0;
|
|
clause->staging_barrier = true;
|
|
}
|
|
}
|
|
|
|
/* Sets the dependencies for a given clause, updating the model */
|
|
|
|
static void
|
|
bi_set_dependencies(bi_block *block, bi_clause *clause, struct bi_scoreboard_state *st)
|
|
{
|
|
bi_foreach_instr_in_clause(block, clause, I) {
|
|
uint64_t read = bi_read_mask(I, false);
|
|
uint64_t written = bi_write_mask(I);
|
|
|
|
/* Read-after-write; write-after-write */
|
|
bi_depend_on_writers(clause, st, read | written);
|
|
|
|
/* Write-after-read */
|
|
bi_set_staging_barrier(clause, st, written);
|
|
}
|
|
|
|
/* LD_VAR instructions must be serialized per-quad. Just always depend
|
|
* on any LD_VAR instructions. This isn't optimal, but doing better
|
|
* requires divergence-aware data flow analysis.
|
|
*
|
|
* Similarly, memory loads/stores need to be synchronized. For now,
|
|
* force them to be serialized. This is not optimal.
|
|
*/
|
|
if (clause->message && bi_should_serialize(clause->message))
|
|
clause->dependencies |= BITFIELD_BIT(BI_SLOT_SERIAL);
|
|
|
|
/* Barriers must wait on all slots to flush existing work. It might be
|
|
* possible to skip this with more information about the barrier. For
|
|
* now, be conservative.
|
|
*/
|
|
if (clause->message && clause->message->op == BI_OPCODE_BARRIER)
|
|
clause->dependencies |= BITFIELD_MASK(BI_NUM_GENERAL_SLOTS);
|
|
}
|
|
|
|
static bool
|
|
scoreboard_block_update(bi_block *blk)
|
|
{
|
|
bool progress = false;
|
|
|
|
/* pending_in[s] = sum { p in pred[s] } ( pending_out[p] ) */
|
|
bi_foreach_predecessor(blk, pred) {
|
|
for (unsigned i = 0; i < BI_NUM_SLOTS; ++i) {
|
|
blk->scoreboard_in.read[i] |= (*pred)->scoreboard_out.read[i];
|
|
blk->scoreboard_in.write[i] |= (*pred)->scoreboard_out.write[i];
|
|
}
|
|
}
|
|
|
|
struct bi_scoreboard_state state = blk->scoreboard_in;
|
|
|
|
/* Assign locally */
|
|
|
|
bi_foreach_clause_in_block(blk, clause) {
|
|
bi_set_dependencies(blk, clause, &state);
|
|
bi_push_clause(&state, clause);
|
|
}
|
|
|
|
/* To figure out progress, diff scoreboard_out */
|
|
|
|
for (unsigned i = 0; i < BI_NUM_SLOTS; ++i)
|
|
progress |= !!memcmp(&state, &blk->scoreboard_out, sizeof(state));
|
|
|
|
blk->scoreboard_out = state;
|
|
|
|
return progress;
|
|
}
|
|
|
|
void
|
|
bi_assign_scoreboard(bi_context *ctx)
|
|
{
|
|
u_worklist worklist;
|
|
bi_worklist_init(ctx, &worklist);
|
|
|
|
/* First, assign slots. */
|
|
bi_foreach_block(ctx, block) {
|
|
bi_foreach_clause_in_block(block, clause) {
|
|
if (clause->message) {
|
|
unsigned slot = bi_choose_scoreboard_slot(clause->message);
|
|
clause->scoreboard_id = slot;
|
|
}
|
|
}
|
|
|
|
bi_worklist_push_tail(&worklist, block);
|
|
}
|
|
|
|
/* Next, perform forward data flow analysis to calculate dependencies */
|
|
while (!u_worklist_is_empty(&worklist)) {
|
|
/* Pop from the front for forward analysis */
|
|
bi_block *blk = bi_worklist_pop_head(&worklist);
|
|
|
|
if (scoreboard_block_update(blk)) {
|
|
bi_foreach_successor(blk, succ)
|
|
bi_worklist_push_tail(&worklist, succ);
|
|
}
|
|
}
|
|
|
|
u_worklist_fini(&worklist);
|
|
}
|