338 lines
12 KiB
C
338 lines
12 KiB
C
/*
|
|
* Copyright (C) 2019 Google, Inc.
|
|
*
|
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
|
* copy of this software and associated documentation files (the "Software"),
|
|
* to deal in the Software without restriction, including without limitation
|
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
|
* and/or sell copies of the Software, and to permit persons to whom the
|
|
* Software is furnished to do so, subject to the following conditions:
|
|
*
|
|
* The above copyright notice and this permission notice (including the next
|
|
* paragraph) shall be included in all copies or substantial portions of the
|
|
* Software.
|
|
*
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
|
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
* SOFTWARE.
|
|
*
|
|
* Authors:
|
|
* Rob Clark <robclark@freedesktop.org>
|
|
*/
|
|
|
|
#include "ir3.h"
|
|
|
|
/* The maximum number of nop's we may need to insert between two instructions.
|
|
*/
|
|
#define MAX_NOPS 6
|
|
|
|
/*
|
|
* Helpers to figure out the necessary delay slots between instructions. Used
|
|
* both in scheduling pass(es) and the final pass to insert any required nop's
|
|
* so that the shader program is valid.
|
|
*
|
|
* Note that this needs to work both pre and post RA, so we can't assume ssa
|
|
* src iterators work.
|
|
*/
|
|
|
|
/* calculate required # of delay slots between the instruction that
|
|
* assigns a value and the one that consumes
|
|
*/
|
|
int
|
|
ir3_delayslots(struct ir3_instruction *assigner,
|
|
struct ir3_instruction *consumer, unsigned n, bool soft)
|
|
{
|
|
/* generally don't count false dependencies, since this can just be
|
|
* something like a barrier, or SSBO store.
|
|
*/
|
|
if (__is_false_dep(consumer, n))
|
|
return 0;
|
|
|
|
/* worst case is cat1-3 (alu) -> cat4/5 needing 6 cycles, normal
|
|
* alu -> alu needs 3 cycles, cat4 -> alu and texture fetch
|
|
* handled with sync bits
|
|
*/
|
|
|
|
if (is_meta(assigner) || is_meta(consumer))
|
|
return 0;
|
|
|
|
if (writes_addr0(assigner) || writes_addr1(assigner))
|
|
return 6;
|
|
|
|
if (soft && is_ss_producer(assigner))
|
|
return soft_ss_delay(assigner);
|
|
|
|
/* handled via sync flags: */
|
|
if (is_ss_producer(assigner) || is_sy_producer(assigner))
|
|
return 0;
|
|
|
|
/* As far as we know, shader outputs don't need any delay. */
|
|
if (consumer->opc == OPC_END || consumer->opc == OPC_CHMASK)
|
|
return 0;
|
|
|
|
/* assigner must be alu: */
|
|
if (is_flow(consumer) || is_sfu(consumer) || is_tex(consumer) ||
|
|
is_mem(consumer)) {
|
|
return 6;
|
|
} else {
|
|
/* In mergedregs mode, there is an extra 2-cycle penalty when half of
|
|
* a full-reg is read as a half-reg or when a half-reg is read as a
|
|
* full-reg.
|
|
*/
|
|
bool mismatched_half = (assigner->dsts[0]->flags & IR3_REG_HALF) !=
|
|
(consumer->srcs[n]->flags & IR3_REG_HALF);
|
|
unsigned penalty = mismatched_half ? 3 : 0;
|
|
if ((is_mad(consumer->opc) || is_madsh(consumer->opc)) && (n == 2)) {
|
|
/* special case, 3rd src to cat3 not required on first cycle */
|
|
return 1 + penalty;
|
|
} else {
|
|
return 3 + penalty;
|
|
}
|
|
}
|
|
}
|
|
|
|
static bool
|
|
count_instruction(struct ir3_instruction *n)
|
|
{
|
|
/* NOTE: don't count branch/jump since we don't know yet if they will
|
|
* be eliminated later in resolve_jumps().. really should do that
|
|
* earlier so we don't have this constraint.
|
|
*/
|
|
return is_alu(n) ||
|
|
(is_flow(n) && (n->opc != OPC_JUMP) && (n->opc != OPC_B));
|
|
}
|
|
|
|
/* Post-RA, we don't have arrays any more, so we have to be a bit careful here
|
|
* and have to handle relative accesses specially.
|
|
*/
|
|
|
|
static unsigned
|
|
post_ra_reg_elems(struct ir3_register *reg)
|
|
{
|
|
if (reg->flags & IR3_REG_RELATIV)
|
|
return reg->size;
|
|
return reg_elems(reg);
|
|
}
|
|
|
|
static unsigned
|
|
post_ra_reg_num(struct ir3_register *reg)
|
|
{
|
|
if (reg->flags & IR3_REG_RELATIV)
|
|
return reg->array.base;
|
|
return reg->num;
|
|
}
|
|
|
|
unsigned
|
|
ir3_delayslots_with_repeat(struct ir3_instruction *assigner,
|
|
struct ir3_instruction *consumer,
|
|
unsigned assigner_n, unsigned consumer_n)
|
|
{
|
|
unsigned delay = ir3_delayslots(assigner, consumer, consumer_n, false);
|
|
|
|
struct ir3_register *src = consumer->srcs[consumer_n];
|
|
struct ir3_register *dst = assigner->dsts[assigner_n];
|
|
|
|
if (assigner->repeat == 0 && consumer->repeat == 0)
|
|
return delay;
|
|
|
|
unsigned src_start = post_ra_reg_num(src) * reg_elem_size(src);
|
|
unsigned dst_start = post_ra_reg_num(dst) * reg_elem_size(dst);
|
|
|
|
/* If either side is a relative access, we can't really apply most of the
|
|
* reasoning below because we don't know which component aliases which.
|
|
* Just bail in this case.
|
|
*/
|
|
if ((src->flags & IR3_REG_RELATIV) || (dst->flags & IR3_REG_RELATIV))
|
|
return delay;
|
|
|
|
/* MOVMSK seems to require that all users wait until the entire
|
|
* instruction is finished, so just bail here.
|
|
*/
|
|
if (assigner->opc == OPC_MOVMSK)
|
|
return delay;
|
|
|
|
bool mismatched_half =
|
|
(src->flags & IR3_REG_HALF) != (dst->flags & IR3_REG_HALF);
|
|
|
|
/* TODO: Handle the combination of (rpt) and different component sizes
|
|
* better like below. This complicates things significantly because the
|
|
* components don't line up.
|
|
*/
|
|
if (mismatched_half)
|
|
return delay;
|
|
|
|
/* If an instruction has a (rpt), then it acts as a sequence of
|
|
* instructions, reading its non-(r) sources at each cycle. First, get the
|
|
* register num for the first instruction where they interfere:
|
|
*/
|
|
|
|
unsigned first_num = MAX2(src_start, dst_start) / reg_elem_size(dst);
|
|
|
|
/* Now, for that first conflicting half/full register, figure out the
|
|
* sub-instruction within assigner/consumer it corresponds to. For (r)
|
|
* sources, this should already return the correct answer of 0. However we
|
|
* have to special-case the multi-mov instructions, where the
|
|
* sub-instructions sometimes come from the src/dst indices instead.
|
|
*/
|
|
unsigned first_src_instr;
|
|
if (consumer->opc == OPC_SWZ || consumer->opc == OPC_GAT)
|
|
first_src_instr = consumer_n;
|
|
else
|
|
first_src_instr = first_num - src->num;
|
|
|
|
unsigned first_dst_instr;
|
|
if (assigner->opc == OPC_SWZ || assigner->opc == OPC_SCT)
|
|
first_dst_instr = assigner_n;
|
|
else
|
|
first_dst_instr = first_num - dst->num;
|
|
|
|
/* The delay we return is relative to the *end* of assigner and the
|
|
* *beginning* of consumer, because it's the number of nops (or other
|
|
* things) needed between them. Any instructions after first_dst_instr
|
|
* subtract from the delay, and so do any instructions before
|
|
* first_src_instr. Calculate an offset to subtract from the non-rpt-aware
|
|
* delay to account for that.
|
|
*
|
|
* Now, a priori, we need to go through this process for every
|
|
* conflicting regnum and take the minimum of the offsets to make sure
|
|
* that the appropriate number of nop's is inserted for every conflicting
|
|
* pair of sub-instructions. However, as we go to the next conflicting
|
|
* regnum (if any), the number of instructions after first_dst_instr
|
|
* decreases by 1 and the number of source instructions before
|
|
* first_src_instr correspondingly increases by 1, so the offset stays the
|
|
* same for all conflicting registers.
|
|
*/
|
|
unsigned offset = first_src_instr + (assigner->repeat - first_dst_instr);
|
|
return offset > delay ? 0 : delay - offset;
|
|
}
|
|
|
|
static unsigned
|
|
delay_calc_srcn(struct ir3_instruction *assigner,
|
|
struct ir3_instruction *consumer, unsigned assigner_n,
|
|
unsigned consumer_n, bool mergedregs)
|
|
{
|
|
struct ir3_register *src = consumer->srcs[consumer_n];
|
|
struct ir3_register *dst = assigner->dsts[assigner_n];
|
|
bool mismatched_half =
|
|
(src->flags & IR3_REG_HALF) != (dst->flags & IR3_REG_HALF);
|
|
|
|
/* In the mergedregs case or when the register is a special register,
|
|
* half-registers do not alias with full registers.
|
|
*/
|
|
if ((!mergedregs || is_reg_special(src) || is_reg_special(dst)) &&
|
|
mismatched_half)
|
|
return 0;
|
|
|
|
unsigned src_start = post_ra_reg_num(src) * reg_elem_size(src);
|
|
unsigned src_end = src_start + post_ra_reg_elems(src) * reg_elem_size(src);
|
|
unsigned dst_start = post_ra_reg_num(dst) * reg_elem_size(dst);
|
|
unsigned dst_end = dst_start + post_ra_reg_elems(dst) * reg_elem_size(dst);
|
|
|
|
if (dst_start >= src_end || src_start >= dst_end)
|
|
return 0;
|
|
|
|
return ir3_delayslots_with_repeat(assigner, consumer, assigner_n, consumer_n);
|
|
}
|
|
|
|
static unsigned
|
|
delay_calc(struct ir3_block *block, struct ir3_instruction *start,
|
|
struct ir3_instruction *consumer, unsigned distance,
|
|
regmask_t *in_mask, bool mergedregs)
|
|
{
|
|
regmask_t mask;
|
|
memcpy(&mask, in_mask, sizeof(mask));
|
|
|
|
unsigned delay = 0;
|
|
/* Search backwards starting at the instruction before start, unless it's
|
|
* NULL then search backwards from the block end.
|
|
*/
|
|
struct list_head *start_list =
|
|
start ? start->node.prev : block->instr_list.prev;
|
|
list_for_each_entry_from_rev (struct ir3_instruction, assigner, start_list,
|
|
&block->instr_list, node) {
|
|
if (count_instruction(assigner))
|
|
distance += assigner->nop;
|
|
|
|
if (distance + delay >= MAX_NOPS)
|
|
return delay;
|
|
|
|
if (is_meta(assigner))
|
|
continue;
|
|
|
|
unsigned new_delay = 0;
|
|
|
|
foreach_dst_n (dst, dst_n, assigner) {
|
|
if (dst->wrmask == 0)
|
|
continue;
|
|
if (!regmask_get(&mask, dst))
|
|
continue;
|
|
foreach_src_n (src, src_n, consumer) {
|
|
if (src->flags & (IR3_REG_IMMED | IR3_REG_CONST))
|
|
continue;
|
|
|
|
unsigned src_delay = delay_calc_srcn(
|
|
assigner, consumer, dst_n, src_n, mergedregs);
|
|
new_delay = MAX2(new_delay, src_delay);
|
|
}
|
|
regmask_clear(&mask, dst);
|
|
}
|
|
|
|
new_delay = new_delay > distance ? new_delay - distance : 0;
|
|
delay = MAX2(delay, new_delay);
|
|
|
|
if (count_instruction(assigner))
|
|
distance += 1 + assigner->repeat;
|
|
}
|
|
|
|
/* Note: this allows recursion into "block" if it has already been
|
|
* visited, but *not* recursion into its predecessors. We may have to
|
|
* visit the original block twice, for the loop case where we have to
|
|
* consider definititons in an earlier iterations of the same loop:
|
|
*
|
|
* while (...) {
|
|
* mov.u32u32 ..., r0.x
|
|
* ...
|
|
* mov.u32u32 r0.x, ...
|
|
* }
|
|
*
|
|
* However any other recursion would be unnecessary.
|
|
*/
|
|
|
|
if (block->data != block) {
|
|
block->data = block;
|
|
|
|
for (unsigned i = 0; i < block->predecessors_count; i++) {
|
|
struct ir3_block *pred = block->predecessors[i];
|
|
unsigned pred_delay = delay_calc(pred, NULL, consumer, distance,
|
|
&mask, mergedregs);
|
|
delay = MAX2(delay, pred_delay);
|
|
}
|
|
|
|
block->data = NULL;
|
|
}
|
|
|
|
return delay;
|
|
}
|
|
|
|
/**
|
|
* Calculate delay for nop insertion. This must exactly match hardware
|
|
* requirements, including recursing into predecessor blocks.
|
|
*/
|
|
unsigned
|
|
ir3_delay_calc(struct ir3_block *block, struct ir3_instruction *instr,
|
|
bool mergedregs)
|
|
{
|
|
regmask_t mask;
|
|
regmask_init(&mask, mergedregs);
|
|
foreach_src (src, instr) {
|
|
if (!(src->flags & (IR3_REG_IMMED | IR3_REG_CONST)))
|
|
regmask_set(&mask, src);
|
|
}
|
|
|
|
return delay_calc(block, NULL, instr, 0, &mask, mergedregs);
|
|
}
|