583 lines
19 KiB
C
583 lines
19 KiB
C
/*
|
|
* Copyright (C) 2021 Valve Corporation
|
|
*
|
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
|
* copy of this software and associated documentation files (the "Software"),
|
|
* to deal in the Software without restriction, including without limitation
|
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
|
* and/or sell copies of the Software, and to permit persons to whom the
|
|
* Software is furnished to do so, subject to the following conditions:
|
|
*
|
|
* The above copyright notice and this permission notice (including the next
|
|
* paragraph) shall be included in all copies or substantial portions of the
|
|
* Software.
|
|
*
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
|
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
* SOFTWARE.
|
|
*/
|
|
|
|
#include "ir3_compiler.h"
|
|
#include "ir3_ra.h"
|
|
#include "ralloc.h"
|
|
|
|
/* This pass "merges" compatible phi-web SSA values. First, we insert a bunch
|
|
* of parallelcopy's to trivially turn the program into CSSA form. Then we
|
|
* try to "merge" SSA def's into "merge sets" which could be allocated to a
|
|
* single register in order to eliminate copies. First we merge phi nodes,
|
|
* which should always succeed because of the parallelcopy's we inserted, and
|
|
* then we try to coalesce the copies we introduced.
|
|
*
|
|
* The merged registers are used for three purposes:
|
|
*
|
|
* 1. We always use the same pvtmem slot for spilling all SSA defs in each
|
|
* merge set. This prevents us from having to insert memory-to-memory copies
|
|
* in the spiller and makes sure we don't insert unecessary copies.
|
|
* 2. When two values are live at the same time, part of the same merge
|
|
* set, and they overlap each other in the merge set, they always occupy
|
|
* overlapping physical registers in RA. This reduces register pressure and
|
|
* copies in several important scenarios:
|
|
* - When sources of a collect are used later by something else, we don't
|
|
* have to introduce copies.
|
|
* - We can handle sequences of extracts that "explode" a vector into its
|
|
* components without any additional copying.
|
|
* 3. We use the merge sets for affinities in register allocation: That is, we
|
|
* try to allocate all the definitions in the same merge set to the
|
|
* same/compatible registers. This helps us e.g. allocate sources of a collect
|
|
* to contiguous registers without too much special code in RA.
|
|
*
|
|
* In a "normal" register allocator, or when spilling, we'd just merge
|
|
* registers in the same merge set to the same register, but with SSA-based
|
|
* register allocation we may have to split the live interval.
|
|
*
|
|
* The implementation is based on "Revisiting Out-of-SSA Translation for
|
|
* Correctness, CodeQuality, and Efficiency," and is broadly similar to the
|
|
* implementation in nir_from_ssa, with the twist that we also try to coalesce
|
|
* META_SPLIT and META_COLLECT. This makes this pass more complicated but
|
|
* prevents us from needing to handle these specially in RA and the spiller,
|
|
* which are already complicated enough. This also forces us to implement that
|
|
* value-comparison optimization they explain, as without it we wouldn't be
|
|
* able to coalesce META_SPLIT even in the simplest of cases.
|
|
*/
|
|
|
|
/* In order to dynamically reconstruct the dominance forest, we need the
|
|
* instructions ordered by a preorder traversal of the dominance tree:
|
|
*/
|
|
|
|
static unsigned
|
|
index_instrs(struct ir3_block *block, unsigned index)
|
|
{
|
|
foreach_instr (instr, &block->instr_list)
|
|
instr->ip = index++;
|
|
|
|
for (unsigned i = 0; i < block->dom_children_count; i++)
|
|
index = index_instrs(block->dom_children[i], index);
|
|
|
|
return index;
|
|
}
|
|
|
|
/* Definitions within a merge set are ordered by instr->ip as set above: */
|
|
|
|
static bool
|
|
def_after(struct ir3_register *a, struct ir3_register *b)
|
|
{
|
|
return a->instr->ip > b->instr->ip;
|
|
}
|
|
|
|
static bool
|
|
def_dominates(struct ir3_register *a, struct ir3_register *b)
|
|
{
|
|
if (def_after(a, b)) {
|
|
return false;
|
|
} else if (a->instr->block == b->instr->block) {
|
|
return def_after(b, a);
|
|
} else {
|
|
return ir3_block_dominates(a->instr->block, b->instr->block);
|
|
}
|
|
}
|
|
|
|
/* This represents a region inside a register. The offset is relative to the
|
|
* start of the register, and offset + size <= size(reg).
|
|
*/
|
|
struct def_value {
|
|
struct ir3_register *reg;
|
|
unsigned offset, size;
|
|
};
|
|
|
|
/* Chase any copies to get the source of a region inside a register. This is
|
|
* Value(a) in the paper.
|
|
*/
|
|
static struct def_value
|
|
chase_copies(struct def_value value)
|
|
{
|
|
while (true) {
|
|
struct ir3_instruction *instr = value.reg->instr;
|
|
if (instr->opc == OPC_META_SPLIT) {
|
|
value.offset += instr->split.off * reg_elem_size(value.reg);
|
|
value.reg = instr->srcs[0]->def;
|
|
} else if (instr->opc == OPC_META_COLLECT) {
|
|
if (value.offset % reg_elem_size(value.reg) != 0 ||
|
|
value.size > reg_elem_size(value.reg) ||
|
|
value.offset + value.size > reg_size(value.reg))
|
|
break;
|
|
struct ir3_register *src =
|
|
instr->srcs[value.offset / reg_elem_size(value.reg)];
|
|
if (!src->def)
|
|
break;
|
|
value.offset = 0;
|
|
value.reg = src->def;
|
|
} else {
|
|
/* TODO: parallelcopy */
|
|
break;
|
|
}
|
|
}
|
|
|
|
return value;
|
|
}
|
|
|
|
/* This represents an entry in the merge set, and consists of a register +
|
|
* offset from the merge set base.
|
|
*/
|
|
struct merge_def {
|
|
struct ir3_register *reg;
|
|
unsigned offset;
|
|
};
|
|
|
|
static bool
|
|
can_skip_interference(const struct merge_def *a, const struct merge_def *b)
|
|
{
|
|
unsigned a_start = a->offset;
|
|
unsigned b_start = b->offset;
|
|
unsigned a_end = a_start + reg_size(a->reg);
|
|
unsigned b_end = b_start + reg_size(b->reg);
|
|
|
|
/* Registers that don't overlap never interfere */
|
|
if (a_end <= b_start || b_end <= a_start)
|
|
return true;
|
|
|
|
/* Disallow skipping interference unless one definition contains the
|
|
* other. This restriction is important for register allocation, because
|
|
* it means that at any given point in the program, the live values in a
|
|
* given merge set will form a tree. If they didn't, then one live value
|
|
* would partially overlap another, and they would have overlapping live
|
|
* ranges because they're live at the same point. This simplifies register
|
|
* allocation and spilling.
|
|
*/
|
|
if (!((a_start <= b_start && a_end >= b_end) ||
|
|
(b_start <= a_start && b_end >= a_end)))
|
|
return false;
|
|
|
|
/* For each register, chase the intersection of a and b to find the
|
|
* ultimate source.
|
|
*/
|
|
unsigned start = MAX2(a_start, b_start);
|
|
unsigned end = MIN2(a_end, b_end);
|
|
struct def_value a_value = chase_copies((struct def_value){
|
|
.reg = a->reg,
|
|
.offset = start - a_start,
|
|
.size = end - start,
|
|
});
|
|
struct def_value b_value = chase_copies((struct def_value){
|
|
.reg = b->reg,
|
|
.offset = start - b_start,
|
|
.size = end - start,
|
|
});
|
|
return a_value.reg == b_value.reg && a_value.offset == b_value.offset;
|
|
}
|
|
|
|
static struct ir3_merge_set *
|
|
get_merge_set(struct ir3_register *def)
|
|
{
|
|
if (def->merge_set)
|
|
return def->merge_set;
|
|
|
|
struct ir3_merge_set *set = ralloc(def, struct ir3_merge_set);
|
|
set->preferred_reg = ~0;
|
|
set->interval_start = ~0;
|
|
set->spill_slot = ~0;
|
|
set->size = reg_size(def);
|
|
set->alignment = (def->flags & IR3_REG_HALF) ? 1 : 2;
|
|
set->regs_count = 1;
|
|
set->regs = ralloc(set, struct ir3_register *);
|
|
set->regs[0] = def;
|
|
|
|
return set;
|
|
}
|
|
|
|
/* Merges b into a */
|
|
static struct ir3_merge_set *
|
|
merge_merge_sets(struct ir3_merge_set *a, struct ir3_merge_set *b, int b_offset)
|
|
{
|
|
if (b_offset < 0)
|
|
return merge_merge_sets(b, a, -b_offset);
|
|
|
|
struct ir3_register **new_regs =
|
|
rzalloc_array(a, struct ir3_register *, a->regs_count + b->regs_count);
|
|
|
|
unsigned a_index = 0, b_index = 0, new_index = 0;
|
|
for (; a_index < a->regs_count || b_index < b->regs_count; new_index++) {
|
|
if (b_index < b->regs_count &&
|
|
(a_index == a->regs_count ||
|
|
def_after(a->regs[a_index], b->regs[b_index]))) {
|
|
new_regs[new_index] = b->regs[b_index++];
|
|
new_regs[new_index]->merge_set_offset += b_offset;
|
|
} else {
|
|
new_regs[new_index] = a->regs[a_index++];
|
|
}
|
|
new_regs[new_index]->merge_set = a;
|
|
}
|
|
|
|
assert(new_index == a->regs_count + b->regs_count);
|
|
|
|
/* Technically this should be the lcm, but because alignment is only 1 or
|
|
* 2 so far this should be ok.
|
|
*/
|
|
a->alignment = MAX2(a->alignment, b->alignment);
|
|
a->regs_count += b->regs_count;
|
|
ralloc_free(a->regs);
|
|
a->regs = new_regs;
|
|
a->size = MAX2(a->size, b->size + b_offset);
|
|
|
|
return a;
|
|
}
|
|
|
|
static bool
|
|
merge_sets_interfere(struct ir3_liveness *live, struct ir3_merge_set *a,
|
|
struct ir3_merge_set *b, int b_offset)
|
|
{
|
|
if (b_offset < 0)
|
|
return merge_sets_interfere(live, b, a, -b_offset);
|
|
|
|
struct merge_def dom[a->regs_count + b->regs_count];
|
|
unsigned a_index = 0, b_index = 0;
|
|
int dom_index = -1;
|
|
|
|
/* Reject trying to merge the sets if the alignment doesn't work out */
|
|
if (b_offset % a->alignment != 0)
|
|
return true;
|
|
|
|
while (a_index < a->regs_count || b_index < b->regs_count) {
|
|
struct merge_def current;
|
|
if (a_index == a->regs_count) {
|
|
current.reg = b->regs[b_index];
|
|
current.offset = current.reg->merge_set_offset + b_offset;
|
|
b_index++;
|
|
} else if (b_index == b->regs_count) {
|
|
current.reg = a->regs[a_index];
|
|
current.offset = current.reg->merge_set_offset;
|
|
a_index++;
|
|
} else {
|
|
if (def_after(b->regs[b_index], a->regs[a_index])) {
|
|
current.reg = a->regs[a_index];
|
|
current.offset = current.reg->merge_set_offset;
|
|
a_index++;
|
|
} else {
|
|
current.reg = b->regs[b_index];
|
|
current.offset = current.reg->merge_set_offset + b_offset;
|
|
b_index++;
|
|
}
|
|
}
|
|
|
|
while (dom_index >= 0 &&
|
|
!def_dominates(dom[dom_index].reg, current.reg)) {
|
|
dom_index--;
|
|
}
|
|
|
|
/* TODO: in the original paper, just dom[dom_index] needs to be
|
|
* checked for interference. We implement the value-chasing extension
|
|
* as well as support for sub-registers, which complicates this
|
|
* significantly because it's no longer the case that if a dominates b
|
|
* dominates c and a and b don't interfere then we only need to check
|
|
* interference between b and c to be sure a and c don't interfere --
|
|
* this means we may have to check for interference against values
|
|
* higher in the stack then dom[dom_index]. In the paper there's a
|
|
* description of a way to do less interference tests with the
|
|
* value-chasing extension, but we'd have to come up with something
|
|
* ourselves for handling the similar problems that come up with
|
|
* allowing values to contain subregisters. For now we just test
|
|
* everything in the stack.
|
|
*/
|
|
for (int i = 0; i <= dom_index; i++) {
|
|
if (can_skip_interference(¤t, &dom[i]))
|
|
continue;
|
|
|
|
/* Ok, now we actually have to check interference. Since we know
|
|
* that dom[i] dominates current, this boils down to checking
|
|
* whether dom[i] is live after current.
|
|
*/
|
|
if (ir3_def_live_after(live, dom[i].reg, current.reg->instr))
|
|
return true;
|
|
}
|
|
|
|
dom[++dom_index] = current;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
static void
|
|
try_merge_defs(struct ir3_liveness *live, struct ir3_register *a,
|
|
struct ir3_register *b, unsigned b_offset)
|
|
{
|
|
struct ir3_merge_set *a_set = get_merge_set(a);
|
|
struct ir3_merge_set *b_set = get_merge_set(b);
|
|
|
|
if (a_set == b_set) {
|
|
/* Note: Even in this case we may not always successfully be able to
|
|
* coalesce this copy, if the offsets don't line up. But in any
|
|
* case, we can't do anything.
|
|
*/
|
|
return;
|
|
}
|
|
|
|
int b_set_offset = a->merge_set_offset + b_offset - b->merge_set_offset;
|
|
|
|
if (!merge_sets_interfere(live, a_set, b_set, b_set_offset))
|
|
merge_merge_sets(a_set, b_set, b_set_offset);
|
|
}
|
|
|
|
void
|
|
ir3_force_merge(struct ir3_register *a, struct ir3_register *b, int b_offset)
|
|
{
|
|
struct ir3_merge_set *a_set = get_merge_set(a);
|
|
struct ir3_merge_set *b_set = get_merge_set(b);
|
|
|
|
if (a_set == b_set)
|
|
return;
|
|
|
|
int b_set_offset = a->merge_set_offset + b_offset - b->merge_set_offset;
|
|
merge_merge_sets(a_set, b_set, b_set_offset);
|
|
}
|
|
|
|
static void
|
|
coalesce_phi(struct ir3_liveness *live, struct ir3_instruction *phi)
|
|
{
|
|
for (unsigned i = 0; i < phi->srcs_count; i++) {
|
|
if (phi->srcs[i]->def)
|
|
try_merge_defs(live, phi->dsts[0], phi->srcs[i]->def, 0);
|
|
}
|
|
}
|
|
|
|
static void
|
|
aggressive_coalesce_parallel_copy(struct ir3_liveness *live,
|
|
struct ir3_instruction *pcopy)
|
|
{
|
|
for (unsigned i = 0; i < pcopy->dsts_count; i++) {
|
|
if (!(pcopy->srcs[i]->flags & IR3_REG_SSA))
|
|
continue;
|
|
try_merge_defs(live, pcopy->dsts[i], pcopy->srcs[i]->def, 0);
|
|
}
|
|
}
|
|
|
|
static void
|
|
aggressive_coalesce_split(struct ir3_liveness *live,
|
|
struct ir3_instruction *split)
|
|
{
|
|
try_merge_defs(live, split->srcs[0]->def, split->dsts[0],
|
|
split->split.off * reg_elem_size(split->dsts[0]));
|
|
}
|
|
|
|
static void
|
|
aggressive_coalesce_collect(struct ir3_liveness *live,
|
|
struct ir3_instruction *collect)
|
|
{
|
|
for (unsigned i = 0, offset = 0; i < collect->srcs_count;
|
|
offset += reg_elem_size(collect->srcs[i]), i++) {
|
|
if (!(collect->srcs[i]->flags & IR3_REG_SSA))
|
|
continue;
|
|
try_merge_defs(live, collect->dsts[0], collect->srcs[i]->def, offset);
|
|
}
|
|
}
|
|
|
|
static void
|
|
create_parallel_copy(struct ir3_block *block)
|
|
{
|
|
for (unsigned i = 0; i < 2; i++) {
|
|
if (!block->successors[i])
|
|
continue;
|
|
|
|
struct ir3_block *succ = block->successors[i];
|
|
|
|
unsigned pred_idx = ir3_block_get_pred_index(succ, block);
|
|
|
|
unsigned phi_count = 0;
|
|
foreach_instr (phi, &succ->instr_list) {
|
|
if (phi->opc != OPC_META_PHI)
|
|
break;
|
|
|
|
/* Avoid undef */
|
|
if ((phi->srcs[pred_idx]->flags & IR3_REG_SSA) &&
|
|
!phi->srcs[pred_idx]->def)
|
|
continue;
|
|
|
|
/* We don't support critical edges. If we were to support them,
|
|
* we'd need to insert parallel copies after the phi node to solve
|
|
* the lost-copy problem.
|
|
*/
|
|
assert(i == 0 && !block->successors[1]);
|
|
phi_count++;
|
|
}
|
|
|
|
if (phi_count == 0)
|
|
continue;
|
|
|
|
struct ir3_register *src[phi_count];
|
|
unsigned j = 0;
|
|
foreach_instr (phi, &succ->instr_list) {
|
|
if (phi->opc != OPC_META_PHI)
|
|
break;
|
|
if ((phi->srcs[pred_idx]->flags & IR3_REG_SSA) &&
|
|
!phi->srcs[pred_idx]->def)
|
|
continue;
|
|
src[j++] = phi->srcs[pred_idx];
|
|
}
|
|
assert(j == phi_count);
|
|
|
|
struct ir3_instruction *pcopy =
|
|
ir3_instr_create(block, OPC_META_PARALLEL_COPY, phi_count, phi_count);
|
|
|
|
for (j = 0; j < phi_count; j++) {
|
|
struct ir3_register *reg = __ssa_dst(pcopy);
|
|
reg->flags |= src[j]->flags & (IR3_REG_HALF | IR3_REG_ARRAY);
|
|
reg->size = src[j]->size;
|
|
reg->wrmask = src[j]->wrmask;
|
|
}
|
|
|
|
for (j = 0; j < phi_count; j++) {
|
|
pcopy->srcs[pcopy->srcs_count++] =
|
|
ir3_reg_clone(block->shader, src[j]);
|
|
}
|
|
|
|
j = 0;
|
|
foreach_instr (phi, &succ->instr_list) {
|
|
if (phi->opc != OPC_META_PHI)
|
|
break;
|
|
if ((phi->srcs[pred_idx]->flags & IR3_REG_SSA) &&
|
|
!phi->srcs[pred_idx]->def)
|
|
continue;
|
|
phi->srcs[pred_idx]->def = pcopy->dsts[j];
|
|
phi->srcs[pred_idx]->flags = pcopy->dsts[j]->flags;
|
|
j++;
|
|
}
|
|
assert(j == phi_count);
|
|
}
|
|
}
|
|
|
|
void
|
|
ir3_create_parallel_copies(struct ir3 *ir)
|
|
{
|
|
foreach_block (block, &ir->block_list) {
|
|
create_parallel_copy(block);
|
|
}
|
|
}
|
|
|
|
static void
|
|
index_merge_sets(struct ir3_liveness *live, struct ir3 *ir)
|
|
{
|
|
unsigned offset = 0;
|
|
foreach_block (block, &ir->block_list) {
|
|
foreach_instr (instr, &block->instr_list) {
|
|
for (unsigned i = 0; i < instr->dsts_count; i++) {
|
|
struct ir3_register *dst = instr->dsts[i];
|
|
|
|
unsigned dst_offset;
|
|
struct ir3_merge_set *merge_set = dst->merge_set;
|
|
unsigned size = reg_size(dst);
|
|
if (merge_set) {
|
|
if (merge_set->interval_start == ~0) {
|
|
merge_set->interval_start = offset;
|
|
offset += merge_set->size;
|
|
}
|
|
dst_offset = merge_set->interval_start + dst->merge_set_offset;
|
|
} else {
|
|
dst_offset = offset;
|
|
offset += size;
|
|
}
|
|
|
|
dst->interval_start = dst_offset;
|
|
dst->interval_end = dst_offset + size;
|
|
}
|
|
}
|
|
}
|
|
|
|
live->interval_offset = offset;
|
|
}
|
|
|
|
#define RESET "\x1b[0m"
|
|
#define BLUE "\x1b[0;34m"
|
|
#define SYN_SSA(x) BLUE x RESET
|
|
|
|
static void
|
|
dump_merge_sets(struct ir3 *ir)
|
|
{
|
|
d("merge sets:");
|
|
struct set *merge_sets = _mesa_pointer_set_create(NULL);
|
|
foreach_block (block, &ir->block_list) {
|
|
foreach_instr (instr, &block->instr_list) {
|
|
for (unsigned i = 0; i < instr->dsts_count; i++) {
|
|
struct ir3_register *dst = instr->dsts[i];
|
|
|
|
struct ir3_merge_set *merge_set = dst->merge_set;
|
|
if (!merge_set || _mesa_set_search(merge_sets, merge_set))
|
|
continue;
|
|
|
|
d("merge set, size %u, align %u:", merge_set->size,
|
|
merge_set->alignment);
|
|
for (unsigned j = 0; j < merge_set->regs_count; j++) {
|
|
struct ir3_register *reg = merge_set->regs[j];
|
|
d("\t" SYN_SSA("ssa_%u") ":%u, offset %u",
|
|
reg->instr->serialno, reg->name, reg->merge_set_offset);
|
|
}
|
|
|
|
_mesa_set_add(merge_sets, merge_set);
|
|
}
|
|
}
|
|
}
|
|
|
|
ralloc_free(merge_sets);
|
|
}
|
|
|
|
void
|
|
ir3_merge_regs(struct ir3_liveness *live, struct ir3 *ir)
|
|
{
|
|
index_instrs(ir3_start_block(ir), 0);
|
|
|
|
/* First pass: coalesce phis, which must be together. */
|
|
foreach_block (block, &ir->block_list) {
|
|
foreach_instr (instr, &block->instr_list) {
|
|
if (instr->opc != OPC_META_PHI)
|
|
break;
|
|
|
|
coalesce_phi(live, instr);
|
|
}
|
|
}
|
|
|
|
/* Second pass: aggressively coalesce parallelcopy, split, collect */
|
|
foreach_block (block, &ir->block_list) {
|
|
foreach_instr (instr, &block->instr_list) {
|
|
switch (instr->opc) {
|
|
case OPC_META_SPLIT:
|
|
aggressive_coalesce_split(live, instr);
|
|
break;
|
|
case OPC_META_COLLECT:
|
|
aggressive_coalesce_collect(live, instr);
|
|
break;
|
|
case OPC_META_PARALLEL_COPY:
|
|
aggressive_coalesce_parallel_copy(live, instr);
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
index_merge_sets(live, ir);
|
|
|
|
if (ir3_shader_debug & IR3_DBG_RAMSGS)
|
|
dump_merge_sets(ir);
|
|
}
|