539 lines
18 KiB
C++
539 lines
18 KiB
C++
/*
|
|
* Copyright © 2019 Valve Corporation
|
|
*
|
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
|
* copy of this software and associated documentation files (the "Software"),
|
|
* to deal in the Software without restriction, including without limitation
|
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
|
* and/or sell copies of the Software, and to permit persons to whom the
|
|
* Software is furnished to do so, subject to the following conditions:
|
|
*
|
|
* The above copyright notice and this permission notice (including the next
|
|
* paragraph) shall be included in all copies or substantial portions of the
|
|
* Software.
|
|
*
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
|
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
|
* IN THE SOFTWARE.
|
|
*
|
|
*/
|
|
|
|
#include "aco_builder.h"
|
|
#include "aco_ir.h"
|
|
|
|
#include <algorithm>
|
|
#include <map>
|
|
#include <unordered_map>
|
|
#include <vector>
|
|
|
|
/*
|
|
* Implements an algorithm to lower to Concentional SSA Form (CSSA).
|
|
* After "Revisiting Out-of-SSA Translation for Correctness, CodeQuality, and Efficiency"
|
|
* by B. Boissinot, A. Darte, F. Rastello, B. Dupont de Dinechin, C. Guillon,
|
|
*
|
|
* By lowering the IR to CSSA, the insertion of parallelcopies is separated from
|
|
* the register coalescing problem. Additionally, correctness is ensured w.r.t. spilling.
|
|
* The algorithm coalesces non-interfering phi-resources while taking value-equality
|
|
* into account. Re-indexes the SSA-defs.
|
|
*/
|
|
|
|
namespace aco {
|
|
namespace {
|
|
|
|
typedef std::vector<Temp> merge_set;
|
|
|
|
struct copy {
|
|
Definition def;
|
|
Operand op;
|
|
};
|
|
|
|
struct merge_node {
|
|
Operand value = Operand(); /* original value: can be an SSA-def or constant value */
|
|
uint32_t index = -1u; /* index into the vector of merge sets */
|
|
uint32_t defined_at = -1u; /* defining block */
|
|
|
|
/* we also remember two dominating defs with the same value: */
|
|
Temp equal_anc_in = Temp(); /* within the same merge set */
|
|
Temp equal_anc_out = Temp(); /* from a different set */
|
|
};
|
|
|
|
struct cssa_ctx {
|
|
Program* program;
|
|
std::vector<IDSet>& live_out; /* live-out sets per block */
|
|
std::vector<std::vector<copy>> parallelcopies; /* copies per block */
|
|
std::vector<merge_set> merge_sets; /* each vector is one (ordered) merge set */
|
|
std::unordered_map<uint32_t, merge_node> merge_node_table; /* tempid -> merge node */
|
|
};
|
|
|
|
/* create (virtual) parallelcopies for each phi instruction and
|
|
* already merge copy-definitions with phi-defs into merge sets */
|
|
void
|
|
collect_parallelcopies(cssa_ctx& ctx)
|
|
{
|
|
ctx.parallelcopies.resize(ctx.program->blocks.size());
|
|
Builder bld(ctx.program);
|
|
for (Block& block : ctx.program->blocks) {
|
|
for (aco_ptr<Instruction>& phi : block.instructions) {
|
|
if (phi->opcode != aco_opcode::p_phi && phi->opcode != aco_opcode::p_linear_phi)
|
|
break;
|
|
|
|
const Definition& def = phi->definitions[0];
|
|
|
|
/* if the definition is not temp, it is the exec mask.
|
|
* We can reload the exec mask directly from the spill slot.
|
|
*/
|
|
if (!def.isTemp())
|
|
continue;
|
|
|
|
std::vector<unsigned>& preds =
|
|
phi->opcode == aco_opcode::p_phi ? block.logical_preds : block.linear_preds;
|
|
uint32_t index = ctx.merge_sets.size();
|
|
merge_set set;
|
|
|
|
bool has_preheader_copy = false;
|
|
for (unsigned i = 0; i < phi->operands.size(); i++) {
|
|
Operand op = phi->operands[i];
|
|
if (op.isUndefined())
|
|
continue;
|
|
|
|
if (def.regClass().type() == RegType::sgpr && !op.isTemp()) {
|
|
/* SGPR inline constants and literals on GFX10+ can be spilled
|
|
* and reloaded directly (without intermediate register) */
|
|
if (op.isConstant()) {
|
|
if (ctx.program->gfx_level >= GFX10)
|
|
continue;
|
|
if (op.size() == 1 && !op.isLiteral())
|
|
continue;
|
|
} else {
|
|
assert(op.isFixed() && op.physReg() == exec);
|
|
continue;
|
|
}
|
|
}
|
|
|
|
/* create new temporary and rename operands */
|
|
Temp tmp = bld.tmp(def.regClass());
|
|
ctx.parallelcopies[preds[i]].emplace_back(copy{Definition(tmp), op});
|
|
phi->operands[i] = Operand(tmp);
|
|
|
|
/* place the new operands in the same merge set */
|
|
set.emplace_back(tmp);
|
|
ctx.merge_node_table[tmp.id()] = {op, index, preds[i]};
|
|
|
|
/* update the liveness information */
|
|
if (op.isKill())
|
|
ctx.live_out[preds[i]].erase(op.tempId());
|
|
ctx.live_out[preds[i]].insert(tmp.id());
|
|
|
|
has_preheader_copy |= i == 0 && block.kind & block_kind_loop_header;
|
|
}
|
|
|
|
if (set.empty())
|
|
continue;
|
|
|
|
/* place the definition in dominance-order */
|
|
if (def.isTemp()) {
|
|
if (has_preheader_copy)
|
|
set.emplace(std::next(set.begin()), def.getTemp());
|
|
else if (block.kind & block_kind_loop_header)
|
|
set.emplace(set.begin(), def.getTemp());
|
|
else
|
|
set.emplace_back(def.getTemp());
|
|
ctx.merge_node_table[def.tempId()] = {Operand(def.getTemp()), index, block.index};
|
|
}
|
|
ctx.merge_sets.emplace_back(set);
|
|
}
|
|
}
|
|
}
|
|
|
|
/* check whether the definition of a comes after b. */
|
|
inline bool
|
|
defined_after(cssa_ctx& ctx, Temp a, Temp b)
|
|
{
|
|
merge_node& node_a = ctx.merge_node_table[a.id()];
|
|
merge_node& node_b = ctx.merge_node_table[b.id()];
|
|
if (node_a.defined_at == node_b.defined_at)
|
|
return a.id() > b.id();
|
|
|
|
return node_a.defined_at > node_b.defined_at;
|
|
}
|
|
|
|
/* check whether a dominates b where b is defined after a */
|
|
inline bool
|
|
dominates(cssa_ctx& ctx, Temp a, Temp b)
|
|
{
|
|
assert(defined_after(ctx, b, a));
|
|
merge_node& node_a = ctx.merge_node_table[a.id()];
|
|
merge_node& node_b = ctx.merge_node_table[b.id()];
|
|
unsigned idom = node_b.defined_at;
|
|
while (idom > node_a.defined_at)
|
|
idom = b.regClass().type() == RegType::vgpr ? ctx.program->blocks[idom].logical_idom
|
|
: ctx.program->blocks[idom].linear_idom;
|
|
|
|
return idom == node_a.defined_at;
|
|
}
|
|
|
|
/* check intersection between var and parent:
|
|
* We already know that parent dominates var. */
|
|
inline bool
|
|
intersects(cssa_ctx& ctx, Temp var, Temp parent)
|
|
{
|
|
merge_node& node_var = ctx.merge_node_table[var.id()];
|
|
merge_node& node_parent = ctx.merge_node_table[parent.id()];
|
|
assert(node_var.index != node_parent.index);
|
|
uint32_t block_idx = node_var.defined_at;
|
|
|
|
/* if the parent is live-out at the definition block of var, they intersect */
|
|
bool parent_live = ctx.live_out[block_idx].count(parent.id());
|
|
if (parent_live)
|
|
return true;
|
|
|
|
/* parent is defined in a different block than var */
|
|
if (node_parent.defined_at < node_var.defined_at) {
|
|
/* if the parent is not live-in, they don't interfere */
|
|
std::vector<uint32_t>& preds = var.type() == RegType::vgpr
|
|
? ctx.program->blocks[block_idx].logical_preds
|
|
: ctx.program->blocks[block_idx].linear_preds;
|
|
for (uint32_t pred : preds) {
|
|
if (!ctx.live_out[pred].count(parent.id()))
|
|
return false;
|
|
}
|
|
}
|
|
|
|
for (const copy& cp : ctx.parallelcopies[block_idx]) {
|
|
/* if var is defined at the edge, they don't intersect */
|
|
if (cp.def.getTemp() == var)
|
|
return false;
|
|
if (cp.op.isTemp() && cp.op.getTemp() == parent)
|
|
parent_live = true;
|
|
}
|
|
/* if the parent is live at the edge, they intersect */
|
|
if (parent_live)
|
|
return true;
|
|
|
|
/* both, parent and var, are present in the same block */
|
|
const Block& block = ctx.program->blocks[block_idx];
|
|
for (auto it = block.instructions.crbegin(); it != block.instructions.crend(); ++it) {
|
|
/* if the parent was not encountered yet, it can only be used by a phi */
|
|
if (is_phi(it->get()))
|
|
break;
|
|
|
|
for (const Definition& def : (*it)->definitions) {
|
|
if (!def.isTemp())
|
|
continue;
|
|
/* if parent was not found yet, they don't intersect */
|
|
if (def.getTemp() == var)
|
|
return false;
|
|
}
|
|
|
|
for (const Operand& op : (*it)->operands) {
|
|
if (!op.isTemp())
|
|
continue;
|
|
/* if the var was defined before this point, they intersect */
|
|
if (op.getTemp() == parent)
|
|
return true;
|
|
}
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
/* check interference between var and parent:
|
|
* i.e. they have different values and intersect.
|
|
* If parent and var share the same value, also updates the equal ancestor. */
|
|
inline bool
|
|
interference(cssa_ctx& ctx, Temp var, Temp parent)
|
|
{
|
|
assert(var != parent);
|
|
merge_node& node_var = ctx.merge_node_table[var.id()];
|
|
node_var.equal_anc_out = Temp();
|
|
|
|
if (node_var.index == ctx.merge_node_table[parent.id()].index) {
|
|
/* check/update in other set */
|
|
parent = ctx.merge_node_table[parent.id()].equal_anc_out;
|
|
}
|
|
|
|
Temp tmp = parent;
|
|
/* check if var intersects with parent or any equal-valued ancestor */
|
|
while (tmp != Temp() && !intersects(ctx, var, tmp)) {
|
|
merge_node& node_tmp = ctx.merge_node_table[tmp.id()];
|
|
tmp = node_tmp.equal_anc_in;
|
|
}
|
|
|
|
/* no intersection found */
|
|
if (tmp == Temp())
|
|
return false;
|
|
|
|
/* var and parent, same value, but in different sets */
|
|
if (node_var.value == ctx.merge_node_table[parent.id()].value) {
|
|
node_var.equal_anc_out = tmp;
|
|
return false;
|
|
}
|
|
|
|
/* var and parent, different values and intersect */
|
|
return true;
|
|
}
|
|
|
|
/* tries to merge set_b into set_a of given temporary and
|
|
* drops that temporary as it is being coalesced */
|
|
bool
|
|
try_merge_merge_set(cssa_ctx& ctx, Temp dst, merge_set& set_b)
|
|
{
|
|
auto def_node_it = ctx.merge_node_table.find(dst.id());
|
|
uint32_t index = def_node_it->second.index;
|
|
merge_set& set_a = ctx.merge_sets[index];
|
|
std::vector<Temp> dom; /* stack of the traversal */
|
|
merge_set union_set; /* the new merged merge-set */
|
|
uint32_t i_a = 0;
|
|
uint32_t i_b = 0;
|
|
|
|
while (i_a < set_a.size() || i_b < set_b.size()) {
|
|
Temp current;
|
|
if (i_a == set_a.size())
|
|
current = set_b[i_b++];
|
|
else if (i_b == set_b.size())
|
|
current = set_a[i_a++];
|
|
/* else pick the one defined first */
|
|
else if (defined_after(ctx, set_a[i_a], set_b[i_b]))
|
|
current = set_b[i_b++];
|
|
else
|
|
current = set_a[i_a++];
|
|
|
|
while (!dom.empty() && !dominates(ctx, dom.back(), current))
|
|
dom.pop_back(); /* not the desired parent, remove */
|
|
|
|
if (!dom.empty() && interference(ctx, current, dom.back()))
|
|
return false; /* intersection detected */
|
|
|
|
dom.emplace_back(current); /* otherwise, keep checking */
|
|
if (current != dst)
|
|
union_set.emplace_back(current); /* maintain the new merge-set sorted */
|
|
}
|
|
|
|
/* update hashmap */
|
|
for (Temp t : union_set) {
|
|
merge_node& node = ctx.merge_node_table[t.id()];
|
|
/* update the equal ancestors:
|
|
* i.e. the 'closest' dominating def with the same value */
|
|
Temp in = node.equal_anc_in;
|
|
Temp out = node.equal_anc_out;
|
|
if (in == Temp() || (out != Temp() && defined_after(ctx, out, in)))
|
|
node.equal_anc_in = out;
|
|
node.equal_anc_out = Temp();
|
|
/* update merge-set index */
|
|
node.index = index;
|
|
}
|
|
set_b = merge_set(); /* free the old set_b */
|
|
ctx.merge_sets[index] = union_set;
|
|
ctx.merge_node_table.erase(dst.id()); /* remove the temporary */
|
|
|
|
return true;
|
|
}
|
|
|
|
/* returns true if the copy can safely be omitted */
|
|
bool
|
|
try_coalesce_copy(cssa_ctx& ctx, copy copy, uint32_t block_idx)
|
|
{
|
|
/* we can only coalesce temporaries */
|
|
if (!copy.op.isTemp())
|
|
return false;
|
|
|
|
/* try emplace a merge_node for the copy operand */
|
|
merge_node& op_node = ctx.merge_node_table[copy.op.tempId()];
|
|
if (op_node.defined_at == -1u) {
|
|
/* find defining block of operand */
|
|
uint32_t pred = block_idx;
|
|
do {
|
|
block_idx = pred;
|
|
pred = copy.op.regClass().type() == RegType::vgpr ? ctx.program->blocks[pred].logical_idom
|
|
: ctx.program->blocks[pred].linear_idom;
|
|
} while (block_idx != pred && ctx.live_out[pred].count(copy.op.tempId()));
|
|
op_node.defined_at = block_idx;
|
|
op_node.value = copy.op;
|
|
}
|
|
|
|
/* we can only coalesce copies of the same register class */
|
|
if (copy.op.regClass() != copy.def.regClass())
|
|
return false;
|
|
|
|
/* check if this operand has not yet been coalesced */
|
|
if (op_node.index == -1u) {
|
|
merge_set op_set = merge_set{copy.op.getTemp()};
|
|
return try_merge_merge_set(ctx, copy.def.getTemp(), op_set);
|
|
}
|
|
|
|
/* check if this operand has been coalesced into the same set */
|
|
assert(ctx.merge_node_table.count(copy.def.tempId()));
|
|
if (op_node.index == ctx.merge_node_table[copy.def.tempId()].index)
|
|
return true;
|
|
|
|
/* otherwise, try to coalesce both merge sets */
|
|
return try_merge_merge_set(ctx, copy.def.getTemp(), ctx.merge_sets[op_node.index]);
|
|
}
|
|
|
|
/* node in the location-transfer-graph */
|
|
struct ltg_node {
|
|
copy cp;
|
|
uint32_t read_idx;
|
|
uint32_t num_uses = 0;
|
|
};
|
|
|
|
/* emit the copies in an order that does not
|
|
* create interferences within a merge-set */
|
|
void
|
|
emit_copies_block(Builder& bld, std::map<uint32_t, ltg_node>& ltg, RegType type)
|
|
{
|
|
auto&& it = ltg.begin();
|
|
while (it != ltg.end()) {
|
|
const copy& cp = it->second.cp;
|
|
/* wrong regclass or still needed as operand */
|
|
if (cp.def.regClass().type() != type || it->second.num_uses > 0) {
|
|
++it;
|
|
continue;
|
|
}
|
|
|
|
/* emit the copy */
|
|
bld.copy(cp.def, it->second.cp.op);
|
|
|
|
/* update the location transfer graph */
|
|
if (it->second.read_idx != -1u) {
|
|
auto&& other = ltg.find(it->second.read_idx);
|
|
if (other != ltg.end())
|
|
other->second.num_uses--;
|
|
}
|
|
ltg.erase(it);
|
|
it = ltg.begin();
|
|
}
|
|
|
|
/* count the number of remaining circular dependencies */
|
|
unsigned num = std::count_if(ltg.begin(), ltg.end(),
|
|
[&](auto& n) { return n.second.cp.def.regClass().type() == type; });
|
|
|
|
/* if there are circular dependencies, we just emit them as single parallelcopy */
|
|
if (num) {
|
|
// TODO: this should be restricted to a feasible number of registers
|
|
// and otherwise use a temporary to avoid having to reload more (spilled)
|
|
// variables than we have registers.
|
|
aco_ptr<Pseudo_instruction> copy{create_instruction<Pseudo_instruction>(
|
|
aco_opcode::p_parallelcopy, Format::PSEUDO, num, num)};
|
|
it = ltg.begin();
|
|
for (unsigned i = 0; i < num; i++) {
|
|
while (it->second.cp.def.regClass().type() != type)
|
|
++it;
|
|
|
|
copy->definitions[i] = it->second.cp.def;
|
|
copy->operands[i] = it->second.cp.op;
|
|
it = ltg.erase(it);
|
|
}
|
|
bld.insert(std::move(copy));
|
|
}
|
|
}
|
|
|
|
/* either emits or coalesces all parallelcopies and
|
|
* renames the phi-operands accordingly. */
|
|
void
|
|
emit_parallelcopies(cssa_ctx& ctx)
|
|
{
|
|
std::unordered_map<uint32_t, Operand> renames;
|
|
|
|
/* we iterate backwards to prioritize coalescing in else-blocks */
|
|
for (int i = ctx.program->blocks.size() - 1; i >= 0; i--) {
|
|
if (ctx.parallelcopies[i].empty())
|
|
continue;
|
|
|
|
std::map<uint32_t, ltg_node> ltg;
|
|
bool has_vgpr_copy = false;
|
|
bool has_sgpr_copy = false;
|
|
|
|
/* first, try to coalesce all parallelcopies */
|
|
for (const copy& cp : ctx.parallelcopies[i]) {
|
|
if (try_coalesce_copy(ctx, cp, i)) {
|
|
renames.emplace(cp.def.tempId(), cp.op);
|
|
/* update liveness info */
|
|
ctx.live_out[i].erase(cp.def.tempId());
|
|
ctx.live_out[i].insert(cp.op.tempId());
|
|
} else {
|
|
uint32_t read_idx = -1u;
|
|
if (cp.op.isTemp())
|
|
read_idx = ctx.merge_node_table[cp.op.tempId()].index;
|
|
uint32_t write_idx = ctx.merge_node_table[cp.def.tempId()].index;
|
|
assert(write_idx != -1u);
|
|
ltg[write_idx] = {cp, read_idx};
|
|
|
|
bool is_vgpr = cp.def.regClass().type() == RegType::vgpr;
|
|
has_vgpr_copy |= is_vgpr;
|
|
has_sgpr_copy |= !is_vgpr;
|
|
}
|
|
}
|
|
|
|
/* build location-transfer-graph */
|
|
for (auto& pair : ltg) {
|
|
if (pair.second.read_idx == -1u)
|
|
continue;
|
|
auto&& it = ltg.find(pair.second.read_idx);
|
|
if (it != ltg.end())
|
|
it->second.num_uses++;
|
|
}
|
|
|
|
/* emit parallelcopies ordered */
|
|
Builder bld(ctx.program);
|
|
Block& block = ctx.program->blocks[i];
|
|
|
|
if (has_vgpr_copy) {
|
|
/* emit VGPR copies */
|
|
auto IsLogicalEnd = [](const aco_ptr<Instruction>& inst) -> bool
|
|
{ return inst->opcode == aco_opcode::p_logical_end; };
|
|
auto it =
|
|
std::find_if(block.instructions.rbegin(), block.instructions.rend(), IsLogicalEnd);
|
|
bld.reset(&block.instructions, std::prev(it.base()));
|
|
emit_copies_block(bld, ltg, RegType::vgpr);
|
|
}
|
|
|
|
if (has_sgpr_copy) {
|
|
/* emit SGPR copies */
|
|
aco_ptr<Instruction> branch = std::move(block.instructions.back());
|
|
block.instructions.pop_back();
|
|
bld.reset(&block.instructions);
|
|
emit_copies_block(bld, ltg, RegType::sgpr);
|
|
bld.insert(std::move(branch));
|
|
}
|
|
}
|
|
|
|
/* finally, rename coalesced phi operands */
|
|
for (Block& block : ctx.program->blocks) {
|
|
for (aco_ptr<Instruction>& phi : block.instructions) {
|
|
if (phi->opcode != aco_opcode::p_phi && phi->opcode != aco_opcode::p_linear_phi)
|
|
break;
|
|
|
|
for (Operand& op : phi->operands) {
|
|
if (!op.isTemp())
|
|
continue;
|
|
auto&& it = renames.find(op.tempId());
|
|
if (it != renames.end()) {
|
|
op = it->second;
|
|
renames.erase(it);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
assert(renames.empty());
|
|
}
|
|
|
|
} /* end namespace */
|
|
|
|
void
|
|
lower_to_cssa(Program* program, live& live_vars)
|
|
{
|
|
reindex_ssa(program, live_vars.live_out);
|
|
cssa_ctx ctx = {program, live_vars.live_out};
|
|
collect_parallelcopies(ctx);
|
|
emit_parallelcopies(ctx);
|
|
|
|
/* update live variable information */
|
|
live_vars = live_var_analysis(program);
|
|
}
|
|
} // namespace aco
|