aco: Format.

Manually adjusted some comments for more intuitive line breaks.

Reviewed-by: Tony Wasserka <tony.wasserka@gmx.de>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/11258>
This commit is contained in:
Daniel Schürmann 2021-06-09 10:14:54 +02:00 committed by Marge Bot
parent 97ec360dc4
commit 1e2639026f
32 changed files with 7231 additions and 6574 deletions

View File

@ -41,14 +41,15 @@ struct constaddr_info {
};
struct asm_context {
Program *program;
Program* program;
enum chip_class chip_class;
std::vector<std::pair<int, SOPP_instruction*>> branches;
std::map<unsigned, constaddr_info> constaddrs;
const int16_t* opcode;
// TODO: keep track of branch instructions referring blocks
// and, when emitting the block, correct the offset in instr
asm_context(Program* program_) : program(program_), chip_class(program->chip_class) {
asm_context(Program* program_) : program(program_), chip_class(program->chip_class)
{
if (chip_class <= GFX7)
opcode = &instr_info.opcode_gfx7[0];
else if (chip_class <= GFX9)
@ -60,7 +61,8 @@ struct asm_context {
int subvector_begin_pos = -1;
};
static uint32_t get_sdwa_sel(unsigned sel, PhysReg reg)
static uint32_t
get_sdwa_sel(unsigned sel, PhysReg reg)
{
if (sel & sdwa_isra) {
unsigned size = sdwa_rasize & sel;
@ -72,7 +74,9 @@ static uint32_t get_sdwa_sel(unsigned sel, PhysReg reg)
return sel & sdwa_asuint;
}
unsigned get_mimg_nsa_dwords(const Instruction *instr) {
unsigned
get_mimg_nsa_dwords(const Instruction* instr)
{
unsigned addr_dwords = instr->operands.size() - 3;
for (unsigned i = 1; i < addr_dwords; i++) {
if (instr->operands[3 + i].physReg() != instr->operands[3].physReg().advance(i * 4))
@ -81,7 +85,8 @@ unsigned get_mimg_nsa_dwords(const Instruction *instr) {
return 0;
}
void emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction* instr)
void
emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction* instr)
{
/* lower remaining pseudo-instructions */
if (instr->opcode == aco_opcode::p_constaddr_getpc) {
@ -99,11 +104,11 @@ void emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction*
uint32_t opcode = ctx.opcode[(int)instr->opcode];
if (opcode == (uint32_t)-1) {
char *outmem;
char* outmem;
size_t outsize;
struct u_memstream mem;
u_memstream_open(&mem, &outmem, &outsize);
FILE *const memf = u_memstream_get(&mem);
FILE* const memf = u_memstream_get(&mem);
fprintf(memf, "Unsupported opcode: ");
aco_print_instr(instr, memf);
@ -144,11 +149,11 @@ void emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction*
uint32_t encoding = (0b1011 << 28);
encoding |= opcode << 23;
encoding |=
!instr->definitions.empty() && !(instr->definitions[0].physReg() == scc) ?
instr->definitions[0].physReg() << 16 :
!instr->operands.empty() && instr->operands[0].physReg() <= 127 ?
instr->operands[0].physReg() << 16 : 0;
encoding |= !instr->definitions.empty() && !(instr->definitions[0].physReg() == scc)
? instr->definitions[0].physReg() << 16
: !instr->operands.empty() && instr->operands[0].physReg() <= 127
? instr->operands[0].physReg() << 16
: 0;
encoding |= sopk.imm;
out.push_back(encoding);
break;
@ -177,7 +182,7 @@ void emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction*
SOPP_instruction& sopp = instr->sopp();
uint32_t encoding = (0b101111111 << 23);
encoding |= opcode << 16;
encoding |= (uint16_t) sopp.imm;
encoding |= (uint16_t)sopp.imm;
if (sopp.block != -1) {
sopp.pass_flags = 0;
ctx.branches.emplace_back(out.size(), &sopp);
@ -208,7 +213,8 @@ void emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction*
}
out.push_back(encoding);
/* SMRD instructions can take a literal on GFX7 */
if (instr->operands.size() >= 2 && instr->operands[1].isConstant() && instr->operands[1].constantValue() >= 1024)
if (instr->operands.size() >= 2 && instr->operands[1].isConstant() &&
instr->operands[1].constantValue() >= 1024)
out.push_back(instr->operands[1].constantValue() >> 2);
return;
}
@ -235,7 +241,8 @@ void emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction*
}
if (is_load || instr->operands.size() >= 3) { /* SDATA */
encoding |= (is_load ? instr->definitions[0].physReg() : instr->operands[2].physReg()) << 6;
encoding |= (is_load ? instr->definitions[0].physReg() : instr->operands[2].physReg())
<< 6;
}
if (instr->operands.size() >= 1) { /* SBASE */
encoding |= instr->operands[0].physReg() >> 1;
@ -246,14 +253,16 @@ void emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction*
int32_t offset = 0;
uint32_t soffset = ctx.chip_class >= GFX10
? sgpr_null /* On GFX10 this is disabled by specifying SGPR_NULL */
: 0; /* On GFX9, it is disabled by the SOE bit (and it's not present on GFX8 and below) */
? sgpr_null /* On GFX10 this is disabled by specifying SGPR_NULL */
: 0; /* On GFX9, it is disabled by the SOE bit (and it's not present on
GFX8 and below) */
if (instr->operands.size() >= 2) {
const Operand &op_off1 = instr->operands[1];
const Operand& op_off1 = instr->operands[1];
if (ctx.chip_class <= GFX9) {
offset = op_off1.isConstant() ? op_off1.constantValue() : op_off1.physReg();
} else {
/* GFX10 only supports constants in OFFSET, so put the operand in SOFFSET if it's an SGPR */
/* GFX10 only supports constants in OFFSET, so put the operand in SOFFSET if it's an
* SGPR */
if (op_off1.isConstant()) {
offset = op_off1.constantValue();
} else {
@ -263,8 +272,9 @@ void emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction*
}
if (soe) {
const Operand &op_off2 = instr->operands.back();
assert(ctx.chip_class >= GFX9); /* GFX8 and below don't support specifying a constant and an SGPR at the same time */
const Operand& op_off2 = instr->operands.back();
assert(ctx.chip_class >= GFX9); /* GFX8 and below don't support specifying a constant
and an SGPR at the same time */
assert(!op_off2.isConstant());
soffset = op_off2.physReg();
}
@ -368,9 +378,13 @@ void emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction*
encoding = 0;
unsigned reg = !instr->definitions.empty() ? instr->definitions[0].physReg() : 0;
encoding |= (0xFF & reg) << 24;
reg = instr->operands.size() >= 3 && !(instr->operands[2].physReg() == m0) ? instr->operands[2].physReg() : 0;
reg = instr->operands.size() >= 3 && !(instr->operands[2].physReg() == m0)
? instr->operands[2].physReg()
: 0;
encoding |= (0xFF & reg) << 16;
reg = instr->operands.size() >= 2 && !(instr->operands[1].physReg() == m0) ? instr->operands[1].physReg() : 0;
reg = instr->operands.size() >= 2 && !(instr->operands[1].physReg() == m0)
? instr->operands[1].physReg()
: 0;
encoding |= (0xFF & reg) << 8;
encoding |= (0xFF & instr->operands[0].physReg());
out.push_back(encoding);
@ -402,7 +416,8 @@ void emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction*
encoding |= instr->operands[2].physReg() << 24;
encoding |= (mubuf.tfe ? 1 : 0) << 23;
encoding |= (instr->operands[0].physReg() >> 2) << 16;
unsigned reg = instr->operands.size() > 3 ? instr->operands[3].physReg() : instr->definitions[0].physReg();
unsigned reg = instr->operands.size() > 3 ? instr->operands[3].physReg()
: instr->definitions[0].physReg();
encoding |= (0xFF & reg) << 8;
encoding |= (0xFF & instr->operands[1].physReg());
out.push_back(encoding);
@ -435,7 +450,8 @@ void emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction*
encoding |= (mtbuf.tfe ? 1 : 0) << 23;
encoding |= (mtbuf.slc ? 1 : 0) << 22;
encoding |= (instr->operands[0].physReg() >> 2) << 16;
unsigned reg = instr->operands.size() > 3 ? instr->operands[3].physReg() : instr->definitions[0].physReg();
unsigned reg = instr->operands.size() > 3 ? instr->operands[3].physReg()
: instr->definitions[0].physReg();
encoding |= (0xFF & reg) << 8;
encoding |= (0xFF & instr->operands[1].physReg());
@ -465,7 +481,8 @@ void emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction*
encoding |= mimg.a16 ? 1 << 15 : 0;
encoding |= mimg.da ? 1 << 14 : 0;
} else {
encoding |= mimg.r128 ? 1 << 15 : 0; /* GFX10: A16 moved to 2nd word, R128 replaces it in 1st word */
encoding |= mimg.r128 ? 1 << 15
: 0; /* GFX10: A16 moved to 2nd word, R128 replaces it in 1st word */
encoding |= nsa_dwords << 1;
encoding |= mimg.dim << 3; /* GFX10: dimensionality instead of declare array */
encoding |= mimg.dlc ? 1 << 7 : 0;
@ -485,7 +502,8 @@ void emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction*
assert(!mimg.d16 || ctx.chip_class >= GFX9);
encoding |= mimg.d16 ? 1 << 31 : 0;
if (ctx.chip_class >= GFX10) {
encoding |= mimg.a16 ? 1 << 30 : 0; /* GFX10: A16 still exists, but is in a different place */
/* GFX10: A16 still exists, but is in a different place */
encoding |= mimg.a16 ? 1 << 30 : 0;
}
out.push_back(encoding);
@ -539,7 +557,8 @@ void emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction*
assert(ctx.chip_class >= GFX10 || instr->operands[1].physReg() != 0x7F);
assert(instr->format != Format::FLAT);
encoding |= instr->operands[1].physReg() << 16;
} else if (instr->format != Format::FLAT || ctx.chip_class >= GFX10) { /* SADDR is actually used with FLAT on GFX10 */
} else if (instr->format != Format::FLAT ||
ctx.chip_class >= GFX10) { /* SADDR is actually used with FLAT on GFX10 */
if (ctx.chip_class <= GFX9)
encoding |= 0x7F << 16;
else
@ -611,7 +630,7 @@ void emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction*
}
encoding |= vop3.opsel << 11;
for (unsigned i = 0; i < 3; i++)
encoding |= vop3.abs[i] << (8+i);
encoding |= vop3.abs[i] << (8 + i);
if (instr->definitions.size() == 2)
encoding |= instr->definitions[1].physReg() << 8;
encoding |= (0xFF & instr->definitions[0].physReg());
@ -625,7 +644,7 @@ void emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction*
}
encoding |= vop3.omod << 27;
for (unsigned i = 0; i < 3; i++)
encoding |= vop3.neg[i] << (29+i);
encoding |= vop3.neg[i] << (29 + i);
out.push_back(encoding);
} else if (instr->isVOP3P()) {
@ -645,7 +664,7 @@ void emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction*
encoding |= vop3.opsel_lo << 11;
encoding |= ((vop3.opsel_hi & 0x4) ? 1 : 0) << 14;
for (unsigned i = 0; i < 3; i++)
encoding |= vop3.neg_hi[i] << (8+i);
encoding |= vop3.neg_hi[i] << (8 + i);
encoding |= (0xFF & instr->definitions[0].physReg());
out.push_back(encoding);
encoding = 0;
@ -653,17 +672,17 @@ void emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction*
encoding |= instr->operands[i].physReg() << (i * 9);
encoding |= (vop3.opsel_hi & 0x3) << 27;
for (unsigned i = 0; i < 3; i++)
encoding |= vop3.neg_lo[i] << (29+i);
encoding |= vop3.neg_lo[i] << (29 + i);
out.push_back(encoding);
} else if (instr->isDPP()){
} else if (instr->isDPP()) {
assert(ctx.chip_class >= GFX8);
DPP_instruction& dpp = instr->dpp();
/* first emit the instruction without the DPP operand */
Operand dpp_op = instr->operands[0];
instr->operands[0] = Operand(PhysReg{250}, v1);
instr->format = (Format) ((uint16_t) instr->format & ~(uint16_t)Format::DPP);
instr->format = (Format)((uint16_t)instr->format & ~(uint16_t)Format::DPP);
emit_instruction(ctx, out, instr);
uint32_t encoding = (0xF & dpp.row_mask) << 28;
encoding |= (0xF & dpp.bank_mask) << 24;
@ -684,7 +703,7 @@ void emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction*
/* first emit the instruction without the SDWA operand */
Operand sdwa_op = instr->operands[0];
instr->operands[0] = Operand(PhysReg{249}, v1);
instr->format = (Format) ((uint16_t) instr->format & ~(uint16_t)Format::SDWA);
instr->format = (Format)((uint16_t)instr->format & ~(uint16_t)Format::SDWA);
emit_instruction(ctx, out, instr);
uint32_t encoding = 0;
@ -737,7 +756,8 @@ void emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction*
}
}
void emit_block(asm_context& ctx, std::vector<uint32_t>& out, Block& block)
void
emit_block(asm_context& ctx, std::vector<uint32_t>& out, Block& block)
{
for (aco_ptr<Instruction>& instr : block.instructions) {
#if 0
@ -754,15 +774,15 @@ void emit_block(asm_context& ctx, std::vector<uint32_t>& out, Block& block)
}
}
void fix_exports(asm_context& ctx, std::vector<uint32_t>& out, Program* program)
void
fix_exports(asm_context& ctx, std::vector<uint32_t>& out, Program* program)
{
bool exported = false;
for (Block& block : program->blocks) {
if (!(block.kind & block_kind_export_end))
continue;
std::vector<aco_ptr<Instruction>>::reverse_iterator it = block.instructions.rbegin();
while ( it != block.instructions.rend())
{
while (it != block.instructions.rend()) {
if ((*it)->isEXP()) {
Export_instruction& exp = (*it)->exp();
if (program->stage.hw == HWStage::VS || program->stage.hw == HWStage::NGG) {
@ -785,15 +805,18 @@ void fix_exports(asm_context& ctx, std::vector<uint32_t>& out, Program* program)
if (!exported) {
/* Abort in order to avoid a GPU hang. */
bool is_vertex_or_ngg = (program->stage.hw == HWStage::VS || program->stage.hw == HWStage::NGG);
aco_err(program, "Missing export in %s shader:", is_vertex_or_ngg ? "vertex or NGG" : "fragment");
bool is_vertex_or_ngg =
(program->stage.hw == HWStage::VS || program->stage.hw == HWStage::NGG);
aco_err(program,
"Missing export in %s shader:", is_vertex_or_ngg ? "vertex or NGG" : "fragment");
aco_print_program(program, stderr);
abort();
}
}
static void insert_code(asm_context& ctx, std::vector<uint32_t>& out, unsigned insert_before,
unsigned insert_count, const uint32_t *insert_data)
static void
insert_code(asm_context& ctx, std::vector<uint32_t>& out, unsigned insert_before,
unsigned insert_count, const uint32_t* insert_data)
{
out.insert(out.begin() + insert_before, insert_data, insert_data + insert_count);
@ -804,9 +827,9 @@ static void insert_code(asm_context& ctx, std::vector<uint32_t>& out, unsigned i
}
/* Find first branch after the inserted code */
auto branch_it = std::find_if(ctx.branches.begin(), ctx.branches.end(), [insert_before](const auto &branch) -> bool {
return (unsigned)branch.first >= insert_before;
});
auto branch_it = std::find_if(ctx.branches.begin(), ctx.branches.end(),
[insert_before](const auto& branch) -> bool
{ return (unsigned)branch.first >= insert_before; });
/* Update the locations of branches */
for (; branch_it != ctx.branches.end(); ++branch_it)
@ -822,15 +845,21 @@ static void insert_code(asm_context& ctx, std::vector<uint32_t>& out, unsigned i
}
}
static void fix_branches_gfx10(asm_context& ctx, std::vector<uint32_t>& out)
static void
fix_branches_gfx10(asm_context& ctx, std::vector<uint32_t>& out)
{
/* Branches with an offset of 0x3f are buggy on GFX10, we workaround by inserting NOPs if needed. */
/* Branches with an offset of 0x3f are buggy on GFX10,
* we workaround by inserting NOPs if needed.
*/
bool gfx10_3f_bug = false;
do {
auto buggy_branch_it = std::find_if(ctx.branches.begin(), ctx.branches.end(), [&ctx](const auto &branch) -> bool {
return ((int)ctx.program->blocks[branch.second->block].offset - branch.first - 1) == 0x3f;
});
auto buggy_branch_it = std::find_if(
ctx.branches.begin(), ctx.branches.end(),
[&ctx](const auto& branch) -> bool {
return ((int)ctx.program->blocks[branch.second->block].offset - branch.first - 1) ==
0x3f;
});
gfx10_3f_bug = buggy_branch_it != ctx.branches.end();
@ -842,7 +871,9 @@ static void fix_branches_gfx10(asm_context& ctx, std::vector<uint32_t>& out)
} while (gfx10_3f_bug);
}
void emit_long_jump(asm_context& ctx, SOPP_instruction *branch, bool backwards, std::vector<uint32_t>& out)
void
emit_long_jump(asm_context& ctx, SOPP_instruction* branch, bool backwards,
std::vector<uint32_t>& out)
{
Builder bld(ctx.program);
@ -857,26 +888,13 @@ void emit_long_jump(asm_context& ctx, SOPP_instruction *branch, bool backwards,
/* for conditional branches, skip the long jump if the condition is false */
aco_opcode inv;
switch (branch->opcode) {
case aco_opcode::s_cbranch_scc0:
inv = aco_opcode::s_cbranch_scc1;
break;
case aco_opcode::s_cbranch_scc1:
inv = aco_opcode::s_cbranch_scc0;
break;
case aco_opcode::s_cbranch_vccz:
inv = aco_opcode::s_cbranch_vccnz;
break;
case aco_opcode::s_cbranch_vccnz:
inv = aco_opcode::s_cbranch_vccz;
break;
case aco_opcode::s_cbranch_execz:
inv = aco_opcode::s_cbranch_execnz;
break;
case aco_opcode::s_cbranch_execnz:
inv = aco_opcode::s_cbranch_execz;
break;
default:
unreachable("Unhandled long jump.");
case aco_opcode::s_cbranch_scc0: inv = aco_opcode::s_cbranch_scc1; break;
case aco_opcode::s_cbranch_scc1: inv = aco_opcode::s_cbranch_scc0; break;
case aco_opcode::s_cbranch_vccz: inv = aco_opcode::s_cbranch_vccnz; break;
case aco_opcode::s_cbranch_vccnz: inv = aco_opcode::s_cbranch_vccz; break;
case aco_opcode::s_cbranch_execz: inv = aco_opcode::s_cbranch_execnz; break;
case aco_opcode::s_cbranch_execnz: inv = aco_opcode::s_cbranch_execz; break;
default: unreachable("Unhandled long jump.");
}
instr.reset(bld.sopp(inv, -1, 7));
emit_instruction(ctx, out, instr.get());
@ -891,7 +909,9 @@ void emit_long_jump(asm_context& ctx, SOPP_instruction *branch, bool backwards,
emit_instruction(ctx, out, instr.get());
branch->pass_flags = out.size();
instr.reset(bld.sop2(aco_opcode::s_addc_u32, def_tmp_hi, op_tmp_hi, Operand(backwards ? UINT32_MAX : 0u)).instr);
instr.reset(
bld.sop2(aco_opcode::s_addc_u32, def_tmp_hi, op_tmp_hi, Operand(backwards ? UINT32_MAX : 0u))
.instr);
emit_instruction(ctx, out, instr.get());
/* restore SCC and clear the LSB of the new PC */
@ -901,11 +921,13 @@ void emit_long_jump(asm_context& ctx, SOPP_instruction *branch, bool backwards,
emit_instruction(ctx, out, instr.get());
/* create the s_setpc_b64 to jump */
instr.reset(bld.sop1(aco_opcode::s_setpc_b64, Operand(branch->definitions[0].physReg(), s2)).instr);
instr.reset(
bld.sop1(aco_opcode::s_setpc_b64, Operand(branch->definitions[0].physReg(), s2)).instr);
emit_instruction(ctx, out, instr.get());
}
void fix_branches(asm_context& ctx, std::vector<uint32_t>& out)
void
fix_branches(asm_context& ctx, std::vector<uint32_t>& out)
{
bool repeat = false;
do {
@ -914,11 +936,12 @@ void fix_branches(asm_context& ctx, std::vector<uint32_t>& out)
if (ctx.chip_class == GFX10)
fix_branches_gfx10(ctx, out);
for (std::pair<int, SOPP_instruction*> &branch : ctx.branches) {
for (std::pair<int, SOPP_instruction*>& branch : ctx.branches) {
int offset = (int)ctx.program->blocks[branch.second->block].offset - branch.first - 1;
if ((offset < INT16_MIN || offset > INT16_MAX) && !branch.second->pass_flags) {
std::vector<uint32_t> long_jump;
bool backwards = ctx.program->blocks[branch.second->block].offset < (unsigned)branch.first;
bool backwards =
ctx.program->blocks[branch.second->block].offset < (unsigned)branch.first;
emit_long_jump(ctx, branch.second, backwards, long_jump);
out[branch.first] = long_jump[0];
@ -934,13 +957,14 @@ void fix_branches(asm_context& ctx, std::vector<uint32_t>& out)
out[branch.first + branch.second->pass_flags - 1] = offset * 4;
} else {
out[branch.first] &= 0xffff0000u;
out[branch.first] |= (uint16_t) offset;
out[branch.first] |= (uint16_t)offset;
}
}
} while (repeat);
}
void fix_constaddrs(asm_context& ctx, std::vector<uint32_t>& out)
void
fix_constaddrs(asm_context& ctx, std::vector<uint32_t>& out)
{
for (auto& constaddr : ctx.constaddrs) {
constaddr_info& info = constaddr.second;
@ -948,13 +972,12 @@ void fix_constaddrs(asm_context& ctx, std::vector<uint32_t>& out)
}
}
unsigned emit_program(Program* program,
std::vector<uint32_t>& code)
unsigned
emit_program(Program* program, std::vector<uint32_t>& code)
{
asm_context ctx(program);
if (program->stage.hw == HWStage::VS ||
program->stage.hw == HWStage::FS ||
if (program->stage.hw == HWStage::VS || program->stage.hw == HWStage::FS ||
program->stage.hw == HWStage::NGG)
fix_exports(ctx, code, program);
@ -986,4 +1009,4 @@ unsigned emit_program(Program* program,
return exec_size;
}
}
} // namespace aco

View File

@ -40,7 +40,8 @@ struct dce_ctx {
std::vector<uint16_t> uses;
std::vector<std::vector<bool>> live;
dce_ctx(Program* program) : current_block(program->blocks.size() - 1), uses(program->peekAllocationId())
dce_ctx(Program* program)
: current_block(program->blocks.size() - 1), uses(program->peekAllocationId())
{
live.reserve(program->blocks.size());
for (Block& block : program->blocks)
@ -48,7 +49,8 @@ struct dce_ctx {
}
};
void process_block(dce_ctx& ctx, Block& block)
void
process_block(dce_ctx& ctx, Block& block)
{
std::vector<bool>& live = ctx.live[block.index];
assert(live.size() == block.instructions.size());
@ -72,23 +74,26 @@ void process_block(dce_ctx& ctx, Block& block)
if (process_predecessors) {
for (unsigned pred_idx : block.linear_preds)
ctx.current_block = std::max(ctx.current_block, (int) pred_idx);
ctx.current_block = std::max(ctx.current_block, (int)pred_idx);
}
}
} /* end namespace */
bool is_dead(const std::vector<uint16_t>& uses, Instruction *instr)
bool
is_dead(const std::vector<uint16_t>& uses, Instruction* instr)
{
if (instr->definitions.empty() || instr->isBranch())
return false;
if (std::any_of(instr->definitions.begin(), instr->definitions.end(),
[&uses] (const Definition& def) { return !def.isTemp() || uses[def.tempId()];}))
[&uses](const Definition& def) { return !def.isTemp() || uses[def.tempId()]; }))
return false;
return !(get_sync_info(instr).semantics & (semantic_volatile | semantic_acqrel));
}
std::vector<uint16_t> dead_code_analysis(Program *program) {
std::vector<uint16_t>
dead_code_analysis(Program* program)
{
dce_ctx ctx(program);
@ -105,5 +110,4 @@ std::vector<uint16_t> dead_code_analysis(Program *program) {
return ctx.uses;
}
}
} // namespace aco

View File

@ -38,7 +38,8 @@
namespace aco {
void dominator_tree(Program* program)
void
dominator_tree(Program* program)
{
program->blocks[0].logical_idom = 0;
program->blocks[0].linear_idom = 0;
@ -48,7 +49,7 @@ void dominator_tree(Program* program)
int new_logical_idom = -1;
int new_linear_idom = -1;
for (unsigned pred_idx : block.logical_preds) {
if ((int) program->blocks[pred_idx].logical_idom == -1)
if ((int)program->blocks[pred_idx].logical_idom == -1)
continue;
if (new_logical_idom == -1) {
@ -56,16 +57,16 @@ void dominator_tree(Program* program)
continue;
}
while ((int) pred_idx != new_logical_idom) {
if ((int) pred_idx > new_logical_idom)
while ((int)pred_idx != new_logical_idom) {
if ((int)pred_idx > new_logical_idom)
pred_idx = program->blocks[pred_idx].logical_idom;
if ((int) pred_idx < new_logical_idom)
if ((int)pred_idx < new_logical_idom)
new_logical_idom = program->blocks[new_logical_idom].logical_idom;
}
}
for (unsigned pred_idx : block.linear_preds) {
if ((int) program->blocks[pred_idx].linear_idom == -1)
if ((int)program->blocks[pred_idx].linear_idom == -1)
continue;
if (new_linear_idom == -1) {
@ -73,10 +74,10 @@ void dominator_tree(Program* program)
continue;
}
while ((int) pred_idx != new_linear_idom) {
if ((int) pred_idx > new_linear_idom)
while ((int)pred_idx != new_linear_idom) {
if ((int)pred_idx > new_linear_idom)
pred_idx = program->blocks[pred_idx].linear_idom;
if ((int) pred_idx < new_linear_idom)
if ((int)pred_idx < new_linear_idom)
new_linear_idom = program->blocks[new_linear_idom].linear_idom;
}
}
@ -86,5 +87,5 @@ void dominator_tree(Program* program)
}
}
}
} // namespace aco
#endif

View File

@ -31,15 +31,15 @@ namespace aco {
namespace {
/* there can also be LDS and VALU clauses, but I don't see how those are interesting */
enum clause_type
{
enum clause_type {
clause_vmem,
clause_flat,
clause_smem,
clause_other,
};
void emit_clause(Builder& bld, unsigned num_instrs, aco_ptr<Instruction> *instrs)
void
emit_clause(Builder& bld, unsigned num_instrs, aco_ptr<Instruction>* instrs)
{
unsigned start = 0;
@ -61,7 +61,8 @@ void emit_clause(Builder& bld, unsigned num_instrs, aco_ptr<Instruction> *instrs
} /* end namespace */
void form_hard_clauses(Program *program)
void
form_hard_clauses(Program* program)
{
for (Block& block : program->blocks) {
unsigned num_instrs = 0;
@ -77,7 +78,8 @@ void form_hard_clauses(Program *program)
clause_type type = clause_other;
if (instr->isVMEM() && !instr->operands.empty()) {
if (program->chip_class == GFX10 && instr->isMIMG() && get_mimg_nsa_dwords(instr.get()) > 0)
if (program->chip_class == GFX10 && instr->isMIMG() &&
get_mimg_nsa_dwords(instr.get()) > 0)
type = clause_other;
else
type = clause_vmem;
@ -109,4 +111,4 @@ void form_hard_clauses(Program *program)
block.instructions = std::move(new_instructions);
}
}
}
} // namespace aco

View File

@ -34,12 +34,15 @@ namespace aco {
namespace {
struct NOP_ctx_gfx6 {
void join(const NOP_ctx_gfx6 &other) {
set_vskip_mode_then_vector = MAX2(set_vskip_mode_then_vector, other.set_vskip_mode_then_vector);
void join(const NOP_ctx_gfx6& other)
{
set_vskip_mode_then_vector =
MAX2(set_vskip_mode_then_vector, other.set_vskip_mode_then_vector);
valu_wr_vcc_then_vccz = MAX2(valu_wr_vcc_then_vccz, other.valu_wr_vcc_then_vccz);
valu_wr_exec_then_execz = MAX2(valu_wr_exec_then_execz, other.valu_wr_exec_then_execz);
valu_wr_vcc_then_div_fmas = MAX2(valu_wr_vcc_then_div_fmas, other.valu_wr_vcc_then_div_fmas);
salu_wr_m0_then_gds_msg_ttrace = MAX2(salu_wr_m0_then_gds_msg_ttrace, other.salu_wr_m0_then_gds_msg_ttrace);
salu_wr_m0_then_gds_msg_ttrace =
MAX2(salu_wr_m0_then_gds_msg_ttrace, other.salu_wr_m0_then_gds_msg_ttrace);
valu_wr_exec_then_dpp = MAX2(valu_wr_exec_then_dpp, other.valu_wr_exec_then_dpp);
salu_wr_m0_then_lds = MAX2(salu_wr_m0_then_lds, other.salu_wr_m0_then_lds);
salu_wr_m0_then_moverel = MAX2(salu_wr_m0_then_moverel, other.salu_wr_m0_then_moverel);
@ -53,23 +56,21 @@ struct NOP_ctx_gfx6 {
}
}
bool operator==(const NOP_ctx_gfx6 &other)
bool operator==(const NOP_ctx_gfx6& other)
{
return
set_vskip_mode_then_vector == other.set_vskip_mode_then_vector &&
valu_wr_vcc_then_vccz == other.valu_wr_vcc_then_vccz &&
valu_wr_exec_then_execz == other.valu_wr_exec_then_execz &&
valu_wr_vcc_then_div_fmas == other.valu_wr_vcc_then_div_fmas &&
vmem_store_then_wr_data == other.vmem_store_then_wr_data &&
salu_wr_m0_then_gds_msg_ttrace == other.salu_wr_m0_then_gds_msg_ttrace &&
valu_wr_exec_then_dpp == other.valu_wr_exec_then_dpp &&
salu_wr_m0_then_lds == other.salu_wr_m0_then_lds &&
salu_wr_m0_then_moverel == other.salu_wr_m0_then_moverel &&
setreg_then_getsetreg == other.setreg_then_getsetreg &&
smem_clause == other.smem_clause &&
smem_write == other.smem_write &&
BITSET_EQUAL(smem_clause_read_write, other.smem_clause_read_write) &&
BITSET_EQUAL(smem_clause_write, other.smem_clause_write);
return set_vskip_mode_then_vector == other.set_vskip_mode_then_vector &&
valu_wr_vcc_then_vccz == other.valu_wr_vcc_then_vccz &&
valu_wr_exec_then_execz == other.valu_wr_exec_then_execz &&
valu_wr_vcc_then_div_fmas == other.valu_wr_vcc_then_div_fmas &&
vmem_store_then_wr_data == other.vmem_store_then_wr_data &&
salu_wr_m0_then_gds_msg_ttrace == other.salu_wr_m0_then_gds_msg_ttrace &&
valu_wr_exec_then_dpp == other.valu_wr_exec_then_dpp &&
salu_wr_m0_then_lds == other.salu_wr_m0_then_lds &&
salu_wr_m0_then_moverel == other.salu_wr_m0_then_moverel &&
setreg_then_getsetreg == other.setreg_then_getsetreg &&
smem_clause == other.smem_clause && smem_write == other.smem_write &&
BITSET_EQUAL(smem_clause_read_write, other.smem_clause_read_write) &&
BITSET_EQUAL(smem_clause_write, other.smem_clause_write);
}
void add_wait_states(unsigned amount)
@ -154,7 +155,8 @@ struct NOP_ctx_gfx10 {
std::bitset<128> sgprs_read_by_VMEM;
std::bitset<128> sgprs_read_by_SMEM;
void join(const NOP_ctx_gfx10 &other) {
void join(const NOP_ctx_gfx10& other)
{
has_VOPC |= other.has_VOPC;
has_nonVALU_exec_read |= other.has_nonVALU_exec_read;
has_VMEM |= other.has_VMEM;
@ -167,23 +169,19 @@ struct NOP_ctx_gfx10 {
sgprs_read_by_SMEM |= other.sgprs_read_by_SMEM;
}
bool operator==(const NOP_ctx_gfx10 &other)
bool operator==(const NOP_ctx_gfx10& other)
{
return
has_VOPC == other.has_VOPC &&
has_nonVALU_exec_read == other.has_nonVALU_exec_read &&
has_VMEM == other.has_VMEM &&
has_branch_after_VMEM == other.has_branch_after_VMEM &&
has_DS == other.has_DS &&
has_branch_after_DS == other.has_branch_after_DS &&
has_NSA_MIMG == other.has_NSA_MIMG &&
has_writelane == other.has_writelane &&
sgprs_read_by_VMEM == other.sgprs_read_by_VMEM &&
sgprs_read_by_SMEM == other.sgprs_read_by_SMEM;
return has_VOPC == other.has_VOPC && has_nonVALU_exec_read == other.has_nonVALU_exec_read &&
has_VMEM == other.has_VMEM && has_branch_after_VMEM == other.has_branch_after_VMEM &&
has_DS == other.has_DS && has_branch_after_DS == other.has_branch_after_DS &&
has_NSA_MIMG == other.has_NSA_MIMG && has_writelane == other.has_writelane &&
sgprs_read_by_VMEM == other.sgprs_read_by_VMEM &&
sgprs_read_by_SMEM == other.sgprs_read_by_SMEM;
}
};
int get_wait_states(aco_ptr<Instruction>& instr)
int
get_wait_states(aco_ptr<Instruction>& instr)
{
if (instr->opcode == aco_opcode::s_nop)
return instr->sopp().imm + 1;
@ -193,16 +191,16 @@ int get_wait_states(aco_ptr<Instruction>& instr)
return 1;
}
bool regs_intersect(PhysReg a_reg, unsigned a_size, PhysReg b_reg, unsigned b_size)
bool
regs_intersect(PhysReg a_reg, unsigned a_size, PhysReg b_reg, unsigned b_size)
{
return a_reg > b_reg ?
(a_reg - b_reg < b_size) :
(b_reg - a_reg < a_size);
return a_reg > b_reg ? (a_reg - b_reg < b_size) : (b_reg - a_reg < a_size);
}
template <bool Valu, bool Vintrp, bool Salu>
int handle_raw_hazard_internal(Program *program, Block *block,
int nops_needed, PhysReg reg, uint32_t mask)
int
handle_raw_hazard_internal(Program* program, Block* block, int nops_needed, PhysReg reg,
uint32_t mask)
{
unsigned mask_size = util_last_bit(mask);
for (int pred_idx = block->instructions.size() - 1; pred_idx >= 0; pred_idx--) {
@ -217,10 +215,8 @@ int handle_raw_hazard_internal(Program *program, Block *block,
}
}
bool is_hazard = writemask != 0 &&
((pred->isVALU() && Valu) ||
(pred->isVINTRP() && Vintrp) ||
(pred->isSALU() && Salu));
bool is_hazard = writemask != 0 && ((pred->isVALU() && Valu) ||
(pred->isVINTRP() && Vintrp) || (pred->isSALU() && Salu));
if (is_hazard)
return nops_needed;
@ -238,17 +234,19 @@ int handle_raw_hazard_internal(Program *program, Block *block,
* huge value. */
for (unsigned lin_pred : block->linear_preds) {
res = std::max(res, handle_raw_hazard_internal<Valu, Vintrp, Salu>(
program, &program->blocks[lin_pred], nops_needed, reg, mask));
program, &program->blocks[lin_pred], nops_needed, reg, mask));
}
return res;
}
template <bool Valu, bool Vintrp, bool Salu>
void handle_raw_hazard(Program *program, Block *cur_block, int *NOPs, int min_states, Operand op)
void
handle_raw_hazard(Program* program, Block* cur_block, int* NOPs, int min_states, Operand op)
{
if (*NOPs >= min_states)
return;
int res = handle_raw_hazard_internal<Valu, Vintrp, Salu>(program, cur_block, min_states, op.physReg(), u_bit_consecutive(0, op.size()));
int res = handle_raw_hazard_internal<Valu, Vintrp, Salu>(
program, cur_block, min_states, op.physReg(), u_bit_consecutive(0, op.size()));
*NOPs = MAX2(*NOPs, res);
}
@ -256,7 +254,9 @@ static auto handle_valu_then_read_hazard = handle_raw_hazard<true, true, false>;
static auto handle_vintrp_then_read_hazard = handle_raw_hazard<false, true, false>;
static auto handle_valu_salu_then_read_hazard = handle_raw_hazard<true, true, true>;
void set_bitset_range(BITSET_WORD *words, unsigned start, unsigned size) {
void
set_bitset_range(BITSET_WORD* words, unsigned start, unsigned size)
{
unsigned end = start + size - 1;
unsigned start_mod = start % BITSET_WORDBITS;
if (start_mod + size <= BITSET_WORDBITS) {
@ -268,7 +268,9 @@ void set_bitset_range(BITSET_WORD *words, unsigned start, unsigned size) {
}
}
bool test_bitset_range(BITSET_WORD *words, unsigned start, unsigned size) {
bool
test_bitset_range(BITSET_WORD* words, unsigned start, unsigned size)
{
unsigned end = start + size - 1;
unsigned start_mod = start % BITSET_WORDBITS;
if (start_mod + size <= BITSET_WORDBITS) {
@ -291,18 +293,21 @@ bool test_bitset_range(BITSET_WORD *words, unsigned start, unsigned size) {
*
* SMEM clauses are only present on GFX8+, and only matter when XNACK is set.
*/
void handle_smem_clause_hazards(Program *program, NOP_ctx_gfx6 &ctx,
aco_ptr<Instruction>& instr, int *NOPs)
void
handle_smem_clause_hazards(Program* program, NOP_ctx_gfx6& ctx, aco_ptr<Instruction>& instr,
int* NOPs)
{
/* break off from previous SMEM clause if needed */
if (!*NOPs & (ctx.smem_clause || ctx.smem_write)) {
/* Don't allow clauses with store instructions since the clause's
* instructions may use the same address. */
if (ctx.smem_write || instr->definitions.empty() || instr_info.is_atomic[(unsigned)instr->opcode]) {
if (ctx.smem_write || instr->definitions.empty() ||
instr_info.is_atomic[(unsigned)instr->opcode]) {
*NOPs = 1;
} else if (program->dev.xnack_enabled) {
for (Operand op : instr->operands) {
if (!op.isConstant() && test_bitset_range(ctx.smem_clause_write, op.physReg(), op.size())) {
if (!op.isConstant() &&
test_bitset_range(ctx.smem_clause_write, op.physReg(), op.size())) {
*NOPs = 1;
break;
}
@ -316,8 +321,10 @@ void handle_smem_clause_hazards(Program *program, NOP_ctx_gfx6 &ctx,
}
/* TODO: we don't handle accessing VCC using the actual SGPR instead of using the alias */
void handle_instruction_gfx6(Program *program, Block *cur_block, NOP_ctx_gfx6 &ctx,
aco_ptr<Instruction>& instr, std::vector<aco_ptr<Instruction>>& new_instructions)
void
handle_instruction_gfx6(Program* program, Block* cur_block, NOP_ctx_gfx6& ctx,
aco_ptr<Instruction>& instr,
std::vector<aco_ptr<Instruction>>& new_instructions)
{
/* check hazards */
int NOPs = 0;
@ -343,14 +350,17 @@ void handle_instruction_gfx6(Program *program, Block *cur_block, NOP_ctx_gfx6 &c
handle_smem_clause_hazards(program, ctx, instr, &NOPs);
} else if (instr->isSALU()) {
if (instr->opcode == aco_opcode::s_setreg_b32 || instr->opcode == aco_opcode::s_setreg_imm32_b32 ||
if (instr->opcode == aco_opcode::s_setreg_b32 ||
instr->opcode == aco_opcode::s_setreg_imm32_b32 ||
instr->opcode == aco_opcode::s_getreg_b32) {
NOPs = MAX2(NOPs, ctx.setreg_then_getsetreg);
}
if (program->chip_class == GFX9) {
if (instr->opcode == aco_opcode::s_movrels_b32 || instr->opcode == aco_opcode::s_movrels_b64 ||
instr->opcode == aco_opcode::s_movreld_b32 || instr->opcode == aco_opcode::s_movreld_b64) {
if (instr->opcode == aco_opcode::s_movrels_b32 ||
instr->opcode == aco_opcode::s_movrels_b64 ||
instr->opcode == aco_opcode::s_movreld_b32 ||
instr->opcode == aco_opcode::s_movreld_b64) {
NOPs = MAX2(NOPs, ctx.salu_wr_m0_then_moverel);
}
}
@ -398,7 +408,8 @@ void handle_instruction_gfx6(Program *program, Block *cur_block, NOP_ctx_gfx6 &c
handle_vintrp_then_read_hazard(program, cur_block, &NOPs, 1, instr->operands[0]);
}
if (instr->opcode == aco_opcode::v_div_fmas_f32 || instr->opcode == aco_opcode::v_div_fmas_f64)
if (instr->opcode == aco_opcode::v_div_fmas_f32 ||
instr->opcode == aco_opcode::v_div_fmas_f64)
NOPs = MAX2(NOPs, ctx.valu_wr_vcc_then_div_fmas);
} else if (instr->isVMEM() || instr->isFlatLike()) {
/* If the VALU writes the SGPR that is used by a VMEM, the user must add five wait states. */
@ -412,13 +423,11 @@ void handle_instruction_gfx6(Program *program, Block *cur_block, NOP_ctx_gfx6 &c
NOPs = MAX2(NOPs, ctx.set_vskip_mode_then_vector);
if (program->chip_class == GFX9) {
bool lds_scratch_global = (instr->isScratch() || instr->isGlobal()) &&
instr->flatlike().lds;
if (instr->isVINTRP() ||
bool lds_scratch_global = (instr->isScratch() || instr->isGlobal()) && instr->flatlike().lds;
if (instr->isVINTRP() || lds_scratch_global ||
instr->opcode == aco_opcode::ds_read_addtid_b32 ||
instr->opcode == aco_opcode::ds_write_addtid_b32 ||
instr->opcode == aco_opcode::buffer_store_lds_dword ||
lds_scratch_global) {
instr->opcode == aco_opcode::buffer_store_lds_dword) {
NOPs = MAX2(NOPs, ctx.salu_wr_m0_then_lds);
}
}
@ -428,7 +437,8 @@ void handle_instruction_gfx6(Program *program, Block *cur_block, NOP_ctx_gfx6 &c
// TODO: try to schedule the NOP-causing instruction up to reduce the number of stall cycles
if (NOPs) {
/* create NOP */
aco_ptr<SOPP_instruction> nop{create_instruction<SOPP_instruction>(aco_opcode::s_nop, Format::SOPP, 0, 0)};
aco_ptr<SOPP_instruction> nop{
create_instruction<SOPP_instruction>(aco_opcode::s_nop, Format::SOPP, 0, 0)};
nop->imm = NOPs - 1;
nop->block = -1;
new_instructions.emplace_back(std::move(nop));
@ -485,7 +495,8 @@ void handle_instruction_gfx6(Program *program, Block *cur_block, NOP_ctx_gfx6 &c
ctx.salu_wr_m0_then_lds = 1;
ctx.salu_wr_m0_then_moverel = 1;
}
} else if (instr->opcode == aco_opcode::s_setreg_b32 || instr->opcode == aco_opcode::s_setreg_imm32_b32) {
} else if (instr->opcode == aco_opcode::s_setreg_b32 ||
instr->opcode == aco_opcode::s_setreg_imm32_b32) {
SOPK_instruction& sopk = instr->sopk();
unsigned offset = (sopk.imm >> 6) & 0x1f;
unsigned size = ((sopk.imm >> 11) & 0x1f) + 1;
@ -497,19 +508,16 @@ void handle_instruction_gfx6(Program *program, Block *cur_block, NOP_ctx_gfx6 &c
}
} else if (instr->isVMEM() || instr->isFlatLike()) {
/* >64-bit MUBUF/MTBUF store with a constant in SOFFSET */
bool consider_buf = (instr->isMUBUF() || instr->isMTBUF()) &&
instr->operands.size() == 4 &&
instr->operands[3].size() > 2 &&
instr->operands[2].physReg() >= 128;
/* MIMG store with a 128-bit T# with more than two bits set in dmask (making it a >64-bit store) */
bool consider_buf = (instr->isMUBUF() || instr->isMTBUF()) && instr->operands.size() == 4 &&
instr->operands[3].size() > 2 && instr->operands[2].physReg() >= 128;
/* MIMG store with a 128-bit T# with more than two bits set in dmask (making it a >64-bit
* store) */
bool consider_mimg = instr->isMIMG() &&
instr->operands[1].regClass().type() == RegType::vgpr &&
instr->operands[1].size() > 2 &&
instr->operands[0].size() == 4;
instr->operands[1].size() > 2 && instr->operands[0].size() == 4;
/* FLAT/GLOBAL/SCRATCH store with >64-bit data */
bool consider_flat = instr->isFlatLike() &&
instr->operands.size() == 3 &&
instr->operands[2].size() > 2;
bool consider_flat =
instr->isFlatLike() && instr->operands.size() == 3 && instr->operands[2].size() > 2;
if (consider_buf || consider_mimg || consider_flat) {
PhysReg wrdata = instr->operands[consider_flat ? 2 : 3].physReg();
unsigned size = instr->operands[consider_flat ? 2 : 3].size();
@ -520,22 +528,26 @@ void handle_instruction_gfx6(Program *program, Block *cur_block, NOP_ctx_gfx6 &c
}
template <std::size_t N>
bool check_written_regs(const aco_ptr<Instruction> &instr, const std::bitset<N> &check_regs)
bool
check_written_regs(const aco_ptr<Instruction>& instr, const std::bitset<N>& check_regs)
{
return std::any_of(instr->definitions.begin(), instr->definitions.end(), [&check_regs](const Definition &def) -> bool {
bool writes_any = false;
for (unsigned i = 0; i < def.size(); i++) {
unsigned def_reg = def.physReg() + i;
writes_any |= def_reg < check_regs.size() && check_regs[def_reg];
}
return writes_any;
});
return std::any_of(instr->definitions.begin(), instr->definitions.end(),
[&check_regs](const Definition& def) -> bool
{
bool writes_any = false;
for (unsigned i = 0; i < def.size(); i++) {
unsigned def_reg = def.physReg() + i;
writes_any |= def_reg < check_regs.size() && check_regs[def_reg];
}
return writes_any;
});
}
template <std::size_t N>
void mark_read_regs(const aco_ptr<Instruction> &instr, std::bitset<N> &reg_reads)
void
mark_read_regs(const aco_ptr<Instruction>& instr, std::bitset<N>& reg_reads)
{
for (const Operand &op : instr->operands) {
for (const Operand& op : instr->operands) {
for (unsigned i = 0; i < op.size(); i++) {
unsigned reg = op.physReg() + i;
if (reg < reg_reads.size())
@ -544,7 +556,8 @@ void mark_read_regs(const aco_ptr<Instruction> &instr, std::bitset<N> &reg_reads
}
}
bool VALU_writes_sgpr(aco_ptr<Instruction>& instr)
bool
VALU_writes_sgpr(aco_ptr<Instruction>& instr)
{
if (instr->isVOPC())
return true;
@ -557,24 +570,26 @@ bool VALU_writes_sgpr(aco_ptr<Instruction>& instr)
return false;
}
bool instr_writes_exec(const aco_ptr<Instruction>& instr)
bool
instr_writes_exec(const aco_ptr<Instruction>& instr)
{
return std::any_of(instr->definitions.begin(), instr->definitions.end(), [](const Definition &def) -> bool {
return def.physReg() == exec_lo || def.physReg() == exec_hi;
});
return std::any_of(instr->definitions.begin(), instr->definitions.end(),
[](const Definition& def) -> bool
{ return def.physReg() == exec_lo || def.physReg() == exec_hi; });
}
bool instr_writes_sgpr(const aco_ptr<Instruction>& instr)
bool
instr_writes_sgpr(const aco_ptr<Instruction>& instr)
{
return std::any_of(instr->definitions.begin(), instr->definitions.end(), [](const Definition &def) -> bool {
return def.getTemp().type() == RegType::sgpr;
});
return std::any_of(instr->definitions.begin(), instr->definitions.end(),
[](const Definition& def) -> bool
{ return def.getTemp().type() == RegType::sgpr; });
}
inline bool instr_is_branch(const aco_ptr<Instruction>& instr)
inline bool
instr_is_branch(const aco_ptr<Instruction>& instr)
{
return instr->opcode == aco_opcode::s_branch ||
instr->opcode == aco_opcode::s_cbranch_scc0 ||
return instr->opcode == aco_opcode::s_branch || instr->opcode == aco_opcode::s_cbranch_scc0 ||
instr->opcode == aco_opcode::s_cbranch_scc1 ||
instr->opcode == aco_opcode::s_cbranch_vccz ||
instr->opcode == aco_opcode::s_cbranch_vccnz ||
@ -586,19 +601,20 @@ inline bool instr_is_branch(const aco_ptr<Instruction>& instr)
instr->opcode == aco_opcode::s_cbranch_cdbgsys_and_user ||
instr->opcode == aco_opcode::s_subvector_loop_begin ||
instr->opcode == aco_opcode::s_subvector_loop_end ||
instr->opcode == aco_opcode::s_setpc_b64 ||
instr->opcode == aco_opcode::s_swappc_b64 ||
instr->opcode == aco_opcode::s_getpc_b64 ||
instr->opcode == aco_opcode::s_call_b64;
instr->opcode == aco_opcode::s_setpc_b64 || instr->opcode == aco_opcode::s_swappc_b64 ||
instr->opcode == aco_opcode::s_getpc_b64 || instr->opcode == aco_opcode::s_call_b64;
}
void handle_instruction_gfx10(Program *program, Block *cur_block, NOP_ctx_gfx10 &ctx,
aco_ptr<Instruction>& instr, std::vector<aco_ptr<Instruction>>& new_instructions)
void
handle_instruction_gfx10(Program* program, Block* cur_block, NOP_ctx_gfx10& ctx,
aco_ptr<Instruction>& instr,
std::vector<aco_ptr<Instruction>>& new_instructions)
{
//TODO: s_dcache_inv needs to be in it's own group on GFX10
// TODO: s_dcache_inv needs to be in it's own group on GFX10
/* VMEMtoScalarWriteHazard
* Handle EXEC/M0/SGPR write following a VMEM instruction without a VALU or "waitcnt vmcnt(0)" in-between.
* Handle EXEC/M0/SGPR write following a VMEM instruction without a VALU or "waitcnt vmcnt(0)"
* in-between.
*/
if (instr->isVMEM() || instr->isFlatLike() || instr->isDS()) {
/* Remember all SGPRs that are read by the VMEM instruction */
@ -624,7 +640,8 @@ void handle_instruction_gfx10(Program *program, Block *cur_block, NOP_ctx_gfx10
ctx.sgprs_read_by_VMEM.reset();
/* Insert s_waitcnt_depctr instruction with magic imm to mitigate the problem */
aco_ptr<SOPP_instruction> depctr{create_instruction<SOPP_instruction>(aco_opcode::s_waitcnt_depctr, Format::SOPP, 0, 0)};
aco_ptr<SOPP_instruction> depctr{
create_instruction<SOPP_instruction>(aco_opcode::s_waitcnt_depctr, Format::SOPP, 0, 0)};
depctr->imm = 0xffe3;
depctr->block = -1;
new_instructions.emplace_back(std::move(depctr));
@ -639,13 +656,13 @@ void handle_instruction_gfx10(Program *program, Block *cur_block, NOP_ctx_gfx10
*/
if (instr->isVOPC()) {
ctx.has_VOPC = true;
} else if (ctx.has_VOPC &&
(instr->opcode == aco_opcode::v_permlane16_b32 ||
instr->opcode == aco_opcode::v_permlanex16_b32)) {
} else if (ctx.has_VOPC && (instr->opcode == aco_opcode::v_permlane16_b32 ||
instr->opcode == aco_opcode::v_permlanex16_b32)) {
ctx.has_VOPC = false;
/* v_nop would be discarded by SQ, so use v_mov with the first operand of the permlane */
aco_ptr<VOP1_instruction> v_mov{create_instruction<VOP1_instruction>(aco_opcode::v_mov_b32, Format::VOP1, 1, 1)};
aco_ptr<VOP1_instruction> v_mov{
create_instruction<VOP1_instruction>(aco_opcode::v_mov_b32, Format::VOP1, 1, 1)};
v_mov->definitions[0] = Definition(instr->operands[0].physReg(), v1);
v_mov->operands[0] = Operand(instr->operands[0].physReg(), v1);
new_instructions.emplace_back(std::move(v_mov));
@ -663,7 +680,8 @@ void handle_instruction_gfx10(Program *program, Block *cur_block, NOP_ctx_gfx10
ctx.has_nonVALU_exec_read = false;
/* Insert s_waitcnt_depctr instruction with magic imm to mitigate the problem */
aco_ptr<SOPP_instruction> depctr{create_instruction<SOPP_instruction>(aco_opcode::s_waitcnt_depctr, Format::SOPP, 0, 0)};
aco_ptr<SOPP_instruction> depctr{
create_instruction<SOPP_instruction>(aco_opcode::s_waitcnt_depctr, Format::SOPP, 0, 0)};
depctr->imm = 0xfffe;
depctr->block = -1;
new_instructions.emplace_back(std::move(depctr));
@ -689,7 +707,8 @@ void handle_instruction_gfx10(Program *program, Block *cur_block, NOP_ctx_gfx10
ctx.sgprs_read_by_SMEM.reset();
/* Insert s_mov to mitigate the problem */
aco_ptr<SOP1_instruction> s_mov{create_instruction<SOP1_instruction>(aco_opcode::s_mov_b32, Format::SOP1, 1, 1)};
aco_ptr<SOP1_instruction> s_mov{
create_instruction<SOP1_instruction>(aco_opcode::s_mov_b32, Format::SOP1, 1, 1)};
s_mov->definitions[0] = Definition(sgpr_null, s1);
s_mov->operands[0] = Operand(0u);
new_instructions.emplace_back(std::move(s_mov));
@ -738,14 +757,16 @@ void handle_instruction_gfx10(Program *program, Block *cur_block, NOP_ctx_gfx10
ctx.has_VMEM = ctx.has_branch_after_VMEM = ctx.has_DS = ctx.has_branch_after_DS = false;
/* Insert s_waitcnt_vscnt to mitigate the problem */
aco_ptr<SOPK_instruction> wait{create_instruction<SOPK_instruction>(aco_opcode::s_waitcnt_vscnt, Format::SOPK, 0, 1)};
aco_ptr<SOPK_instruction> wait{
create_instruction<SOPK_instruction>(aco_opcode::s_waitcnt_vscnt, Format::SOPK, 0, 1)};
wait->definitions[0] = Definition(sgpr_null, s1);
wait->imm = 0;
new_instructions.emplace_back(std::move(wait));
}
/* NSAToVMEMBug
* Handles NSA MIMG (4 or more dwords) immediately followed by MUBUF/MTBUF (with offset[2:1] != 0).
* Handles NSA MIMG (4 or more dwords) immediately followed by MUBUF/MTBUF (with offset[2:1] !=
* 0).
*/
if (instr->isMIMG() && get_mimg_nsa_dwords(instr.get()) > 1) {
ctx.has_NSA_MIMG = true;
@ -772,11 +793,12 @@ void handle_instruction_gfx10(Program *program, Block *cur_block, NOP_ctx_gfx10
}
template <typename Ctx>
using HandleInstr = void (*)(Program *, Block *block, Ctx&, aco_ptr<Instruction>&,
using HandleInstr = void (*)(Program*, Block* block, Ctx&, aco_ptr<Instruction>&,
std::vector<aco_ptr<Instruction>>&);
template <typename Ctx, HandleInstr<Ctx> Handle>
void handle_block(Program *program, Ctx& ctx, Block& block)
void
handle_block(Program* program, Ctx& ctx, Block& block)
{
if (block.instructions.empty())
return;
@ -793,14 +815,15 @@ void handle_block(Program *program, Ctx& ctx, Block& block)
}
template <typename Ctx, HandleInstr<Ctx> Handle>
void mitigate_hazards(Program *program)
void
mitigate_hazards(Program* program)
{
std::vector<Ctx> all_ctx(program->blocks.size());
std::stack<unsigned> loop_header_indices;
for (unsigned i = 0; i < program->blocks.size(); i++) {
Block& block = program->blocks[i];
Ctx &ctx = all_ctx[i];
Ctx& ctx = all_ctx[i];
if (block.kind & block_kind_loop_header) {
loop_header_indices.push(i);
@ -832,7 +855,8 @@ void mitigate_hazards(Program *program)
} /* end namespace */
void insert_NOPs(Program* program)
void
insert_NOPs(Program* program)
{
if (program->chip_class >= GFX10_3)
; /* no hazards/bugs to mitigate */
@ -842,4 +866,4 @@ void insert_NOPs(Program* program)
mitigate_hazards<NOP_ctx_gfx6, handle_instruction_gfx6>(program);
}
}
} // namespace aco

View File

@ -24,6 +24,7 @@
#include "aco_builder.h"
#include "aco_ir.h"
#include "util/u_math.h"
#include <set>
@ -55,10 +56,9 @@ struct wqm_ctx {
std::vector<uint16_t> defined_in;
std::vector<bool> needs_wqm;
std::vector<bool> branch_wqm; /* true if the branch condition in this block should be in wqm */
wqm_ctx(Program* program_) : program(program_),
defined_in(program->peekAllocationId(), 0xFFFF),
needs_wqm(program->peekAllocationId()),
branch_wqm(program->blocks.size())
wqm_ctx(Program* program_)
: program(program_), defined_in(program->peekAllocationId(), 0xFFFF),
needs_wqm(program->peekAllocationId()), branch_wqm(program->blocks.size())
{
for (unsigned i = 0; i < program->blocks.size(); i++)
worklist.insert(i);
@ -72,13 +72,15 @@ struct loop_info {
bool has_divergent_break;
bool has_divergent_continue;
bool has_discard; /* has a discard or demote */
loop_info(Block* b, uint16_t num, uint8_t needs_, bool breaks, bool cont, bool discard) :
loop_header(b), num_exec_masks(num), needs(needs_), has_divergent_break(breaks),
has_divergent_continue(cont), has_discard(discard) {}
loop_info(Block* b, uint16_t num, uint8_t needs_, bool breaks, bool cont, bool discard)
: loop_header(b), num_exec_masks(num), needs(needs_), has_divergent_break(breaks),
has_divergent_continue(cont), has_discard(discard)
{}
};
struct block_info {
std::vector<std::pair<Operand, uint8_t>> exec; /* Vector of exec masks. Either a temporary or const -1. */
std::vector<std::pair<Operand, uint8_t>>
exec; /* Vector of exec masks. Either a temporary or const -1. */
std::vector<WQMState> instr_needs;
uint8_t block_needs;
uint8_t ever_again_needs;
@ -87,14 +89,16 @@ struct block_info {
};
struct exec_ctx {
Program *program;
Program* program;
std::vector<block_info> info;
std::vector<loop_info> loop;
bool handle_wqm = false;
exec_ctx(Program *program_) : program(program_), info(program->blocks.size()) {}
exec_ctx(Program* program_) : program(program_), info(program->blocks.size()) {}
};
bool needs_exact(aco_ptr<Instruction>& instr) {
bool
needs_exact(aco_ptr<Instruction>& instr)
{
if (instr->isMUBUF()) {
return instr->mubuf().disable_wqm;
} else if (instr->isMTBUF()) {
@ -108,7 +112,8 @@ bool needs_exact(aco_ptr<Instruction>& instr) {
}
}
void set_needs_wqm(wqm_ctx &ctx, Temp tmp)
void
set_needs_wqm(wqm_ctx& ctx, Temp tmp)
{
if (!ctx.needs_wqm[tmp.id()]) {
ctx.needs_wqm[tmp.id()] = true;
@ -117,7 +122,8 @@ void set_needs_wqm(wqm_ctx &ctx, Temp tmp)
}
}
void mark_block_wqm(wqm_ctx &ctx, unsigned block_idx)
void
mark_block_wqm(wqm_ctx& ctx, unsigned block_idx)
{
if (ctx.branch_wqm[block_idx])
return;
@ -136,7 +142,8 @@ void mark_block_wqm(wqm_ctx &ctx, unsigned block_idx)
mark_block_wqm(ctx, pred_idx);
}
void get_block_needs(wqm_ctx &ctx, exec_ctx &exec_ctx, Block* block)
void
get_block_needs(wqm_ctx& ctx, exec_ctx& exec_ctx, Block* block)
{
block_info& info = exec_ctx.info[block->index];
@ -146,8 +153,8 @@ void get_block_needs(wqm_ctx &ctx, exec_ctx &exec_ctx, Block* block)
aco_ptr<Instruction>& instr = block->instructions[i];
WQMState needs = needs_exact(instr) ? Exact : Unspecified;
bool propagate_wqm = instr->opcode == aco_opcode::p_wqm ||
instr->opcode == aco_opcode::p_as_uniform;
bool propagate_wqm =
instr->opcode == aco_opcode::p_wqm || instr->opcode == aco_opcode::p_as_uniform;
bool preserve_wqm = instr->opcode == aco_opcode::p_discard_if;
bool pred_by_exec = needs_exec_mask(instr.get());
for (const Definition& definition : instr->definitions) {
@ -214,7 +221,8 @@ void get_block_needs(wqm_ctx &ctx, exec_ctx &exec_ctx, Block* block)
* breaks, which might benefit from being in exact) by adding Exact_Branch to a
* divergent branch surrounding the nested loop, if such a branch exists.
*/
void handle_wqm_loops(wqm_ctx& ctx, exec_ctx& exec_ctx, unsigned preheader)
void
handle_wqm_loops(wqm_ctx& ctx, exec_ctx& exec_ctx, unsigned preheader)
{
for (unsigned idx = preheader + 1; idx < exec_ctx.program->blocks.size(); idx++) {
Block& block = exec_ctx.program->blocks[idx];
@ -231,7 +239,8 @@ void handle_wqm_loops(wqm_ctx& ctx, exec_ctx& exec_ctx, unsigned preheader)
* ensure that the exact exec mask is not empty by adding Exact_Branch to
* the outer divergent branch.
*/
void handle_exact_loops(wqm_ctx& ctx, exec_ctx& exec_ctx, unsigned preheader)
void
handle_exact_loops(wqm_ctx& ctx, exec_ctx& exec_ctx, unsigned preheader)
{
assert(exec_ctx.program->blocks[preheader + 1].kind & block_kind_loop_header);
@ -265,7 +274,8 @@ void handle_exact_loops(wqm_ctx& ctx, exec_ctx& exec_ctx, unsigned preheader)
}
}
void calculate_wqm_needs(exec_ctx& exec_ctx)
void
calculate_wqm_needs(exec_ctx& exec_ctx)
{
wqm_ctx ctx(exec_ctx.program);
@ -307,14 +317,12 @@ void calculate_wqm_needs(exec_ctx& exec_ctx)
exec_ctx.info[i].block_needs |= Exact;
/* if discard is used somewhere in nested CF, we need to preserve the WQM mask */
if ((block.kind & block_kind_discard ||
block.kind & block_kind_uses_discard_if) &&
if ((block.kind & block_kind_discard || block.kind & block_kind_uses_discard_if) &&
ever_again_needs & WQM)
exec_ctx.info[i].block_needs |= Preserve_WQM;
ever_again_needs |= exec_ctx.info[i].block_needs & ~Exact_Branch;
if (block.kind & block_kind_discard ||
block.kind & block_kind_uses_discard_if ||
if (block.kind & block_kind_discard || block.kind & block_kind_uses_discard_if ||
block.kind & block_kind_uses_demote)
ever_again_needs |= Exact;
@ -327,7 +335,8 @@ void calculate_wqm_needs(exec_ctx& exec_ctx)
exec_ctx.handle_wqm = true;
}
Operand get_exec_op(Operand t)
Operand
get_exec_op(Operand t)
{
if (t.isUndefined())
return Operand(exec, t.regClass());
@ -335,7 +344,8 @@ Operand get_exec_op(Operand t)
return t;
}
void transition_to_WQM(exec_ctx& ctx, Builder bld, unsigned idx)
void
transition_to_WQM(exec_ctx& ctx, Builder bld, unsigned idx)
{
if (ctx.info[idx].exec.back().second & mask_type_wqm)
return;
@ -346,7 +356,8 @@ void transition_to_WQM(exec_ctx& ctx, Builder bld, unsigned idx)
ctx.info[idx].exec.back().first = exec_mask;
}
exec_mask = bld.sop1(Builder::s_wqm, Definition(exec, bld.lm), bld.def(s1, scc), get_exec_op(exec_mask));
exec_mask = bld.sop1(Builder::s_wqm, Definition(exec, bld.lm), bld.def(s1, scc),
get_exec_op(exec_mask));
ctx.info[idx].exec.emplace_back(exec_mask, mask_type_global | mask_type_wqm);
return;
}
@ -355,11 +366,12 @@ void transition_to_WQM(exec_ctx& ctx, Builder bld, unsigned idx)
assert(ctx.info[idx].exec.back().second & mask_type_wqm);
assert(ctx.info[idx].exec.back().first.size() == bld.lm.size());
assert(ctx.info[idx].exec.back().first.isTemp());
ctx.info[idx].exec.back().first = bld.pseudo(aco_opcode::p_parallelcopy, Definition(exec, bld.lm),
ctx.info[idx].exec.back().first);
ctx.info[idx].exec.back().first = bld.pseudo(
aco_opcode::p_parallelcopy, Definition(exec, bld.lm), ctx.info[idx].exec.back().first);
}
void transition_to_Exact(exec_ctx& ctx, Builder bld, unsigned idx)
void
transition_to_Exact(exec_ctx& ctx, Builder bld, unsigned idx)
{
if (ctx.info[idx].exec.back().second & mask_type_exact)
return;
@ -372,8 +384,8 @@ void transition_to_Exact(exec_ctx& ctx, Builder bld, unsigned idx)
assert(ctx.info[idx].exec.back().second & mask_type_exact);
assert(ctx.info[idx].exec.back().first.size() == bld.lm.size());
assert(ctx.info[idx].exec.back().first.isTemp());
ctx.info[idx].exec.back().first = bld.pseudo(aco_opcode::p_parallelcopy, Definition(exec, bld.lm),
ctx.info[idx].exec.back().first);
ctx.info[idx].exec.back().first = bld.pseudo(
aco_opcode::p_parallelcopy, Definition(exec, bld.lm), ctx.info[idx].exec.back().first);
return;
}
/* otherwise, we create an exact mask and push to the stack */
@ -382,14 +394,15 @@ void transition_to_Exact(exec_ctx& ctx, Builder bld, unsigned idx)
wqm = bld.sop1(Builder::s_and_saveexec, bld.def(bld.lm), bld.def(s1, scc),
Definition(exec, bld.lm), ctx.info[idx].exec[0].first, Operand(exec, bld.lm));
} else {
bld.sop2(Builder::s_and, Definition(exec, bld.lm), bld.def(s1, scc), ctx.info[idx].exec[0].first, wqm);
bld.sop2(Builder::s_and, Definition(exec, bld.lm), bld.def(s1, scc),
ctx.info[idx].exec[0].first, wqm);
}
ctx.info[idx].exec.back().first = Operand(wqm);
ctx.info[idx].exec.emplace_back(Operand(bld.lm), mask_type_exact);
}
unsigned add_coupling_code(exec_ctx& ctx, Block* block,
std::vector<aco_ptr<Instruction>>& instructions)
unsigned
add_coupling_code(exec_ctx& ctx, Block* block, std::vector<aco_ptr<Instruction>>& instructions)
{
unsigned idx = block->index;
Builder bld(ctx.program, &instructions);
@ -417,7 +430,8 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block,
} else {
uint8_t mask = mask_type_global;
if (ctx.program->needs_wqm) {
bld.sop1(Builder::s_wqm, Definition(exec, bld.lm), bld.def(s1, scc), Operand(exec, bld.lm));
bld.sop1(Builder::s_wqm, Definition(exec, bld.lm), bld.def(s1, scc),
Operand(exec, bld.lm));
mask |= mask_type_wqm;
} else {
mask |= mask_type_exact;
@ -440,7 +454,8 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block,
if (info.has_discard) {
aco_ptr<Pseudo_instruction> phi;
for (int i = 0; i < info.num_exec_masks - 1; i++) {
phi.reset(create_instruction<Pseudo_instruction>(aco_opcode::p_linear_phi, Format::PSEUDO, preds.size(), 1));
phi.reset(create_instruction<Pseudo_instruction>(aco_opcode::p_linear_phi,
Format::PSEUDO, preds.size(), 1));
phi->definitions[0] = bld.def(bld.lm);
phi->operands[0] = get_exec_op(ctx.info[preds[0]].exec[i].first);
ctx.info[idx].exec[i].first = bld.insert(std::move(phi));
@ -450,14 +465,16 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block,
/* create ssa name for restore mask */
if (info.has_divergent_break) {
/* this phi might be trivial but ensures a parallelcopy on the loop header */
aco_ptr<Pseudo_instruction> phi{create_instruction<Pseudo_instruction>(aco_opcode::p_linear_phi, Format::PSEUDO, preds.size(), 1)};
aco_ptr<Pseudo_instruction> phi{create_instruction<Pseudo_instruction>(
aco_opcode::p_linear_phi, Format::PSEUDO, preds.size(), 1)};
phi->definitions[0] = bld.def(bld.lm);
phi->operands[0] = get_exec_op(ctx.info[preds[0]].exec[info.num_exec_masks - 1].first);
ctx.info[idx].exec.back().first = bld.insert(std::move(phi));
}
/* create ssa name for loop active mask */
aco_ptr<Pseudo_instruction> phi{create_instruction<Pseudo_instruction>(aco_opcode::p_linear_phi, Format::PSEUDO, preds.size(), 1)};
aco_ptr<Pseudo_instruction> phi{create_instruction<Pseudo_instruction>(
aco_opcode::p_linear_phi, Format::PSEUDO, preds.size(), 1)};
if (info.has_divergent_continue)
phi->definitions[0] = bld.def(bld.lm);
else
@ -466,7 +483,8 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block,
Temp loop_active = bld.insert(std::move(phi));
if (info.has_divergent_break) {
uint8_t mask_type = (ctx.info[idx].exec.back().second & (mask_type_wqm | mask_type_exact)) | mask_type_loop;
uint8_t mask_type =
(ctx.info[idx].exec.back().second & (mask_type_wqm | mask_type_exact)) | mask_type_loop;
ctx.info[idx].exec.emplace_back(loop_active, mask_type);
} else {
ctx.info[idx].exec.back().first = Operand(loop_active);
@ -482,8 +500,10 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block,
}
uint8_t mask_type = ctx.info[idx].exec.back().second & (mask_type_wqm | mask_type_exact);
assert(ctx.info[idx].exec.back().first.size() == bld.lm.size());
ctx.info[idx].exec.emplace_back(bld.pseudo(aco_opcode::p_parallelcopy, Definition(exec, bld.lm),
ctx.info[idx].exec.back().first), mask_type);
ctx.info[idx].exec.emplace_back(
bld.pseudo(aco_opcode::p_parallelcopy, Definition(exec, bld.lm),
ctx.info[idx].exec.back().first),
mask_type);
}
return i;
@ -514,14 +534,16 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block,
aco_ptr<Instruction>& phi = header->instructions[instr_idx++];
assert(phi->opcode == aco_opcode::p_linear_phi);
for (unsigned i = 1; i < phi->operands.size(); i++)
phi->operands[i] = get_exec_op(ctx.info[header_preds[i]].exec[info.num_exec_masks - 1].first);
phi->operands[i] =
get_exec_op(ctx.info[header_preds[i]].exec[info.num_exec_masks - 1].first);
}
if (info.has_divergent_break) {
aco_ptr<Instruction>& phi = header->instructions[instr_idx];
assert(phi->opcode == aco_opcode::p_linear_phi);
for (unsigned i = 1; i < phi->operands.size(); i++)
phi->operands[i] = get_exec_op(ctx.info[header_preds[i]].exec[info.num_exec_masks].first);
phi->operands[i] =
get_exec_op(ctx.info[header_preds[i]].exec[info.num_exec_masks].first);
}
assert(!(block->kind & block_kind_top_level) || info.num_exec_masks <= 2);
@ -541,7 +563,8 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block,
ctx.info[idx].exec.emplace_back(same, type);
} else {
/* create phi for loop footer */
aco_ptr<Pseudo_instruction> phi{create_instruction<Pseudo_instruction>(aco_opcode::p_linear_phi, Format::PSEUDO, preds.size(), 1)};
aco_ptr<Pseudo_instruction> phi{create_instruction<Pseudo_instruction>(
aco_opcode::p_linear_phi, Format::PSEUDO, preds.size(), 1)};
phi->definitions[0] = bld.def(bld.lm);
if (exec_idx == info.num_exec_masks - 1u) {
phi->definitions[0] = Definition(exec, bld.lm);
@ -578,8 +601,8 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block,
assert(ctx.info[idx].exec.back().first.size() == bld.lm.size());
if (get_exec_op(ctx.info[idx].exec.back().first).isTemp()) {
/* move current exec mask into exec register */
ctx.info[idx].exec.back().first = bld.pseudo(aco_opcode::p_parallelcopy, Definition(exec, bld.lm),
ctx.info[idx].exec.back().first);
ctx.info[idx].exec.back().first = bld.pseudo(
aco_opcode::p_parallelcopy, Definition(exec, bld.lm), ctx.info[idx].exec.back().first);
}
ctx.loop.pop_back();
@ -591,8 +614,8 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block,
} else {
assert(preds.size() == 2);
/* if one of the predecessors ends in exact mask, we pop it from stack */
unsigned num_exec_masks = std::min(ctx.info[preds[0]].exec.size(),
ctx.info[preds[1]].exec.size());
unsigned num_exec_masks =
std::min(ctx.info[preds[0]].exec.size(), ctx.info[preds[1]].exec.size());
if (block->kind & block_kind_merge)
num_exec_masks--;
@ -605,14 +628,16 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block,
if (ctx.info[preds[0]].exec[i].first == ctx.info[preds[1]].exec[i].first) {
Operand t = ctx.info[preds[0]].exec[i].first;
/* discard/demote can change the state of the current exec mask */
assert(!t.isTemp() || ctx.info[preds[0]].exec[i].second == ctx.info[preds[1]].exec[i].second);
assert(!t.isTemp() ||
ctx.info[preds[0]].exec[i].second == ctx.info[preds[1]].exec[i].second);
uint8_t mask = ctx.info[preds[0]].exec[i].second & ctx.info[preds[1]].exec[i].second;
ctx.info[idx].exec.emplace_back(t, mask);
continue;
}
bool in_exec = i == num_exec_masks - 1 && !(block->kind & block_kind_merge);
Temp phi = bld.pseudo(aco_opcode::p_linear_phi, in_exec ? Definition(exec, bld.lm) : bld.def(bld.lm),
Temp phi = bld.pseudo(aco_opcode::p_linear_phi,
in_exec ? Definition(exec, bld.lm) : bld.def(bld.lm),
get_exec_op(ctx.info[preds[0]].exec[i].first),
get_exec_op(ctx.info[preds[1]].exec[i].first));
uint8_t mask_type = ctx.info[preds[0]].exec[i].second & ctx.info[preds[1]].exec[i].second;
@ -654,9 +679,9 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block,
return i;
}
void process_instructions(exec_ctx& ctx, Block* block,
std::vector<aco_ptr<Instruction>>& instructions,
unsigned idx)
void
process_instructions(exec_ctx& ctx, Block* block, std::vector<aco_ptr<Instruction>>& instructions,
unsigned idx)
{
WQMState state;
if (ctx.info[block->index].exec.back().second & mask_type_wqm)
@ -667,17 +692,16 @@ void process_instructions(exec_ctx& ctx, Block* block,
}
/* if the block doesn't need both, WQM and Exact, we can skip processing the instructions */
bool process = (ctx.handle_wqm &&
(ctx.info[block->index].block_needs & state) !=
(ctx.info[block->index].block_needs & (WQM | Exact))) ||
bool process = (ctx.handle_wqm && (ctx.info[block->index].block_needs & state) !=
(ctx.info[block->index].block_needs & (WQM | Exact))) ||
block->kind & block_kind_uses_discard_if ||
block->kind & block_kind_uses_demote ||
block->kind & block_kind_needs_lowering;
block->kind & block_kind_uses_demote || block->kind & block_kind_needs_lowering;
if (!process) {
std::vector<aco_ptr<Instruction>>::iterator it = std::next(block->instructions.begin(), idx);
instructions.insert(instructions.end(),
std::move_iterator<std::vector<aco_ptr<Instruction>>::iterator>(it),
std::move_iterator<std::vector<aco_ptr<Instruction>>::iterator>(block->instructions.end()));
std::move_iterator<std::vector<aco_ptr<Instruction>>::iterator>(
block->instructions.end()));
return;
}
@ -700,11 +724,13 @@ void process_instructions(exec_ctx& ctx, Block* block,
/* discard from current exec */
const Operand cond = instr->operands[0];
Temp exit_cond = bld.sop2(Builder::s_andn2, Definition(exec, bld.lm), bld.def(s1, scc),
Operand(exec, bld.lm), cond).def(1).getTemp();
Operand(exec, bld.lm), cond)
.def(1)
.getTemp();
/* discard from inner to outer exec mask on stack */
for (int i = num - 2; i >= 0; i--) {
Instruction *andn2 = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc),
Instruction* andn2 = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc),
ctx.info[block->index].exec[i].first, cond);
ctx.info[block->index].exec[i].first = Operand(andn2->definitions[0].getTemp());
exit_cond = andn2->definitions[1].getTemp();
@ -726,14 +752,16 @@ void process_instructions(exec_ctx& ctx, Block* block,
Definition dst = instr->definitions[0];
assert(dst.size() == bld.lm.size());
if (state == Exact) {
instr.reset(create_instruction<SOP1_instruction>(bld.w64or32(Builder::s_mov), Format::SOP1, 1, 1));
instr.reset(create_instruction<SOP1_instruction>(bld.w64or32(Builder::s_mov),
Format::SOP1, 1, 1));
instr->operands[0] = Operand(0u);
instr->definitions[0] = dst;
} else {
std::pair<Operand, uint8_t>& exact_mask = ctx.info[block->index].exec[0];
assert(exact_mask.second & mask_type_exact);
instr.reset(create_instruction<SOP2_instruction>(bld.w64or32(Builder::s_andn2), Format::SOP2, 2, 2));
instr.reset(create_instruction<SOP2_instruction>(bld.w64or32(Builder::s_andn2),
Format::SOP2, 2, 2));
instr->operands[0] = Operand(exec, bld.lm); /* current exec */
instr->operands[1] = Operand(exact_mask.first);
instr->definitions[0] = dst;
@ -741,7 +769,8 @@ void process_instructions(exec_ctx& ctx, Block* block,
}
} else if (instr->opcode == aco_opcode::p_demote_to_helper) {
/* turn demote into discard_if with only exact masks */
assert((ctx.info[block->index].exec[0].second & (mask_type_exact | mask_type_global)) == (mask_type_exact | mask_type_global));
assert((ctx.info[block->index].exec[0].second & (mask_type_exact | mask_type_global)) ==
(mask_type_exact | mask_type_global));
int num;
Temp cond, exit_cond;
@ -749,8 +778,9 @@ void process_instructions(exec_ctx& ctx, Block* block,
assert(instr->operands[0].constantValue() == -1u);
/* transition to exact and set exec to zero */
exit_cond = bld.tmp(s1);
cond = bld.sop1(Builder::s_and_saveexec, bld.def(bld.lm), bld.scc(Definition(exit_cond)),
Definition(exec, bld.lm), Operand(0u), Operand(exec, bld.lm));
cond =
bld.sop1(Builder::s_and_saveexec, bld.def(bld.lm), bld.scc(Definition(exit_cond)),
Definition(exec, bld.lm), Operand(0u), Operand(exec, bld.lm));
num = ctx.info[block->index].exec.size() - 2;
if (!(ctx.info[block->index].exec.back().second & mask_type_exact)) {
@ -767,7 +797,7 @@ void process_instructions(exec_ctx& ctx, Block* block,
for (int i = num; i >= 0; i--) {
if (ctx.info[block->index].exec[i].second & mask_type_exact) {
Instruction *andn2 = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc),
Instruction* andn2 = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc),
ctx.info[block->index].exec[i].first, cond);
if (i == (int)ctx.info[block->index].exec.size() - 1) {
andn2->operands[0] = Operand(exec, bld.lm);
@ -783,14 +813,14 @@ void process_instructions(exec_ctx& ctx, Block* block,
instr->opcode = aco_opcode::p_exit_early_if;
instr->operands[0] = bld.scc(exit_cond);
state = Exact;
}
bld.insert(std::move(instr));
}
}
void add_branch_code(exec_ctx& ctx, Block* block)
void
add_branch_code(exec_ctx& ctx, Block* block)
{
unsigned idx = block->index;
Builder bld(ctx.program, block);
@ -806,8 +836,7 @@ void add_branch_code(exec_ctx& ctx, Block* block)
}
assert(ctx.info[idx].exec.size() <= 2);
if (ctx.info[idx].ever_again_needs == 0 ||
ctx.info[idx].ever_again_needs == Exact) {
if (ctx.info[idx].ever_again_needs == 0 || ctx.info[idx].ever_again_needs == Exact) {
/* transition to Exact */
aco_ptr<Instruction> branch = std::move(block->instructions.back());
block->instructions.pop_back();
@ -838,8 +867,7 @@ void add_branch_code(exec_ctx& ctx, Block* block)
Block& loop_block = ctx.program->blocks[i];
needs |= ctx.info[i].block_needs;
if (loop_block.kind & block_kind_uses_discard_if ||
loop_block.kind & block_kind_discard ||
if (loop_block.kind & block_kind_uses_discard_if || loop_block.kind & block_kind_discard ||
loop_block.kind & block_kind_uses_demote)
has_discard = true;
if (loop_block.loop_nest_depth != loop_nest_depth)
@ -871,12 +899,8 @@ void add_branch_code(exec_ctx& ctx, Block* block)
if (block->kind & block_kind_top_level)
num_exec_masks = std::min(num_exec_masks, 2u);
ctx.loop.emplace_back(&ctx.program->blocks[block->linear_succs[0]],
num_exec_masks,
needs,
has_divergent_break,
has_divergent_continue,
has_discard);
ctx.loop.emplace_back(&ctx.program->blocks[block->linear_succs[0]], num_exec_masks, needs,
has_divergent_break, has_divergent_continue, has_discard);
}
/* For normal breaks, this is the exec mask. For discard+break, it's the
@ -903,7 +927,7 @@ void add_branch_code(exec_ctx& ctx, Block* block)
Definition(exec, bld.lm), Operand(0u), Operand(exec, bld.lm));
for (int i = num - 1; i >= 0; i--) {
Instruction *andn2 = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc),
Instruction* andn2 = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc),
get_exec_op(ctx.info[block->index].exec[i].first), cond);
if (i == (int)ctx.info[idx].exec.size() - 1)
andn2->definitions[0] = Definition(exec, bld.lm);
@ -919,8 +943,10 @@ void add_branch_code(exec_ctx& ctx, Block* block)
}
if (block->kind & block_kind_continue_or_break) {
assert(ctx.program->blocks[ctx.program->blocks[block->linear_succs[1]].linear_succs[0]].kind & block_kind_loop_header);
assert(ctx.program->blocks[ctx.program->blocks[block->linear_succs[0]].linear_succs[0]].kind & block_kind_loop_exit);
assert(ctx.program->blocks[ctx.program->blocks[block->linear_succs[1]].linear_succs[0]].kind &
block_kind_loop_header);
assert(ctx.program->blocks[ctx.program->blocks[block->linear_succs[0]].linear_succs[0]].kind &
block_kind_loop_exit);
assert(block->instructions.back()->opcode == aco_opcode::p_branch);
block->instructions.pop_back();
@ -931,8 +957,10 @@ void add_branch_code(exec_ctx& ctx, Block* block)
}
if (need_parallelcopy)
ctx.info[idx].exec.back().first = bld.pseudo(aco_opcode::p_parallelcopy, Definition(exec, bld.lm), ctx.info[idx].exec.back().first);
bld.branch(aco_opcode::p_cbranch_nz, bld.hint_vcc(bld.def(s2)), Operand(exec, bld.lm), block->linear_succs[1], block->linear_succs[0]);
ctx.info[idx].exec.back().first = bld.pseudo(
aco_opcode::p_parallelcopy, Definition(exec, bld.lm), ctx.info[idx].exec.back().first);
bld.branch(aco_opcode::p_cbranch_nz, bld.hint_vcc(bld.def(s2)), Operand(exec, bld.lm),
block->linear_succs[1], block->linear_succs[0]);
return;
}
@ -949,8 +977,7 @@ void add_branch_code(exec_ctx& ctx, Block* block)
if (block->kind & block_kind_branch) {
if (ctx.handle_wqm &&
ctx.info[idx].exec.size() >= 2 &&
if (ctx.handle_wqm && ctx.info[idx].exec.size() >= 2 &&
ctx.info[idx].exec.back().second == mask_type_exact &&
!(ctx.info[idx].block_needs & Exact_Branch) &&
ctx.info[idx].exec[ctx.info[idx].exec.size() - 2].second & mask_type_wqm) {
@ -972,7 +999,7 @@ void add_branch_code(exec_ctx& ctx, Block* block)
bld.pseudo(aco_opcode::p_parallelcopy, Definition(exec, bld.lm), cond);
} else {
Temp old_exec = bld.sop1(Builder::s_and_saveexec, bld.def(bld.lm), bld.def(s1, scc),
Definition(exec, bld.lm), cond, Operand(exec, bld.lm));
Definition(exec, bld.lm), cond, Operand(exec, bld.lm));
ctx.info[idx].exec.back().first = Operand(old_exec);
}
@ -980,7 +1007,8 @@ void add_branch_code(exec_ctx& ctx, Block* block)
/* add next current exec to the stack */
ctx.info[idx].exec.emplace_back(Operand(bld.lm), mask_type);
bld.branch(aco_opcode::p_cbranch_z, bld.hint_vcc(bld.def(s2)), Operand(exec, bld.lm), block->linear_succs[1], block->linear_succs[0]);
bld.branch(aco_opcode::p_cbranch_z, bld.hint_vcc(bld.def(s2)), Operand(exec, bld.lm),
block->linear_succs[1], block->linear_succs[0]);
return;
}
@ -990,9 +1018,11 @@ void add_branch_code(exec_ctx& ctx, Block* block)
block->instructions.pop_back();
assert(ctx.info[idx].exec.size() >= 2);
Operand orig_exec = ctx.info[idx].exec[ctx.info[idx].exec.size() - 2].first;
bld.sop2(Builder::s_andn2, Definition(exec, bld.lm), bld.def(s1, scc), orig_exec, Operand(exec, bld.lm));
bld.sop2(Builder::s_andn2, Definition(exec, bld.lm), bld.def(s1, scc), orig_exec,
Operand(exec, bld.lm));
bld.branch(aco_opcode::p_cbranch_z, bld.hint_vcc(bld.def(s2)), Operand(exec, bld.lm), block->linear_succs[1], block->linear_succs[0]);
bld.branch(aco_opcode::p_cbranch_z, bld.hint_vcc(bld.def(s2)), Operand(exec, bld.lm),
block->linear_succs[1], block->linear_succs[0]);
return;
}
@ -1020,7 +1050,8 @@ void add_branch_code(exec_ctx& ctx, Block* block)
bld.copy(Definition(exec, bld.lm), Operand(0u, bld.lm == s2));
}
bld.branch(aco_opcode::p_cbranch_nz, bld.hint_vcc(bld.def(s2)), bld.scc(cond), block->linear_succs[1], block->linear_succs[0]);
bld.branch(aco_opcode::p_cbranch_nz, bld.hint_vcc(bld.def(s2)), bld.scc(cond),
block->linear_succs[1], block->linear_succs[0]);
return;
}
@ -1048,12 +1079,14 @@ void add_branch_code(exec_ctx& ctx, Block* block)
bld.copy(Definition(exec, bld.lm), Operand(0u, bld.lm == s2));
}
bld.branch(aco_opcode::p_cbranch_nz, bld.hint_vcc(bld.def(s2)), bld.scc(cond), block->linear_succs[1], block->linear_succs[0]);
bld.branch(aco_opcode::p_cbranch_nz, bld.hint_vcc(bld.def(s2)), bld.scc(cond),
block->linear_succs[1], block->linear_succs[0]);
return;
}
}
void process_block(exec_ctx& ctx, Block* block)
void
process_block(exec_ctx& ctx, Block* block)
{
std::vector<aco_ptr<Instruction>> instructions;
instructions.reserve(block->instructions.size());
@ -1072,8 +1105,8 @@ void process_block(exec_ctx& ctx, Block* block)
} /* end namespace */
void insert_exec_mask(Program *program)
void
insert_exec_mask(Program* program)
{
exec_ctx ctx(program);
@ -1082,8 +1115,6 @@ void insert_exec_mask(Program *program)
for (Block& block : program->blocks)
process_block(ctx, &block);
}
}
} // namespace aco

View File

@ -23,6 +23,7 @@
*/
#include "aco_ir.h"
#include "common/sid.h"
#include <map>
@ -49,7 +50,8 @@ namespace {
* - or erase gprs with counters higher than to be waited for.
*/
// TODO: do a more clever insertion of wait_cnt (lgkm_cnt) when there is a load followed by a use of a previous load
// TODO: do a more clever insertion of wait_cnt (lgkm_cnt)
// when there is a load followed by a use of a previous load
/* Instructions of the same event will finish in-order except for smem
* and maybe flat. Instructions of different events may not finish in-order. */
@ -77,54 +79,50 @@ enum counter_type : uint8_t {
num_counters = 4,
};
static const uint16_t exp_events = event_exp_pos | event_exp_param | event_exp_mrt_null | event_gds_gpr_lock | event_vmem_gpr_lock;
static const uint16_t exp_events =
event_exp_pos | event_exp_param | event_exp_mrt_null | event_gds_gpr_lock | event_vmem_gpr_lock;
static const uint16_t lgkm_events = event_smem | event_lds | event_gds | event_flat | event_sendmsg;
static const uint16_t vm_events = event_vmem | event_flat;
static const uint16_t vs_events = event_vmem_store;
uint8_t get_counters_for_event(wait_event ev)
uint8_t
get_counters_for_event(wait_event ev)
{
switch (ev) {
case event_smem:
case event_lds:
case event_gds:
case event_sendmsg:
return counter_lgkm;
case event_vmem:
return counter_vm;
case event_vmem_store:
return counter_vs;
case event_flat:
return counter_vm | counter_lgkm;
case event_sendmsg: return counter_lgkm;
case event_vmem: return counter_vm;
case event_vmem_store: return counter_vs;
case event_flat: return counter_vm | counter_lgkm;
case event_exp_pos:
case event_exp_param:
case event_exp_mrt_null:
case event_gds_gpr_lock:
case event_vmem_gpr_lock:
return counter_exp;
default:
return 0;
case event_vmem_gpr_lock: return counter_exp;
default: return 0;
}
}
struct wait_entry {
wait_imm imm;
uint16_t events; /* use wait_event notion */
uint16_t events; /* use wait_event notion */
uint8_t counters; /* use counter_type notion */
bool wait_on_read:1;
bool logical:1;
bool has_vmem_nosampler:1;
bool has_vmem_sampler:1;
bool wait_on_read : 1;
bool logical : 1;
bool has_vmem_nosampler : 1;
bool has_vmem_sampler : 1;
wait_entry(wait_event event_, wait_imm imm_, bool logical_, bool wait_on_read_)
: imm(imm_), events(event_), counters(get_counters_for_event(event_)),
wait_on_read(wait_on_read_), logical(logical_),
has_vmem_nosampler(false), has_vmem_sampler(false) {}
: imm(imm_), events(event_), counters(get_counters_for_event(event_)),
wait_on_read(wait_on_read_), logical(logical_), has_vmem_nosampler(false),
has_vmem_sampler(false)
{}
bool join(const wait_entry& other)
{
bool changed = (other.events & ~events) ||
(other.counters & ~counters) ||
bool changed = (other.events & ~events) || (other.counters & ~counters) ||
(other.wait_on_read && !wait_on_read) ||
(other.has_vmem_nosampler && !has_vmem_nosampler) ||
(other.has_vmem_sampler && !has_vmem_sampler);
@ -156,7 +154,8 @@ struct wait_entry {
if (counter == counter_exp) {
imm.exp = wait_imm::unset_counter;
events &= ~(event_exp_pos | event_exp_param | event_exp_mrt_null | event_gds_gpr_lock | event_vmem_gpr_lock);
events &= ~(event_exp_pos | event_exp_param | event_exp_mrt_null | event_gds_gpr_lock |
event_vmem_gpr_lock);
}
if (counter == counter_vs) {
@ -170,7 +169,7 @@ struct wait_entry {
};
struct wait_ctx {
Program *program;
Program* program;
enum chip_class chip_class;
uint16_t max_vm_cnt;
uint16_t max_exp_cnt;
@ -189,24 +188,21 @@ struct wait_ctx {
wait_imm barrier_imm[storage_count];
uint16_t barrier_events[storage_count] = {}; /* use wait_event notion */
std::map<PhysReg,wait_entry> gpr_map;
std::map<PhysReg, wait_entry> gpr_map;
wait_ctx() {}
wait_ctx(Program *program_)
: program(program_),
chip_class(program_->chip_class),
max_vm_cnt(program_->chip_class >= GFX9 ? 62 : 14),
max_exp_cnt(6),
max_lgkm_cnt(program_->chip_class >= GFX10 ? 62 : 14),
max_vs_cnt(program_->chip_class >= GFX10 ? 62 : 0),
unordered_events(event_smem | (program_->chip_class < GFX10 ? event_flat : 0)) {}
wait_ctx(Program* program_)
: program(program_), chip_class(program_->chip_class),
max_vm_cnt(program_->chip_class >= GFX9 ? 62 : 14), max_exp_cnt(6),
max_lgkm_cnt(program_->chip_class >= GFX10 ? 62 : 14),
max_vs_cnt(program_->chip_class >= GFX10 ? 62 : 0),
unordered_events(event_smem | (program_->chip_class < GFX10 ? event_flat : 0))
{}
bool join(const wait_ctx* other, bool logical)
{
bool changed = other->exp_cnt > exp_cnt ||
other->vm_cnt > vm_cnt ||
other->lgkm_cnt > lgkm_cnt ||
other->vs_cnt > vs_cnt ||
bool changed = other->exp_cnt > exp_cnt || other->vm_cnt > vm_cnt ||
other->lgkm_cnt > lgkm_cnt || other->vs_cnt > vs_cnt ||
(other->pending_flat_lgkm && !pending_flat_lgkm) ||
(other->pending_flat_vm && !pending_flat_vm);
@ -218,12 +214,11 @@ struct wait_ctx {
pending_flat_vm |= other->pending_flat_vm;
pending_s_buffer_store |= other->pending_s_buffer_store;
for (const auto& entry : other->gpr_map)
{
for (const auto& entry : other->gpr_map) {
if (entry.second.logical != logical)
continue;
using iterator = std::map<PhysReg,wait_entry>::iterator;
using iterator = std::map<PhysReg, wait_entry>::iterator;
const std::pair<iterator, bool> insert_pair = gpr_map.insert(entry);
if (insert_pair.second) {
changed = true;
@ -241,12 +236,14 @@ struct wait_ctx {
return changed;
}
void wait_and_remove_from_entry(PhysReg reg, wait_entry& entry, counter_type counter) {
void wait_and_remove_from_entry(PhysReg reg, wait_entry& entry, counter_type counter)
{
entry.remove_counter(counter);
}
};
wait_imm check_instr(Instruction* instr, wait_ctx& ctx)
wait_imm
check_instr(Instruction* instr, wait_ctx& ctx)
{
wait_imm wait;
@ -257,7 +254,7 @@ wait_imm check_instr(Instruction* instr, wait_ctx& ctx)
/* check consecutively read gprs */
for (unsigned j = 0; j < op.size(); j++) {
PhysReg reg{op.physReg() + j};
std::map<PhysReg,wait_entry>::iterator it = ctx.gpr_map.find(reg);
std::map<PhysReg, wait_entry>::iterator it = ctx.gpr_map.find(reg);
if (it == ctx.gpr_map.end() || !it->second.wait_on_read)
continue;
@ -267,22 +264,24 @@ wait_imm check_instr(Instruction* instr, wait_ctx& ctx)
for (const Definition& def : instr->definitions) {
/* check consecutively written gprs */
for (unsigned j = 0; j < def.getTemp().size(); j++)
{
for (unsigned j = 0; j < def.getTemp().size(); j++) {
PhysReg reg{def.physReg() + j};
std::map<PhysReg,wait_entry>::iterator it = ctx.gpr_map.find(reg);
std::map<PhysReg, wait_entry>::iterator it = ctx.gpr_map.find(reg);
if (it == ctx.gpr_map.end())
continue;
/* Vector Memory reads and writes return in the order they were issued */
bool has_sampler = instr->isMIMG() && !instr->operands[1].isUndefined() && instr->operands[1].regClass() == s4;
bool has_sampler = instr->isMIMG() && !instr->operands[1].isUndefined() &&
instr->operands[1].regClass() == s4;
if (instr->isVMEM() && ((it->second.events & vm_events) == event_vmem) &&
it->second.has_vmem_nosampler == !has_sampler && it->second.has_vmem_sampler == has_sampler)
it->second.has_vmem_nosampler == !has_sampler &&
it->second.has_vmem_sampler == has_sampler)
continue;
/* LDS reads and writes return in the order they were issued. same for GDS */
if (instr->isDS() && (it->second.events & lgkm_events) == (instr->ds().gds ? event_gds : event_lds))
if (instr->isDS() &&
(it->second.events & lgkm_events) == (instr->ds().gds ? event_gds : event_lds))
continue;
wait.combine(it->second.imm);
@ -292,7 +291,8 @@ wait_imm check_instr(Instruction* instr, wait_ctx& ctx)
return wait;
}
wait_imm parse_wait_instr(wait_ctx& ctx, Instruction *instr)
wait_imm
parse_wait_instr(wait_ctx& ctx, Instruction* instr)
{
if (instr->opcode == aco_opcode::s_waitcnt_vscnt &&
instr->definitions[0].physReg() == sgpr_null) {
@ -305,10 +305,12 @@ wait_imm parse_wait_instr(wait_ctx& ctx, Instruction *instr)
return wait_imm();
}
wait_imm perform_barrier(wait_ctx& ctx, memory_sync_info sync, unsigned semantics)
wait_imm
perform_barrier(wait_ctx& ctx, memory_sync_info sync, unsigned semantics)
{
wait_imm imm;
sync_scope subgroup_scope = ctx.program->workgroup_size <= ctx.program->wave_size ? scope_workgroup : scope_subgroup;
sync_scope subgroup_scope =
ctx.program->workgroup_size <= ctx.program->wave_size ? scope_workgroup : scope_subgroup;
if ((sync.semantics & semantics) && sync.scope > subgroup_scope) {
unsigned storage = sync.storage;
while (storage) {
@ -321,7 +323,8 @@ wait_imm perform_barrier(wait_ctx& ctx, memory_sync_info sync, unsigned semantic
if (bar_scope_lds <= subgroup_scope)
events &= ~event_lds;
/* in non-WGP, the L1 (L0 on GFX10+) cache keeps all memory operations in-order for the same workgroup */
/* in non-WGP, the L1 (L0 on GFX10+) cache keeps all memory operations
* in-order for the same workgroup */
if (!ctx.program->wgp_mode && sync.scope <= scope_workgroup)
events &= ~(event_vmem | event_vmem_store | event_smem);
@ -333,7 +336,8 @@ wait_imm perform_barrier(wait_ctx& ctx, memory_sync_info sync, unsigned semantic
return imm;
}
void force_waitcnt(wait_ctx& ctx, wait_imm& imm)
void
force_waitcnt(wait_ctx& ctx, wait_imm& imm)
{
if (ctx.vm_cnt)
imm.vm = 0;
@ -348,7 +352,8 @@ void force_waitcnt(wait_ctx& ctx, wait_imm& imm)
}
}
wait_imm kill(Instruction* instr, wait_ctx& ctx, memory_sync_info sync_info)
wait_imm
kill(Instruction* instr, wait_ctx& ctx, memory_sync_info sync_info)
{
wait_imm imm;
@ -364,7 +369,6 @@ wait_imm kill(Instruction* instr, wait_ctx& ctx, memory_sync_info sync_info)
imm.combine(parse_wait_instr(ctx, instr));
/* It's required to wait for scalar stores before "writing back" data.
* It shouldn't cost anything anyways since we're about to do s_endpgm.
*/
@ -380,20 +384,19 @@ wait_imm kill(Instruction* instr, wait_ctx& ctx, memory_sync_info sync_info)
*
* TODO: Refine this when we have proper alias analysis.
*/
if (ctx.pending_s_buffer_store &&
!instr->smem().definitions.empty() &&
if (ctx.pending_s_buffer_store && !instr->smem().definitions.empty() &&
!instr->smem().sync.can_reorder()) {
imm.lgkm = 0;
}
}
if (ctx.program->early_rast && instr->opcode == aco_opcode::exp) {
if (instr->exp().dest >= V_008DFC_SQ_EXP_POS &&
instr->exp().dest < V_008DFC_SQ_EXP_PRIM) {
if (instr->exp().dest >= V_008DFC_SQ_EXP_POS && instr->exp().dest < V_008DFC_SQ_EXP_PRIM) {
/* With early_rast, the HW will start clipping and rasterization after the 1st DONE pos export.
* Wait for all stores (and atomics) to complete, so PS can read them.
* TODO: This only really applies to DONE pos exports. Consider setting the DONE bit earlier.
/* With early_rast, the HW will start clipping and rasterization after the 1st DONE pos
* export. Wait for all stores (and atomics) to complete, so PS can read them.
* TODO: This only really applies to DONE pos exports.
* Consider setting the DONE bit earlier.
*/
if (ctx.vs_cnt > 0)
imm.vs = 0;
@ -444,9 +447,8 @@ wait_imm kill(Instruction* instr, wait_ctx& ctx, memory_sync_info sync_info)
}
/* remove all gprs with higher counter from map */
std::map<PhysReg,wait_entry>::iterator it = ctx.gpr_map.begin();
while (it != ctx.gpr_map.end())
{
std::map<PhysReg, wait_entry>::iterator it = ctx.gpr_map.begin();
while (it != ctx.gpr_map.end()) {
if (imm.exp != wait_imm::unset_counter && imm.exp <= it->second.imm.exp)
ctx.wait_and_remove_from_entry(it->first, it->second, counter_exp);
if (imm.vm != wait_imm::unset_counter && imm.vm <= it->second.imm.vm)
@ -472,13 +474,15 @@ wait_imm kill(Instruction* instr, wait_ctx& ctx, memory_sync_info sync_info)
return imm;
}
void update_barrier_counter(uint8_t *ctr, unsigned max)
void
update_barrier_counter(uint8_t* ctr, unsigned max)
{
if (*ctr != wait_imm::unset_counter && *ctr < max)
(*ctr)++;
}
void update_barrier_imm(wait_ctx& ctx, uint8_t counters, wait_event event, memory_sync_info sync)
void
update_barrier_imm(wait_ctx& ctx, uint8_t counters, wait_event event, memory_sync_info sync)
{
for (unsigned i = 0; i < storage_count; i++) {
wait_imm& bar = ctx.barrier_imm[i];
@ -506,7 +510,8 @@ void update_barrier_imm(wait_ctx& ctx, uint8_t counters, wait_event event, memor
}
}
void update_counters(wait_ctx& ctx, wait_event event, memory_sync_info sync=memory_sync_info())
void
update_counters(wait_ctx& ctx, wait_event event, memory_sync_info sync = memory_sync_info())
{
uint8_t counters = get_counters_for_event(event);
@ -529,7 +534,7 @@ void update_counters(wait_ctx& ctx, wait_event event, memory_sync_info sync=memo
if (ctx.pending_flat_vm)
counters &= ~counter_vm;
for (std::pair<const PhysReg,wait_entry>& e : ctx.gpr_map) {
for (std::pair<const PhysReg, wait_entry>& e : ctx.gpr_map) {
wait_entry& entry = e.second;
if (entry.events & ctx.unordered_events)
@ -537,18 +542,23 @@ void update_counters(wait_ctx& ctx, wait_event event, memory_sync_info sync=memo
assert(entry.events);
if ((counters & counter_exp) && (entry.events & exp_events) == event && entry.imm.exp < ctx.max_exp_cnt)
if ((counters & counter_exp) && (entry.events & exp_events) == event &&
entry.imm.exp < ctx.max_exp_cnt)
entry.imm.exp++;
if ((counters & counter_lgkm) && (entry.events & lgkm_events) == event && entry.imm.lgkm < ctx.max_lgkm_cnt)
if ((counters & counter_lgkm) && (entry.events & lgkm_events) == event &&
entry.imm.lgkm < ctx.max_lgkm_cnt)
entry.imm.lgkm++;
if ((counters & counter_vm) && (entry.events & vm_events) == event && entry.imm.vm < ctx.max_vm_cnt)
if ((counters & counter_vm) && (entry.events & vm_events) == event &&
entry.imm.vm < ctx.max_vm_cnt)
entry.imm.vm++;
if ((counters & counter_vs) && (entry.events & vs_events) == event && entry.imm.vs < ctx.max_vs_cnt)
if ((counters & counter_vs) && (entry.events & vs_events) == event &&
entry.imm.vs < ctx.max_vs_cnt)
entry.imm.vs++;
}
}
void update_counters_for_flat_load(wait_ctx& ctx, memory_sync_info sync=memory_sync_info())
void
update_counters_for_flat_load(wait_ctx& ctx, memory_sync_info sync = memory_sync_info())
{
assert(ctx.chip_class < GFX10);
@ -559,8 +569,7 @@ void update_counters_for_flat_load(wait_ctx& ctx, memory_sync_info sync=memory_s
update_barrier_imm(ctx, counter_vm | counter_lgkm, event_flat, sync);
for (std::pair<PhysReg,wait_entry> e : ctx.gpr_map)
{
for (std::pair<PhysReg, wait_entry> e : ctx.gpr_map) {
if (e.second.counters & counter_vm)
e.second.imm.vm = 0;
if (e.second.counters & counter_lgkm)
@ -570,8 +579,9 @@ void update_counters_for_flat_load(wait_ctx& ctx, memory_sync_info sync=memory_s
ctx.pending_flat_vm = true;
}
void insert_wait_entry(wait_ctx& ctx, PhysReg reg, RegClass rc, wait_event event, bool wait_on_read,
bool has_sampler=false)
void
insert_wait_entry(wait_ctx& ctx, PhysReg reg, RegClass rc, wait_event event, bool wait_on_read,
bool has_sampler = false)
{
uint16_t counters = get_counters_for_event(event);
wait_imm imm;
@ -589,24 +599,27 @@ void insert_wait_entry(wait_ctx& ctx, PhysReg reg, RegClass rc, wait_event event
new_entry.has_vmem_sampler = (event & event_vmem) && has_sampler;
for (unsigned i = 0; i < rc.size(); i++) {
auto it = ctx.gpr_map.emplace(PhysReg{reg.reg()+i}, new_entry);
auto it = ctx.gpr_map.emplace(PhysReg{reg.reg() + i}, new_entry);
if (!it.second)
it.first->second.join(new_entry);
}
}
void insert_wait_entry(wait_ctx& ctx, Operand op, wait_event event, bool has_sampler=false)
void
insert_wait_entry(wait_ctx& ctx, Operand op, wait_event event, bool has_sampler = false)
{
if (!op.isConstant() && !op.isUndefined())
insert_wait_entry(ctx, op.physReg(), op.regClass(), event, false, has_sampler);
}
void insert_wait_entry(wait_ctx& ctx, Definition def, wait_event event, bool has_sampler=false)
void
insert_wait_entry(wait_ctx& ctx, Definition def, wait_event event, bool has_sampler = false)
{
insert_wait_entry(ctx, def.physReg(), def.regClass(), event, true, has_sampler);
}
void gen(Instruction* instr, wait_ctx& ctx)
void
gen(Instruction* instr, wait_ctx& ctx)
{
switch (instr->format) {
case Format::EXP: {
@ -622,13 +635,11 @@ void gen(Instruction* instr, wait_ctx& ctx)
update_counters(ctx, ev);
/* insert new entries for exported vgprs */
for (unsigned i = 0; i < 4; i++)
{
for (unsigned i = 0; i < 4; i++) {
if (exp_instr.enabled_mask & (1 << i)) {
unsigned idx = exp_instr.compressed ? i >> 1 : i;
assert(idx < exp_instr.operands.size());
insert_wait_entry(ctx, exp_instr.operands[idx], ev);
}
}
insert_wait_entry(ctx, exec, s2, ev, false);
@ -651,8 +662,7 @@ void gen(Instruction* instr, wait_ctx& ctx)
if (!instr->definitions.empty())
insert_wait_entry(ctx, instr->definitions[0], event_smem);
else if (ctx.chip_class >= GFX10 &&
!smem.sync.can_reorder())
else if (ctx.chip_class >= GFX10 && !smem.sync.can_reorder())
ctx.pending_s_buffer_store = true;
break;
@ -677,23 +687,21 @@ void gen(Instruction* instr, wait_ctx& ctx)
case Format::MTBUF:
case Format::MIMG:
case Format::GLOBAL: {
wait_event ev = !instr->definitions.empty() || ctx.chip_class < GFX10 ? event_vmem : event_vmem_store;
wait_event ev =
!instr->definitions.empty() || ctx.chip_class < GFX10 ? event_vmem : event_vmem_store;
update_counters(ctx, ev, get_sync_info(instr));
bool has_sampler = instr->isMIMG() && !instr->operands[1].isUndefined() && instr->operands[1].regClass() == s4;
bool has_sampler = instr->isMIMG() && !instr->operands[1].isUndefined() &&
instr->operands[1].regClass() == s4;
if (!instr->definitions.empty())
insert_wait_entry(ctx, instr->definitions[0], ev, has_sampler);
if (ctx.chip_class == GFX6 &&
instr->format != Format::MIMG &&
instr->operands.size() == 4) {
if (ctx.chip_class == GFX6 && instr->format != Format::MIMG && instr->operands.size() == 4) {
ctx.exp_cnt++;
update_counters(ctx, event_vmem_gpr_lock);
insert_wait_entry(ctx, instr->operands[3], event_vmem_gpr_lock);
} else if (ctx.chip_class == GFX6 &&
instr->isMIMG() &&
!instr->operands[2].isUndefined()) {
} else if (ctx.chip_class == GFX6 && instr->isMIMG() && !instr->operands[2].isUndefined()) {
ctx.exp_cnt++;
update_counters(ctx, event_vmem_gpr_lock);
insert_wait_entry(ctx, instr->operands[2], event_vmem_gpr_lock);
@ -702,35 +710,37 @@ void gen(Instruction* instr, wait_ctx& ctx)
break;
}
case Format::SOPP: {
if (instr->opcode == aco_opcode::s_sendmsg ||
instr->opcode == aco_opcode::s_sendmsghalt)
if (instr->opcode == aco_opcode::s_sendmsg || instr->opcode == aco_opcode::s_sendmsghalt)
update_counters(ctx, event_sendmsg);
break;
}
default:
break;
default: break;
}
}
void emit_waitcnt(wait_ctx& ctx, std::vector<aco_ptr<Instruction>>& instructions, wait_imm imm)
void
emit_waitcnt(wait_ctx& ctx, std::vector<aco_ptr<Instruction>>& instructions, wait_imm imm)
{
if (imm.vs != wait_imm::unset_counter) {
assert(ctx.chip_class >= GFX10);
SOPK_instruction* waitcnt_vs = create_instruction<SOPK_instruction>(aco_opcode::s_waitcnt_vscnt, Format::SOPK, 0, 1);
SOPK_instruction* waitcnt_vs =
create_instruction<SOPK_instruction>(aco_opcode::s_waitcnt_vscnt, Format::SOPK, 0, 1);
waitcnt_vs->definitions[0] = Definition(sgpr_null, s1);
waitcnt_vs->imm = imm.vs;
instructions.emplace_back(waitcnt_vs);
imm.vs = wait_imm::unset_counter;
}
if (!imm.empty()) {
SOPP_instruction* waitcnt = create_instruction<SOPP_instruction>(aco_opcode::s_waitcnt, Format::SOPP, 0, 0);
SOPP_instruction* waitcnt =
create_instruction<SOPP_instruction>(aco_opcode::s_waitcnt, Format::SOPP, 0, 0);
waitcnt->imm = imm.pack(ctx.chip_class);
waitcnt->block = -1;
instructions.emplace_back(waitcnt);
}
}
void handle_block(Program *program, Block& block, wait_ctx& ctx)
void
handle_block(Program* program, Block& block, wait_ctx& ctx)
{
std::vector<aco_ptr<Instruction>> new_instructions;
@ -763,7 +773,8 @@ void handle_block(Program *program, Block& block, wait_ctx& ctx)
} /* end namespace */
void insert_wait_states(Program* program)
void
insert_wait_states(Program* program)
{
/* per BB ctx */
std::vector<bool> done(program->blocks.size());
@ -818,5 +829,4 @@ void insert_wait_states(Program* program)
}
}
}
} // namespace aco

File diff suppressed because it is too large Load Diff

View File

@ -39,21 +39,22 @@ struct shader_io_state {
uint8_t mask[VARYING_SLOT_MAX];
Temp temps[VARYING_SLOT_MAX * 4u];
shader_io_state() {
shader_io_state()
{
memset(mask, 0, sizeof(mask));
std::fill_n(temps, VARYING_SLOT_MAX * 4u, Temp(0, RegClass::v1));
}
};
struct isel_context {
const struct radv_nir_compiler_options *options;
struct radv_shader_args *args;
Program *program;
nir_shader *shader;
const struct radv_nir_compiler_options* options;
struct radv_shader_args* args;
Program* program;
nir_shader* shader;
uint32_t constant_data_offset;
Block *block;
Block* block;
uint32_t first_temp_id;
std::unordered_map<unsigned, std::array<Temp,NIR_MAX_VEC_COMPONENTS>> allocated_vec;
std::unordered_map<unsigned, std::array<Temp, NIR_MAX_VEC_COMPONENTS>> allocated_vec;
Stage stage;
struct {
bool has_branch;
@ -66,7 +67,8 @@ struct isel_context {
struct {
bool is_divergent = false;
} parent_if;
bool exec_potentially_empty_discard = false; /* set to false when loop_nest_depth==0 && parent_if.is_divergent==false */
bool exec_potentially_empty_discard =
false; /* set to false when loop_nest_depth==0 && parent_if.is_divergent==false */
uint16_t exec_potentially_empty_break_depth = UINT16_MAX;
/* Set to false when loop_nest_depth==exec_potentially_empty_break_depth
* and parent_if.is_divergent==false. Called _break but it's also used for
@ -76,7 +78,7 @@ struct isel_context {
} cf_info;
/* NIR range analysis. */
struct hash_table *range_ht;
struct hash_table* range_ht;
nir_unsigned_upper_bound_config ub_config;
Temp arg_temps[AC_MAX_ARGS];
@ -102,22 +104,19 @@ struct isel_context {
shader_io_state outputs;
};
inline Temp get_arg(isel_context *ctx, struct ac_arg arg)
inline Temp
get_arg(isel_context* ctx, struct ac_arg arg)
{
assert(arg.used);
return ctx->arg_temps[arg.arg_index];
}
void init_context(isel_context *ctx, nir_shader *shader);
void cleanup_context(isel_context *ctx);
void init_context(isel_context* ctx, nir_shader* shader);
void cleanup_context(isel_context* ctx);
isel_context
setup_isel_context(Program* program,
unsigned shader_count,
struct nir_shader *const *shaders,
ac_shader_config* config,
struct radv_shader_args *args,
bool is_gs_copy_shader);
isel_context setup_isel_context(Program* program, unsigned shader_count,
struct nir_shader* const* shaders, ac_shader_config* config,
struct radv_shader_args* args, bool is_gs_copy_shader);
} // namespace aco

File diff suppressed because it is too large Load Diff

View File

@ -23,6 +23,7 @@
*/
#include "aco_interface.h"
#include "aco_ir.h"
#include "vulkan/radv_shader.h"
@ -37,23 +38,33 @@
static const std::array<aco_compiler_statistic_info, aco::num_statistics> statistic_infos = []()
{
std::array<aco_compiler_statistic_info, aco::num_statistics> ret{};
ret[aco::statistic_hash] = aco_compiler_statistic_info{"Hash", "CRC32 hash of code and constant data"};
ret[aco::statistic_instructions] = aco_compiler_statistic_info{"Instructions", "Instruction count"};
ret[aco::statistic_copies] = aco_compiler_statistic_info{"Copies", "Copy instructions created for pseudo-instructions"};
ret[aco::statistic_hash] =
aco_compiler_statistic_info{"Hash", "CRC32 hash of code and constant data"};
ret[aco::statistic_instructions] =
aco_compiler_statistic_info{"Instructions", "Instruction count"};
ret[aco::statistic_copies] =
aco_compiler_statistic_info{"Copies", "Copy instructions created for pseudo-instructions"};
ret[aco::statistic_branches] = aco_compiler_statistic_info{"Branches", "Branch instructions"};
ret[aco::statistic_latency] = aco_compiler_statistic_info{"Latency", "Issue cycles plus stall cycles"};
ret[aco::statistic_inv_throughput] = aco_compiler_statistic_info{"Inverse Throughput", "Estimated busy cycles to execute one wave"};
ret[aco::statistic_vmem_clauses] = aco_compiler_statistic_info{"VMEM Clause", "Number of VMEM clauses (includes 1-sized clauses)"};
ret[aco::statistic_smem_clauses] = aco_compiler_statistic_info{"SMEM Clause", "Number of SMEM clauses (includes 1-sized clauses)"};
ret[aco::statistic_sgpr_presched] = aco_compiler_statistic_info{"Pre-Sched SGPRs", "SGPR usage before scheduling"};
ret[aco::statistic_vgpr_presched] = aco_compiler_statistic_info{"Pre-Sched VGPRs", "VGPR usage before scheduling"};
ret[aco::statistic_latency] =
aco_compiler_statistic_info{"Latency", "Issue cycles plus stall cycles"};
ret[aco::statistic_inv_throughput] = aco_compiler_statistic_info{
"Inverse Throughput", "Estimated busy cycles to execute one wave"};
ret[aco::statistic_vmem_clauses] = aco_compiler_statistic_info{
"VMEM Clause", "Number of VMEM clauses (includes 1-sized clauses)"};
ret[aco::statistic_smem_clauses] = aco_compiler_statistic_info{
"SMEM Clause", "Number of SMEM clauses (includes 1-sized clauses)"};
ret[aco::statistic_sgpr_presched] =
aco_compiler_statistic_info{"Pre-Sched SGPRs", "SGPR usage before scheduling"};
ret[aco::statistic_vgpr_presched] =
aco_compiler_statistic_info{"Pre-Sched VGPRs", "VGPR usage before scheduling"};
return ret;
}();
const unsigned aco_num_statistics = aco::num_statistics;
const aco_compiler_statistic_info *aco_statistic_infos = statistic_infos.data();
const aco_compiler_statistic_info* aco_statistic_infos = statistic_infos.data();
static void validate(aco::Program *program)
static void
validate(aco::Program* program)
{
if (!(aco::debug_flags & aco::DEBUG_VALIDATE_IR))
return;
@ -62,10 +73,9 @@ static void validate(aco::Program *program)
assert(is_valid);
}
void aco_compile_shader(unsigned shader_count,
struct nir_shader *const *shaders,
struct radv_shader_binary **binary,
struct radv_shader_args *args)
void
aco_compile_shader(unsigned shader_count, struct nir_shader* const* shaders,
struct radv_shader_binary** binary, struct radv_shader_args* args)
{
aco::init();
@ -116,11 +126,11 @@ void aco_compile_shader(unsigned shader_count,
std::string llvm_ir;
if (args->options->record_ir) {
char *data = NULL;
char* data = NULL;
size_t size = 0;
u_memstream mem;
if (u_memstream_open(&mem, &data, &size)) {
FILE *const memf = u_memstream_get(&mem);
FILE* const memf = u_memstream_get(&mem);
aco_print_program(program.get(), memf);
fputc(0, memf);
u_memstream_close(&mem);
@ -137,8 +147,7 @@ void aco_compile_shader(unsigned shader_count,
aco_print_program(program.get(), stderr, live_vars, aco::print_live_vars | aco::print_kill);
if (!args->is_trap_handler_shader) {
if (!args->options->disable_optimizations &&
!(aco::debug_flags & aco::DEBUG_NO_SCHED))
if (!args->options->disable_optimizations && !(aco::debug_flags & aco::DEBUG_NO_SCHED))
aco::schedule_program(program.get(), live_vars);
validate(program.get());
@ -189,11 +198,11 @@ void aco_compile_shader(unsigned shader_count,
std::string disasm;
if (get_disasm) {
char *data = NULL;
char* data = NULL;
size_t disasm_size = 0;
struct u_memstream mem;
if (u_memstream_open(&mem, &data, &disasm_size)) {
FILE *const memf = u_memstream_get(&mem);
FILE* const memf = u_memstream_get(&mem);
aco::print_asm(program.get(), code, exec_size / 4u, memf);
fputc(0, memf);
u_memstream_close(&mem);
@ -214,10 +223,10 @@ void aco_compile_shader(unsigned shader_count,
* directly for the disk cache. Uninitialized data can appear because of
* padding in the struct or because legacy_binary->data can be at an offset
* from the start less than sizeof(radv_shader_binary_legacy). */
radv_shader_binary_legacy* legacy_binary = (radv_shader_binary_legacy*) calloc(size, 1);
radv_shader_binary_legacy* legacy_binary = (radv_shader_binary_legacy*)calloc(size, 1);
legacy_binary->base.type = RADV_BINARY_TYPE_LEGACY;
legacy_binary->base.stage = shaders[shader_count-1]->info.stage;
legacy_binary->base.stage = shaders[shader_count - 1]->info.stage;
legacy_binary->base.is_gs_copy_shader = args->is_gs_copy_shader;
legacy_binary->base.total_size = size;
@ -225,7 +234,8 @@ void aco_compile_shader(unsigned shader_count,
memcpy(legacy_binary->data, program->statistics, aco::num_statistics * sizeof(uint32_t));
legacy_binary->stats_size = stats_size;
memcpy(legacy_binary->data + legacy_binary->stats_size, code.data(), code.size() * sizeof(uint32_t));
memcpy(legacy_binary->data + legacy_binary->stats_size, code.data(),
code.size() * sizeof(uint32_t));
legacy_binary->exec_size = exec_size;
legacy_binary->code_size = code.size() * sizeof(uint32_t);
@ -233,12 +243,15 @@ void aco_compile_shader(unsigned shader_count,
legacy_binary->disasm_size = 0;
legacy_binary->ir_size = llvm_ir.size();
llvm_ir.copy((char*) legacy_binary->data + legacy_binary->stats_size + legacy_binary->code_size, llvm_ir.size());
llvm_ir.copy((char*)legacy_binary->data + legacy_binary->stats_size + legacy_binary->code_size,
llvm_ir.size());
if (get_disasm) {
disasm.copy((char*) legacy_binary->data + legacy_binary->stats_size + legacy_binary->code_size + llvm_ir.size(), disasm.size());
disasm.copy((char*)legacy_binary->data + legacy_binary->stats_size +
legacy_binary->code_size + llvm_ir.size(),
disasm.size());
legacy_binary->disasm_size = disasm.size();
}
*binary = (radv_shader_binary*) legacy_binary;
*binary = (radv_shader_binary*)legacy_binary;
}

View File

@ -39,12 +39,10 @@ struct aco_compiler_statistic_info {
};
extern const unsigned aco_num_statistics;
extern const struct aco_compiler_statistic_info *aco_statistic_infos;
extern const struct aco_compiler_statistic_info* aco_statistic_infos;
void aco_compile_shader(unsigned shader_count,
struct nir_shader *const *shaders,
struct radv_shader_binary** binary,
struct radv_shader_args *args);
void aco_compile_shader(unsigned shader_count, struct nir_shader* const* shaders,
struct radv_shader_binary** binary, struct radv_shader_args* args);
#ifdef __cplusplus
}

View File

@ -32,39 +32,40 @@ namespace aco {
uint64_t debug_flags = 0;
static const struct debug_control aco_debug_options[] = {
{"validateir", DEBUG_VALIDATE_IR},
{"validatera", DEBUG_VALIDATE_RA},
{"perfwarn", DEBUG_PERFWARN},
{"force-waitcnt", DEBUG_FORCE_WAITCNT},
{"novn", DEBUG_NO_VN},
{"noopt", DEBUG_NO_OPT},
{"nosched", DEBUG_NO_SCHED},
{"perfinfo", DEBUG_PERF_INFO},
{"liveinfo", DEBUG_LIVE_INFO},
{NULL, 0}
};
static const struct debug_control aco_debug_options[] = {{"validateir", DEBUG_VALIDATE_IR},
{"validatera", DEBUG_VALIDATE_RA},
{"perfwarn", DEBUG_PERFWARN},
{"force-waitcnt", DEBUG_FORCE_WAITCNT},
{"novn", DEBUG_NO_VN},
{"noopt", DEBUG_NO_OPT},
{"nosched", DEBUG_NO_SCHED},
{"perfinfo", DEBUG_PERF_INFO},
{"liveinfo", DEBUG_LIVE_INFO},
{NULL, 0}};
static once_flag init_once_flag = ONCE_FLAG_INIT;
static void init_once()
static void
init_once()
{
debug_flags = parse_debug_string(getenv("ACO_DEBUG"), aco_debug_options);
#ifndef NDEBUG
#ifndef NDEBUG
/* enable some flags by default on debug builds */
debug_flags |= aco::DEBUG_VALIDATE_IR;
#endif
#endif
}
void init()
void
init()
{
call_once(&init_once_flag, init_once);
}
void init_program(Program *program, Stage stage, struct radv_shader_info *info,
enum chip_class chip_class, enum radeon_family family,
bool wgp_mode, ac_shader_config *config)
void
init_program(Program* program, Stage stage, struct radv_shader_info* info,
enum chip_class chip_class, enum radeon_family family, bool wgp_mode,
ac_shader_config* config)
{
program->stage = stage;
program->config = config;
@ -72,24 +73,12 @@ void init_program(Program *program, Stage stage, struct radv_shader_info *info,
program->chip_class = chip_class;
if (family == CHIP_UNKNOWN) {
switch (chip_class) {
case GFX6:
program->family = CHIP_TAHITI;
break;
case GFX7:
program->family = CHIP_BONAIRE;
break;
case GFX8:
program->family = CHIP_POLARIS10;
break;
case GFX9:
program->family = CHIP_VEGA10;
break;
case GFX10:
program->family = CHIP_NAVI10;
break;
default:
program->family = CHIP_UNKNOWN;
break;
case GFX6: program->family = CHIP_TAHITI; break;
case GFX7: program->family = CHIP_BONAIRE; break;
case GFX8: program->family = CHIP_POLARIS10; break;
case GFX9: program->family = CHIP_VEGA10; break;
case GFX10: program->family = CHIP_NAVI10; break;
default: program->family = CHIP_UNKNOWN; break;
}
} else {
program->family = family;
@ -98,7 +87,8 @@ void init_program(Program *program, Stage stage, struct radv_shader_info *info,
program->lane_mask = program->wave_size == 32 ? s1 : s2;
program->dev.lds_encoding_granule = chip_class >= GFX7 ? 512 : 256;
program->dev.lds_alloc_granule = chip_class >= GFX10_3 ? 1024 : program->dev.lds_encoding_granule;
program->dev.lds_alloc_granule =
chip_class >= GFX10_3 ? 1024 : program->dev.lds_encoding_granule;
program->dev.lds_limit = chip_class >= GFX7 ? 65536 : 32768;
/* apparently gfx702 also has 16-bank LDS but I can't find a family for that */
program->dev.has_16bank_lds = family == CHIP_KABINI || family == CHIP_STONEY;
@ -111,7 +101,8 @@ void init_program(Program *program, Stage stage, struct radv_shader_info *info,
program->dev.physical_sgprs = 5120; /* doesn't matter as long as it's at least 128 * 40 */
program->dev.physical_vgprs = program->wave_size == 32 ? 1024 : 512;
program->dev.sgpr_alloc_granule = 128;
program->dev.sgpr_limit = 108; /* includes VCC, which can be treated as s[106-107] on GFX10+ */
program->dev.sgpr_limit =
108; /* includes VCC, which can be treated as s[106-107] on GFX10+ */
if (chip_class >= GFX10_3)
program->dev.vgpr_alloc_granule = program->wave_size == 32 ? 16 : 8;
else
@ -145,18 +136,14 @@ void init_program(Program *program, Stage stage, struct radv_shader_info *info,
/* GFX9 APUS */
case CHIP_RAVEN:
case CHIP_RAVEN2:
case CHIP_RENOIR:
program->dev.xnack_enabled = true;
break;
default:
break;
case CHIP_RENOIR: program->dev.xnack_enabled = true; break;
default: break;
}
program->dev.sram_ecc_enabled = program->family == CHIP_ARCTURUS;
/* apparently gfx702 also has fast v_fma_f32 but I can't find a family for that */
program->dev.has_fast_fma32 = program->chip_class >= GFX9;
if (program->family == CHIP_TAHITI ||
program->family == CHIP_CARRIZO ||
if (program->family == CHIP_TAHITI || program->family == CHIP_CARRIZO ||
program->family == CHIP_HAWAII)
program->dev.has_fast_fma32 = true;
@ -176,29 +163,24 @@ void init_program(Program *program, Stage stage, struct radv_shader_info *info,
program->next_fp_mode.round32 = fp_round_ne;
}
memory_sync_info get_sync_info(const Instruction* instr)
memory_sync_info
get_sync_info(const Instruction* instr)
{
switch (instr->format) {
case Format::SMEM:
return instr->smem().sync;
case Format::MUBUF:
return instr->mubuf().sync;
case Format::MIMG:
return instr->mimg().sync;
case Format::MTBUF:
return instr->mtbuf().sync;
case Format::SMEM: return instr->smem().sync;
case Format::MUBUF: return instr->mubuf().sync;
case Format::MIMG: return instr->mimg().sync;
case Format::MTBUF: return instr->mtbuf().sync;
case Format::FLAT:
case Format::GLOBAL:
case Format::SCRATCH:
return instr->flatlike().sync;
case Format::DS:
return instr->ds().sync;
default:
return memory_sync_info();
case Format::SCRATCH: return instr->flatlike().sync;
case Format::DS: return instr->ds().sync;
default: return memory_sync_info();
}
}
bool can_use_SDWA(chip_class chip, const aco_ptr<Instruction>& instr, bool pre_ra)
bool
can_use_SDWA(chip_class chip, const aco_ptr<Instruction>& instr, bool pre_ra)
{
if (!instr->isVALU())
return false;
@ -218,7 +200,7 @@ bool can_use_SDWA(chip_class chip, const aco_ptr<Instruction>& instr, bool pre_r
if (vop3.omod && chip < GFX9)
return false;
//TODO: return true if we know we will use vcc
// TODO: return true if we know we will use vcc
if (!pre_ra && instr->definitions.size() >= 2)
return false;
@ -244,38 +226,36 @@ bool can_use_SDWA(chip_class chip, const aco_ptr<Instruction>& instr, bool pre_r
return false;
}
bool is_mac = instr->opcode == aco_opcode::v_mac_f32 ||
instr->opcode == aco_opcode::v_mac_f16 ||
instr->opcode == aco_opcode::v_fmac_f32 ||
instr->opcode == aco_opcode::v_fmac_f16;
bool is_mac = instr->opcode == aco_opcode::v_mac_f32 || instr->opcode == aco_opcode::v_mac_f16 ||
instr->opcode == aco_opcode::v_fmac_f32 || instr->opcode == aco_opcode::v_fmac_f16;
if (chip != GFX8 && is_mac)
return false;
//TODO: return true if we know we will use vcc
// TODO: return true if we know we will use vcc
if (!pre_ra && instr->isVOPC())
return false;
if (!pre_ra && instr->operands.size() >= 3 && !is_mac)
return false;
return instr->opcode != aco_opcode::v_madmk_f32 &&
instr->opcode != aco_opcode::v_madak_f32 &&
instr->opcode != aco_opcode::v_madmk_f16 &&
instr->opcode != aco_opcode::v_madak_f16 &&
return instr->opcode != aco_opcode::v_madmk_f32 && instr->opcode != aco_opcode::v_madak_f32 &&
instr->opcode != aco_opcode::v_madmk_f16 && instr->opcode != aco_opcode::v_madak_f16 &&
instr->opcode != aco_opcode::v_readfirstlane_b32 &&
instr->opcode != aco_opcode::v_clrexcp &&
instr->opcode != aco_opcode::v_swap_b32;
instr->opcode != aco_opcode::v_clrexcp && instr->opcode != aco_opcode::v_swap_b32;
}
/* updates "instr" and returns the old instruction (or NULL if no update was needed) */
aco_ptr<Instruction> convert_to_SDWA(chip_class chip, aco_ptr<Instruction>& instr)
aco_ptr<Instruction>
convert_to_SDWA(chip_class chip, aco_ptr<Instruction>& instr)
{
if (instr->isSDWA())
return NULL;
aco_ptr<Instruction> tmp = std::move(instr);
Format format = (Format)(((uint16_t)tmp->format & ~(uint16_t)Format::VOP3) | (uint16_t)Format::SDWA);
instr.reset(create_instruction<SDWA_instruction>(tmp->opcode, format, tmp->operands.size(), tmp->definitions.size()));
Format format =
(Format)(((uint16_t)tmp->format & ~(uint16_t)Format::VOP3) | (uint16_t)Format::SDWA);
instr.reset(create_instruction<SDWA_instruction>(tmp->opcode, format, tmp->operands.size(),
tmp->definitions.size()));
std::copy(tmp->operands.cbegin(), tmp->operands.cend(), instr->operands.begin());
std::copy(tmp->definitions.cbegin(), tmp->definitions.cend(), instr->definitions.begin());
@ -295,15 +275,9 @@ aco_ptr<Instruction> convert_to_SDWA(chip_class chip, aco_ptr<Instruction>& inst
break;
switch (instr->operands[i].bytes()) {
case 1:
sdwa.sel[i] = sdwa_ubyte;
break;
case 2:
sdwa.sel[i] = sdwa_uword;
break;
case 4:
sdwa.sel[i] = sdwa_udword;
break;
case 1: sdwa.sel[i] = sdwa_ubyte; break;
case 2: sdwa.sel[i] = sdwa_uword; break;
case 4: sdwa.sel[i] = sdwa_udword; break;
}
}
switch (instr->definitions[0].bytes()) {
@ -315,9 +289,7 @@ aco_ptr<Instruction> convert_to_SDWA(chip_class chip, aco_ptr<Instruction>& inst
sdwa.dst_sel = sdwa_uword;
sdwa.dst_preserve = true;
break;
case 4:
sdwa.dst_sel = sdwa_udword;
break;
case 4: sdwa.dst_sel = sdwa_udword; break;
}
if (instr->definitions[0].getTemp().type() == RegType::sgpr && chip == GFX8)
@ -330,7 +302,8 @@ aco_ptr<Instruction> convert_to_SDWA(chip_class chip, aco_ptr<Instruction>& inst
return tmp;
}
bool can_use_opsel(chip_class chip, aco_opcode op, int idx, bool high)
bool
can_use_opsel(chip_class chip, aco_opcode op, int idx, bool high)
{
/* opsel is only GFX9+ */
if ((high || idx == -1) && chip < GFX9)
@ -362,21 +335,18 @@ bool can_use_opsel(chip_class chip, aco_opcode op, int idx, bool high)
case aco_opcode::v_lshlrev_b16_e64:
case aco_opcode::v_lshrrev_b16_e64:
case aco_opcode::v_ashrrev_i16_e64:
case aco_opcode::v_mul_lo_u16_e64:
return true;
case aco_opcode::v_mul_lo_u16_e64: return true;
case aco_opcode::v_pack_b32_f16:
case aco_opcode::v_cvt_pknorm_i16_f16:
case aco_opcode::v_cvt_pknorm_u16_f16:
return idx != -1;
case aco_opcode::v_cvt_pknorm_u16_f16: return idx != -1;
case aco_opcode::v_mad_u32_u16:
case aco_opcode::v_mad_i32_i16:
return idx >= 0 && idx < 2;
default:
return false;
case aco_opcode::v_mad_i32_i16: return idx >= 0 && idx < 2;
default: return false;
}
}
uint32_t get_reduction_identity(ReduceOp op, unsigned idx)
uint32_t
get_reduction_identity(ReduceOp op, unsigned idx)
{
switch (op) {
case iadd8:
@ -397,65 +367,44 @@ uint32_t get_reduction_identity(ReduceOp op, unsigned idx)
case umax8:
case umax16:
case umax32:
case umax64:
return 0;
case umax64: return 0;
case imul8:
case imul16:
case imul32:
case imul64:
return idx ? 0 : 1;
case fmul16:
return 0x3c00u; /* 1.0 */
case fmul32:
return 0x3f800000u; /* 1.0 */
case fmul64:
return idx ? 0x3ff00000u : 0u; /* 1.0 */
case imin8:
return INT8_MAX;
case imin16:
return INT16_MAX;
case imin32:
return INT32_MAX;
case imin64:
return idx ? 0x7fffffffu : 0xffffffffu;
case imax8:
return INT8_MIN;
case imax16:
return INT16_MIN;
case imax32:
return INT32_MIN;
case imax64:
return idx ? 0x80000000u : 0;
case imul64: return idx ? 0 : 1;
case fmul16: return 0x3c00u; /* 1.0 */
case fmul32: return 0x3f800000u; /* 1.0 */
case fmul64: return idx ? 0x3ff00000u : 0u; /* 1.0 */
case imin8: return INT8_MAX;
case imin16: return INT16_MAX;
case imin32: return INT32_MAX;
case imin64: return idx ? 0x7fffffffu : 0xffffffffu;
case imax8: return INT8_MIN;
case imax16: return INT16_MIN;
case imax32: return INT32_MIN;
case imax64: return idx ? 0x80000000u : 0;
case umin8:
case umin16:
case iand8:
case iand16:
return 0xffffffffu;
case iand16: return 0xffffffffu;
case umin32:
case umin64:
case iand32:
case iand64:
return 0xffffffffu;
case fmin16:
return 0x7c00u; /* infinity */
case fmin32:
return 0x7f800000u; /* infinity */
case fmin64:
return idx ? 0x7ff00000u : 0u; /* infinity */
case fmax16:
return 0xfc00u; /* negative infinity */
case fmax32:
return 0xff800000u; /* negative infinity */
case fmax64:
return idx ? 0xfff00000u : 0u; /* negative infinity */
default:
unreachable("Invalid reduction operation");
break;
case iand64: return 0xffffffffu;
case fmin16: return 0x7c00u; /* infinity */
case fmin32: return 0x7f800000u; /* infinity */
case fmin64: return idx ? 0x7ff00000u : 0u; /* infinity */
case fmax16: return 0xfc00u; /* negative infinity */
case fmax32: return 0xff800000u; /* negative infinity */
case fmax64: return idx ? 0xfff00000u : 0u; /* negative infinity */
default: unreachable("Invalid reduction operation"); break;
}
return 0;
}
bool needs_exec_mask(const Instruction* instr) {
bool
needs_exec_mask(const Instruction* instr)
{
if (instr->isSALU() || instr->isBranch())
return instr->reads_exec();
if (instr->isSMEM())
@ -479,10 +428,8 @@ bool needs_exec_mask(const Instruction* instr) {
case aco_opcode::p_reload:
case aco_opcode::p_logical_start:
case aco_opcode::p_logical_end:
case aco_opcode::p_startpgm:
return false;
default:
break;
case aco_opcode::p_startpgm: return false;
default: break;
}
}
@ -495,10 +442,11 @@ bool needs_exec_mask(const Instruction* instr) {
return true;
}
wait_imm::wait_imm() :
vm(unset_counter), exp(unset_counter), lgkm(unset_counter), vs(unset_counter) {}
wait_imm::wait_imm(uint16_t vm_, uint16_t exp_, uint16_t lgkm_, uint16_t vs_) :
vm(vm_), exp(exp_), lgkm(lgkm_), vs(vs_) {}
wait_imm::wait_imm() : vm(unset_counter), exp(unset_counter), lgkm(unset_counter), vs(unset_counter)
{}
wait_imm::wait_imm(uint16_t vm_, uint16_t exp_, uint16_t lgkm_, uint16_t vs_)
: vm(vm_), exp(exp_), lgkm(lgkm_), vs(vs_)
{}
wait_imm::wait_imm(enum chip_class chip, uint16_t packed) : vs(unset_counter)
{
@ -513,7 +461,8 @@ wait_imm::wait_imm(enum chip_class chip, uint16_t packed) : vs(unset_counter)
lgkm |= (packed >> 8) & 0x30;
}
uint16_t wait_imm::pack(enum chip_class chip) const
uint16_t
wait_imm::pack(enum chip_class chip) const
{
uint16_t imm = 0;
assert(exp == unset_counter || exp <= 0x7);
@ -536,13 +485,16 @@ uint16_t wait_imm::pack(enum chip_class chip) const
break;
}
if (chip < GFX9 && vm == wait_imm::unset_counter)
imm |= 0xc000; /* should have no effect on pre-GFX9 and now we won't have to worry about the architecture when interpreting the immediate */
imm |= 0xc000; /* should have no effect on pre-GFX9 and now we won't have to worry about the
architecture when interpreting the immediate */
if (chip < GFX10 && lgkm == wait_imm::unset_counter)
imm |= 0x3000; /* should have no effect on pre-GFX10 and now we won't have to worry about the architecture when interpreting the immediate */
imm |= 0x3000; /* should have no effect on pre-GFX10 and now we won't have to worry about the
architecture when interpreting the immediate */
return imm;
}
bool wait_imm::combine(const wait_imm& other)
bool
wait_imm::combine(const wait_imm& other)
{
bool changed = other.vm < vm || other.exp < exp || other.lgkm < lgkm || other.vs < vs;
vm = std::min(vm, other.vm);
@ -552,17 +504,21 @@ bool wait_imm::combine(const wait_imm& other)
return changed;
}
bool wait_imm::empty() const
bool
wait_imm::empty() const
{
return vm == unset_counter && exp == unset_counter &&
lgkm == unset_counter && vs == unset_counter;
return vm == unset_counter && exp == unset_counter && lgkm == unset_counter &&
vs == unset_counter;
}
bool should_form_clause(const Instruction *a, const Instruction *b)
bool
should_form_clause(const Instruction* a, const Instruction* b)
{
/* Vertex attribute loads from the same binding likely load from similar addresses */
unsigned a_vtx_binding = a->isMUBUF() ? a->mubuf().vtx_binding : (a->isMTBUF() ? a->mtbuf().vtx_binding : 0);
unsigned b_vtx_binding = b->isMUBUF() ? b->mubuf().vtx_binding : (b->isMTBUF() ? b->mtbuf().vtx_binding : 0);
unsigned a_vtx_binding =
a->isMUBUF() ? a->mubuf().vtx_binding : (a->isMTBUF() ? a->mtbuf().vtx_binding : 0);
unsigned b_vtx_binding =
b->isMUBUF() ? b->mubuf().vtx_binding : (b->isMTBUF() ? b->mtbuf().vtx_binding : 0);
if (a_vtx_binding && a_vtx_binding == b_vtx_binding)
return true;
@ -584,4 +540,4 @@ bool should_form_clause(const Instruction *a, const Instruction *b)
return false;
}
}
} // namespace aco

File diff suppressed because it is too large Load Diff

View File

@ -24,13 +24,15 @@
*/
#include "aco_ir.h"
#include "util/u_math.h"
#include <set>
#include <vector>
namespace aco {
RegisterDemand get_live_changes(aco_ptr<Instruction>& instr)
RegisterDemand
get_live_changes(aco_ptr<Instruction>& instr)
{
RegisterDemand changes;
for (const Definition& def : instr->definitions) {
@ -48,7 +50,8 @@ RegisterDemand get_live_changes(aco_ptr<Instruction>& instr)
return changes;
}
RegisterDemand get_temp_registers(aco_ptr<Instruction>& instr)
RegisterDemand
get_temp_registers(aco_ptr<Instruction>& instr)
{
RegisterDemand temp_registers;
@ -67,7 +70,9 @@ RegisterDemand get_temp_registers(aco_ptr<Instruction>& instr)
return temp_registers;
}
RegisterDemand get_demand_before(RegisterDemand demand, aco_ptr<Instruction>& instr, aco_ptr<Instruction>& instr_before)
RegisterDemand
get_demand_before(RegisterDemand demand, aco_ptr<Instruction>& instr,
aco_ptr<Instruction>& instr_before)
{
demand -= get_live_changes(instr);
demand -= get_temp_registers(instr);
@ -77,8 +82,9 @@ RegisterDemand get_demand_before(RegisterDemand demand, aco_ptr<Instruction>& in
}
namespace {
void process_live_temps_per_block(Program *program, live& lives, Block* block,
std::set<unsigned>& worklist, std::vector<uint16_t>& phi_sgpr_ops)
void
process_live_temps_per_block(Program* program, live& lives, Block* block,
std::set<unsigned>& worklist, std::vector<uint16_t>& phi_sgpr_ops)
{
std::vector<RegisterDemand>& register_demand = lives.register_demand[block->index];
RegisterDemand new_demand;
@ -94,8 +100,8 @@ void process_live_temps_per_block(Program *program, live& lives, Block* block,
/* traverse the instructions backwards */
int idx;
for (idx = block->instructions.size() -1; idx >= 0; idx--) {
Instruction *insn = block->instructions[idx].get();
for (idx = block->instructions.size() - 1; idx >= 0; idx--) {
Instruction* insn = block->instructions[idx].get();
if (is_phi(insn))
break;
@ -131,8 +137,7 @@ void process_live_temps_per_block(Program *program, live& lives, Block* block,
for (Operand& op : insn->operands)
op.setKill(false);
for (unsigned i = 0; i < insn->operands.size(); ++i)
{
for (unsigned i = 0; i < insn->operands.size(); ++i) {
Operand& operand = insn->operands[i];
if (!operand.isTemp())
continue;
@ -143,7 +148,8 @@ void process_live_temps_per_block(Program *program, live& lives, Block* block,
if (inserted) {
operand.setFirstKill(true);
for (unsigned j = i + 1; j < insn->operands.size(); ++j) {
if (insn->operands[j].isTemp() && insn->operands[j].tempId() == operand.tempId()) {
if (insn->operands[j].isTemp() &&
insn->operands[j].tempId() == operand.tempId()) {
insn->operands[j].setFirstKill(false);
insn->operands[j].setKill(true);
}
@ -167,7 +173,7 @@ void process_live_temps_per_block(Program *program, live& lives, Block* block,
int phi_idx = idx;
while (phi_idx >= 0) {
register_demand[phi_idx] = new_demand;
Instruction *insn = block->instructions[phi_idx].get();
Instruction* insn = block->instructions[phi_idx].get();
assert(is_phi(insn) && insn->definitions.size() == 1);
if (!insn->definitions[0].isTemp()) {
@ -196,7 +202,8 @@ void process_live_temps_per_block(Program *program, live& lives, Block* block,
#ifndef NDEBUG
if (preds.empty())
aco_err(program, "Temporary never defined or are defined after use: %%%d in BB%d", t, block->index);
aco_err(program, "Temporary never defined or are defined after use: %%%d in BB%d", t,
block->index);
#endif
for (unsigned pred_idx : preds) {
@ -209,14 +216,13 @@ void process_live_temps_per_block(Program *program, live& lives, Block* block,
/* handle phi operands */
phi_idx = idx;
while (phi_idx >= 0) {
Instruction *insn = block->instructions[phi_idx].get();
Instruction* insn = block->instructions[phi_idx].get();
assert(is_phi(insn));
/* directly insert into the predecessors live-out set */
std::vector<unsigned>& preds = insn->opcode == aco_opcode::p_phi
? block->logical_preds
: block->linear_preds;
std::vector<unsigned>& preds =
insn->opcode == aco_opcode::p_phi ? block->logical_preds : block->linear_preds;
for (unsigned i = 0; i < preds.size(); ++i) {
Operand &operand = insn->operands[i];
Operand& operand = insn->operands[i];
if (!operand.isTemp())
continue;
if (operand.isFixed() && operand.physReg() == vcc)
@ -238,18 +244,19 @@ void process_live_temps_per_block(Program *program, live& lives, Block* block,
assert(block->index != 0 || (new_demand == RegisterDemand() && live.empty()));
}
unsigned calc_waves_per_workgroup(Program *program)
unsigned
calc_waves_per_workgroup(Program* program)
{
/* When workgroup size is not known, just go with wave_size */
unsigned workgroup_size = program->workgroup_size == UINT_MAX
? program->wave_size
: program->workgroup_size;
unsigned workgroup_size =
program->workgroup_size == UINT_MAX ? program->wave_size : program->workgroup_size;
return align(workgroup_size, program->wave_size) / program->wave_size;
}
} /* end namespace */
uint16_t get_extra_sgprs(Program *program)
uint16_t
get_extra_sgprs(Program* program)
{
if (program->chip_class >= GFX10) {
assert(!program->needs_flat_scr);
@ -275,26 +282,30 @@ uint16_t get_extra_sgprs(Program *program)
}
}
uint16_t get_sgpr_alloc(Program *program, uint16_t addressable_sgprs)
uint16_t
get_sgpr_alloc(Program* program, uint16_t addressable_sgprs)
{
uint16_t sgprs = addressable_sgprs + get_extra_sgprs(program);
uint16_t granule = program->dev.sgpr_alloc_granule;
return ALIGN_NPOT(std::max(sgprs, granule), granule);
}
uint16_t get_vgpr_alloc(Program *program, uint16_t addressable_vgprs)
uint16_t
get_vgpr_alloc(Program* program, uint16_t addressable_vgprs)
{
assert(addressable_vgprs <= program->dev.vgpr_limit);
uint16_t granule = program->dev.vgpr_alloc_granule;
return align(std::max(addressable_vgprs, granule), granule);
}
unsigned round_down(unsigned a, unsigned b)
unsigned
round_down(unsigned a, unsigned b)
{
return a - (a % b);
}
uint16_t get_addr_sgpr_from_waves(Program *program, uint16_t waves)
uint16_t
get_addr_sgpr_from_waves(Program* program, uint16_t waves)
{
/* it's not possible to allocate more than 128 SGPRs */
uint16_t sgprs = std::min(program->dev.physical_sgprs / waves, 128);
@ -303,21 +314,24 @@ uint16_t get_addr_sgpr_from_waves(Program *program, uint16_t waves)
return std::min(sgprs, program->dev.sgpr_limit);
}
uint16_t get_addr_vgpr_from_waves(Program *program, uint16_t waves)
uint16_t
get_addr_vgpr_from_waves(Program* program, uint16_t waves)
{
uint16_t vgprs = program->dev.physical_vgprs / waves & ~(program->dev.vgpr_alloc_granule - 1);
vgprs -= program->config->num_shared_vgprs / 2;
return std::min(vgprs, program->dev.vgpr_limit);
}
void calc_min_waves(Program* program)
void
calc_min_waves(Program* program)
{
unsigned waves_per_workgroup = calc_waves_per_workgroup(program);
unsigned simd_per_cu_wgp = program->dev.simd_per_cu * (program->wgp_mode ? 2 : 1);
program->min_waves = DIV_ROUND_UP(waves_per_workgroup, simd_per_cu_wgp);
}
void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand)
void
update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand)
{
unsigned max_waves_per_simd = program->dev.max_wave64_per_simd * (64 / program->wave_size);
unsigned simd_per_cu_wgp = program->dev.simd_per_cu * (program->wgp_mode ? 2 : 1);
@ -333,8 +347,10 @@ void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand)
program->max_reg_demand = new_demand;
} else {
program->num_waves = program->dev.physical_sgprs / get_sgpr_alloc(program, new_demand.sgpr);
uint16_t vgpr_demand = get_vgpr_alloc(program, new_demand.vgpr) + program->config->num_shared_vgprs / 2;
program->num_waves = std::min<uint16_t>(program->num_waves, program->dev.physical_vgprs / vgpr_demand);
uint16_t vgpr_demand =
get_vgpr_alloc(program, new_demand.vgpr) + program->config->num_shared_vgprs / 2;
program->num_waves =
std::min<uint16_t>(program->num_waves, program->dev.physical_vgprs / vgpr_demand);
program->max_waves = max_waves_per_simd;
/* adjust max_waves for workgroup and LDS limits */
@ -346,12 +362,15 @@ void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand)
workgroups_per_cu_wgp = std::min(workgroups_per_cu_wgp, lds_limit / lds);
}
if (waves_per_workgroup > 1 && program->chip_class < GFX10)
workgroups_per_cu_wgp = std::min(workgroups_per_cu_wgp, 16u); /* TODO: is this a SI-only limit? what about Navi? */
workgroups_per_cu_wgp = std::min(
workgroups_per_cu_wgp, 16u); /* TODO: is this a SI-only limit? what about Navi? */
/* in cases like waves_per_workgroup=3 or lds=65536 and
* waves_per_workgroup=1, we want the maximum possible number of waves per
* SIMD and not the minimum. so DIV_ROUND_UP is used */
program->max_waves = std::min<uint16_t>(program->max_waves, DIV_ROUND_UP(workgroups_per_cu_wgp * waves_per_workgroup, simd_per_cu_wgp));
program->max_waves = std::min<uint16_t>(
program->max_waves,
DIV_ROUND_UP(workgroups_per_cu_wgp * waves_per_workgroup, simd_per_cu_wgp));
/* incorporate max_waves and calculate max_reg_demand */
program->num_waves = std::min<uint16_t>(program->num_waves, program->max_waves);
@ -360,7 +379,8 @@ void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand)
}
}
live live_var_analysis(Program* program)
live
live_var_analysis(Program* program)
{
live result;
result.live_out.resize(program->blocks.size());
@ -371,14 +391,16 @@ live live_var_analysis(Program* program)
program->needs_vcc = false;
/* this implementation assumes that the block idx corresponds to the block's position in program->blocks vector */
/* this implementation assumes that the block idx corresponds to the block's position in
* program->blocks vector */
for (Block& block : program->blocks)
worklist.insert(block.index);
while (!worklist.empty()) {
std::set<unsigned>::reverse_iterator b_it = worklist.rbegin();
unsigned block_idx = *b_it;
worklist.erase(block_idx);
process_live_temps_per_block(program, result, &program->blocks[block_idx], worklist, phi_sgpr_ops);
process_live_temps_per_block(program, result, &program->blocks[block_idx], worklist,
phi_sgpr_ops);
new_demand.update(program->blocks[block_idx].register_demand);
}
@ -389,5 +411,4 @@ live live_var_analysis(Program* program)
return result;
}
}
} // namespace aco

View File

@ -47,7 +47,8 @@ struct ssa_state {
std::vector<bool> visited;
};
Operand get_ssa(Program *program, unsigned block_idx, ssa_state *state, bool before_write)
Operand
get_ssa(Program* program, unsigned block_idx, ssa_state* state, bool before_write)
{
if (!before_write) {
auto it = state->writes.find(block_idx);
@ -79,7 +80,8 @@ Operand get_ssa(Program *program, unsigned block_idx, ssa_state *state, bool bef
Temp res = Temp(program->allocateTmp(program->lane_mask));
state->latest[block_idx] = Operand(res);
aco_ptr<Pseudo_instruction> phi{create_instruction<Pseudo_instruction>(aco_opcode::p_linear_phi, Format::PSEUDO, pred, 1)};
aco_ptr<Pseudo_instruction> phi{
create_instruction<Pseudo_instruction>(aco_opcode::p_linear_phi, Format::PSEUDO, pred, 1)};
for (unsigned i = 0; i < pred; i++)
phi->operands[i] = get_ssa(program, block.linear_preds[i], state, false);
phi->definitions[0] = Definition(res);
@ -89,11 +91,11 @@ Operand get_ssa(Program *program, unsigned block_idx, ssa_state *state, bool bef
}
}
void insert_before_logical_end(Block *block, aco_ptr<Instruction> instr)
void
insert_before_logical_end(Block* block, aco_ptr<Instruction> instr)
{
auto IsLogicalEnd = [] (const aco_ptr<Instruction>& inst) -> bool {
return inst->opcode == aco_opcode::p_logical_end;
};
auto IsLogicalEnd = [](const aco_ptr<Instruction>& inst) -> bool
{ return inst->opcode == aco_opcode::p_logical_end; };
auto it = std::find_if(block->instructions.crbegin(), block->instructions.crend(), IsLogicalEnd);
if (it == block->instructions.crend()) {
@ -104,13 +106,13 @@ void insert_before_logical_end(Block *block, aco_ptr<Instruction> instr)
}
}
void build_merge_code(Program *program, Block *block, Definition dst, Operand prev, Operand cur)
void
build_merge_code(Program* program, Block* block, Definition dst, Operand prev, Operand cur)
{
Builder bld(program);
auto IsLogicalEnd = [] (const aco_ptr<Instruction>& instr) -> bool {
return instr->opcode == aco_opcode::p_logical_end;
};
auto IsLogicalEnd = [](const aco_ptr<Instruction>& instr) -> bool
{ return instr->opcode == aco_opcode::p_logical_end; };
auto it = std::find_if(block->instructions.rbegin(), block->instructions.rend(), IsLogicalEnd);
assert(it != block->instructions.rend());
bld.reset(&block->instructions, std::prev(it.base()));
@ -126,7 +128,8 @@ void build_merge_code(Program *program, Block *block, Definition dst, Operand pr
if (!prev_is_constant) {
if (!cur_is_constant) {
Temp tmp1 = bld.tmp(bld.lm), tmp2 = bld.tmp(bld.lm);
bld.sop2(Builder::s_andn2, Definition(tmp1), bld.def(s1, scc), prev, Operand(exec, bld.lm));
bld.sop2(Builder::s_andn2, Definition(tmp1), bld.def(s1, scc), prev,
Operand(exec, bld.lm));
bld.sop2(Builder::s_and, Definition(tmp2), bld.def(s1, scc), cur, Operand(exec, bld.lm));
bld.sop2(Builder::s_or, dst, bld.def(s1, scc), tmp1, tmp2);
} else if (cur.constantValue()) {
@ -151,7 +154,8 @@ void build_merge_code(Program *program, Block *block, Definition dst, Operand pr
}
}
void init_any_pred_defined(Program *program, ssa_state *state, Block *block, aco_ptr<Instruction>& phi)
void
init_any_pred_defined(Program* program, ssa_state* state, Block* block, aco_ptr<Instruction>& phi)
{
std::fill(state->any_pred_defined.begin(), state->any_pred_defined.end(), false);
for (unsigned i = 0; i < block->logical_preds.size(); i++) {
@ -178,7 +182,9 @@ void init_any_pred_defined(Program *program, ssa_state *state, Block *block, aco
}
}
void lower_divergent_bool_phi(Program *program, ssa_state *state, Block *block, aco_ptr<Instruction>& phi)
void
lower_divergent_bool_phi(Program* program, ssa_state* state, Block* block,
aco_ptr<Instruction>& phi)
{
Builder bld(program);
@ -186,7 +192,8 @@ void lower_divergent_bool_phi(Program *program, ssa_state *state, Block *block,
state->all_preds_uniform = !(block->kind & block_kind_merge) &&
block->linear_preds.size() == block->logical_preds.size();
for (unsigned pred : block->logical_preds)
state->all_preds_uniform = state->all_preds_uniform && (program->blocks[pred].kind & block_kind_uniform);
state->all_preds_uniform =
state->all_preds_uniform && (program->blocks[pred].kind & block_kind_uniform);
state->checked_preds_for_uniform = true;
}
@ -230,7 +237,7 @@ void lower_divergent_bool_phi(Program *program, ssa_state *state, Block *block,
bool uniform_merge = block->kind & block_kind_loop_header;
for (unsigned i = 0; i < phi->operands.size(); i++) {
Block *pred = &program->blocks[block->logical_preds[i]];
Block* pred = &program->blocks[block->logical_preds[i]];
bool need_get_ssa = !uniform_merge;
if (block->kind & block_kind_loop_header && !(pred->kind & block_kind_uniform))
@ -254,7 +261,8 @@ void lower_divergent_bool_phi(Program *program, ssa_state *state, Block *block,
unsigned num_preds = block->linear_preds.size();
if (phi->operands.size() != num_preds) {
Pseudo_instruction* new_phi{create_instruction<Pseudo_instruction>(aco_opcode::p_linear_phi, Format::PSEUDO, num_preds, 1)};
Pseudo_instruction* new_phi{create_instruction<Pseudo_instruction>(
aco_opcode::p_linear_phi, Format::PSEUDO, num_preds, 1)};
new_phi->definitions[0] = phi->definitions[0];
phi.reset(new_phi);
} else {
@ -268,7 +276,8 @@ void lower_divergent_bool_phi(Program *program, ssa_state *state, Block *block,
return;
}
void lower_subdword_phis(Program *program, Block *block, aco_ptr<Instruction>& phi)
void
lower_subdword_phis(Program* program, Block* block, aco_ptr<Instruction>& phi)
{
Builder bld(program);
for (unsigned i = 0; i < phi->operands.size(); i++) {
@ -278,21 +287,24 @@ void lower_subdword_phis(Program *program, Block *block, aco_ptr<Instruction>& p
continue;
assert(phi->operands[i].isTemp());
Block *pred = &program->blocks[block->logical_preds[i]];
Block* pred = &program->blocks[block->logical_preds[i]];
Temp phi_src = phi->operands[i].getTemp();
assert(phi_src.regClass().type() == RegType::sgpr);
Temp tmp = bld.tmp(RegClass(RegType::vgpr, phi_src.size()));
insert_before_logical_end(pred, bld.copy(Definition(tmp), phi_src).get_ptr());
Temp new_phi_src = bld.tmp(phi->definitions[0].regClass());
insert_before_logical_end(pred, bld.pseudo(aco_opcode::p_extract_vector, Definition(new_phi_src), tmp, Operand(0u)).get_ptr());
insert_before_logical_end(
pred, bld.pseudo(aco_opcode::p_extract_vector, Definition(new_phi_src), tmp, Operand(0u))
.get_ptr());
phi->operands[i].setTemp(new_phi_src);
}
return;
}
void lower_phis(Program* program)
void
lower_phis(Program* program)
{
ssa_state state;
@ -301,7 +313,8 @@ void lower_phis(Program* program)
state.needs_init = true;
for (aco_ptr<Instruction>& phi : block.instructions) {
if (phi->opcode == aco_opcode::p_phi) {
assert(program->wave_size == 64 ? phi->definitions[0].regClass() != s1 : phi->definitions[0].regClass() != s2);
assert(program->wave_size == 64 ? phi->definitions[0].regClass() != s1
: phi->definitions[0].regClass() != s2);
if (phi->definitions[0].regClass() == program->lane_mask)
lower_divergent_bool_phi(program, &state, &block, phi);
else if (phi->definitions[0].regClass().is_subdword())
@ -313,4 +326,4 @@ void lower_phis(Program* program)
}
}
}
} // namespace aco

View File

@ -53,32 +53,32 @@ struct copy {
struct merge_node {
Operand value = Operand(); /* original value: can be an SSA-def or constant value */
uint32_t index = -1u; /* index into the vector of merge sets */
uint32_t index = -1u; /* index into the vector of merge sets */
uint32_t defined_at = -1u; /* defining block */
/* we also remember two dominating defs with the same value: */
Temp equal_anc_in = Temp(); /* within the same merge set */
Temp equal_anc_in = Temp(); /* within the same merge set */
Temp equal_anc_out = Temp(); /* from a different set */
};
struct cssa_ctx {
Program* program;
std::vector<IDSet>& live_out; /* live-out sets per block */
std::vector<IDSet>& live_out; /* live-out sets per block */
std::vector<std::vector<copy>> parallelcopies; /* copies per block */
std::vector<merge_set> merge_sets; /* each vector is one (ordered) merge set */
std::vector<merge_set> merge_sets; /* each vector is one (ordered) merge set */
std::unordered_map<uint32_t, merge_node> merge_node_table; /* tempid -> merge node */
};
/* create (virtual) parallelcopies for each phi instruction and
* already merge copy-definitions with phi-defs into merge sets */
void collect_parallelcopies(cssa_ctx& ctx)
void
collect_parallelcopies(cssa_ctx& ctx)
{
ctx.parallelcopies.resize(ctx.program->blocks.size());
Builder bld(ctx.program);
for (Block& block : ctx.program->blocks) {
for (aco_ptr<Instruction>& phi : block.instructions) {
if (phi->opcode != aco_opcode::p_phi &&
phi->opcode != aco_opcode::p_linear_phi)
if (phi->opcode != aco_opcode::p_phi && phi->opcode != aco_opcode::p_linear_phi)
break;
const Definition& def = phi->definitions[0];
@ -89,9 +89,8 @@ void collect_parallelcopies(cssa_ctx& ctx)
if (!def.isTemp())
continue;
std::vector<unsigned>& preds = phi->opcode == aco_opcode::p_phi ?
block.logical_preds :
block.linear_preds;
std::vector<unsigned>& preds =
phi->opcode == aco_opcode::p_phi ? block.logical_preds : block.linear_preds;
uint32_t index = ctx.merge_sets.size();
merge_set set;
@ -151,8 +150,8 @@ void collect_parallelcopies(cssa_ctx& ctx)
}
/* check whether the definition of a comes after b. */
inline
bool defined_after(cssa_ctx& ctx, Temp a, Temp b)
inline bool
defined_after(cssa_ctx& ctx, Temp a, Temp b)
{
merge_node& node_a = ctx.merge_node_table[a.id()];
merge_node& node_b = ctx.merge_node_table[b.id()];
@ -163,25 +162,24 @@ bool defined_after(cssa_ctx& ctx, Temp a, Temp b)
}
/* check whether a dominates b where b is defined after a */
inline
bool dominates(cssa_ctx& ctx, Temp a, Temp b)
inline bool
dominates(cssa_ctx& ctx, Temp a, Temp b)
{
assert(defined_after(ctx, b, a));
merge_node& node_a = ctx.merge_node_table[a.id()];
merge_node& node_b = ctx.merge_node_table[b.id()];
unsigned idom = node_b.defined_at;
while (idom > node_a.defined_at)
idom = b.regClass().type() == RegType::vgpr ?
ctx.program->blocks[idom].logical_idom :
ctx.program->blocks[idom].linear_idom;
idom = b.regClass().type() == RegType::vgpr ? ctx.program->blocks[idom].logical_idom
: ctx.program->blocks[idom].linear_idom;
return idom == node_a.defined_at;
}
/* check intersection between var and parent:
* We already know that parent dominates var. */
inline
bool intersects(cssa_ctx& ctx, Temp var, Temp parent)
inline bool
intersects(cssa_ctx& ctx, Temp var, Temp parent)
{
merge_node& node_var = ctx.merge_node_table[var.id()];
merge_node& node_parent = ctx.merge_node_table[parent.id()];
@ -196,9 +194,9 @@ bool intersects(cssa_ctx& ctx, Temp var, Temp parent)
/* parent is defined in a different block than var */
if (node_parent.defined_at < node_var.defined_at) {
/* if the parent is not live-in, they don't interfere */
std::vector<uint32_t>& preds = var.type() == RegType::vgpr ?
ctx.program->blocks[block_idx].logical_preds :
ctx.program->blocks[block_idx].linear_preds;
std::vector<uint32_t>& preds = var.type() == RegType::vgpr
? ctx.program->blocks[block_idx].logical_preds
: ctx.program->blocks[block_idx].linear_preds;
for (uint32_t pred : preds) {
if (!ctx.live_out[pred].count(parent.id()))
return false;
@ -246,8 +244,8 @@ bool intersects(cssa_ctx& ctx, Temp var, Temp parent)
/* check interference between var and parent:
* i.e. they have different values and intersect.
* If parent and var share the same value, also updates the equal ancestor. */
inline
bool interference(cssa_ctx& ctx, Temp var, Temp parent)
inline bool
interference(cssa_ctx& ctx, Temp var, Temp parent)
{
assert(var != parent);
merge_node& node_var = ctx.merge_node_table[var.id()];
@ -281,13 +279,14 @@ bool interference(cssa_ctx& ctx, Temp var, Temp parent)
/* tries to merge set_b into set_a of given temporary and
* drops that temporary as it is being coalesced */
bool try_merge_merge_set(cssa_ctx& ctx, Temp dst, merge_set& set_b)
bool
try_merge_merge_set(cssa_ctx& ctx, Temp dst, merge_set& set_b)
{
auto def_node_it = ctx.merge_node_table.find(dst.id());
uint32_t index = def_node_it->second.index;
merge_set& set_a = ctx.merge_sets[index];
std::vector<Temp> dom; /* stack of the traversal */
merge_set union_set; /* the new merged merge-set */
merge_set union_set; /* the new merged merge-set */
uint32_t i_a = 0;
uint32_t i_b = 0;
@ -335,7 +334,8 @@ bool try_merge_merge_set(cssa_ctx& ctx, Temp dst, merge_set& set_b)
}
/* returns true if the copy can safely be omitted */
bool try_coalesce_copy(cssa_ctx& ctx, copy copy, uint32_t block_idx)
bool
try_coalesce_copy(cssa_ctx& ctx, copy copy, uint32_t block_idx)
{
/* we can only coalesce temporaries */
if (!copy.op.isTemp())
@ -348,11 +348,9 @@ bool try_coalesce_copy(cssa_ctx& ctx, copy copy, uint32_t block_idx)
uint32_t pred = block_idx;
do {
block_idx = pred;
pred = copy.op.regClass().type() == RegType::vgpr ?
ctx.program->blocks[pred].logical_idom :
ctx.program->blocks[pred].linear_idom;
} while (block_idx != pred &&
ctx.live_out[pred].count(copy.op.tempId()));
pred = copy.op.regClass().type() == RegType::vgpr ? ctx.program->blocks[pred].logical_idom
: ctx.program->blocks[pred].linear_idom;
} while (block_idx != pred && ctx.live_out[pred].count(copy.op.tempId()));
op_node.defined_at = block_idx;
op_node.value = copy.op;
}
@ -385,7 +383,8 @@ struct ltg_node {
/* emit the copies in an order that does not
* create interferences within a merge-set */
void emit_copies_block(Builder bld, std::map<uint32_t, ltg_node>& ltg, RegType type)
void
emit_copies_block(Builder bld, std::map<uint32_t, ltg_node>& ltg, RegType type)
{
auto&& it = ltg.begin();
while (it != ltg.end()) {
@ -410,16 +409,16 @@ void emit_copies_block(Builder bld, std::map<uint32_t, ltg_node>& ltg, RegType t
}
/* count the number of remaining circular dependencies */
unsigned num = std::count_if(ltg.begin(), ltg.end(), [&] (auto& n){
return n.second.cp.def.regClass().type() == type;
});
unsigned num = std::count_if(ltg.begin(), ltg.end(),
[&](auto& n) { return n.second.cp.def.regClass().type() == type; });
/* if there are circular dependencies, we just emit them as single parallelcopy */
if (num) {
// TODO: this should be restricted to a feasible number of registers
// and otherwise use a temporary to avoid having to reload more (spilled)
// variables than we have registers.
aco_ptr<Pseudo_instruction> copy{create_instruction<Pseudo_instruction>(aco_opcode::p_parallelcopy, Format::PSEUDO, num, num)};
aco_ptr<Pseudo_instruction> copy{create_instruction<Pseudo_instruction>(
aco_opcode::p_parallelcopy, Format::PSEUDO, num, num)};
it = ltg.begin();
for (unsigned i = 0; i < num; i++) {
while (it->second.cp.def.regClass().type() != type)
@ -435,7 +434,8 @@ void emit_copies_block(Builder bld, std::map<uint32_t, ltg_node>& ltg, RegType t
/* either emits or coalesces all parallelcopies and
* renames the phi-operands accordingly. */
void emit_parallelcopies(cssa_ctx& ctx)
void
emit_parallelcopies(cssa_ctx& ctx)
{
std::unordered_map<uint32_t, Operand> renames;
@ -476,9 +476,8 @@ void emit_parallelcopies(cssa_ctx& ctx)
Block& block = ctx.program->blocks[i];
/* emit VGPR copies */
auto IsLogicalEnd = [] (const aco_ptr<Instruction>& inst) -> bool {
return inst->opcode == aco_opcode::p_logical_end;
};
auto IsLogicalEnd = [](const aco_ptr<Instruction>& inst) -> bool
{ return inst->opcode == aco_opcode::p_logical_end; };
auto it = std::find_if(block.instructions.rbegin(), block.instructions.rend(), IsLogicalEnd);
bld.reset(&block.instructions, std::prev(it.base()));
emit_copies_block(bld, ltg, RegType::vgpr);
@ -494,8 +493,7 @@ void emit_parallelcopies(cssa_ctx& ctx)
/* finally, rename coalesced phi operands */
for (Block& block : ctx.program->blocks) {
for (aco_ptr<Instruction>& phi : block.instructions) {
if (phi->opcode != aco_opcode::p_phi &&
phi->opcode != aco_opcode::p_linear_phi)
if (phi->opcode != aco_opcode::p_phi && phi->opcode != aco_opcode::p_linear_phi)
break;
for (Operand& op : phi->operands) {
@ -514,8 +512,8 @@ void emit_parallelcopies(cssa_ctx& ctx)
} /* end namespace */
void lower_to_cssa(Program* program, live& live_vars)
void
lower_to_cssa(Program* program, live& live_vars)
{
reindex_ssa(program, live_vars.live_out);
cssa_ctx ctx = {program, live_vars.live_out};
@ -525,5 +523,4 @@ void lower_to_cssa(Program* program, live& live_vars)
/* update live variable information */
live_vars = live_var_analysis(program);
}
}
} // namespace aco

File diff suppressed because it is too large Load Diff

View File

@ -36,8 +36,9 @@
namespace aco {
namespace {
inline
uint32_t murmur_32_scramble(uint32_t h, uint32_t k) {
inline uint32_t
murmur_32_scramble(uint32_t h, uint32_t k)
{
k *= 0xcc9e2d51;
k = (k << 15) | (k >> 17);
h ^= k * 0x1b873593;
@ -46,8 +47,9 @@ uint32_t murmur_32_scramble(uint32_t h, uint32_t k) {
return h;
}
template<typename T>
uint32_t hash_murmur_32(Instruction* instr)
template <typename T>
uint32_t
hash_murmur_32(Instruction* instr)
{
uint32_t hash = uint32_t(instr->format) << 16 | uint32_t(instr->opcode);
@ -58,7 +60,7 @@ uint32_t hash_murmur_32(Instruction* instr)
for (unsigned i = 2; i < (sizeof(T) >> 2); i++) {
uint32_t u;
/* Accesses it though a byte array, so doesn't violate the strict aliasing rule */
memcpy(&u, reinterpret_cast<uint8_t *>(instr) + i * 4, 4);
memcpy(&u, reinterpret_cast<uint8_t*>(instr) + i * 4, 4);
hash = murmur_32_scramble(hash, u);
}
@ -92,32 +94,19 @@ struct InstrHash {
return hash_murmur_32<SDWA_instruction>(instr);
switch (instr->format) {
case Format::SMEM:
return hash_murmur_32<SMEM_instruction>(instr);
case Format::VINTRP:
return hash_murmur_32<Interp_instruction>(instr);
case Format::DS:
return hash_murmur_32<DS_instruction>(instr);
case Format::SOPP:
return hash_murmur_32<SOPP_instruction>(instr);
case Format::SOPK:
return hash_murmur_32<SOPK_instruction>(instr);
case Format::EXP:
return hash_murmur_32<Export_instruction>(instr);
case Format::MUBUF:
return hash_murmur_32<MUBUF_instruction>(instr);
case Format::MIMG:
return hash_murmur_32<MIMG_instruction>(instr);
case Format::MTBUF:
return hash_murmur_32<MTBUF_instruction>(instr);
case Format::FLAT:
return hash_murmur_32<FLAT_instruction>(instr);
case Format::PSEUDO_BRANCH:
return hash_murmur_32<Pseudo_branch_instruction>(instr);
case Format::PSEUDO_REDUCTION:
return hash_murmur_32<Pseudo_reduction_instruction>(instr);
default:
return hash_murmur_32<Instruction>(instr);
case Format::SMEM: return hash_murmur_32<SMEM_instruction>(instr);
case Format::VINTRP: return hash_murmur_32<Interp_instruction>(instr);
case Format::DS: return hash_murmur_32<DS_instruction>(instr);
case Format::SOPP: return hash_murmur_32<SOPP_instruction>(instr);
case Format::SOPK: return hash_murmur_32<SOPK_instruction>(instr);
case Format::EXP: return hash_murmur_32<Export_instruction>(instr);
case Format::MUBUF: return hash_murmur_32<MUBUF_instruction>(instr);
case Format::MIMG: return hash_murmur_32<MIMG_instruction>(instr);
case Format::MTBUF: return hash_murmur_32<MTBUF_instruction>(instr);
case Format::FLAT: return hash_murmur_32<FLAT_instruction>(instr);
case Format::PSEUDO_BRANCH: return hash_murmur_32<Pseudo_branch_instruction>(instr);
case Format::PSEUDO_REDUCTION: return hash_murmur_32<Pseudo_reduction_instruction>(instr);
default: return hash_murmur_32<Instruction>(instr);
}
}
};
@ -129,7 +118,8 @@ struct InstrPred {
return false;
if (a->opcode != b->opcode)
return false;
if (a->operands.size() != b->operands.size() || a->definitions.size() != b->definitions.size())
if (a->operands.size() != b->operands.size() ||
a->definitions.size() != b->definitions.size())
return false; /* possible with pseudo-instructions */
for (unsigned i = 0; i < a->operands.size(); i++) {
if (a->operands[i].isConstant()) {
@ -137,14 +127,12 @@ struct InstrPred {
return false;
if (a->operands[i].constantValue() != b->operands[i].constantValue())
return false;
}
else if (a->operands[i].isTemp()) {
} else if (a->operands[i].isTemp()) {
if (!b->operands[i].isTemp())
return false;
if (a->operands[i].tempId() != b->operands[i].tempId())
return false;
}
else if (a->operands[i].isUndefined() ^ b->operands[i].isUndefined())
} else if (a->operands[i].isUndefined() ^ b->operands[i].isUndefined())
return false;
if (a->operands[i].isFixed()) {
if (!b->operands[i].isFixed())
@ -179,154 +167,110 @@ struct InstrPred {
VOP3_instruction& a3 = a->vop3();
VOP3_instruction& b3 = b->vop3();
for (unsigned i = 0; i < 3; i++) {
if (a3.abs[i] != b3.abs[i] ||
a3.neg[i] != b3.neg[i])
if (a3.abs[i] != b3.abs[i] || a3.neg[i] != b3.neg[i])
return false;
}
return a3.clamp == b3.clamp &&
a3.omod == b3.omod &&
a3.opsel == b3.opsel;
return a3.clamp == b3.clamp && a3.omod == b3.omod && a3.opsel == b3.opsel;
}
if (a->isDPP()) {
DPP_instruction& aDPP = a->dpp();
DPP_instruction& bDPP = b->dpp();
return aDPP.pass_flags == bDPP.pass_flags &&
aDPP.dpp_ctrl == bDPP.dpp_ctrl &&
aDPP.bank_mask == bDPP.bank_mask &&
aDPP.row_mask == bDPP.row_mask &&
aDPP.bound_ctrl == bDPP.bound_ctrl &&
aDPP.abs[0] == bDPP.abs[0] &&
aDPP.abs[1] == bDPP.abs[1] &&
aDPP.neg[0] == bDPP.neg[0] &&
return aDPP.pass_flags == bDPP.pass_flags && aDPP.dpp_ctrl == bDPP.dpp_ctrl &&
aDPP.bank_mask == bDPP.bank_mask && aDPP.row_mask == bDPP.row_mask &&
aDPP.bound_ctrl == bDPP.bound_ctrl && aDPP.abs[0] == bDPP.abs[0] &&
aDPP.abs[1] == bDPP.abs[1] && aDPP.neg[0] == bDPP.neg[0] &&
aDPP.neg[1] == bDPP.neg[1];
}
if (a->isSDWA()) {
SDWA_instruction& aSDWA = a->sdwa();
SDWA_instruction& bSDWA = b->sdwa();
return aSDWA.sel[0] == bSDWA.sel[0] &&
aSDWA.sel[1] == bSDWA.sel[1] &&
aSDWA.dst_sel == bSDWA.dst_sel &&
aSDWA.abs[0] == bSDWA.abs[0] &&
aSDWA.abs[1] == bSDWA.abs[1] &&
aSDWA.neg[0] == bSDWA.neg[0] &&
aSDWA.neg[1] == bSDWA.neg[1] &&
aSDWA.dst_preserve == bSDWA.dst_preserve &&
aSDWA.clamp == bSDWA.clamp &&
aSDWA.omod == bSDWA.omod;
return aSDWA.sel[0] == bSDWA.sel[0] && aSDWA.sel[1] == bSDWA.sel[1] &&
aSDWA.dst_sel == bSDWA.dst_sel && aSDWA.abs[0] == bSDWA.abs[0] &&
aSDWA.abs[1] == bSDWA.abs[1] && aSDWA.neg[0] == bSDWA.neg[0] &&
aSDWA.neg[1] == bSDWA.neg[1] && aSDWA.dst_preserve == bSDWA.dst_preserve &&
aSDWA.clamp == bSDWA.clamp && aSDWA.omod == bSDWA.omod;
}
switch (a->format) {
case Format::SOPK: {
if (a->opcode == aco_opcode::s_getreg_b32)
case Format::SOPK: {
if (a->opcode == aco_opcode::s_getreg_b32)
return false;
SOPK_instruction& aK = a->sopk();
SOPK_instruction& bK = b->sopk();
return aK.imm == bK.imm;
}
case Format::SMEM: {
SMEM_instruction& aS = a->smem();
SMEM_instruction& bS = b->smem();
/* isel shouldn't be creating situations where this assertion fails */
assert(aS.prevent_overflow == bS.prevent_overflow);
return aS.sync == bS.sync && aS.glc == bS.glc && aS.dlc == bS.dlc && aS.nv == bS.nv &&
aS.disable_wqm == bS.disable_wqm && aS.prevent_overflow == bS.prevent_overflow;
}
case Format::VINTRP: {
Interp_instruction& aI = a->vintrp();
Interp_instruction& bI = b->vintrp();
if (aI.attribute != bI.attribute)
return false;
if (aI.component != bI.component)
return false;
return true;
}
case Format::VOP3P: {
VOP3P_instruction& a3P = a->vop3p();
VOP3P_instruction& b3P = b->vop3p();
for (unsigned i = 0; i < 3; i++) {
if (a3P.neg_lo[i] != b3P.neg_lo[i] || a3P.neg_hi[i] != b3P.neg_hi[i])
return false;
SOPK_instruction& aK = a->sopk();
SOPK_instruction& bK = b->sopk();
return aK.imm == bK.imm;
}
case Format::SMEM: {
SMEM_instruction& aS = a->smem();
SMEM_instruction& bS = b->smem();
/* isel shouldn't be creating situations where this assertion fails */
assert(aS.prevent_overflow == bS.prevent_overflow);
return aS.sync == bS.sync && aS.glc == bS.glc && aS.dlc == bS.dlc &&
aS.nv == bS.nv && aS.disable_wqm == bS.disable_wqm &&
aS.prevent_overflow == bS.prevent_overflow;
}
case Format::VINTRP: {
Interp_instruction& aI = a->vintrp();
Interp_instruction& bI = b->vintrp();
if (aI.attribute != bI.attribute)
return false;
if (aI.component != bI.component)
return false;
return true;
}
case Format::VOP3P: {
VOP3P_instruction& a3P = a->vop3p();
VOP3P_instruction& b3P = b->vop3p();
for (unsigned i = 0; i < 3; i++) {
if (a3P.neg_lo[i] != b3P.neg_lo[i] ||
a3P.neg_hi[i] != b3P.neg_hi[i])
return false;
}
return a3P.opsel_lo == b3P.opsel_lo &&
a3P.opsel_hi == b3P.opsel_hi &&
a3P.clamp == b3P.clamp;
}
case Format::PSEUDO_REDUCTION: {
Pseudo_reduction_instruction& aR = a->reduction();
Pseudo_reduction_instruction& bR = b->reduction();
return aR.pass_flags == bR.pass_flags &&
aR.reduce_op == bR.reduce_op &&
aR.cluster_size == bR.cluster_size;
}
case Format::DS: {
assert(a->opcode == aco_opcode::ds_bpermute_b32 ||
a->opcode == aco_opcode::ds_permute_b32 ||
a->opcode == aco_opcode::ds_swizzle_b32);
DS_instruction& aD = a->ds();
DS_instruction& bD = b->ds();
return aD.sync == bD.sync &&
aD.pass_flags == bD.pass_flags &&
aD.gds == bD.gds &&
aD.offset0 == bD.offset0 &&
aD.offset1 == bD.offset1;
}
case Format::MTBUF: {
MTBUF_instruction& aM = a->mtbuf();
MTBUF_instruction& bM = b->mtbuf();
return aM.sync == bM.sync &&
aM.dfmt == bM.dfmt &&
aM.nfmt == bM.nfmt &&
aM.offset == bM.offset &&
aM.offen == bM.offen &&
aM.idxen == bM.idxen &&
aM.glc == bM.glc &&
aM.dlc == bM.dlc &&
aM.slc == bM.slc &&
aM.tfe == bM.tfe &&
aM.disable_wqm == bM.disable_wqm;
}
case Format::MUBUF: {
MUBUF_instruction& aM = a->mubuf();
MUBUF_instruction& bM = b->mubuf();
return aM.sync == bM.sync &&
aM.offset == bM.offset &&
aM.offen == bM.offen &&
aM.idxen == bM.idxen &&
aM.glc == bM.glc &&
aM.dlc == bM.dlc &&
aM.slc == bM.slc &&
aM.tfe == bM.tfe &&
aM.lds == bM.lds &&
aM.disable_wqm == bM.disable_wqm;
}
case Format::MIMG: {
MIMG_instruction& aM = a->mimg();
MIMG_instruction& bM = b->mimg();
return aM.sync == bM.sync &&
aM.dmask == bM.dmask &&
aM.unrm == bM.unrm &&
aM.glc == bM.glc &&
aM.slc == bM.slc &&
aM.tfe == bM.tfe &&
aM.da == bM.da &&
aM.lwe == bM.lwe &&
aM.r128 == bM.r128 &&
aM.a16 == bM.a16 &&
aM.d16 == bM.d16 &&
aM.disable_wqm == bM.disable_wqm;
}
case Format::FLAT:
case Format::GLOBAL:
case Format::SCRATCH:
case Format::EXP:
case Format::SOPP:
case Format::PSEUDO_BRANCH:
case Format::PSEUDO_BARRIER:
assert(false);
default:
return true;
return a3P.opsel_lo == b3P.opsel_lo && a3P.opsel_hi == b3P.opsel_hi &&
a3P.clamp == b3P.clamp;
}
case Format::PSEUDO_REDUCTION: {
Pseudo_reduction_instruction& aR = a->reduction();
Pseudo_reduction_instruction& bR = b->reduction();
return aR.pass_flags == bR.pass_flags && aR.reduce_op == bR.reduce_op &&
aR.cluster_size == bR.cluster_size;
}
case Format::DS: {
assert(a->opcode == aco_opcode::ds_bpermute_b32 ||
a->opcode == aco_opcode::ds_permute_b32 || a->opcode == aco_opcode::ds_swizzle_b32);
DS_instruction& aD = a->ds();
DS_instruction& bD = b->ds();
return aD.sync == bD.sync && aD.pass_flags == bD.pass_flags && aD.gds == bD.gds &&
aD.offset0 == bD.offset0 && aD.offset1 == bD.offset1;
}
case Format::MTBUF: {
MTBUF_instruction& aM = a->mtbuf();
MTBUF_instruction& bM = b->mtbuf();
return aM.sync == bM.sync && aM.dfmt == bM.dfmt && aM.nfmt == bM.nfmt &&
aM.offset == bM.offset && aM.offen == bM.offen && aM.idxen == bM.idxen &&
aM.glc == bM.glc && aM.dlc == bM.dlc && aM.slc == bM.slc && aM.tfe == bM.tfe &&
aM.disable_wqm == bM.disable_wqm;
}
case Format::MUBUF: {
MUBUF_instruction& aM = a->mubuf();
MUBUF_instruction& bM = b->mubuf();
return aM.sync == bM.sync && aM.offset == bM.offset && aM.offen == bM.offen &&
aM.idxen == bM.idxen && aM.glc == bM.glc && aM.dlc == bM.dlc && aM.slc == bM.slc &&
aM.tfe == bM.tfe && aM.lds == bM.lds && aM.disable_wqm == bM.disable_wqm;
}
case Format::MIMG: {
MIMG_instruction& aM = a->mimg();
MIMG_instruction& bM = b->mimg();
return aM.sync == bM.sync && aM.dmask == bM.dmask && aM.unrm == bM.unrm &&
aM.glc == bM.glc && aM.slc == bM.slc && aM.tfe == bM.tfe && aM.da == bM.da &&
aM.lwe == bM.lwe && aM.r128 == bM.r128 && aM.a16 == bM.a16 && aM.d16 == bM.d16 &&
aM.disable_wqm == bM.disable_wqm;
}
case Format::FLAT:
case Format::GLOBAL:
case Format::SCRATCH:
case Format::EXP:
case Format::SOPP:
case Format::PSEUDO_BRANCH:
case Format::PSEUDO_BARRIER: assert(false);
default: return true;
}
}
};
@ -345,7 +289,8 @@ struct vn_ctx {
*/
uint32_t exec_id = 1;
vn_ctx(Program* program_) : program(program_) {
vn_ctx(Program* program_) : program(program_)
{
static_assert(sizeof(Temp) == 4, "Temp must fit in 32bits");
unsigned size = 0;
for (Block& block : program->blocks)
@ -354,11 +299,11 @@ struct vn_ctx {
}
};
/* dominates() returns true if the parent block dominates the child block and
* if the parent block is part of the same loop or has a smaller loop nest depth.
*/
bool dominates(vn_ctx& ctx, uint32_t parent, uint32_t child)
bool
dominates(vn_ctx& ctx, uint32_t parent, uint32_t child)
{
unsigned parent_loop_nest_depth = ctx.program->blocks[parent].loop_nest_depth;
while (parent < child && parent_loop_nest_depth <= ctx.program->blocks[child].loop_nest_depth)
@ -375,42 +320,40 @@ bool dominates(vn_ctx& ctx, uint32_t parent, uint32_t child)
* Note that expr_set must not be used with instructions
* which cannot be eliminated.
*/
bool can_eliminate(aco_ptr<Instruction>& instr)
bool
can_eliminate(aco_ptr<Instruction>& instr)
{
switch (instr->format) {
case Format::FLAT:
case Format::GLOBAL:
case Format::SCRATCH:
case Format::EXP:
case Format::SOPP:
case Format::PSEUDO_BRANCH:
case Format::PSEUDO_BARRIER:
case Format::FLAT:
case Format::GLOBAL:
case Format::SCRATCH:
case Format::EXP:
case Format::SOPP:
case Format::PSEUDO_BRANCH:
case Format::PSEUDO_BARRIER: return false;
case Format::DS:
return instr->opcode == aco_opcode::ds_bpermute_b32 ||
instr->opcode == aco_opcode::ds_permute_b32 ||
instr->opcode == aco_opcode::ds_swizzle_b32;
case Format::SMEM:
case Format::MUBUF:
case Format::MIMG:
case Format::MTBUF:
if (!get_sync_info(instr.get()).can_reorder())
return false;
case Format::DS:
return instr->opcode == aco_opcode::ds_bpermute_b32 ||
instr->opcode == aco_opcode::ds_permute_b32 ||
instr->opcode == aco_opcode::ds_swizzle_b32;
case Format::SMEM:
case Format::MUBUF:
case Format::MIMG:
case Format::MTBUF:
if (!get_sync_info(instr.get()).can_reorder())
return false;
break;
default:
break;
break;
default: break;
}
if (instr->definitions.empty() ||
instr->opcode == aco_opcode::p_phi ||
instr->opcode == aco_opcode::p_linear_phi ||
instr->definitions[0].isNoCSE())
if (instr->definitions.empty() || instr->opcode == aco_opcode::p_phi ||
instr->opcode == aco_opcode::p_linear_phi || instr->definitions[0].isNoCSE())
return false;
return true;
}
void process_block(vn_ctx& ctx, Block& block)
void
process_block(vn_ctx& ctx, Block& block)
{
std::vector<aco_ptr<Instruction>> new_instructions;
new_instructions.reserve(block.instructions.size());
@ -435,8 +378,9 @@ void process_block(vn_ctx& ctx, Block& block)
}
/* simple copy-propagation through renaming */
bool copy_instr = instr->opcode == aco_opcode::p_parallelcopy ||
(instr->opcode == aco_opcode::p_create_vector && instr->operands.size() == 1);
bool copy_instr =
instr->opcode == aco_opcode::p_parallelcopy ||
(instr->opcode == aco_opcode::p_create_vector && instr->operands.size() == 1);
if (copy_instr && !instr->definitions[0].isFixed() && instr->operands[0].isTemp() &&
instr->operands[0].regClass() == instr->definitions[0].regClass()) {
ctx.renames[instr->definitions[0].tempId()] = instr->operands[0].getTemp();
@ -479,7 +423,8 @@ void process_block(vn_ctx& ctx, Block& block)
block.instructions = std::move(new_instructions);
}
void rename_phi_operands(Block& block, std::map<uint32_t, Temp>& renames)
void
rename_phi_operands(Block& block, std::map<uint32_t, Temp>& renames)
{
for (aco_ptr<Instruction>& phi : block.instructions) {
if (phi->opcode != aco_opcode::p_phi && phi->opcode != aco_opcode::p_linear_phi)
@ -496,8 +441,8 @@ void rename_phi_operands(Block& block, std::map<uint32_t, Temp>& renames)
}
} /* end namespace */
void value_numbering(Program* program)
void
value_numbering(Program* program)
{
vn_ctx ctx(program);
std::vector<unsigned> loop_headers;
@ -521,10 +466,8 @@ void value_numbering(Program* program)
rename_phi_operands(block, ctx.renames);
/* increment exec_id when entering nested control flow */
if (block.kind & block_kind_branch ||
block.kind & block_kind_loop_preheader ||
block.kind & block_kind_break ||
block.kind & block_kind_continue ||
if (block.kind & block_kind_branch || block.kind & block_kind_loop_preheader ||
block.kind & block_kind_break || block.kind & block_kind_continue ||
block.kind & block_kind_discard)
ctx.exec_id++;
else if (block.kind & block_kind_continue_or_break)
@ -538,4 +481,4 @@ void value_numbering(Program* program)
}
}
}
} // namespace aco

File diff suppressed because it is too large Load Diff

View File

@ -24,9 +24,9 @@
#include "aco_ir.h"
#include <bitset>
#include <algorithm>
#include <array>
#include <bitset>
#include <vector>
namespace aco {
@ -41,15 +41,14 @@ enum {
written_by_multiple_instrs = -4,
};
struct pr_opt_ctx
{
Program *program;
Block *current_block;
struct pr_opt_ctx {
Program* program;
Block* current_block;
int current_instr_idx;
std::vector<uint16_t> uses;
std::array<int, max_reg_cnt * 4u> instr_idx_by_regs;
void reset_block(Block *block)
void reset_block(Block* block)
{
current_block = block;
current_instr_idx = -1;
@ -57,9 +56,10 @@ struct pr_opt_ctx
}
};
void save_reg_writes(pr_opt_ctx &ctx, aco_ptr<Instruction> &instr)
void
save_reg_writes(pr_opt_ctx& ctx, aco_ptr<Instruction>& instr)
{
for (const Definition &def : instr->definitions) {
for (const Definition& def : instr->definitions) {
assert(def.regClass().type() != RegType::sgpr || def.physReg().reg() <= 255);
assert(def.regClass().type() != RegType::vgpr || def.physReg().reg() >= 256);
@ -75,20 +75,21 @@ void save_reg_writes(pr_opt_ctx &ctx, aco_ptr<Instruction> &instr)
}
}
int last_writer_idx(pr_opt_ctx &ctx, PhysReg physReg, RegClass rc)
int
last_writer_idx(pr_opt_ctx& ctx, PhysReg physReg, RegClass rc)
{
/* Verify that all of the operand's registers are written by the same instruction. */
int instr_idx = ctx.instr_idx_by_regs[physReg.reg()];
unsigned dw_size = DIV_ROUND_UP(rc.bytes(), 4u);
unsigned r = physReg.reg();
bool all_same = std::all_of(
&ctx.instr_idx_by_regs[r], &ctx.instr_idx_by_regs[r + dw_size],
[instr_idx](int i) { return i == instr_idx; });
bool all_same = std::all_of(&ctx.instr_idx_by_regs[r], &ctx.instr_idx_by_regs[r + dw_size],
[instr_idx](int i) { return i == instr_idx; });
return all_same ? instr_idx : written_by_multiple_instrs;
}
int last_writer_idx(pr_opt_ctx &ctx, const Operand &op)
int
last_writer_idx(pr_opt_ctx& ctx, const Operand& op)
{
if (op.isConstant() || op.isUndefined())
return const_or_undef;
@ -104,7 +105,8 @@ int last_writer_idx(pr_opt_ctx &ctx, const Operand &op)
return instr_idx;
}
void try_apply_branch_vcc(pr_opt_ctx &ctx, aco_ptr<Instruction> &instr)
void
try_apply_branch_vcc(pr_opt_ctx& ctx, aco_ptr<Instruction>& instr)
{
/* We are looking for the following pattern:
*
@ -123,8 +125,7 @@ void try_apply_branch_vcc(pr_opt_ctx &ctx, aco_ptr<Instruction> &instr)
if (ctx.program->chip_class < GFX8)
return;
if (instr->format != Format::PSEUDO_BRANCH ||
instr->operands.size() == 0 ||
if (instr->format != Format::PSEUDO_BRANCH || instr->operands.size() == 0 ||
instr->operands[0].physReg() != scc)
return;
@ -141,13 +142,12 @@ void try_apply_branch_vcc(pr_opt_ctx &ctx, aco_ptr<Instruction> &instr)
last_exec_wr_idx > last_vcc_wr_idx || last_exec_wr_idx < not_written_in_block)
return;
aco_ptr<Instruction> &op0_instr = ctx.current_block->instructions[op0_instr_idx];
aco_ptr<Instruction> &last_vcc_wr = ctx.current_block->instructions[last_vcc_wr_idx];
aco_ptr<Instruction>& op0_instr = ctx.current_block->instructions[op0_instr_idx];
aco_ptr<Instruction>& last_vcc_wr = ctx.current_block->instructions[last_vcc_wr_idx];
if ((op0_instr->opcode != aco_opcode::s_and_b64 /* wave64 */ &&
op0_instr->opcode != aco_opcode::s_and_b32 /* wave32 */) ||
op0_instr->operands[0].physReg() != vcc ||
op0_instr->operands[1].physReg() != exec ||
op0_instr->operands[0].physReg() != vcc || op0_instr->operands[1].physReg() != exec ||
!last_vcc_wr->isVOPC())
return;
@ -159,7 +159,8 @@ void try_apply_branch_vcc(pr_opt_ctx &ctx, aco_ptr<Instruction> &instr)
instr->operands[0] = op0_instr->operands[0];
}
void try_optimize_scc_nocompare(pr_opt_ctx &ctx, aco_ptr<Instruction> &instr)
void
try_optimize_scc_nocompare(pr_opt_ctx& ctx, aco_ptr<Instruction>& instr)
{
/* We are looking for the following pattern:
*
@ -180,8 +181,7 @@ void try_optimize_scc_nocompare(pr_opt_ctx &ctx, aco_ptr<Instruction> &instr)
if (instr->isSOPC() &&
(instr->opcode == aco_opcode::s_cmp_eq_u32 || instr->opcode == aco_opcode::s_cmp_eq_i32 ||
instr->opcode == aco_opcode::s_cmp_lg_u32 || instr->opcode == aco_opcode::s_cmp_lg_i32 ||
instr->opcode == aco_opcode::s_cmp_eq_u64 ||
instr->opcode == aco_opcode::s_cmp_lg_u64) &&
instr->opcode == aco_opcode::s_cmp_eq_u64 || instr->opcode == aco_opcode::s_cmp_lg_u64) &&
(instr->operands[0].constantEquals(0) || instr->operands[1].constantEquals(0)) &&
(instr->operands[0].isTemp() || instr->operands[1].isTemp())) {
/* Make sure the constant is always in operand 1 */
@ -197,8 +197,9 @@ void try_optimize_scc_nocompare(pr_opt_ctx &ctx, aco_ptr<Instruction> &instr)
if (wr_idx < 0 || wr_idx != sccwr_idx)
return;
aco_ptr<Instruction> &wr_instr = ctx.current_block->instructions[wr_idx];
if (!wr_instr->isSALU() || wr_instr->definitions.size() < 2 || wr_instr->definitions[1].physReg() != scc)
aco_ptr<Instruction>& wr_instr = ctx.current_block->instructions[wr_idx];
if (!wr_instr->isSALU() || wr_instr->definitions.size() < 2 ||
wr_instr->definitions[1].physReg() != scc)
return;
/* Look for instructions which set SCC := (D != 0) */
@ -232,10 +233,8 @@ void try_optimize_scc_nocompare(pr_opt_ctx &ctx, aco_ptr<Instruction> &instr)
case aco_opcode::s_ashr_i32:
case aco_opcode::s_ashr_i64:
case aco_opcode::s_abs_i32:
case aco_opcode::s_absdiff_i32:
break;
default:
return;
case aco_opcode::s_absdiff_i32: break;
default: return;
}
/* Use the SCC def from wr_instr */
@ -245,13 +244,12 @@ void try_optimize_scc_nocompare(pr_opt_ctx &ctx, aco_ptr<Instruction> &instr)
/* Set the opcode and operand to 32-bit */
instr->operands[1] = Operand(0u);
instr->opcode = (instr->opcode == aco_opcode::s_cmp_eq_u32 ||
instr->opcode == aco_opcode::s_cmp_eq_i32 ||
instr->opcode == aco_opcode::s_cmp_eq_u64)
? aco_opcode::s_cmp_eq_u32
: aco_opcode::s_cmp_lg_u32;
} else if ((instr->format == Format::PSEUDO_BRANCH &&
instr->operands.size() == 1 &&
instr->opcode =
(instr->opcode == aco_opcode::s_cmp_eq_u32 || instr->opcode == aco_opcode::s_cmp_eq_i32 ||
instr->opcode == aco_opcode::s_cmp_eq_u64)
? aco_opcode::s_cmp_eq_u32
: aco_opcode::s_cmp_lg_u32;
} else if ((instr->format == Format::PSEUDO_BRANCH && instr->operands.size() == 1 &&
instr->operands[0].physReg() == scc) ||
instr->opcode == aco_opcode::s_cselect_b32) {
@ -265,10 +263,11 @@ void try_optimize_scc_nocompare(pr_opt_ctx &ctx, aco_ptr<Instruction> &instr)
if (wr_idx < 0)
return;
aco_ptr<Instruction> &wr_instr = ctx.current_block->instructions[wr_idx];
aco_ptr<Instruction>& wr_instr = ctx.current_block->instructions[wr_idx];
/* Check if we found the pattern above. */
if (wr_instr->opcode != aco_opcode::s_cmp_eq_u32 && wr_instr->opcode != aco_opcode::s_cmp_lg_u32)
if (wr_instr->opcode != aco_opcode::s_cmp_eq_u32 &&
wr_instr->opcode != aco_opcode::s_cmp_lg_u32)
return;
if (wr_instr->operands[0].physReg() != scc)
return;
@ -282,11 +281,13 @@ void try_optimize_scc_nocompare(pr_opt_ctx &ctx, aco_ptr<Instruction> &instr)
if (wr_instr->opcode == aco_opcode::s_cmp_eq_u32) {
/* Flip the meaning of the instruction to correctly use the SCC. */
if (instr->format == Format::PSEUDO_BRANCH)
instr->opcode = instr->opcode == aco_opcode::p_cbranch_z ? aco_opcode::p_cbranch_nz : aco_opcode::p_cbranch_z;
instr->opcode = instr->opcode == aco_opcode::p_cbranch_z ? aco_opcode::p_cbranch_nz
: aco_opcode::p_cbranch_z;
else if (instr->opcode == aco_opcode::s_cselect_b32)
std::swap(instr->operands[0], instr->operands[1]);
else
unreachable("scc_nocompare optimization is only implemented for p_cbranch and s_cselect");
unreachable(
"scc_nocompare optimization is only implemented for p_cbranch and s_cselect");
}
/* Use the SCC def from the original instruction, not the comparison */
@ -295,7 +296,8 @@ void try_optimize_scc_nocompare(pr_opt_ctx &ctx, aco_ptr<Instruction> &instr)
}
}
void process_instruction(pr_opt_ctx &ctx, aco_ptr<Instruction> &instr)
void
process_instruction(pr_opt_ctx& ctx, aco_ptr<Instruction>& instr)
{
ctx.current_instr_idx++;
@ -307,9 +309,10 @@ void process_instruction(pr_opt_ctx &ctx, aco_ptr<Instruction> &instr)
save_reg_writes(ctx, instr);
}
} /* End of empty namespace */
} // namespace
void optimize_postRA(Program* program)
void
optimize_postRA(Program* program)
{
pr_opt_ctx ctx;
ctx.program = program;
@ -319,10 +322,10 @@ void optimize_postRA(Program* program)
* Goes through each instruction exactly once, and can transform
* instructions or adjust the use counts of temps.
*/
for (auto &block : program->blocks) {
for (auto& block : program->blocks) {
ctx.reset_block(&block);
for (aco_ptr<Instruction> &instr : block.instructions)
for (aco_ptr<Instruction>& instr : block.instructions)
process_instruction(ctx, instr);
}
@ -330,13 +333,12 @@ void optimize_postRA(Program* program)
* Gets rid of instructions which are manually deleted or
* no longer have any uses.
*/
for (auto &block : program->blocks) {
auto new_end = std::remove_if(
block.instructions.begin(), block.instructions.end(),
[&ctx](const aco_ptr<Instruction> &instr) { return !instr || is_dead(ctx.uses, instr.get()); });
for (auto& block : program->blocks) {
auto new_end = std::remove_if(block.instructions.begin(), block.instructions.end(),
[&ctx](const aco_ptr<Instruction>& instr)
{ return !instr || is_dead(ctx.uses, instr.get()); });
block.instructions.resize(new_end - block.instructions.begin());
}
}
} /* End of aco namespace */
} // namespace aco

View File

@ -39,17 +39,17 @@ namespace {
/* LLVM disassembler only supports GFX8+, try to disassemble with CLRXdisasm
* for GFX6-GFX7 if found on the system, this is better than nothing.
*/
bool print_asm_gfx6_gfx7(Program *program, std::vector<uint32_t>& binary,
FILE *output)
*/
bool
print_asm_gfx6_gfx7(Program* program, std::vector<uint32_t>& binary, FILE* output)
{
#ifdef _WIN32
return true;
#else
char path[] = "/tmp/fileXXXXXX";
char line[2048], command[128];
const char *gpu_type;
FILE *p;
const char* gpu_type;
FILE* p;
int fd;
/* Dump the binary into a temporary file. */
@ -57,8 +57,7 @@ bool print_asm_gfx6_gfx7(Program *program, std::vector<uint32_t>& binary,
if (fd < 0)
return true;
for (uint32_t w : binary)
{
for (uint32_t w : binary) {
if (write(fd, &w, sizeof(w)) == -1)
goto fail;
}
@ -69,30 +68,16 @@ bool print_asm_gfx6_gfx7(Program *program, std::vector<uint32_t>& binary,
switch (program->chip_class) {
case GFX6:
switch (program->family) {
case CHIP_TAHITI:
gpu_type = "tahiti";
break;
case CHIP_PITCAIRN:
gpu_type = "pitcairn";
break;
case CHIP_VERDE:
gpu_type = "capeverde";
break;
case CHIP_OLAND:
gpu_type = "oland";
break;
case CHIP_HAINAN:
gpu_type = "hainan";
break;
default:
unreachable("Invalid GFX6 family!");
case CHIP_TAHITI: gpu_type = "tahiti"; break;
case CHIP_PITCAIRN: gpu_type = "pitcairn"; break;
case CHIP_VERDE: gpu_type = "capeverde"; break;
case CHIP_OLAND: gpu_type = "oland"; break;
case CHIP_HAINAN: gpu_type = "hainan"; break;
default: unreachable("Invalid GFX6 family!");
}
break;
case GFX7:
gpu_type = "gfx700";
break;
default:
unreachable("Invalid chip class!");
case GFX7: gpu_type = "gfx700"; break;
default: unreachable("Invalid chip class!");
}
sprintf(command, "clrxdisasm --gpuType=%s -r %s", gpu_type, path);
@ -121,22 +106,21 @@ fail:
#endif
}
std::pair<bool, size_t> disasm_instr(chip_class chip, LLVMDisasmContextRef disasm,
uint32_t *binary, unsigned exec_size, size_t pos,
char *outline, unsigned outline_size)
std::pair<bool, size_t>
disasm_instr(chip_class chip, LLVMDisasmContextRef disasm, uint32_t* binary, unsigned exec_size,
size_t pos, char* outline, unsigned outline_size)
{
/* mask out src2 on v_writelane_b32 */
if (((chip == GFX8 || chip == GFX9) && (binary[pos] & 0xffff8000) == 0xd28a0000) ||
(chip >= GFX10 && (binary[pos] & 0xffff8000) == 0xd7610000)) {
binary[pos+1] = binary[pos+1] & 0xF803FFFF;
binary[pos + 1] = binary[pos + 1] & 0xF803FFFF;
}
size_t l = LLVMDisasmInstruction(disasm, (uint8_t *) &binary[pos],
(exec_size - pos) * sizeof(uint32_t), pos * 4,
outline, outline_size);
size_t l =
LLVMDisasmInstruction(disasm, (uint8_t*)&binary[pos], (exec_size - pos) * sizeof(uint32_t),
pos * 4, outline, outline_size);
if (chip >= GFX10 && l == 8 &&
((binary[pos] & 0xffff0000) == 0xd7610000) &&
if (chip >= GFX10 && l == 8 && ((binary[pos] & 0xffff0000) == 0xd7610000) &&
((binary[pos + 1] & 0x1ff) == 0xff)) {
/* v_writelane with literal uses 3 dwords but llvm consumes only 2 */
l += 4;
@ -145,14 +129,14 @@ std::pair<bool, size_t> disasm_instr(chip_class chip, LLVMDisasmContextRef disas
bool invalid = false;
size_t size;
if (!l &&
((chip >= GFX9 && (binary[pos] & 0xffff8000) == 0xd1348000) || /* v_add_u32_e64 + clamp */
((chip >= GFX9 && (binary[pos] & 0xffff8000) == 0xd1348000) || /* v_add_u32_e64 + clamp */
(chip >= GFX10 && (binary[pos] & 0xffff8000) == 0xd7038000) || /* v_add_u16_e64 + clamp */
(chip <= GFX9 && (binary[pos] & 0xffff8000) == 0xd1268000) || /* v_add_u16_e64 + clamp */
(chip <= GFX9 && (binary[pos] & 0xffff8000) == 0xd1268000) || /* v_add_u16_e64 + clamp */
(chip >= GFX10 && (binary[pos] & 0xffff8000) == 0xd76d8000) || /* v_add3_u32 + clamp */
(chip == GFX9 && (binary[pos] & 0xffff8000) == 0xd1ff8000)) /* v_add3_u32 + clamp */) {
strcpy(outline, "\tinteger addition + clamp");
bool has_literal = chip >= GFX10 &&
(((binary[pos+1] & 0x1ff) == 0xff) || (((binary[pos+1] >> 9) & 0x1ff) == 0xff));
bool has_literal = chip >= GFX10 && (((binary[pos + 1] & 0x1ff) == 0xff) ||
(((binary[pos + 1] >> 9) & 0x1ff) == 0xff));
size = 2 + has_literal;
} else if (chip >= GFX10 && l == 4 && ((binary[pos] & 0xfe0001ff) == 0x020000f9)) {
strcpy(outline, "\tv_cndmask_b32 + sdwa");
@ -170,8 +154,8 @@ std::pair<bool, size_t> disasm_instr(chip_class chip, LLVMDisasmContextRef disas
}
} /* end namespace */
bool print_asm(Program *program, std::vector<uint32_t>& binary,
unsigned exec_size, FILE *output)
bool
print_asm(Program* program, std::vector<uint32_t>& binary, unsigned exec_size, FILE* output)
{
if (program->chip_class <= GFX7) {
/* Do not abort if clrxdisasm isn't found. */
@ -187,7 +171,7 @@ bool print_asm(Program *program, std::vector<uint32_t>& binary,
}
std::vector<llvm::SymbolInfoTy> symbols;
std::vector<std::array<char,16>> block_names;
std::vector<std::array<char, 16>> block_names;
block_names.reserve(program->blocks.size());
for (Block& block : program->blocks) {
if (!referenced_blocks[block.index])
@ -195,18 +179,18 @@ bool print_asm(Program *program, std::vector<uint32_t>& binary,
std::array<char, 16> name;
sprintf(name.data(), "BB%u", block.index);
block_names.push_back(name);
symbols.emplace_back(block.offset * 4, llvm::StringRef(block_names[block_names.size() - 1].data()), 0);
symbols.emplace_back(block.offset * 4,
llvm::StringRef(block_names[block_names.size() - 1].data()), 0);
}
const char *features = "";
const char* features = "";
if (program->chip_class >= GFX10 && program->wave_size == 64) {
features = "+wavefrontsize64";
}
LLVMDisasmContextRef disasm = LLVMCreateDisasmCPUFeatures("amdgcn-mesa-mesa3d",
ac_get_llvm_processor_name(program->family),
features,
&symbols, 0, NULL, NULL);
LLVMDisasmContextRef disasm =
LLVMCreateDisasmCPUFeatures("amdgcn-mesa-mesa3d", ac_get_llvm_processor_name(program->family),
features, &symbols, 0, NULL, NULL);
size_t pos = 0;
bool invalid = false;
@ -216,7 +200,8 @@ bool print_asm(Program *program, std::vector<uint32_t>& binary,
unsigned prev_pos = 0;
unsigned repeat_count = 0;
while (pos < exec_size) {
bool new_block = next_block < program->blocks.size() && pos == program->blocks[next_block].offset;
bool new_block =
next_block < program->blocks.size() && pos == program->blocks[next_block].offset;
if (pos + prev_size <= exec_size && prev_pos != pos && !new_block &&
memcmp(&binary[prev_pos], &binary[pos], prev_size * 4) == 0) {
repeat_count++;
@ -235,8 +220,8 @@ bool print_asm(Program *program, std::vector<uint32_t>& binary,
}
char outline[1024];
std::pair<bool, size_t> res = disasm_instr(
program->chip_class, disasm, binary.data(), exec_size, pos, outline, sizeof(outline));
std::pair<bool, size_t> res = disasm_instr(program->chip_class, disasm, binary.data(),
exec_size, pos, outline, sizeof(outline));
invalid |= res.first;
fprintf(output, "%-60s ;", outline);
@ -271,4 +256,4 @@ bool print_asm(Program *program, std::vector<uint32_t>& binary,
return invalid;
}
}
} // namespace aco

View File

@ -86,36 +86,38 @@ const std::array<const char*, num_reduce_ops> reduce_ops = []()
return ret;
}();
static void print_reg_class(const RegClass rc, FILE *output)
static void
print_reg_class(const RegClass rc, FILE* output)
{
switch (rc) {
case RegClass::s1: fprintf(output, " s1: "); return;
case RegClass::s2: fprintf(output, " s2: "); return;
case RegClass::s3: fprintf(output, " s3: "); return;
case RegClass::s4: fprintf(output, " s4: "); return;
case RegClass::s6: fprintf(output, " s6: "); return;
case RegClass::s8: fprintf(output, " s8: "); return;
case RegClass::s16: fprintf(output, "s16: "); return;
case RegClass::v1: fprintf(output, " v1: "); return;
case RegClass::v2: fprintf(output, " v2: "); return;
case RegClass::v3: fprintf(output, " v3: "); return;
case RegClass::v4: fprintf(output, " v4: "); return;
case RegClass::v5: fprintf(output, " v5: "); return;
case RegClass::v6: fprintf(output, " v6: "); return;
case RegClass::v7: fprintf(output, " v7: "); return;
case RegClass::v8: fprintf(output, " v8: "); return;
case RegClass::v1b: fprintf(output, " v1b: "); return;
case RegClass::v2b: fprintf(output, " v2b: "); return;
case RegClass::v3b: fprintf(output, " v3b: "); return;
case RegClass::v4b: fprintf(output, " v4b: "); return;
case RegClass::v6b: fprintf(output, " v6b: "); return;
case RegClass::v8b: fprintf(output, " v8b: "); return;
case RegClass::v1_linear: fprintf(output, " v1: "); return;
case RegClass::v2_linear: fprintf(output, " v2: "); return;
case RegClass::s1: fprintf(output, " s1: "); return;
case RegClass::s2: fprintf(output, " s2: "); return;
case RegClass::s3: fprintf(output, " s3: "); return;
case RegClass::s4: fprintf(output, " s4: "); return;
case RegClass::s6: fprintf(output, " s6: "); return;
case RegClass::s8: fprintf(output, " s8: "); return;
case RegClass::s16: fprintf(output, "s16: "); return;
case RegClass::v1: fprintf(output, " v1: "); return;
case RegClass::v2: fprintf(output, " v2: "); return;
case RegClass::v3: fprintf(output, " v3: "); return;
case RegClass::v4: fprintf(output, " v4: "); return;
case RegClass::v5: fprintf(output, " v5: "); return;
case RegClass::v6: fprintf(output, " v6: "); return;
case RegClass::v7: fprintf(output, " v7: "); return;
case RegClass::v8: fprintf(output, " v8: "); return;
case RegClass::v1b: fprintf(output, " v1b: "); return;
case RegClass::v2b: fprintf(output, " v2b: "); return;
case RegClass::v3b: fprintf(output, " v3b: "); return;
case RegClass::v4b: fprintf(output, " v4b: "); return;
case RegClass::v6b: fprintf(output, " v6b: "); return;
case RegClass::v8b: fprintf(output, " v8b: "); return;
case RegClass::v1_linear: fprintf(output, " v1: "); return;
case RegClass::v2_linear: fprintf(output, " v2: "); return;
}
}
void print_physReg(PhysReg reg, unsigned bytes, FILE *output, unsigned flags)
void
print_physReg(PhysReg reg, unsigned bytes, FILE* output, unsigned flags)
{
if (reg == 124) {
fprintf(output, "m0");
@ -134,16 +136,17 @@ void print_physReg(PhysReg reg, unsigned bytes, FILE *output, unsigned flags)
} else {
fprintf(output, "%c[%d", is_vgpr ? 'v' : 's', r);
if (size > 1)
fprintf(output, "-%d]", r + size -1);
fprintf(output, "-%d]", r + size - 1);
else
fprintf(output, "]");
}
if (reg.byte() || bytes % 4)
fprintf(output, "[%d:%d]", reg.byte()*8, (reg.byte()+bytes) * 8);
fprintf(output, "[%d:%d]", reg.byte() * 8, (reg.byte() + bytes) * 8);
}
}
static void print_constant(uint8_t reg, FILE *output)
static void
print_constant(uint8_t reg, FILE* output)
{
if (reg >= 128 && reg <= 192) {
fprintf(output, "%d", reg - 128);
@ -154,37 +157,20 @@ static void print_constant(uint8_t reg, FILE *output)
}
switch (reg) {
case 240:
fprintf(output, "0.5");
break;
case 241:
fprintf(output, "-0.5");
break;
case 242:
fprintf(output, "1.0");
break;
case 243:
fprintf(output, "-1.0");
break;
case 244:
fprintf(output, "2.0");
break;
case 245:
fprintf(output, "-2.0");
break;
case 246:
fprintf(output, "4.0");
break;
case 247:
fprintf(output, "-4.0");
break;
case 248:
fprintf(output, "1/(2*PI)");
break;
case 240: fprintf(output, "0.5"); break;
case 241: fprintf(output, "-0.5"); break;
case 242: fprintf(output, "1.0"); break;
case 243: fprintf(output, "-1.0"); break;
case 244: fprintf(output, "2.0"); break;
case 245: fprintf(output, "-2.0"); break;
case 246: fprintf(output, "4.0"); break;
case 247: fprintf(output, "-4.0"); break;
case 248: fprintf(output, "1/(2*PI)"); break;
}
}
void aco_print_operand(const Operand *operand, FILE *output, unsigned flags)
void
aco_print_operand(const Operand* operand, FILE* output, unsigned flags)
{
if (operand->isLiteral() || (operand->isConstant() && operand->bytes() == 1)) {
if (operand->bytes() == 1)
@ -216,7 +202,8 @@ void aco_print_operand(const Operand *operand, FILE *output, unsigned flags)
}
}
static void print_definition(const Definition *definition, FILE *output, unsigned flags)
static void
print_definition(const Definition* definition, FILE* output, unsigned flags)
{
if (!(flags & print_no_ssa))
print_reg_class(definition->regClass(), output);
@ -235,7 +222,8 @@ static void print_definition(const Definition *definition, FILE *output, unsigne
print_physReg(definition->physReg(), definition->bytes(), output, flags);
}
static void print_storage(storage_class storage, FILE *output)
static void
print_storage(storage_class storage, FILE* output)
{
fprintf(output, " storage:");
int printed = 0;
@ -255,7 +243,8 @@ static void print_storage(storage_class storage, FILE *output)
printed += fprintf(output, "%svgpr_spill", printed ? "," : "");
}
static void print_semantics(memory_semantics sem, FILE *output)
static void
print_semantics(memory_semantics sem, FILE* output)
{
fprintf(output, " semantics:");
int printed = 0;
@ -275,36 +264,29 @@ static void print_semantics(memory_semantics sem, FILE *output)
printed += fprintf(output, "%srmw", printed ? "," : "");
}
static void print_scope(sync_scope scope, FILE *output, const char *prefix="scope")
static void
print_scope(sync_scope scope, FILE* output, const char* prefix = "scope")
{
fprintf(output, " %s:", prefix);
switch (scope) {
case scope_invocation:
fprintf(output, "invocation");
break;
case scope_subgroup:
fprintf(output, "subgroup");
break;
case scope_workgroup:
fprintf(output, "workgroup");
break;
case scope_queuefamily:
fprintf(output, "queuefamily");
break;
case scope_device:
fprintf(output, "device");
break;
case scope_invocation: fprintf(output, "invocation"); break;
case scope_subgroup: fprintf(output, "subgroup"); break;
case scope_workgroup: fprintf(output, "workgroup"); break;
case scope_queuefamily: fprintf(output, "queuefamily"); break;
case scope_device: fprintf(output, "device"); break;
}
}
static void print_sync(memory_sync_info sync, FILE *output)
static void
print_sync(memory_sync_info sync, FILE* output)
{
print_storage(sync.storage, output);
print_semantics(sync.semantics, output);
print_scope(sync.scope, output);
}
static void print_instr_format_specific(const Instruction *instr, FILE *output)
static void
print_instr_format_specific(const Instruction* instr, FILE* output)
{
switch (instr->format) {
case Format::SOPK: {
@ -319,9 +301,12 @@ static void print_instr_format_specific(const Instruction *instr, FILE *output)
/* we usually should check the chip class for vmcnt/lgkm, but
* insert_waitcnt() should fill it in regardless. */
unsigned vmcnt = (imm & 0xF) | ((imm & (0x3 << 14)) >> 10);
if (vmcnt != 63) fprintf(output, " vmcnt(%d)", vmcnt);
if (((imm >> 4) & 0x7) < 0x7) fprintf(output, " expcnt(%d)", (imm >> 4) & 0x7);
if (((imm >> 8) & 0x3F) < 0x3F) fprintf(output, " lgkmcnt(%d)", (imm >> 8) & 0x3F);
if (vmcnt != 63)
fprintf(output, " vmcnt(%d)", vmcnt);
if (((imm >> 4) & 0x7) < 0x7)
fprintf(output, " expcnt(%d)", (imm >> 4) & 0x7);
if (((imm >> 8) & 0x3F) < 0x3F)
fprintf(output, " lgkmcnt(%d)", (imm >> 8) & 0x3F);
break;
}
case aco_opcode::s_endpgm:
@ -337,35 +322,21 @@ static void print_instr_format_specific(const Instruction *instr, FILE *output)
case aco_opcode::s_sendmsg: {
unsigned id = imm & sendmsg_id_mask;
switch (id) {
case sendmsg_none:
fprintf(output, " sendmsg(MSG_NONE)");
break;
case sendmsg_none: fprintf(output, " sendmsg(MSG_NONE)"); break;
case _sendmsg_gs:
fprintf(output, " sendmsg(gs%s%s, %u)",
imm & 0x10 ? ", cut" : "", imm & 0x20 ? ", emit" : "", imm >> 8);
fprintf(output, " sendmsg(gs%s%s, %u)", imm & 0x10 ? ", cut" : "",
imm & 0x20 ? ", emit" : "", imm >> 8);
break;
case _sendmsg_gs_done:
fprintf(output, " sendmsg(gs_done%s%s, %u)",
imm & 0x10 ? ", cut" : "", imm & 0x20 ? ", emit" : "", imm >> 8);
break;
case sendmsg_save_wave:
fprintf(output, " sendmsg(save_wave)");
break;
case sendmsg_stall_wave_gen:
fprintf(output, " sendmsg(stall_wave_gen)");
break;
case sendmsg_halt_waves:
fprintf(output, " sendmsg(halt_waves)");
break;
case sendmsg_ordered_ps_done:
fprintf(output, " sendmsg(ordered_ps_done)");
break;
case sendmsg_early_prim_dealloc:
fprintf(output, " sendmsg(early_prim_dealloc)");
break;
case sendmsg_gs_alloc_req:
fprintf(output, " sendmsg(gs_alloc_req)");
fprintf(output, " sendmsg(gs_done%s%s, %u)", imm & 0x10 ? ", cut" : "",
imm & 0x20 ? ", emit" : "", imm >> 8);
break;
case sendmsg_save_wave: fprintf(output, " sendmsg(save_wave)"); break;
case sendmsg_stall_wave_gen: fprintf(output, " sendmsg(stall_wave_gen)"); break;
case sendmsg_halt_waves: fprintf(output, " sendmsg(halt_waves)"); break;
case sendmsg_ordered_ps_done: fprintf(output, " sendmsg(ordered_ps_done)"); break;
case sendmsg_early_prim_dealloc: fprintf(output, " sendmsg(early_prim_dealloc)"); break;
case sendmsg_gs_alloc_req: fprintf(output, " sendmsg(gs_alloc_req)"); break;
}
break;
}
@ -433,40 +404,21 @@ static void print_instr_format_specific(const Instruction *instr, FILE *output)
}
case Format::MIMG: {
const MIMG_instruction& mimg = instr->mimg();
unsigned identity_dmask = !instr->definitions.empty() ?
(1 << instr->definitions[0].size()) - 1 :
0xf;
unsigned identity_dmask =
!instr->definitions.empty() ? (1 << instr->definitions[0].size()) - 1 : 0xf;
if ((mimg.dmask & identity_dmask) != identity_dmask)
fprintf(output, " dmask:%s%s%s%s",
mimg.dmask & 0x1 ? "x" : "",
mimg.dmask & 0x2 ? "y" : "",
mimg.dmask & 0x4 ? "z" : "",
fprintf(output, " dmask:%s%s%s%s", mimg.dmask & 0x1 ? "x" : "",
mimg.dmask & 0x2 ? "y" : "", mimg.dmask & 0x4 ? "z" : "",
mimg.dmask & 0x8 ? "w" : "");
switch (mimg.dim) {
case ac_image_1d:
fprintf(output, " 1d");
break;
case ac_image_2d:
fprintf(output, " 2d");
break;
case ac_image_3d:
fprintf(output, " 3d");
break;
case ac_image_cube:
fprintf(output, " cube");
break;
case ac_image_1darray:
fprintf(output, " 1darray");
break;
case ac_image_2darray:
fprintf(output, " 2darray");
break;
case ac_image_2dmsaa:
fprintf(output, " 2dmsaa");
break;
case ac_image_2darraymsaa:
fprintf(output, " 2darraymsaa");
break;
case ac_image_1d: fprintf(output, " 1d"); break;
case ac_image_2d: fprintf(output, " 2d"); break;
case ac_image_3d: fprintf(output, " 3d"); break;
case ac_image_cube: fprintf(output, " cube"); break;
case ac_image_1darray: fprintf(output, " 1darray"); break;
case ac_image_2darray: fprintf(output, " 2darray"); break;
case ac_image_2dmsaa: fprintf(output, " 2dmsaa"); break;
case ac_image_2darraymsaa: fprintf(output, " 2darraymsaa"); break;
}
if (mimg.unrm)
fprintf(output, " unrm");
@ -495,10 +447,8 @@ static void print_instr_format_specific(const Instruction *instr, FILE *output)
const Export_instruction& exp = instr->exp();
unsigned identity_mask = exp.compressed ? 0x5 : 0xf;
if ((exp.enabled_mask & identity_mask) != identity_mask)
fprintf(output, " en:%c%c%c%c",
exp.enabled_mask & 0x1 ? 'r' : '*',
exp.enabled_mask & 0x2 ? 'g' : '*',
exp.enabled_mask & 0x4 ? 'b' : '*',
fprintf(output, " en:%c%c%c%c", exp.enabled_mask & 0x1 ? 'r' : '*',
exp.enabled_mask & 0x2 ? 'g' : '*', exp.enabled_mask & 0x4 ? 'b' : '*',
exp.enabled_mask & 0x8 ? 'a' : '*');
if (exp.compressed)
fprintf(output, " compr");
@ -624,15 +574,9 @@ static void print_instr_format_specific(const Instruction *instr, FILE *output)
if (instr->isVOP3()) {
const VOP3_instruction& vop3 = instr->vop3();
switch (vop3.omod) {
case 1:
fprintf(output, " *2");
break;
case 2:
fprintf(output, " *4");
break;
case 3:
fprintf(output, " *0.5");
break;
case 1: fprintf(output, " *2"); break;
case 2: fprintf(output, " *4"); break;
case 3: fprintf(output, " *0.5"); break;
}
if (vop3.clamp)
fprintf(output, " clamp");
@ -641,8 +585,7 @@ static void print_instr_format_specific(const Instruction *instr, FILE *output)
} else if (instr->isDPP()) {
const DPP_instruction& dpp = instr->dpp();
if (dpp.dpp_ctrl <= 0xff) {
fprintf(output, " quad_perm:[%d,%d,%d,%d]",
dpp.dpp_ctrl & 0x3, (dpp.dpp_ctrl >> 2) & 0x3,
fprintf(output, " quad_perm:[%d,%d,%d,%d]", dpp.dpp_ctrl & 0x3, (dpp.dpp_ctrl >> 2) & 0x3,
(dpp.dpp_ctrl >> 4) & 0x3, (dpp.dpp_ctrl >> 6) & 0x3);
} else if (dpp.dpp_ctrl >= 0x101 && dpp.dpp_ctrl <= 0x10f) {
fprintf(output, " row_shl:%d", dpp.dpp_ctrl & 0xf);
@ -678,21 +621,14 @@ static void print_instr_format_specific(const Instruction *instr, FILE *output)
} else if (instr->isSDWA()) {
const SDWA_instruction& sdwa = instr->sdwa();
switch (sdwa.omod) {
case 1:
fprintf(output, " *2");
break;
case 2:
fprintf(output, " *4");
break;
case 3:
fprintf(output, " *0.5");
break;
case 1: fprintf(output, " *2"); break;
case 2: fprintf(output, " *4"); break;
case 3: fprintf(output, " *0.5"); break;
}
if (sdwa.clamp)
fprintf(output, " clamp");
switch (sdwa.dst_sel & sdwa_asuint) {
case sdwa_udword:
break;
case sdwa_udword: break;
case sdwa_ubyte0:
case sdwa_ubyte1:
case sdwa_ubyte2:
@ -711,7 +647,8 @@ static void print_instr_format_specific(const Instruction *instr, FILE *output)
}
}
void aco_print_instr(const Instruction *instr, FILE *output, unsigned flags)
void
aco_print_instr(const Instruction* instr, FILE* output, unsigned flags)
{
if (!instr->definitions.empty()) {
for (unsigned i = 0; i < instr->definitions.size(); ++i) {
@ -723,10 +660,10 @@ void aco_print_instr(const Instruction *instr, FILE *output, unsigned flags)
}
fprintf(output, "%s", instr_info.name[(int)instr->opcode]);
if (instr->operands.size()) {
bool *const abs = (bool *)alloca(instr->operands.size() * sizeof(bool));
bool *const neg = (bool *)alloca(instr->operands.size() * sizeof(bool));
bool *const opsel = (bool *)alloca(instr->operands.size() * sizeof(bool));
uint8_t *const sel = (uint8_t *)alloca(instr->operands.size() * sizeof(uint8_t));
bool* const abs = (bool*)alloca(instr->operands.size() * sizeof(bool));
bool* const neg = (bool*)alloca(instr->operands.size() * sizeof(bool));
bool* const opsel = (bool*)alloca(instr->operands.size() * sizeof(bool));
uint8_t* const sel = (uint8_t*)alloca(instr->operands.size() * sizeof(uint8_t));
for (unsigned i = 0; i < instr->operands.size(); ++i) {
abs[i] = false;
neg[i] = false;
@ -792,8 +729,7 @@ void aco_print_instr(const Instruction *instr, FILE *output, unsigned flags)
if (instr->isVOP3P()) {
const VOP3P_instruction& vop3 = instr->vop3p();
if ((vop3.opsel_lo & (1 << i)) || !(vop3.opsel_hi & (1 << i))) {
fprintf(output, ".%c%c",
vop3.opsel_lo & (1 << i) ? 'y' : 'x',
fprintf(output, ".%c%c", vop3.opsel_lo & (1 << i) ? 'y' : 'x',
vop3.opsel_hi & (1 << i) ? 'y' : 'x');
}
if (vop3.neg_lo[i] && vop3.neg_hi[i])
@ -808,7 +744,8 @@ void aco_print_instr(const Instruction *instr, FILE *output, unsigned flags)
print_instr_format_specific(instr, output);
}
static void print_block_kind(uint16_t kind, FILE *output)
static void
print_block_kind(uint16_t kind, FILE* output)
{
if (kind & block_kind_uniform)
fprintf(output, "uniform, ");
@ -844,7 +781,8 @@ static void print_block_kind(uint16_t kind, FILE *output)
fprintf(output, "export_end, ");
}
static void print_stage(Stage stage, FILE *output)
static void
print_stage(Stage stage, FILE* output)
{
fprintf(output, "ACO shader stage: ");
@ -888,7 +826,8 @@ static void print_stage(Stage stage, FILE *output)
fprintf(output, "\n");
}
void aco_print_block(const Block* block, FILE *output, unsigned flags, const live& live_vars)
void
aco_print_block(const Block* block, FILE* output, unsigned flags, const live& live_vars)
{
fprintf(output, "BB%d\n", block->index);
fprintf(output, "/* logical preds: ");
@ -927,19 +866,16 @@ void aco_print_block(const Block* block, FILE *output, unsigned flags, const liv
}
}
void aco_print_program(const Program *program, FILE *output, const live& live_vars, unsigned flags)
void
aco_print_program(const Program* program, FILE* output, const live& live_vars, unsigned flags)
{
switch (program->progress) {
case CompilationProgress::after_isel:
fprintf(output, "After Instruction Selection:\n");
break;
case CompilationProgress::after_isel: fprintf(output, "After Instruction Selection:\n"); break;
case CompilationProgress::after_spilling:
fprintf(output, "After Spilling:\n");
flags |= print_kill;
break;
case CompilationProgress::after_ra:
fprintf(output, "After RA:\n");
break;
case CompilationProgress::after_ra: fprintf(output, "After RA:\n"); break;
}
print_stage(program->stage, output);
@ -965,9 +901,10 @@ void aco_print_program(const Program *program, FILE *output, const live& live_va
fprintf(output, "\n");
}
void aco_print_program(const Program *program, FILE *output, unsigned flags)
void
aco_print_program(const Program* program, FILE* output, unsigned flags)
{
aco_print_program(program, output, live(), flags);
}
}
} // namespace aco

View File

@ -36,7 +36,8 @@
namespace aco {
void setup_reduce_temp(Program* program)
void
setup_reduce_temp(Program* program)
{
unsigned last_top_level_block_idx = 0;
unsigned maxSize = 0;
@ -69,7 +70,8 @@ void setup_reduce_temp(Program* program)
if (reduceTmp_in_loop && block.loop_nest_depth == 0) {
assert(inserted_at == (int)last_top_level_block_idx);
aco_ptr<Instruction> end{create_instruction<Instruction>(aco_opcode::p_end_linear_vgpr, Format::PSEUDO, vtmp_in_loop ? 2 : 1, 0)};
aco_ptr<Instruction> end{create_instruction<Instruction>(
aco_opcode::p_end_linear_vgpr, Format::PSEUDO, vtmp_in_loop ? 2 : 1, 0)};
end->operands[0] = Operand(reduceTmp);
if (vtmp_in_loop)
end->operands[1] = Operand(vtmp);
@ -89,7 +91,7 @@ void setup_reduce_temp(Program* program)
std::vector<aco_ptr<Instruction>>::iterator it;
for (it = block.instructions.begin(); it != block.instructions.end(); ++it) {
Instruction *instr = (*it).get();
Instruction* instr = (*it).get();
if (instr->format != Format::PSEUDO_REDUCTION)
continue;
@ -98,7 +100,8 @@ void setup_reduce_temp(Program* program)
if ((int)last_top_level_block_idx != inserted_at) {
reduceTmp = program->allocateTmp(reduceTmp.regClass());
aco_ptr<Pseudo_instruction> create{create_instruction<Pseudo_instruction>(aco_opcode::p_start_linear_vgpr, Format::PSEUDO, 0, 1)};
aco_ptr<Pseudo_instruction> create{create_instruction<Pseudo_instruction>(
aco_opcode::p_start_linear_vgpr, Format::PSEUDO, 0, 1)};
create->definitions[0] = Definition(reduceTmp);
/* find the right place to insert this definition */
if (last_top_level_block_idx == block.index) {
@ -110,18 +113,19 @@ void setup_reduce_temp(Program* program)
} else {
assert(last_top_level_block_idx < block.index);
/* insert before the branch at last top level block */
std::vector<aco_ptr<Instruction>>& instructions = program->blocks[last_top_level_block_idx].instructions;
instructions.insert(std::next(instructions.begin(), instructions.size() - 1), std::move(create));
std::vector<aco_ptr<Instruction>>& instructions =
program->blocks[last_top_level_block_idx].instructions;
instructions.insert(std::next(instructions.begin(), instructions.size() - 1),
std::move(create));
inserted_at = last_top_level_block_idx;
}
}
/* same as before, except for the vector temporary instead of the reduce temporary */
unsigned cluster_size = instr->reduction().cluster_size;
bool need_vtmp = op == imul32 || op == fadd64 || op == fmul64 ||
op == fmin64 || op == fmax64 || op == umin64 ||
op == umax64 || op == imin64 || op == imax64 ||
op == imul64;
bool need_vtmp = op == imul32 || op == fadd64 || op == fmul64 || op == fmin64 ||
op == fmax64 || op == umin64 || op == umax64 || op == imin64 ||
op == imax64 || op == imul64;
bool gfx10_need_vtmp = op == imul8 || op == imax8 || op == imin8 || op == umin8 ||
op == imul16 || op == imax16 || op == imin16 || op == umin16 ||
op == iadd64;
@ -138,15 +142,18 @@ void setup_reduce_temp(Program* program)
vtmp_in_loop |= need_vtmp && block.loop_nest_depth > 0;
if (need_vtmp && (int)last_top_level_block_idx != vtmp_inserted_at) {
vtmp = program->allocateTmp(vtmp.regClass());
aco_ptr<Pseudo_instruction> create{create_instruction<Pseudo_instruction>(aco_opcode::p_start_linear_vgpr, Format::PSEUDO, 0, 1)};
aco_ptr<Pseudo_instruction> create{create_instruction<Pseudo_instruction>(
aco_opcode::p_start_linear_vgpr, Format::PSEUDO, 0, 1)};
create->definitions[0] = Definition(vtmp);
if (last_top_level_block_idx == block.index) {
it = block.instructions.insert(it, std::move(create));
it++;
} else {
assert(last_top_level_block_idx < block.index);
std::vector<aco_ptr<Instruction>>& instructions = program->blocks[last_top_level_block_idx].instructions;
instructions.insert(std::next(instructions.begin(), instructions.size() - 1), std::move(create));
std::vector<aco_ptr<Instruction>>& instructions =
program->blocks[last_top_level_block_idx].instructions;
instructions.insert(std::next(instructions.begin(), instructions.size() - 1),
std::move(create));
vtmp_inserted_at = last_top_level_block_idx;
}
}
@ -158,5 +165,4 @@ void setup_reduce_temp(Program* program)
}
}
};
}; // namespace aco

File diff suppressed because it is too large Load Diff

View File

@ -34,8 +34,8 @@ struct idx_ctx {
std::vector<uint32_t> renames;
};
inline
void reindex_defs(idx_ctx& ctx, aco_ptr<Instruction>& instr)
inline void
reindex_defs(idx_ctx& ctx, aco_ptr<Instruction>& instr)
{
for (Definition& def : instr->definitions) {
if (!def.isTemp())
@ -48,8 +48,8 @@ void reindex_defs(idx_ctx& ctx, aco_ptr<Instruction>& instr)
}
}
inline
void reindex_ops(idx_ctx& ctx, aco_ptr<Instruction>& instr)
inline void
reindex_ops(idx_ctx& ctx, aco_ptr<Instruction>& instr)
{
for (Operand& op : instr->operands) {
if (!op.isTemp())
@ -60,7 +60,8 @@ void reindex_ops(idx_ctx& ctx, aco_ptr<Instruction>& instr)
}
}
void reindex_program(idx_ctx& ctx, Program* program)
void
reindex_program(idx_ctx& ctx, Program* program)
{
ctx.renames.resize(program->peekAllocationId());
@ -88,12 +89,13 @@ void reindex_program(idx_ctx& ctx, Program* program)
/* update program members */
program->private_segment_buffer = Temp(ctx.renames[program->private_segment_buffer.id()],
program->private_segment_buffer.regClass());
program->scratch_offset = Temp(ctx.renames[program->scratch_offset.id()],
program->scratch_offset.regClass());
program->scratch_offset =
Temp(ctx.renames[program->scratch_offset.id()], program->scratch_offset.regClass());
program->temp_rc = ctx.temp_rc;
}
void update_live_out(idx_ctx& ctx, std::vector<IDSet>& live_out)
void
update_live_out(idx_ctx& ctx, std::vector<IDSet>& live_out)
{
for (IDSet& set : live_out) {
IDSet new_set;
@ -105,7 +107,8 @@ void update_live_out(idx_ctx& ctx, std::vector<IDSet>& live_out)
} /* end namespace */
void reindex_ssa(Program* program)
void
reindex_ssa(Program* program)
{
idx_ctx ctx;
reindex_program(ctx, program);
@ -113,7 +116,8 @@ void reindex_ssa(Program* program)
program->allocationID = program->temp_rc.size();
}
void reindex_ssa(Program* program, std::vector<IDSet>& live_out)
void
reindex_ssa(Program* program, std::vector<IDSet>& live_out)
{
idx_ctx ctx;
reindex_program(ctx, program);
@ -122,4 +126,4 @@ void reindex_ssa(Program* program, std::vector<IDSet>& live_out)
program->allocationID = program->temp_rc.size();
}
}
} // namespace aco

View File

@ -34,11 +34,11 @@
#define SMEM_WINDOW_SIZE (350 - ctx.num_waves * 35)
#define VMEM_WINDOW_SIZE (1024 - ctx.num_waves * 64)
#define POS_EXP_WINDOW_SIZE 512
#define SMEM_MAX_MOVES (64 - ctx.num_waves * 4)
#define VMEM_MAX_MOVES (256 - ctx.num_waves * 16)
#define SMEM_MAX_MOVES (64 - ctx.num_waves * 4)
#define VMEM_MAX_MOVES (256 - ctx.num_waves * 16)
/* creating clauses decreases def-use distances, so make it less aggressive the lower num_waves is */
#define VMEM_CLAUSE_MAX_GRAB_DIST (ctx.num_waves * 8)
#define POS_EXP_MAX_MOVES 512
#define POS_EXP_MAX_MOVES 512
namespace aco {
@ -54,7 +54,7 @@ enum MoveResult {
* or below a group of instruction that hardware can execute as a clause.
*/
struct DownwardsCursor {
int source_idx; /* Current instruction to consider for moving */
int source_idx; /* Current instruction to consider for moving */
int insert_idx_clause; /* First clause instruction */
int insert_idx; /* First instruction *after* the clause */
@ -66,11 +66,9 @@ struct DownwardsCursor {
RegisterDemand total_demand;
DownwardsCursor(int current_idx, RegisterDemand initial_clause_demand)
: source_idx(current_idx - 1),
insert_idx_clause(current_idx),
insert_idx(current_idx + 1),
clause_demand(initial_clause_demand) {
}
: source_idx(current_idx - 1), insert_idx_clause(current_idx), insert_idx(current_idx + 1),
clause_demand(initial_clause_demand)
{}
void verify_invariants(const RegisterDemand* register_demand);
};
@ -91,18 +89,16 @@ struct UpwardsCursor {
insert_idx = -1; /* to be initialized later */
}
bool has_insert_idx() const {
return insert_idx != -1;
}
bool has_insert_idx() const { return insert_idx != -1; }
void verify_invariants(const RegisterDemand* register_demand);
};
struct MoveState {
RegisterDemand max_registers;
Block *block;
Instruction *current;
RegisterDemand *register_demand; /* demand per instruction */
Block* block;
Instruction* current;
RegisterDemand* register_demand; /* demand per instruction */
bool improved_rar;
std::vector<bool> depends_on;
@ -143,19 +139,22 @@ struct sched_ctx {
*/
template <typename T>
void move_element(T begin_it, size_t idx, size_t before) {
if (idx < before) {
auto begin = std::next(begin_it, idx);
auto end = std::next(begin_it, before);
std::rotate(begin, begin + 1, end);
} else if (idx > before) {
auto begin = std::next(begin_it, before);
auto end = std::next(begin_it, idx + 1);
std::rotate(begin, end - 1, end);
}
void
move_element(T begin_it, size_t idx, size_t before)
{
if (idx < before) {
auto begin = std::next(begin_it, idx);
auto end = std::next(begin_it, before);
std::rotate(begin, begin + 1, end);
} else if (idx > before) {
auto begin = std::next(begin_it, before);
auto end = std::next(begin_it, idx + 1);
std::rotate(begin, end - 1, end);
}
}
void DownwardsCursor::verify_invariants(const RegisterDemand* register_demand)
void
DownwardsCursor::verify_invariants(const RegisterDemand* register_demand)
{
assert(source_idx < insert_idx_clause);
assert(insert_idx_clause < insert_idx);
@ -175,7 +174,8 @@ void DownwardsCursor::verify_invariants(const RegisterDemand* register_demand)
#endif
}
DownwardsCursor MoveState::downwards_init(int current_idx, bool improved_rar_, bool may_form_clauses)
DownwardsCursor
MoveState::downwards_init(int current_idx, bool improved_rar_, bool may_form_clauses)
{
improved_rar = improved_rar_;
@ -202,7 +202,8 @@ DownwardsCursor MoveState::downwards_init(int current_idx, bool improved_rar_, b
/* If add_to_clause is true, the current clause is extended by moving the
* instruction at source_idx in front of the clause. Otherwise, the instruction
* is moved past the end of the clause without extending it */
MoveResult MoveState::downwards_move(DownwardsCursor& cursor, bool add_to_clause)
MoveResult
MoveState::downwards_move(DownwardsCursor& cursor, bool add_to_clause)
{
aco_ptr<Instruction>& instr = block->instructions[cursor.source_idx];
@ -211,7 +212,8 @@ MoveResult MoveState::downwards_move(DownwardsCursor& cursor, bool add_to_clause
return move_fail_ssa;
/* check if one of candidate's operands is killed by depending instruction */
std::vector<bool>& RAR_deps = improved_rar ? (add_to_clause ? RAR_dependencies_clause : RAR_dependencies) : depends_on;
std::vector<bool>& RAR_deps =
improved_rar ? (add_to_clause ? RAR_dependencies_clause : RAR_dependencies) : depends_on;
for (const Operand& op : instr->operands) {
if (op.isTemp() && RAR_deps[op.tempId()]) {
// FIXME: account for difference in register pressure
@ -274,7 +276,8 @@ MoveResult MoveState::downwards_move(DownwardsCursor& cursor, bool add_to_clause
return move_success;
}
void MoveState::downwards_skip(DownwardsCursor& cursor)
void
MoveState::downwards_skip(DownwardsCursor& cursor)
{
aco_ptr<Instruction>& instr = block->instructions[cursor.source_idx];
@ -292,7 +295,9 @@ void MoveState::downwards_skip(DownwardsCursor& cursor)
cursor.verify_invariants(register_demand);
}
void UpwardsCursor::verify_invariants(const RegisterDemand* register_demand) {
void
UpwardsCursor::verify_invariants(const RegisterDemand* register_demand)
{
#ifndef NDEBUG
if (!has_insert_idx()) {
return;
@ -308,7 +313,8 @@ void UpwardsCursor::verify_invariants(const RegisterDemand* register_demand) {
#endif
}
UpwardsCursor MoveState::upwards_init(int source_idx, bool improved_rar_)
UpwardsCursor
MoveState::upwards_init(int source_idx, bool improved_rar_)
{
improved_rar = improved_rar_;
@ -323,7 +329,8 @@ UpwardsCursor MoveState::upwards_init(int source_idx, bool improved_rar_)
return UpwardsCursor(source_idx);
}
bool MoveState::upwards_check_deps(UpwardsCursor& cursor)
bool
MoveState::upwards_check_deps(UpwardsCursor& cursor)
{
aco_ptr<Instruction>& instr = block->instructions[cursor.source_idx];
for (const Operand& op : instr->operands) {
@ -333,13 +340,15 @@ bool MoveState::upwards_check_deps(UpwardsCursor& cursor)
return true;
}
void MoveState::upwards_update_insert_idx(UpwardsCursor& cursor)
void
MoveState::upwards_update_insert_idx(UpwardsCursor& cursor)
{
cursor.insert_idx = cursor.source_idx;
cursor.total_demand = register_demand[cursor.insert_idx];
}
MoveResult MoveState::upwards_move(UpwardsCursor& cursor)
MoveResult
MoveState::upwards_move(UpwardsCursor& cursor)
{
assert(cursor.has_insert_idx());
@ -355,13 +364,15 @@ MoveResult MoveState::upwards_move(UpwardsCursor& cursor)
return move_fail_rar;
}
/* check if register pressure is low enough: the diff is negative if register pressure is decreased */
/* check if register pressure is low enough: the diff is negative if register pressure is
* decreased */
const RegisterDemand candidate_diff = get_live_changes(instr);
const RegisterDemand temp = get_temp_registers(instr);
if (RegisterDemand(cursor.total_demand + candidate_diff).exceeds(max_registers))
return move_fail_pressure;
const RegisterDemand temp2 = get_temp_registers(block->instructions[cursor.insert_idx - 1]);
const RegisterDemand new_demand = register_demand[cursor.insert_idx - 1] - temp2 + candidate_diff + temp;
const RegisterDemand new_demand =
register_demand[cursor.insert_idx - 1] - temp2 + candidate_diff + temp;
if (new_demand.exceeds(max_registers))
return move_fail_pressure;
@ -385,7 +396,8 @@ MoveResult MoveState::upwards_move(UpwardsCursor& cursor)
return move_success;
}
void MoveState::upwards_skip(UpwardsCursor& cursor)
void
MoveState::upwards_skip(UpwardsCursor& cursor)
{
if (cursor.has_insert_idx()) {
aco_ptr<Instruction>& instr = block->instructions[cursor.source_idx];
@ -405,30 +417,33 @@ void MoveState::upwards_skip(UpwardsCursor& cursor)
cursor.verify_invariants(register_demand);
}
bool is_gs_or_done_sendmsg(const Instruction *instr)
bool
is_gs_or_done_sendmsg(const Instruction* instr)
{
if (instr->opcode == aco_opcode::s_sendmsg) {
uint16_t imm = instr->sopp().imm;
return (imm & sendmsg_id_mask) == _sendmsg_gs ||
(imm & sendmsg_id_mask) == _sendmsg_gs_done;
return (imm & sendmsg_id_mask) == _sendmsg_gs || (imm & sendmsg_id_mask) == _sendmsg_gs_done;
}
return false;
}
bool is_done_sendmsg(const Instruction *instr)
bool
is_done_sendmsg(const Instruction* instr)
{
if (instr->opcode == aco_opcode::s_sendmsg)
return (instr->sopp().imm & sendmsg_id_mask) == _sendmsg_gs_done;
return false;
}
memory_sync_info get_sync_info_with_hack(const Instruction* instr)
memory_sync_info
get_sync_info_with_hack(const Instruction* instr)
{
memory_sync_info sync = get_sync_info(instr);
if (instr->isSMEM() && !instr->operands.empty() && instr->operands[0].bytes() == 16) {
// FIXME: currently, it doesn't seem beneficial to omit this due to how our scheduler works
sync.storage = (storage_class)(sync.storage | storage_buffer);
sync.semantics = (memory_semantics)((sync.semantics | semantic_private) & ~semantic_can_reorder);
sync.semantics =
(memory_semantics)((sync.semantics | semantic_private) & ~semantic_can_reorder);
}
return sync;
}
@ -451,11 +466,13 @@ struct hazard_query {
bool contains_sendmsg;
bool uses_exec;
memory_event_set mem_events;
unsigned aliasing_storage; /* storage classes which are accessed (non-SMEM) */
unsigned aliasing_storage; /* storage classes which are accessed (non-SMEM) */
unsigned aliasing_storage_smem; /* storage classes which are accessed (SMEM) */
};
void init_hazard_query(hazard_query *query) {
void
init_hazard_query(hazard_query* query)
{
query->contains_spill = false;
query->contains_sendmsg = false;
query->uses_exec = false;
@ -464,7 +481,8 @@ void init_hazard_query(hazard_query *query) {
query->aliasing_storage_smem = 0;
}
void add_memory_event(memory_event_set *set, Instruction *instr, memory_sync_info *sync)
void
add_memory_event(memory_event_set* set, Instruction* instr, memory_sync_info* sync)
{
set->has_control_barrier |= is_done_sendmsg(instr);
if (instr->opcode == aco_opcode::p_barrier) {
@ -494,7 +512,8 @@ void add_memory_event(memory_event_set *set, Instruction *instr, memory_sync_inf
}
}
void add_to_hazard_query(hazard_query *query, Instruction *instr)
void
add_to_hazard_query(hazard_query* query, Instruction* instr)
{
if (instr->opcode == aco_opcode::p_spill || instr->opcode == aco_opcode::p_reload)
query->contains_spill = true;
@ -507,7 +526,8 @@ void add_to_hazard_query(hazard_query *query, Instruction *instr)
if (!(sync.semantics & semantic_can_reorder)) {
unsigned storage = sync.storage;
/* images and buffer/global memory can alias */ //TODO: more precisely, buffer images and buffer/global memory can alias
/* images and buffer/global memory can alias */ // TODO: more precisely, buffer images and
// buffer/global memory can alias
if (storage & (storage_buffer | storage_image))
storage |= storage_buffer | storage_image;
if (instr->isSMEM())
@ -531,7 +551,8 @@ enum HazardResult {
hazard_fail_unreorderable,
};
HazardResult perform_hazard_query(hazard_query *query, Instruction *instr, bool upwards)
HazardResult
perform_hazard_query(hazard_query* query, Instruction* instr, bool upwards)
{
/* don't schedule discards downwards */
if (!upwards && instr->opcode == aco_opcode::p_exit_early_if)
@ -549,10 +570,8 @@ HazardResult perform_hazard_query(hazard_query *query, Instruction *instr, bool
return hazard_fail_export;
/* don't move non-reorderable instructions */
if (instr->opcode == aco_opcode::s_memtime ||
instr->opcode == aco_opcode::s_memrealtime ||
instr->opcode == aco_opcode::s_setprio ||
instr->opcode == aco_opcode::s_getreg_b32)
if (instr->opcode == aco_opcode::s_memtime || instr->opcode == aco_opcode::s_memrealtime ||
instr->opcode == aco_opcode::s_setprio || instr->opcode == aco_opcode::s_getreg_b32)
return hazard_fail_unreorderable;
memory_event_set instr_set;
@ -560,8 +579,8 @@ HazardResult perform_hazard_query(hazard_query *query, Instruction *instr, bool
memory_sync_info sync = get_sync_info_with_hack(instr);
add_memory_event(&instr_set, instr, &sync);
memory_event_set *first = &instr_set;
memory_event_set *second = &query->mem_events;
memory_event_set* first = &instr_set;
memory_event_set* second = &query->mem_events;
if (upwards)
std::swap(first, second);
@ -571,7 +590,8 @@ HazardResult perform_hazard_query(hazard_query *query, Instruction *instr, bool
if ((first->has_control_barrier || first->access_atomic) && second->bar_acquire)
return hazard_fail_barrier;
if (((first->access_acquire || first->bar_acquire) && second->bar_classes) ||
((first->access_acquire | first->bar_acquire) & (second->access_relaxed | second->access_atomic)))
((first->access_acquire | first->bar_acquire) &
(second->access_relaxed | second->access_atomic)))
return hazard_fail_barrier;
/* everything before barrier(release) happens before the atomics/control_barriers after *
@ -580,7 +600,8 @@ HazardResult perform_hazard_query(hazard_query *query, Instruction *instr, bool
if (first->bar_release && (second->has_control_barrier || second->access_atomic))
return hazard_fail_barrier;
if ((first->bar_classes && (second->bar_release || second->access_release)) ||
((first->access_relaxed | first->access_atomic) & (second->bar_release | second->access_release)))
((first->access_relaxed | first->access_atomic) &
(second->bar_release | second->access_release)))
return hazard_fail_barrier;
/* don't move memory barriers around other memory barriers */
@ -589,14 +610,15 @@ HazardResult perform_hazard_query(hazard_query *query, Instruction *instr, bool
/* Don't move memory accesses to before control barriers. I don't think
* this is necessary for the Vulkan memory model, but it might be for GLSL450. */
unsigned control_classes = storage_buffer | storage_atomic_counter | storage_image | storage_shared;
if (first->has_control_barrier && ((second->access_atomic | second->access_relaxed) & control_classes))
unsigned control_classes =
storage_buffer | storage_atomic_counter | storage_image | storage_shared;
if (first->has_control_barrier &&
((second->access_atomic | second->access_relaxed) & control_classes))
return hazard_fail_barrier;
/* don't move memory loads/stores past potentially aliasing loads/stores */
unsigned aliasing_storage = instr->isSMEM() ?
query->aliasing_storage_smem :
query->aliasing_storage;
unsigned aliasing_storage =
instr->isSMEM() ? query->aliasing_storage_smem : query->aliasing_storage;
if ((sync.storage & aliasing_storage) && !(sync.semantics & semantic_can_reorder)) {
unsigned intersect = sync.storage & aliasing_storage;
if (intersect & storage_shared)
@ -614,9 +636,9 @@ HazardResult perform_hazard_query(hazard_query *query, Instruction *instr, bool
return hazard_success;
}
void schedule_SMEM(sched_ctx& ctx, Block* block,
std::vector<RegisterDemand>& register_demand,
Instruction* current, int idx)
void
schedule_SMEM(sched_ctx& ctx, Block* block, std::vector<RegisterDemand>& register_demand,
Instruction* current, int idx)
{
assert(idx != 0);
int window_size = SMEM_WINDOW_SIZE;
@ -634,30 +656,37 @@ void schedule_SMEM(sched_ctx& ctx, Block* block,
DownwardsCursor cursor = ctx.mv.downwards_init(idx, false, false);
for (int candidate_idx = idx - 1; k < max_moves && candidate_idx > (int) idx - window_size; candidate_idx--) {
for (int candidate_idx = idx - 1; k < max_moves && candidate_idx > (int)idx - window_size;
candidate_idx--) {
assert(candidate_idx >= 0);
assert(candidate_idx == cursor.source_idx);
aco_ptr<Instruction>& candidate = block->instructions[candidate_idx];
/* break if we'd make the previous SMEM instruction stall */
bool can_stall_prev_smem = idx <= ctx.last_SMEM_dep_idx && candidate_idx < ctx.last_SMEM_dep_idx;
bool can_stall_prev_smem =
idx <= ctx.last_SMEM_dep_idx && candidate_idx < ctx.last_SMEM_dep_idx;
if (can_stall_prev_smem && ctx.last_SMEM_stall >= 0)
break;
/* break when encountering another MEM instruction, logical_start or barriers */
if (candidate->opcode == aco_opcode::p_logical_start)
break;
/* only move VMEM instructions below descriptor loads. be more aggressive at higher num_waves to help create more vmem clauses */
if (candidate->isVMEM() && (cursor.insert_idx - cursor.source_idx > (ctx.num_waves * 4) || current->operands[0].size() == 4))
/* only move VMEM instructions below descriptor loads. be more aggressive at higher num_waves
* to help create more vmem clauses */
if (candidate->isVMEM() && (cursor.insert_idx - cursor.source_idx > (ctx.num_waves * 4) ||
current->operands[0].size() == 4))
break;
/* don't move descriptor loads below buffer loads */
if (candidate->format == Format::SMEM && current->operands[0].size() == 4 && candidate->operands[0].size() == 2)
if (candidate->format == Format::SMEM && current->operands[0].size() == 4 &&
candidate->operands[0].size() == 2)
break;
bool can_move_down = true;
HazardResult haz = perform_hazard_query(&hq, candidate.get(), false);
if (haz == hazard_fail_reorder_ds || haz == hazard_fail_spill || haz == hazard_fail_reorder_sendmsg || haz == hazard_fail_barrier || haz == hazard_fail_export)
if (haz == hazard_fail_reorder_ds || haz == hazard_fail_spill ||
haz == hazard_fail_reorder_sendmsg || haz == hazard_fail_barrier ||
haz == hazard_fail_export)
can_move_down = false;
else if (haz != hazard_success)
break;
@ -689,9 +718,10 @@ void schedule_SMEM(sched_ctx& ctx, Block* block,
bool found_dependency = false;
/* second, check if we have instructions after current to move up */
for (int candidate_idx = idx + 1; k < max_moves && candidate_idx < (int) idx + window_size; candidate_idx++) {
for (int candidate_idx = idx + 1; k < max_moves && candidate_idx < (int)idx + window_size;
candidate_idx++) {
assert(candidate_idx == up_cursor.source_idx);
assert(candidate_idx < (int) block->instructions.size());
assert(candidate_idx < (int)block->instructions.size());
aco_ptr<Instruction>& candidate = block->instructions[candidate_idx];
if (candidate->opcode == aco_opcode::p_logical_end)
@ -748,9 +778,9 @@ void schedule_SMEM(sched_ctx& ctx, Block* block,
ctx.last_SMEM_stall = 10 - ctx.num_waves - k;
}
void schedule_VMEM(sched_ctx& ctx, Block* block,
std::vector<RegisterDemand>& register_demand,
Instruction* current, int idx)
void
schedule_VMEM(sched_ctx& ctx, Block* block, std::vector<RegisterDemand>& register_demand,
Instruction* current, int idx)
{
assert(idx != 0);
int window_size = VMEM_WINDOW_SIZE;
@ -767,7 +797,8 @@ void schedule_VMEM(sched_ctx& ctx, Block* block,
DownwardsCursor cursor = ctx.mv.downwards_init(idx, true, true);
for (int candidate_idx = idx - 1; k < max_moves && candidate_idx > (int) idx - window_size; candidate_idx--) {
for (int candidate_idx = idx - 1; k < max_moves && candidate_idx > (int)idx - window_size;
candidate_idx--) {
assert(candidate_idx == cursor.source_idx);
assert(candidate_idx >= 0);
aco_ptr<Instruction>& candidate = block->instructions[candidate_idx];
@ -778,7 +809,8 @@ void schedule_VMEM(sched_ctx& ctx, Block* block,
break;
/* break if we'd make the previous SMEM instruction stall */
bool can_stall_prev_smem = idx <= ctx.last_SMEM_dep_idx && candidate_idx < ctx.last_SMEM_dep_idx;
bool can_stall_prev_smem =
idx <= ctx.last_SMEM_dep_idx && candidate_idx < ctx.last_SMEM_dep_idx;
if (can_stall_prev_smem && ctx.last_SMEM_stall >= 0)
break;
@ -787,14 +819,15 @@ void schedule_VMEM(sched_ctx& ctx, Block* block,
int grab_dist = cursor.insert_idx_clause - candidate_idx;
/* We can't easily tell how much this will decrease the def-to-use
* distances, so just use how far it will be moved as a heuristic. */
part_of_clause = grab_dist < clause_max_grab_dist &&
should_form_clause(current, candidate.get());
part_of_clause =
grab_dist < clause_max_grab_dist && should_form_clause(current, candidate.get());
}
/* if current depends on candidate, add additional dependencies and continue */
bool can_move_down = !is_vmem || part_of_clause;
HazardResult haz = perform_hazard_query(part_of_clause ? &clause_hq : &indep_hq, candidate.get(), false);
HazardResult haz =
perform_hazard_query(part_of_clause ? &clause_hq : &indep_hq, candidate.get(), false);
if (haz == hazard_fail_reorder_ds || haz == hazard_fail_spill ||
haz == hazard_fail_reorder_sendmsg || haz == hazard_fail_barrier ||
haz == hazard_fail_export)
@ -809,7 +842,7 @@ void schedule_VMEM(sched_ctx& ctx, Block* block,
continue;
}
Instruction *candidate_ptr = candidate.get();
Instruction* candidate_ptr = candidate.get();
MoveResult res = ctx.mv.downwards_move(cursor, part_of_clause);
if (res == move_fail_ssa || res == move_fail_rar) {
add_to_hazard_query(&indep_hq, candidate.get());
@ -832,9 +865,10 @@ void schedule_VMEM(sched_ctx& ctx, Block* block,
bool found_dependency = false;
/* second, check if we have instructions after current to move up */
for (int candidate_idx = idx + 1; k < max_moves && candidate_idx < (int) idx + window_size; candidate_idx++) {
for (int candidate_idx = idx + 1; k < max_moves && candidate_idx < (int)idx + window_size;
candidate_idx++) {
assert(candidate_idx == up_cursor.source_idx);
assert(candidate_idx < (int) block->instructions.size());
assert(candidate_idx < (int)block->instructions.size());
aco_ptr<Instruction>& candidate = block->instructions[candidate_idx];
bool is_vmem = candidate->isVMEM() || candidate->isFlatLike();
@ -889,9 +923,9 @@ void schedule_VMEM(sched_ctx& ctx, Block* block,
}
}
void schedule_position_export(sched_ctx& ctx, Block* block,
std::vector<RegisterDemand>& register_demand,
Instruction* current, int idx)
void
schedule_position_export(sched_ctx& ctx, Block* block, std::vector<RegisterDemand>& register_demand,
Instruction* current, int idx)
{
assert(idx != 0);
int window_size = POS_EXP_WINDOW_SIZE;
@ -904,7 +938,8 @@ void schedule_position_export(sched_ctx& ctx, Block* block,
init_hazard_query(&hq);
add_to_hazard_query(&hq, current);
for (int candidate_idx = idx - 1; k < max_moves && candidate_idx > (int) idx - window_size; candidate_idx--) {
for (int candidate_idx = idx - 1; k < max_moves && candidate_idx > (int)idx - window_size;
candidate_idx--) {
assert(candidate_idx >= 0);
aco_ptr<Instruction>& candidate = block->instructions[candidate_idx];
@ -935,7 +970,8 @@ void schedule_position_export(sched_ctx& ctx, Block* block,
}
}
void schedule_block(sched_ctx& ctx, Program *program, Block* block, live& live_vars)
void
schedule_block(sched_ctx& ctx, Program* program, Block* block, live& live_vars)
{
ctx.last_SMEM_dep_idx = 0;
ctx.last_SMEM_stall = INT16_MIN;
@ -950,7 +986,8 @@ void schedule_block(sched_ctx& ctx, Program *program, Block* block, live& live_v
unsigned target = current->exp().dest;
if (target >= V_008DFC_SQ_EXP_POS && target < V_008DFC_SQ_EXP_PRIM) {
ctx.mv.current = current;
schedule_position_export(ctx, block, live_vars.register_demand[block->index], current, idx);
schedule_position_export(ctx, block, live_vars.register_demand[block->index], current,
idx);
}
}
@ -975,8 +1012,8 @@ void schedule_block(sched_ctx& ctx, Program *program, Block* block, live& live_v
}
}
void schedule_program(Program *program, live& live_vars)
void
schedule_program(Program* program, live& live_vars)
{
/* don't use program->max_reg_demand because that is affected by max_waves_per_simd */
RegisterDemand demand;
@ -991,7 +1028,7 @@ void schedule_program(Program *program, live& live_vars)
/* Allowing the scheduler to reduce the number of waves to as low as 5
* improves performance of Thrones of Britannia significantly and doesn't
* seem to hurt anything else. */
//TODO: account for possible uneven num_waves on GFX10+
// TODO: account for possible uneven num_waves on GFX10+
unsigned wave_fac = program->dev.physical_vgprs / 256;
if (program->num_waves <= 5 * wave_fac)
ctx.num_waves = program->num_waves;
@ -1008,8 +1045,8 @@ void schedule_program(Program *program, live& live_vars)
ctx.num_waves = std::max<uint16_t>(ctx.num_waves / wave_fac, 1);
assert(ctx.num_waves > 0);
ctx.mv.max_registers = { int16_t(get_addr_vgpr_from_waves(program, ctx.num_waves * wave_fac) - 2),
int16_t(get_addr_sgpr_from_waves(program, ctx.num_waves * wave_fac))};
ctx.mv.max_registers = {int16_t(get_addr_vgpr_from_waves(program, ctx.num_waves * wave_fac) - 2),
int16_t(get_addr_sgpr_from_waves(program, ctx.num_waves * wave_fac))};
for (Block& block : program->blocks)
schedule_block(ctx, program, &block, live_vars);
@ -1021,8 +1058,8 @@ void schedule_program(Program *program, live& live_vars)
}
update_vgpr_sgpr_demand(program, new_demand);
/* if enabled, this code asserts that register_demand is updated correctly */
#if 0
/* if enabled, this code asserts that register_demand is updated correctly */
#if 0
int prev_num_waves = program->num_waves;
const RegisterDemand prev_max_demand = program->max_reg_demand;
@ -1042,7 +1079,7 @@ void schedule_program(Program *program, live& live_vars)
assert(program->max_reg_demand == prev_max_demand);
assert(program->num_waves == prev_num_waves);
#endif
#endif
}
}
} // namespace aco

File diff suppressed because it is too large Load Diff

View File

@ -37,7 +37,8 @@ struct phi_info_item {
};
struct ssa_elimination_ctx {
/* The outer vectors should be indexed by block index. The inner vectors store phi information for each block. */
/* The outer vectors should be indexed by block index. The inner vectors store phi information
* for each block. */
std::vector<std::vector<phi_info_item>> logical_phi_info;
std::vector<std::vector<phi_info_item>> linear_phi_info;
std::vector<bool> empty_blocks;
@ -45,14 +46,14 @@ struct ssa_elimination_ctx {
Program* program;
ssa_elimination_ctx(Program* program_)
: logical_phi_info(program_->blocks.size())
, linear_phi_info(program_->blocks.size())
, empty_blocks(program_->blocks.size(), true)
, blocks_incoming_exec_used(program_->blocks.size(), true)
, program(program_) {}
: logical_phi_info(program_->blocks.size()), linear_phi_info(program_->blocks.size()),
empty_blocks(program_->blocks.size(), true),
blocks_incoming_exec_used(program_->blocks.size(), true), program(program_)
{}
};
void collect_phi_info(ssa_elimination_ctx& ctx)
void
collect_phi_info(ssa_elimination_ctx& ctx)
{
for (Block& block : ctx.program->blocks) {
for (aco_ptr<Instruction>& phi : block.instructions) {
@ -67,9 +68,11 @@ void collect_phi_info(ssa_elimination_ctx& ctx)
assert(phi->definitions[0].size() == phi->operands[i].size());
std::vector<unsigned>& preds = phi->opcode == aco_opcode::p_phi ? block.logical_preds : block.linear_preds;
std::vector<unsigned>& preds =
phi->opcode == aco_opcode::p_phi ? block.logical_preds : block.linear_preds;
uint32_t pred_idx = preds[i];
auto& info_vec = phi->opcode == aco_opcode::p_phi ? ctx.logical_phi_info[pred_idx] : ctx.linear_phi_info[pred_idx];
auto& info_vec = phi->opcode == aco_opcode::p_phi ? ctx.logical_phi_info[pred_idx]
: ctx.linear_phi_info[pred_idx];
info_vec.push_back({phi->definitions[0], phi->operands[i]});
ctx.empty_blocks[pred_idx] = false;
}
@ -77,11 +80,12 @@ void collect_phi_info(ssa_elimination_ctx& ctx)
}
}
void insert_parallelcopies(ssa_elimination_ctx& ctx)
void
insert_parallelcopies(ssa_elimination_ctx& ctx)
{
/* insert the parallelcopies from logical phis before p_logical_end */
for (unsigned block_idx = 0; block_idx < ctx.program->blocks.size(); ++block_idx) {
auto &logical_phi_info = ctx.logical_phi_info[block_idx];
auto& logical_phi_info = ctx.logical_phi_info[block_idx];
if (logical_phi_info.empty())
continue;
@ -93,10 +97,11 @@ void insert_parallelcopies(ssa_elimination_ctx& ctx)
}
std::vector<aco_ptr<Instruction>>::iterator it = std::next(block.instructions.begin(), idx);
aco_ptr<Pseudo_instruction> pc{create_instruction<Pseudo_instruction>(aco_opcode::p_parallelcopy, Format::PSEUDO, logical_phi_info.size(), logical_phi_info.size())};
aco_ptr<Pseudo_instruction> pc{
create_instruction<Pseudo_instruction>(aco_opcode::p_parallelcopy, Format::PSEUDO,
logical_phi_info.size(), logical_phi_info.size())};
unsigned i = 0;
for (auto& phi_info : logical_phi_info)
{
for (auto& phi_info : logical_phi_info) {
pc->definitions[i] = phi_info.def;
pc->operands[i] = phi_info.op;
i++;
@ -108,7 +113,7 @@ void insert_parallelcopies(ssa_elimination_ctx& ctx)
/* insert parallelcopies for the linear phis at the end of blocks just before the branch */
for (unsigned block_idx = 0; block_idx < ctx.program->blocks.size(); ++block_idx) {
auto &linear_phi_info = ctx.linear_phi_info[block_idx];
auto& linear_phi_info = ctx.linear_phi_info[block_idx];
if (linear_phi_info.empty())
continue;
@ -116,10 +121,11 @@ void insert_parallelcopies(ssa_elimination_ctx& ctx)
std::vector<aco_ptr<Instruction>>::iterator it = block.instructions.end();
--it;
assert((*it)->isBranch());
aco_ptr<Pseudo_instruction> pc{create_instruction<Pseudo_instruction>(aco_opcode::p_parallelcopy, Format::PSEUDO, linear_phi_info.size(), linear_phi_info.size())};
aco_ptr<Pseudo_instruction> pc{
create_instruction<Pseudo_instruction>(aco_opcode::p_parallelcopy, Format::PSEUDO,
linear_phi_info.size(), linear_phi_info.size())};
unsigned i = 0;
for (auto& phi_info : linear_phi_info)
{
for (auto& phi_info : linear_phi_info) {
pc->definitions[i] = phi_info.def;
pc->operands[i] = phi_info.op;
i++;
@ -130,38 +136,38 @@ void insert_parallelcopies(ssa_elimination_ctx& ctx)
}
}
bool is_empty_block(Block* block, bool ignore_exec_writes)
bool
is_empty_block(Block* block, bool ignore_exec_writes)
{
/* check if this block is empty and the exec mask is not needed */
for (aco_ptr<Instruction>& instr : block->instructions) {
switch (instr->opcode) {
case aco_opcode::p_linear_phi:
case aco_opcode::p_phi:
case aco_opcode::p_logical_start:
case aco_opcode::p_logical_end:
case aco_opcode::p_branch:
case aco_opcode::p_linear_phi:
case aco_opcode::p_phi:
case aco_opcode::p_logical_start:
case aco_opcode::p_logical_end:
case aco_opcode::p_branch: break;
case aco_opcode::p_parallelcopy:
for (unsigned i = 0; i < instr->definitions.size(); i++) {
if (ignore_exec_writes && instr->definitions[i].physReg() == exec)
continue;
if (instr->definitions[i].physReg() != instr->operands[i].physReg())
return false;
}
break;
case aco_opcode::s_andn2_b64:
case aco_opcode::s_andn2_b32:
if (ignore_exec_writes && instr->definitions[0].physReg() == exec)
break;
case aco_opcode::p_parallelcopy:
for (unsigned i = 0; i < instr->definitions.size(); i++) {
if (ignore_exec_writes && instr->definitions[i].physReg() == exec)
continue;
if (instr->definitions[i].physReg() != instr->operands[i].physReg())
return false;
}
break;
case aco_opcode::s_andn2_b64:
case aco_opcode::s_andn2_b32:
if (ignore_exec_writes && instr->definitions[0].physReg() == exec)
break;
return false;
default:
return false;
return false;
default: return false;
}
}
return true;
}
void try_remove_merge_block(ssa_elimination_ctx& ctx, Block* block)
void
try_remove_merge_block(ssa_elimination_ctx& ctx, Block* block)
{
/* check if the successor is another merge block which restores exec */
// TODO: divergent loops also restore exec
@ -179,7 +185,8 @@ void try_remove_merge_block(ssa_elimination_ctx& ctx, Block* block)
block->instructions.emplace_back(std::move(branch));
}
void try_remove_invert_block(ssa_elimination_ctx& ctx, Block* block)
void
try_remove_invert_block(ssa_elimination_ctx& ctx, Block* block)
{
assert(block->linear_succs.size() == 2);
/* only remove this block if the successor got removed as well */
@ -193,7 +200,7 @@ void try_remove_invert_block(ssa_elimination_ctx& ctx, Block* block)
unsigned succ_idx = block->linear_succs[0];
assert(block->linear_preds.size() == 2);
for (unsigned i = 0; i < 2; i++) {
Block *pred = &ctx.program->blocks[block->linear_preds[i]];
Block* pred = &ctx.program->blocks[block->linear_preds[i]];
pred->linear_succs[0] = succ_idx;
ctx.program->blocks[succ_idx].linear_preds[i] = pred->index;
@ -208,7 +215,8 @@ void try_remove_invert_block(ssa_elimination_ctx& ctx, Block* block)
block->linear_succs.clear();
}
void try_remove_simple_block(ssa_elimination_ctx& ctx, Block* block)
void
try_remove_simple_block(ssa_elimination_ctx& ctx, Block* block)
{
if (!is_empty_block(block, false))
return;
@ -277,7 +285,8 @@ void try_remove_simple_block(ssa_elimination_ctx& ctx, Block* block)
block->linear_succs.clear();
}
bool instr_writes_exec(Instruction* instr)
bool
instr_writes_exec(Instruction* instr)
{
for (Definition& def : instr->definitions)
if (def.physReg() == exec || def.physReg() == exec_hi)
@ -286,7 +295,8 @@ bool instr_writes_exec(Instruction* instr)
return false;
}
void eliminate_useless_exec_writes_in_block(ssa_elimination_ctx& ctx, Block& block)
void
eliminate_useless_exec_writes_in_block(ssa_elimination_ctx& ctx, Block& block)
{
/* Check if any successor needs the outgoing exec mask from the current block. */
@ -309,8 +319,9 @@ void eliminate_useless_exec_writes_in_block(ssa_elimination_ctx& ctx, Block& blo
exec_write_used = false;
else
/* blocks_incoming_exec_used is initialized to true, so this is correct even for loops. */
exec_write_used = std::any_of(block.linear_succs.begin(), block.linear_succs.end(),
[&ctx](int succ_idx) { return ctx.blocks_incoming_exec_used[succ_idx]; });
exec_write_used =
std::any_of(block.linear_succs.begin(), block.linear_succs.end(),
[&ctx](int succ_idx) { return ctx.blocks_incoming_exec_used[succ_idx]; });
}
/* Go through all instructions and eliminate useless exec writes. */
@ -318,7 +329,8 @@ void eliminate_useless_exec_writes_in_block(ssa_elimination_ctx& ctx, Block& blo
for (int i = block.instructions.size() - 1; i >= 0; --i) {
aco_ptr<Instruction>& instr = block.instructions[i];
/* We already take information from phis into account before the loop, so let's just break on phis. */
/* We already take information from phis into account before the loop, so let's just break on
* phis. */
if (instr->opcode == aco_opcode::p_linear_phi || instr->opcode == aco_opcode::p_phi)
break;
@ -341,16 +353,15 @@ void eliminate_useless_exec_writes_in_block(ssa_elimination_ctx& ctx, Block& blo
}
/* Remember if the current block needs an incoming exec mask from its predecessors. */
ctx.blocks_incoming_exec_used[block.index] = exec_write_used;
/* Cleanup: remove deleted instructions from the vector. */
auto new_end = std::remove(block.instructions.begin(), block.instructions.end(), nullptr);
block.instructions.resize(new_end - block.instructions.begin());
}
void jump_threading(ssa_elimination_ctx& ctx)
void
jump_threading(ssa_elimination_ctx& ctx)
{
for (int i = ctx.program->blocks.size() - 1; i >= 0; i--) {
Block* block = &ctx.program->blocks[i];
@ -367,8 +378,7 @@ void jump_threading(ssa_elimination_ctx& ctx)
if (block->linear_succs.size() > 1)
continue;
if (block->kind & block_kind_merge ||
block->kind & block_kind_loop_exit)
if (block->kind & block_kind_merge || block->kind & block_kind_loop_exit)
try_remove_merge_block(ctx, block);
if (block->linear_preds.size() == 1)
@ -378,8 +388,8 @@ void jump_threading(ssa_elimination_ctx& ctx)
} /* end namespace */
void ssa_elimination(Program* program)
void
ssa_elimination(Program* program)
{
ssa_elimination_ctx ctx(program);
@ -391,6 +401,5 @@ void ssa_elimination(Program* program)
/* insert parallelcopies from SSA elimination */
insert_parallelcopies(ctx);
}
}
} // namespace aco

View File

@ -23,6 +23,7 @@
*/
#include "aco_ir.h"
#include "util/crc32.h"
#include <algorithm>
@ -33,7 +34,8 @@
namespace aco {
/* sgpr_presched/vgpr_presched */
void collect_presched_stats(Program *program)
void
collect_presched_stats(Program* program)
{
RegisterDemand presched_demand;
for (Block& block : program->blocks)
@ -56,9 +58,9 @@ public:
resource_count,
};
BlockCycleEstimator(Program *program_) : program(program_) {}
BlockCycleEstimator(Program* program_) : program(program_) {}
Program *program;
Program* program;
int32_t cur_cycle = 0;
int32_t res_available[(int)BlockCycleEstimator::resource_count] = {0};
@ -72,6 +74,7 @@ public:
unsigned predict_cost(aco_ptr<Instruction>& instr);
void add(aco_ptr<Instruction>& instr);
void join(const BlockCycleEstimator& other);
private:
unsigned get_waitcnt_cost(wait_imm imm);
unsigned get_dependency_cost(aco_ptr<Instruction>& instr);
@ -81,8 +84,9 @@ private:
};
struct wait_counter_info {
wait_counter_info(unsigned vm_, unsigned exp_, unsigned lgkm_, unsigned vs_) :
vm(vm_), exp(exp_), lgkm(lgkm_), vs(vs_) {}
wait_counter_info(unsigned vm_, unsigned exp_, unsigned lgkm_, unsigned vs_)
: vm(vm_), exp(exp_), lgkm(lgkm_), vs(vs_)
{}
unsigned vm;
unsigned exp;
@ -100,107 +104,83 @@ struct perf_info {
unsigned cost1;
};
static perf_info get_perf_info(Program *program, aco_ptr<Instruction>& instr)
static perf_info
get_perf_info(Program* program, aco_ptr<Instruction>& instr)
{
instr_class cls = instr_info.classes[(int)instr->opcode];
#define WAIT(res) BlockCycleEstimator::res, 0
#define WAIT_USE(res, cnt) BlockCycleEstimator::res, cnt
#define WAIT(res) BlockCycleEstimator::res, 0
#define WAIT_USE(res, cnt) BlockCycleEstimator::res, cnt
if (program->chip_class >= GFX10) {
/* fp64 might be incorrect */
switch (cls) {
case instr_class::valu32:
case instr_class::valu_convert32:
case instr_class::valu_fma:
return {5, WAIT_USE(valu, 1)};
case instr_class::valu64:
return {6, WAIT_USE(valu, 2), WAIT_USE(valu_complex, 2)};
case instr_class::valu_fma: return {5, WAIT_USE(valu, 1)};
case instr_class::valu64: return {6, WAIT_USE(valu, 2), WAIT_USE(valu_complex, 2)};
case instr_class::valu_quarter_rate32:
return {8, WAIT_USE(valu, 4), WAIT_USE(valu_complex, 4)};
case instr_class::valu_transcendental32:
return {10, WAIT_USE(valu, 1), WAIT_USE(valu_complex, 4)};
case instr_class::valu_double:
return {22, WAIT_USE(valu, 16), WAIT_USE(valu_complex, 16)};
case instr_class::valu_double: return {22, WAIT_USE(valu, 16), WAIT_USE(valu_complex, 16)};
case instr_class::valu_double_add:
return {22, WAIT_USE(valu, 16), WAIT_USE(valu_complex, 16)};
case instr_class::valu_double_convert:
return {22, WAIT_USE(valu, 16), WAIT_USE(valu_complex, 16)};
case instr_class::valu_double_transcendental:
return {24, WAIT_USE(valu, 16), WAIT_USE(valu_complex, 16)};
case instr_class::salu:
return {2, WAIT_USE(scalar, 1)};
case instr_class::smem:
return {0, WAIT_USE(scalar, 1)};
case instr_class::salu: return {2, WAIT_USE(scalar, 1)};
case instr_class::smem: return {0, WAIT_USE(scalar, 1)};
case instr_class::branch:
case instr_class::sendmsg:
return {0, WAIT_USE(branch_sendmsg, 1)};
case instr_class::sendmsg: return {0, WAIT_USE(branch_sendmsg, 1)};
case instr_class::ds:
return instr->ds().gds ?
perf_info{0, WAIT_USE(export_gds, 1)} :
perf_info{0, WAIT_USE(lds, 1)};
case instr_class::exp:
return {0, WAIT_USE(export_gds, 1)};
case instr_class::vmem:
return {0, WAIT_USE(vmem, 1)};
return instr->ds().gds ? perf_info{0, WAIT_USE(export_gds, 1)}
: perf_info{0, WAIT_USE(lds, 1)};
case instr_class::exp: return {0, WAIT_USE(export_gds, 1)};
case instr_class::vmem: return {0, WAIT_USE(vmem, 1)};
case instr_class::barrier:
case instr_class::waitcnt:
case instr_class::other:
default:
return {0};
default: return {0};
}
} else {
switch (cls) {
case instr_class::valu32:
return {4, WAIT_USE(valu, 4)};
case instr_class::valu_convert32:
return {16, WAIT_USE(valu, 16)};
case instr_class::valu64:
return {8, WAIT_USE(valu, 8)};
case instr_class::valu_quarter_rate32:
return {16, WAIT_USE(valu, 16)};
case instr_class::valu32: return {4, WAIT_USE(valu, 4)};
case instr_class::valu_convert32: return {16, WAIT_USE(valu, 16)};
case instr_class::valu64: return {8, WAIT_USE(valu, 8)};
case instr_class::valu_quarter_rate32: return {16, WAIT_USE(valu, 16)};
case instr_class::valu_fma:
return program->dev.has_fast_fma32 ?
perf_info{4, WAIT_USE(valu, 4)} :
perf_info{16, WAIT_USE(valu, 16)};
case instr_class::valu_transcendental32:
return {16, WAIT_USE(valu, 16)};
case instr_class::valu_double:
return {64, WAIT_USE(valu, 64)};
case instr_class::valu_double_add:
return {32, WAIT_USE(valu, 32)};
case instr_class::valu_double_convert:
return {16, WAIT_USE(valu, 16)};
case instr_class::valu_double_transcendental:
return {64, WAIT_USE(valu, 64)};
case instr_class::salu:
return {4, WAIT_USE(scalar, 4)};
case instr_class::smem:
return {4, WAIT_USE(scalar, 4)};
return program->dev.has_fast_fma32 ? perf_info{4, WAIT_USE(valu, 4)}
: perf_info{16, WAIT_USE(valu, 16)};
case instr_class::valu_transcendental32: return {16, WAIT_USE(valu, 16)};
case instr_class::valu_double: return {64, WAIT_USE(valu, 64)};
case instr_class::valu_double_add: return {32, WAIT_USE(valu, 32)};
case instr_class::valu_double_convert: return {16, WAIT_USE(valu, 16)};
case instr_class::valu_double_transcendental: return {64, WAIT_USE(valu, 64)};
case instr_class::salu: return {4, WAIT_USE(scalar, 4)};
case instr_class::smem: return {4, WAIT_USE(scalar, 4)};
case instr_class::branch:
return {8, WAIT_USE(branch_sendmsg, 8)};
return {4, WAIT_USE(branch_sendmsg, 4)};
case instr_class::ds:
return instr->ds().gds ?
perf_info{4, WAIT_USE(export_gds, 4)} :
perf_info{4, WAIT_USE(lds, 4)};
case instr_class::exp:
return {16, WAIT_USE(export_gds, 16)};
case instr_class::vmem:
return {4, WAIT_USE(vmem, 4)};
return instr->ds().gds ? perf_info{4, WAIT_USE(export_gds, 4)}
: perf_info{4, WAIT_USE(lds, 4)};
case instr_class::exp: return {16, WAIT_USE(export_gds, 16)};
case instr_class::vmem: return {4, WAIT_USE(vmem, 4)};
case instr_class::barrier:
case instr_class::waitcnt:
case instr_class::other:
default:
return {4};
default: return {4};
}
}
#undef WAIT_USE
#undef WAIT
#undef WAIT_USE
#undef WAIT
}
void BlockCycleEstimator::use_resources(aco_ptr<Instruction>& instr)
void
BlockCycleEstimator::use_resources(aco_ptr<Instruction>& instr)
{
perf_info perf = get_perf_info(program, instr);
@ -215,7 +195,8 @@ void BlockCycleEstimator::use_resources(aco_ptr<Instruction>& instr)
}
}
int32_t BlockCycleEstimator::cycles_until_res_available(aco_ptr<Instruction>& instr)
int32_t
BlockCycleEstimator::cycles_until_res_available(aco_ptr<Instruction>& instr)
{
perf_info perf = get_perf_info(program, instr);
@ -228,7 +209,8 @@ int32_t BlockCycleEstimator::cycles_until_res_available(aco_ptr<Instruction>& in
return cost;
}
static wait_counter_info get_wait_counter_info(aco_ptr<Instruction>& instr)
static wait_counter_info
get_wait_counter_info(aco_ptr<Instruction>& instr)
{
/* These numbers are all a bit nonsense. LDS/VMEM/SMEM/EXP performance
* depends a lot on the situation. */
@ -252,8 +234,8 @@ static wait_counter_info get_wait_counter_info(aco_ptr<Instruction>& instr)
bool likely_desc_load = instr->operands[0].size() == 2;
bool soe = instr->operands.size() >= (!instr->definitions.empty() ? 3 : 4);
bool const_offset = instr->operands[1].isConstant() &&
(!soe || instr->operands.back().isConstant());
bool const_offset =
instr->operands[1].isConstant() && (!soe || instr->operands.back().isConstant());
if (likely_desc_load || const_offset)
return wait_counter_info(0, 0, 30, 0); /* likely to hit L0 cache */
@ -273,7 +255,8 @@ static wait_counter_info get_wait_counter_info(aco_ptr<Instruction>& instr)
return wait_counter_info(0, 0, 0, 0);
}
static wait_imm get_wait_imm(Program *program, aco_ptr<Instruction>& instr)
static wait_imm
get_wait_imm(Program* program, aco_ptr<Instruction>& instr)
{
if (instr->opcode == aco_opcode::s_endpgm) {
return wait_imm(0, 0, 0, 0);
@ -297,7 +280,8 @@ static wait_imm get_wait_imm(Program *program, aco_ptr<Instruction>& instr)
}
}
unsigned BlockCycleEstimator::get_dependency_cost(aco_ptr<Instruction>& instr)
unsigned
BlockCycleEstimator::get_dependency_cost(aco_ptr<Instruction>& instr)
{
int deps_available = cur_cycle;
@ -337,13 +321,15 @@ unsigned BlockCycleEstimator::get_dependency_cost(aco_ptr<Instruction>& instr)
return deps_available - cur_cycle;
}
unsigned BlockCycleEstimator::predict_cost(aco_ptr<Instruction>& instr)
unsigned
BlockCycleEstimator::predict_cost(aco_ptr<Instruction>& instr)
{
int32_t dep = get_dependency_cost(instr);
return dep + std::max(cycles_until_res_available(instr) - dep, 0);
}
static bool is_vector(aco_opcode op)
static bool
is_vector(aco_opcode op)
{
switch (instr_info.classes[(int)op]) {
case instr_class::valu32:
@ -358,14 +344,13 @@ static bool is_vector(aco_opcode op)
case instr_class::exp:
case instr_class::valu64:
case instr_class::valu_quarter_rate32:
case instr_class::valu_transcendental32:
return true;
default:
return false;
case instr_class::valu_transcendental32: return true;
default: return false;
}
}
void BlockCycleEstimator::add(aco_ptr<Instruction>& instr)
void
BlockCycleEstimator::add(aco_ptr<Instruction>& instr)
{
perf_info perf = get_perf_info(program, instr);
@ -411,13 +396,14 @@ void BlockCycleEstimator::add(aco_ptr<Instruction>& instr)
int32_t result_available = start + MAX2(perf.latency, latency);
for (Definition& def : instr->definitions) {
int32_t *available = &reg_available[def.physReg().reg()];
int32_t* available = &reg_available[def.physReg().reg()];
for (unsigned i = 0; i < def.size(); i++)
available[i] = MAX2(available[i], result_available);
}
}
static void join_queue(std::deque<int32_t>& queue, const std::deque<int32_t>& pred, int cycle_diff)
static void
join_queue(std::deque<int32_t>& queue, const std::deque<int32_t>& pred, int cycle_diff)
{
for (unsigned i = 0; i < MIN2(queue.size(), pred.size()); i++)
queue.rbegin()[i] = MAX2(queue.rbegin()[i], pred.rbegin()[i] + cycle_diff);
@ -425,7 +411,8 @@ static void join_queue(std::deque<int32_t>& queue, const std::deque<int32_t>& pr
queue.push_front(pred[i] + cycle_diff);
}
void BlockCycleEstimator::join(const BlockCycleEstimator& pred)
void
BlockCycleEstimator::join(const BlockCycleEstimator& pred)
{
assert(cur_cycle == 0);
@ -435,8 +422,7 @@ void BlockCycleEstimator::join(const BlockCycleEstimator& pred)
}
for (unsigned i = 0; i < 512; i++)
reg_available[i] = MAX2(reg_available[i],
pred.reg_available[i] - pred.cur_cycle + cur_cycle);
reg_available[i] = MAX2(reg_available[i], pred.reg_available[i] - pred.cur_cycle + cur_cycle);
join_queue(lgkm, pred.lgkm, -pred.cur_cycle);
join_queue(exp, pred.exp, -pred.cur_cycle);
@ -445,11 +431,12 @@ void BlockCycleEstimator::join(const BlockCycleEstimator& pred)
}
/* instructions/branches/vmem_clauses/smem_clauses/cycles */
void collect_preasm_stats(Program *program)
void
collect_preasm_stats(Program* program)
{
for (Block& block : program->blocks) {
std::set<Instruction *> vmem_clause;
std::set<Instruction *> smem_clause;
std::set<Instruction*> vmem_clause;
std::set<Instruction*> smem_clause;
program->statistics[statistic_instructions] += block.instructions.size();
@ -462,7 +449,8 @@ void collect_preasm_stats(Program *program)
if (instr->isVMEM() && !instr->operands.empty()) {
if (std::none_of(vmem_clause.begin(), vmem_clause.end(),
[&](Instruction *other) {return should_form_clause(instr.get(), other);}))
[&](Instruction* other)
{ return should_form_clause(instr.get(), other); }))
program->statistics[statistic_vmem_clauses]++;
vmem_clause.insert(instr.get());
} else {
@ -471,12 +459,13 @@ void collect_preasm_stats(Program *program)
if (instr->isSMEM() && !instr->operands.empty()) {
if (std::none_of(smem_clause.begin(), smem_clause.end(),
[&](Instruction *other) {return should_form_clause(instr.get(), other);}))
[&](Instruction* other)
{ return should_form_clause(instr.get(), other); }))
program->statistics[statistic_smem_clauses]++;
smem_clause.insert(instr.get());
} else {
smem_clause.clear();
}
}
}
}
@ -514,8 +503,10 @@ void collect_preasm_stats(Program *program)
iter *= pow(0.5, block.uniform_if_depth);
iter *= pow(0.75, block.divergent_if_logical_depth);
bool divergent_if_linear_else = block.logical_preds.empty() && block.linear_preds.size() == 1 && block.linear_succs.size() == 1 &&
program->blocks[block.linear_preds[0]].kind & (block_kind_branch | block_kind_invert);
bool divergent_if_linear_else =
block.logical_preds.empty() && block.linear_preds.size() == 1 &&
block.linear_succs.size() == 1 &&
program->blocks[block.linear_preds[0]].kind & (block_kind_branch | block_kind_invert);
if (divergent_if_linear_else)
iter *= 0.25;
@ -540,7 +531,8 @@ void collect_preasm_stats(Program *program)
double max_utilization = 1.0;
if (program->workgroup_size != UINT_MAX)
max_utilization = program->workgroup_size / (double)align(program->workgroup_size, program->wave_size);
max_utilization =
program->workgroup_size / (double)align(program->workgroup_size, program->wave_size);
wave64_per_cycle *= max_utilization;
program->statistics[statistic_latency] = round(latency);
@ -551,7 +543,8 @@ void collect_preasm_stats(Program *program)
fprintf(stderr, "num_waves: %u\n", program->num_waves);
fprintf(stderr, "salu_smem_usage: %f\n", usage[(int)BlockCycleEstimator::scalar]);
fprintf(stderr, "branch_sendmsg_usage: %f\n", usage[(int)BlockCycleEstimator::branch_sendmsg]);
fprintf(stderr, "branch_sendmsg_usage: %f\n",
usage[(int)BlockCycleEstimator::branch_sendmsg]);
fprintf(stderr, "valu_usage: %f\n", usage[(int)BlockCycleEstimator::valu]);
fprintf(stderr, "valu_complex_usage: %f\n", usage[(int)BlockCycleEstimator::valu_complex]);
fprintf(stderr, "lds_usage: %f\n", usage[(int)BlockCycleEstimator::lds]);
@ -565,9 +558,10 @@ void collect_preasm_stats(Program *program)
}
}
void collect_postasm_stats(Program *program, const std::vector<uint32_t>& code)
void
collect_postasm_stats(Program* program, const std::vector<uint32_t>& code)
{
program->statistics[aco::statistic_hash] = util_hash_crc32(code.data(), code.size() * 4);
}
}
} // namespace aco

View File

@ -35,207 +35,198 @@
namespace aco {
/*! \brief Definition of a span object
*
* \details A "span" is an "array view" type for holding a view of contiguous
* data. The "span" object does not own the data itself.
*/
template <typename T>
class span {
*
* \details A "span" is an "array view" type for holding a view of contiguous
* data. The "span" object does not own the data itself.
*/
template <typename T> class span {
public:
using value_type = T;
using pointer = value_type*;
using const_pointer = const value_type*;
using reference = value_type&;
using const_reference = const value_type&;
using iterator = pointer;
using const_iterator = const_pointer;
using reverse_iterator = std::reverse_iterator<iterator>;
using value_type = T;
using pointer = value_type*;
using const_pointer = const value_type*;
using reference = value_type&;
using const_reference = const value_type&;
using iterator = pointer;
using const_iterator = const_pointer;
using reverse_iterator = std::reverse_iterator<iterator>;
using const_reverse_iterator = std::reverse_iterator<const_iterator>;
using size_type = uint16_t;
using difference_type = ptrdiff_t;
using size_type = uint16_t;
using difference_type = ptrdiff_t;
/*! \brief Compiler generated default constructor
*/
*/
constexpr span() = default;
/*! \brief Constructor taking a pointer and the length of the span
* \param[in] data Pointer to the underlying data array
* \param[in] length The size of the span
*/
constexpr span(uint16_t offset_, const size_type length_)
: offset{ offset_ } , length{ length_ } {}
* \param[in] data Pointer to the underlying data array
* \param[in] length The size of the span
*/
constexpr span(uint16_t offset_, const size_type length_) : offset{offset_}, length{length_} {}
/*! \brief Returns an iterator to the begin of the span
* \return data
*/
constexpr iterator begin() noexcept {
return (pointer)((uintptr_t)this + offset);
}
* \return data
*/
constexpr iterator begin() noexcept { return (pointer)((uintptr_t)this + offset); }
/*! \brief Returns a const_iterator to the begin of the span
* \return data
*/
constexpr const_iterator begin() const noexcept {
* \return data
*/
constexpr const_iterator begin() const noexcept
{
return (const_pointer)((uintptr_t)this + offset);
}
/*! \brief Returns an iterator to the end of the span
* \return data + length
*/
constexpr iterator end() noexcept {
return std::next(begin(), length);
}
* \return data + length
*/
constexpr iterator end() noexcept { return std::next(begin(), length); }
/*! \brief Returns a const_iterator to the end of the span
* \return data + length
*/
constexpr const_iterator end() const noexcept {
return std::next(begin(), length);
}
* \return data + length
*/
constexpr const_iterator end() const noexcept { return std::next(begin(), length); }
/*! \brief Returns a const_iterator to the begin of the span
* \return data
*/
constexpr const_iterator cbegin() const noexcept {
return begin();
}
* \return data
*/
constexpr const_iterator cbegin() const noexcept { return begin(); }
/*! \brief Returns a const_iterator to the end of the span
* \return data + length
*/
constexpr const_iterator cend() const noexcept {
return std::next(begin(), length);
}
* \return data + length
*/
constexpr const_iterator cend() const noexcept { return std::next(begin(), length); }
/*! \brief Returns a reverse_iterator to the end of the span
* \return reverse_iterator(end())
*/
constexpr reverse_iterator rbegin() noexcept {
return reverse_iterator(end());
}
* \return reverse_iterator(end())
*/
constexpr reverse_iterator rbegin() noexcept { return reverse_iterator(end()); }
/*! \brief Returns a const_reverse_iterator to the end of the span
* \return reverse_iterator(end())
*/
constexpr const_reverse_iterator rbegin() const noexcept {
* \return reverse_iterator(end())
*/
constexpr const_reverse_iterator rbegin() const noexcept
{
return const_reverse_iterator(end());
}
/*! \brief Returns a reverse_iterator to the begin of the span
* \return reverse_iterator(begin())
*/
constexpr reverse_iterator rend() noexcept {
return reverse_iterator(begin());
}
* \return reverse_iterator(begin())
*/
constexpr reverse_iterator rend() noexcept { return reverse_iterator(begin()); }
/*! \brief Returns a const_reverse_iterator to the begin of the span
* \return reverse_iterator(begin())
*/
constexpr const_reverse_iterator rend() const noexcept {
* \return reverse_iterator(begin())
*/
constexpr const_reverse_iterator rend() const noexcept
{
return const_reverse_iterator(begin());
}
/*! \brief Returns a const_reverse_iterator to the end of the span
* \return rbegin()
*/
constexpr const_reverse_iterator crbegin() const noexcept {
* \return rbegin()
*/
constexpr const_reverse_iterator crbegin() const noexcept
{
return const_reverse_iterator(cend());
}
/*! \brief Returns a const_reverse_iterator to the begin of the span
* \return rend()
*/
constexpr const_reverse_iterator crend() const noexcept {
* \return rend()
*/
constexpr const_reverse_iterator crend() const noexcept
{
return const_reverse_iterator(cbegin());
}
/*! \brief Unchecked access operator
* \param[in] index Index of the element we want to access
* \return *(std::next(data, index))
*/
constexpr reference operator[](const size_type index) noexcept {
* \param[in] index Index of the element we want to access
* \return *(std::next(data, index))
*/
constexpr reference operator[](const size_type index) noexcept
{
assert(length > index);
return *(std::next(begin(), index));
}
/*! \brief Unchecked const access operator
* \param[in] index Index of the element we want to access
* \return *(std::next(data, index))
*/
constexpr const_reference operator[](const size_type index) const noexcept {
* \param[in] index Index of the element we want to access
* \return *(std::next(data, index))
*/
constexpr const_reference operator[](const size_type index) const noexcept
{
assert(length > index);
return *(std::next(begin(), index));
}
/*! \brief Returns a reference to the last element of the span
* \return *(std::next(data, length - 1))
*/
constexpr reference back() noexcept {
* \return *(std::next(data, length - 1))
*/
constexpr reference back() noexcept
{
assert(length > 0);
return *(std::next(begin(), length - 1));
}
/*! \brief Returns a const_reference to the last element of the span
* \return *(std::next(data, length - 1))
*/
constexpr const_reference back() const noexcept {
* \return *(std::next(data, length - 1))
*/
constexpr const_reference back() const noexcept
{
assert(length > 0);
return *(std::next(begin(), length - 1));
}
/*! \brief Returns a reference to the first element of the span
* \return *begin()
*/
constexpr reference front() noexcept {
* \return *begin()
*/
constexpr reference front() noexcept
{
assert(length > 0);
return *begin();
}
/*! \brief Returns a const_reference to the first element of the span
* \return *cbegin()
*/
constexpr const_reference front() const noexcept {
* \return *cbegin()
*/
constexpr const_reference front() const noexcept
{
assert(length > 0);
return *cbegin();
}
/*! \brief Returns true if the span is empty
* \return length == 0
*/
constexpr bool empty() const noexcept {
return length == 0;
}
* \return length == 0
*/
constexpr bool empty() const noexcept { return length == 0; }
/*! \brief Returns the size of the span
* \return length == 0
*/
constexpr size_type size() const noexcept {
return length;
}
* \return length == 0
*/
constexpr size_type size() const noexcept { return length; }
/*! \brief Decreases the size of the span by 1
*/
constexpr void pop_back() noexcept {
*/
constexpr void pop_back() noexcept
{
assert(length > 0);
--length;
}
/*! \brief Adds an element to the end of the span
*/
constexpr void push_back(const_reference val) noexcept {
*std::next(begin(), length++) = val;
}
*/
constexpr void push_back(const_reference val) noexcept { *std::next(begin(), length++) = val; }
/*! \brief Clears the span
*/
constexpr void clear() noexcept {
*/
constexpr void clear() noexcept
{
offset = 0;
length = 0;
}
private:
uint16_t offset{ 0 }; //!> Byte offset from span to data
size_type length{ 0 }; //!> Size of the span
uint16_t offset{0}; //!> Byte offset from span to data
size_type length{0}; //!> Size of the span
};
/*
@ -250,30 +241,32 @@ private:
*/
struct IDSet {
struct Iterator {
const IDSet *set;
const IDSet* set;
union {
struct {
uint32_t bit:6;
uint32_t word:26;
uint32_t bit : 6;
uint32_t word : 26;
};
uint32_t id;
};
Iterator& operator ++();
Iterator& operator++();
bool operator != (const Iterator& other) const;
bool operator!=(const Iterator& other) const;
uint32_t operator * () const;
uint32_t operator*() const;
};
size_t count(uint32_t id) const {
size_t count(uint32_t id) const
{
if (id >= words.size() * 64)
return 0;
return words[id / 64u] & (1ull << (id % 64u)) ? 1 : 0;
}
Iterator find(uint32_t id) const {
Iterator find(uint32_t id) const
{
if (!count(id))
return end();
@ -284,7 +277,8 @@ struct IDSet {
return it;
}
std::pair<Iterator, bool> insert(uint32_t id) {
std::pair<Iterator, bool> insert(uint32_t id)
{
if (words.size() * 64u <= id)
words.resize(id / 64u + 1);
@ -302,7 +296,8 @@ struct IDSet {
return std::make_pair(it, true);
}
size_t erase(uint32_t id) {
size_t erase(uint32_t id)
{
if (!count(id))
return 0;
@ -311,7 +306,8 @@ struct IDSet {
return 1;
}
Iterator cbegin() const {
Iterator cbegin() const
{
Iterator it;
it.set = this;
for (size_t i = 0; i < words.size(); i++) {
@ -324,7 +320,8 @@ struct IDSet {
return end();
}
Iterator cend() const {
Iterator cend() const
{
Iterator it;
it.set = this;
it.word = words.size();
@ -332,27 +329,21 @@ struct IDSet {
return it;
}
Iterator begin() const {
return cbegin();
}
Iterator begin() const { return cbegin(); }
Iterator end() const {
return cend();
}
Iterator end() const { return cend(); }
bool empty() const {
return bits_set == 0;
}
bool empty() const { return bits_set == 0; }
size_t size() const {
return bits_set;
}
size_t size() const { return bits_set; }
std::vector<uint64_t> words;
uint32_t bits_set = 0;
};
inline IDSet::Iterator& IDSet::Iterator::operator ++() {
inline IDSet::Iterator&
IDSet::Iterator::operator++()
{
uint64_t m = set->words[word];
m &= ~((2ull << bit) - 1ull);
if (!m) {
@ -374,12 +365,16 @@ inline IDSet::Iterator& IDSet::Iterator::operator ++() {
return *this;
}
inline bool IDSet::Iterator::operator != (const IDSet::Iterator& other) const {
inline bool
IDSet::Iterator::operator!=(const IDSet::Iterator& other) const
{
assert(set == other.set);
return id != other.id;
}
inline uint32_t IDSet::Iterator::operator * () const {
inline uint32_t
IDSet::Iterator::operator*() const
{
return (word << 6) | bit;
}

View File

@ -23,6 +23,7 @@
*/
#include "aco_ir.h"
#include "util/memstream.h"
#include <array>
@ -32,11 +33,11 @@
namespace aco {
static void aco_log(Program *program, enum radv_compiler_debug_level level,
const char *prefix, const char *file, unsigned line,
const char *fmt, va_list args)
static void
aco_log(Program* program, enum radv_compiler_debug_level level, const char* prefix,
const char* file, unsigned line, const char* fmt, va_list args)
{
char *msg;
char* msg;
if (program->debug.shorten_messages) {
msg = ralloc_vasprintf(NULL, fmt, args);
@ -55,38 +56,39 @@ static void aco_log(Program *program, enum radv_compiler_debug_level level,
ralloc_free(msg);
}
void _aco_perfwarn(Program *program, const char *file, unsigned line,
const char *fmt, ...)
void
_aco_perfwarn(Program* program, const char* file, unsigned line, const char* fmt, ...)
{
va_list args;
va_start(args, fmt);
aco_log(program, RADV_COMPILER_DEBUG_LEVEL_PERFWARN,
"ACO PERFWARN:\n", file, line, fmt, args);
aco_log(program, RADV_COMPILER_DEBUG_LEVEL_PERFWARN, "ACO PERFWARN:\n", file, line, fmt, args);
va_end(args);
}
void _aco_err(Program *program, const char *file, unsigned line,
const char *fmt, ...)
void
_aco_err(Program* program, const char* file, unsigned line, const char* fmt, ...)
{
va_list args;
va_start(args, fmt);
aco_log(program, RADV_COMPILER_DEBUG_LEVEL_ERROR,
"ACO ERROR:\n", file, line, fmt, args);
aco_log(program, RADV_COMPILER_DEBUG_LEVEL_ERROR, "ACO ERROR:\n", file, line, fmt, args);
va_end(args);
}
bool validate_ir(Program* program)
bool
validate_ir(Program* program)
{
bool is_valid = true;
auto check = [&program, &is_valid](bool success, const char * msg, aco::Instruction * instr) -> void {
auto check = [&program, &is_valid](bool success, const char* msg,
aco::Instruction* instr) -> void
{
if (!success) {
char *out;
char* out;
size_t outsize;
struct u_memstream mem;
u_memstream_open(&mem, &out, &outsize);
FILE *const memf = u_memstream_get(&mem);
FILE* const memf = u_memstream_get(&mem);
fprintf(memf, "%s: ", msg);
aco_print_instr(instr, memf);
@ -99,7 +101,9 @@ bool validate_ir(Program* program)
}
};
auto check_block = [&program, &is_valid](bool success, const char * msg, aco::Block * block) -> void {
auto check_block = [&program, &is_valid](bool success, const char* msg,
aco::Block* block) -> void
{
if (!success) {
aco_err(program, "%s: BB%u", msg, block->index);
is_valid = false;
@ -132,32 +136,32 @@ bool validate_ir(Program* program)
base_format = Format::VINTRP;
}
}
check(base_format == instr_info.format[(int)instr->opcode], "Wrong base format for instruction", instr.get());
check(base_format == instr_info.format[(int)instr->opcode],
"Wrong base format for instruction", instr.get());
/* check VOP3 modifiers */
if (instr->isVOP3() && instr->format != Format::VOP3) {
check(base_format == Format::VOP2 ||
base_format == Format::VOP1 ||
base_format == Format::VOPC ||
base_format == Format::VINTRP,
check(base_format == Format::VOP2 || base_format == Format::VOP1 ||
base_format == Format::VOPC || base_format == Format::VINTRP,
"Format cannot have VOP3/VOP3B applied", instr.get());
}
/* check SDWA */
if (instr->isSDWA()) {
check(base_format == Format::VOP2 ||
base_format == Format::VOP1 ||
base_format == Format::VOPC,
check(base_format == Format::VOP2 || base_format == Format::VOP1 ||
base_format == Format::VOPC,
"Format cannot have SDWA applied", instr.get());
check(program->chip_class >= GFX8, "SDWA is GFX8+ only", instr.get());
SDWA_instruction& sdwa = instr->sdwa();
check(sdwa.omod == 0 || program->chip_class >= GFX9, "SDWA omod only supported on GFX9+", instr.get());
check(sdwa.omod == 0 || program->chip_class >= GFX9,
"SDWA omod only supported on GFX9+", instr.get());
if (base_format == Format::VOPC) {
check(sdwa.clamp == false || program->chip_class == GFX8, "SDWA VOPC clamp only supported on GFX8", instr.get());
check(sdwa.clamp == false || program->chip_class == GFX8,
"SDWA VOPC clamp only supported on GFX8", instr.get());
check((instr->definitions[0].isFixed() && instr->definitions[0].physReg() == vcc) ||
program->chip_class >= GFX9,
program->chip_class >= GFX9,
"SDWA+VOPC definition must be fixed to vcc on GFX8", instr.get());
}
@ -171,8 +175,7 @@ bool validate_ir(Program* program)
}
const bool sdwa_opcodes =
instr->opcode != aco_opcode::v_fmac_f32 &&
instr->opcode != aco_opcode::v_fmac_f16 &&
instr->opcode != aco_opcode::v_fmac_f32 && instr->opcode != aco_opcode::v_fmac_f16 &&
instr->opcode != aco_opcode::v_fmamk_f32 &&
instr->opcode != aco_opcode::v_fmaak_f32 &&
instr->opcode != aco_opcode::v_fmamk_f16 &&
@ -186,67 +189,75 @@ bool validate_ir(Program* program)
const bool feature_mac =
program->chip_class == GFX8 &&
(instr->opcode == aco_opcode::v_mac_f32 &&
instr->opcode == aco_opcode::v_mac_f16);
(instr->opcode == aco_opcode::v_mac_f32 && instr->opcode == aco_opcode::v_mac_f16);
check(sdwa_opcodes || feature_mac, "SDWA can't be used with this opcode", instr.get());
if (instr->definitions[0].regClass().is_subdword())
check((sdwa.dst_sel & sdwa_asuint) == (sdwa_isra | instr->definitions[0].bytes()), "Unexpected SDWA sel for sub-dword definition", instr.get());
check((sdwa.dst_sel & sdwa_asuint) == (sdwa_isra | instr->definitions[0].bytes()),
"Unexpected SDWA sel for sub-dword definition", instr.get());
}
/* check opsel */
if (instr->isVOP3()) {
VOP3_instruction& vop3 = instr->vop3();
check(vop3.opsel == 0 || program->chip_class >= GFX9, "Opsel is only supported on GFX9+", instr.get());
check(vop3.opsel == 0 || program->chip_class >= GFX9,
"Opsel is only supported on GFX9+", instr.get());
for (unsigned i = 0; i < 3; i++) {
if (i >= instr->operands.size() ||
(instr->operands[i].hasRegClass() && instr->operands[i].regClass().is_subdword() && !instr->operands[i].isFixed()))
(instr->operands[i].hasRegClass() &&
instr->operands[i].regClass().is_subdword() && !instr->operands[i].isFixed()))
check((vop3.opsel & (1 << i)) == 0, "Unexpected opsel for operand", instr.get());
}
if (instr->definitions[0].regClass().is_subdword() && !instr->definitions[0].isFixed())
check((vop3.opsel & (1 << 3)) == 0, "Unexpected opsel for sub-dword definition", instr.get());
check((vop3.opsel & (1 << 3)) == 0, "Unexpected opsel for sub-dword definition",
instr.get());
}
/* check for undefs */
for (unsigned i = 0; i < instr->operands.size(); i++) {
if (instr->operands[i].isUndefined()) {
bool flat = instr->isFlatLike();
bool can_be_undef = is_phi(instr) || instr->isEXP() ||
instr->isReduction() ||
bool can_be_undef = is_phi(instr) || instr->isEXP() || instr->isReduction() ||
instr->opcode == aco_opcode::p_create_vector ||
(flat && i == 1) || (instr->isMIMG() && (i == 1 || i == 2)) ||
((instr->isMUBUF() || instr->isMTBUF()) && i == 1);
check(can_be_undef, "Undefs can only be used in certain operands", instr.get());
} else {
check(instr->operands[i].isFixed() || instr->operands[i].isTemp() || instr->operands[i].isConstant(), "Uninitialized Operand", instr.get());
check(instr->operands[i].isFixed() || instr->operands[i].isTemp() ||
instr->operands[i].isConstant(),
"Uninitialized Operand", instr.get());
}
}
/* check subdword definitions */
for (unsigned i = 0; i < instr->definitions.size(); i++) {
if (instr->definitions[i].regClass().is_subdword())
check(instr->isPseudo() || instr->definitions[i].bytes() <= 4, "Only Pseudo instructions can write subdword registers larger than 4 bytes", instr.get());
check(instr->isPseudo() || instr->definitions[i].bytes() <= 4,
"Only Pseudo instructions can write subdword registers larger than 4 bytes",
instr.get());
}
if (instr->isSALU() || instr->isVALU()) {
/* check literals */
Operand literal(s1);
for (unsigned i = 0; i < instr->operands.size(); i++)
{
for (unsigned i = 0; i < instr->operands.size(); i++) {
Operand op = instr->operands[i];
if (!op.isLiteral())
continue;
check(!instr->isDPP() && !instr->isSDWA() &&
(!instr->isVOP3() || program->chip_class >= GFX10) &&
(!instr->isVOP3P() || program->chip_class >= GFX10),
(!instr->isVOP3() || program->chip_class >= GFX10) &&
(!instr->isVOP3P() || program->chip_class >= GFX10),
"Literal applied on wrong instruction format", instr.get());
check(literal.isUndefined() || (literal.size() == op.size() && literal.constantValue() == op.constantValue()), "Only 1 Literal allowed", instr.get());
check(literal.isUndefined() || (literal.size() == op.size() &&
literal.constantValue() == op.constantValue()),
"Only 1 Literal allowed", instr.get());
literal = op;
check(instr->isSALU() || instr->isVOP3() || instr->isVOP3P() || i == 0 || i == 2, "Wrong source position for Literal argument", instr.get());
check(instr->isSALU() || instr->isVOP3() || instr->isVOP3P() || i == 0 || i == 2,
"Wrong source position for Literal argument", instr.get());
}
/* check num sgprs for VALU */
@ -264,8 +275,7 @@ bool validate_ir(Program* program)
else if (instr->isDPP())
scalar_mask = 0x0;
if (instr->isVOPC() ||
instr->opcode == aco_opcode::v_readfirstlane_b32 ||
if (instr->isVOPC() || instr->opcode == aco_opcode::v_readfirstlane_b32 ||
instr->opcode == aco_opcode::v_readlane_b32 ||
instr->opcode == aco_opcode::v_readlane_b32_e64) {
check(instr->definitions[0].getTemp().type() == RegType::sgpr,
@ -277,45 +287,42 @@ bool validate_ir(Program* program)
unsigned num_sgprs = 0;
unsigned sgpr[] = {0, 0};
for (unsigned i = 0; i < instr->operands.size(); i++)
{
for (unsigned i = 0; i < instr->operands.size(); i++) {
Operand op = instr->operands[i];
if (instr->opcode == aco_opcode::v_readfirstlane_b32 ||
instr->opcode == aco_opcode::v_readlane_b32 ||
instr->opcode == aco_opcode::v_readlane_b32_e64) {
check(i != 1 ||
(op.isTemp() && op.regClass().type() == RegType::sgpr) ||
op.isConstant(),
check(i != 1 || (op.isTemp() && op.regClass().type() == RegType::sgpr) ||
op.isConstant(),
"Must be a SGPR or a constant", instr.get());
check(i == 1 ||
(op.isTemp() && op.regClass().type() == RegType::vgpr && op.bytes() <= 4),
check(i == 1 || (op.isTemp() && op.regClass().type() == RegType::vgpr &&
op.bytes() <= 4),
"Wrong Operand type for VALU instruction", instr.get());
continue;
}
if (instr->opcode == aco_opcode::v_permlane16_b32 ||
instr->opcode == aco_opcode::v_permlanex16_b32) {
check(i != 0 ||
(op.isTemp() && op.regClass().type() == RegType::vgpr),
check(i != 0 || (op.isTemp() && op.regClass().type() == RegType::vgpr),
"Operand 0 of v_permlane must be VGPR", instr.get());
check(i == 0 ||
(op.isTemp() && op.regClass().type() == RegType::sgpr) ||
op.isConstant(),
"Lane select operands of v_permlane must be SGPR or constant", instr.get());
check(i == 0 || (op.isTemp() && op.regClass().type() == RegType::sgpr) ||
op.isConstant(),
"Lane select operands of v_permlane must be SGPR or constant",
instr.get());
}
if (instr->opcode == aco_opcode::v_writelane_b32 ||
instr->opcode == aco_opcode::v_writelane_b32_e64) {
check(i != 2 ||
(op.isTemp() && op.regClass().type() == RegType::vgpr && op.bytes() <= 4),
check(i != 2 || (op.isTemp() && op.regClass().type() == RegType::vgpr &&
op.bytes() <= 4),
"Wrong Operand type for VALU instruction", instr.get());
check(i == 2 ||
(op.isTemp() && op.regClass().type() == RegType::sgpr) ||
op.isConstant(),
check(i == 2 || (op.isTemp() && op.regClass().type() == RegType::sgpr) ||
op.isConstant(),
"Must be a SGPR or a constant", instr.get());
continue;
}
if (op.isTemp() && instr->operands[i].regClass().type() == RegType::sgpr) {
check(scalar_mask & (1 << i), "Wrong source position for SGPR argument", instr.get());
check(scalar_mask & (1 << i), "Wrong source position for SGPR argument",
instr.get());
if (op.tempId() != sgpr[0] && op.tempId() != sgpr[1]) {
if (num_sgprs < 2)
@ -324,19 +331,22 @@ bool validate_ir(Program* program)
}
if (op.isConstant() && !op.isLiteral())
check(scalar_mask & (1 << i), "Wrong source position for constant argument", instr.get());
check(scalar_mask & (1 << i), "Wrong source position for constant argument",
instr.get());
}
check(num_sgprs + (literal.isUndefined() ? 0 : 1) <= const_bus_limit, "Too many SGPRs/literals", instr.get());
check(num_sgprs + (literal.isUndefined() ? 0 : 1) <= const_bus_limit,
"Too many SGPRs/literals", instr.get());
}
if (instr->isSOP1() || instr->isSOP2()) {
check(instr->definitions[0].getTemp().type() == RegType::sgpr, "Wrong Definition type for SALU instruction", instr.get());
check(instr->definitions[0].getTemp().type() == RegType::sgpr,
"Wrong Definition type for SALU instruction", instr.get());
for (const Operand& op : instr->operands) {
check(op.isConstant() || op.regClass().type() <= RegType::sgpr,
"Wrong Operand type for SALU instruction", instr.get());
check(op.isConstant() || op.regClass().type() <= RegType::sgpr,
"Wrong Operand type for SALU instruction", instr.get());
}
}
}
}
switch (instr->format) {
case Format::PSEUDO: {
@ -346,7 +356,8 @@ bool validate_ir(Program* program)
check(op.bytes() < 4 || size % 4 == 0, "Operand is not aligned", instr.get());
size += op.bytes();
}
check(size == instr->definitions[0].bytes(), "Definition size does not match operand sizes", instr.get());
check(size == instr->definitions[0].bytes(),
"Definition size does not match operand sizes", instr.get());
if (instr->definitions[0].getTemp().type() == RegType::sgpr) {
for (const Operand& op : instr->operands) {
check(op.isConstant() || op.regClass().type() == RegType::sgpr,
@ -354,55 +365,75 @@ bool validate_ir(Program* program)
}
}
} else if (instr->opcode == aco_opcode::p_extract_vector) {
check((instr->operands[0].isTemp()) && instr->operands[1].isConstant(), "Wrong Operand types", instr.get());
check((instr->operands[1].constantValue() + 1) * instr->definitions[0].bytes() <= instr->operands[0].bytes(), "Index out of range", instr.get());
check(instr->definitions[0].getTemp().type() == RegType::vgpr || instr->operands[0].regClass().type() == RegType::sgpr,
check((instr->operands[0].isTemp()) && instr->operands[1].isConstant(),
"Wrong Operand types", instr.get());
check((instr->operands[1].constantValue() + 1) * instr->definitions[0].bytes() <=
instr->operands[0].bytes(),
"Index out of range", instr.get());
check(instr->definitions[0].getTemp().type() == RegType::vgpr ||
instr->operands[0].regClass().type() == RegType::sgpr,
"Cannot extract SGPR value from VGPR vector", instr.get());
check(program->chip_class >= GFX9 || !instr->definitions[0].regClass().is_subdword() ||
instr->operands[0].regClass().type() == RegType::vgpr, "Cannot extract subdword from SGPR before GFX9+", instr.get());
check(program->chip_class >= GFX9 ||
!instr->definitions[0].regClass().is_subdword() ||
instr->operands[0].regClass().type() == RegType::vgpr,
"Cannot extract subdword from SGPR before GFX9+", instr.get());
} else if (instr->opcode == aco_opcode::p_split_vector) {
check(instr->operands[0].isTemp(), "Operand must be a temporary", instr.get());
unsigned size = 0;
for (const Definition& def : instr->definitions) {
size += def.bytes();
}
check(size == instr->operands[0].bytes(), "Operand size does not match definition sizes", instr.get());
check(size == instr->operands[0].bytes(),
"Operand size does not match definition sizes", instr.get());
if (instr->operands[0].getTemp().type() == RegType::vgpr) {
for (const Definition& def : instr->definitions)
check(def.regClass().type() == RegType::vgpr, "Wrong Definition type for VGPR split_vector", instr.get());
check(def.regClass().type() == RegType::vgpr,
"Wrong Definition type for VGPR split_vector", instr.get());
} else {
for (const Definition& def : instr->definitions)
check(program->chip_class >= GFX9 || !def.regClass().is_subdword(), "Cannot split SGPR into subdword VGPRs before GFX9+", instr.get());
check(program->chip_class >= GFX9 || !def.regClass().is_subdword(),
"Cannot split SGPR into subdword VGPRs before GFX9+", instr.get());
}
} else if (instr->opcode == aco_opcode::p_parallelcopy) {
check(instr->definitions.size() == instr->operands.size(), "Number of Operands does not match number of Definitions", instr.get());
check(instr->definitions.size() == instr->operands.size(),
"Number of Operands does not match number of Definitions", instr.get());
for (unsigned i = 0; i < instr->operands.size(); i++) {
check(instr->definitions[i].bytes() == instr->operands[i].bytes(), "Operand and Definition size must match", instr.get());
check(instr->definitions[i].bytes() == instr->operands[i].bytes(),
"Operand and Definition size must match", instr.get());
if (instr->operands[i].isTemp())
check((instr->definitions[i].getTemp().type() == instr->operands[i].regClass().type()) ||
(instr->definitions[i].getTemp().type() == RegType::vgpr && instr->operands[i].regClass().type() == RegType::sgpr),
check((instr->definitions[i].getTemp().type() ==
instr->operands[i].regClass().type()) ||
(instr->definitions[i].getTemp().type() == RegType::vgpr &&
instr->operands[i].regClass().type() == RegType::sgpr),
"Operand and Definition types do not match", instr.get());
}
} else if (instr->opcode == aco_opcode::p_phi) {
check(instr->operands.size() == block.logical_preds.size(), "Number of Operands does not match number of predecessors", instr.get());
check(instr->definitions[0].getTemp().type() == RegType::vgpr, "Logical Phi Definition must be vgpr", instr.get());
check(instr->operands.size() == block.logical_preds.size(),
"Number of Operands does not match number of predecessors", instr.get());
check(instr->definitions[0].getTemp().type() == RegType::vgpr,
"Logical Phi Definition must be vgpr", instr.get());
for (const Operand& op : instr->operands)
check(instr->definitions[0].size() == op.size(), "Operand sizes must match Definition size", instr.get());
check(instr->definitions[0].size() == op.size(),
"Operand sizes must match Definition size", instr.get());
} else if (instr->opcode == aco_opcode::p_linear_phi) {
for (const Operand& op : instr->operands) {
check(!op.isTemp() || op.getTemp().is_linear(), "Wrong Operand type", instr.get());
check(instr->definitions[0].size() == op.size(), "Operand sizes must match Definition size", instr.get());
check(!op.isTemp() || op.getTemp().is_linear(), "Wrong Operand type",
instr.get());
check(instr->definitions[0].size() == op.size(),
"Operand sizes must match Definition size", instr.get());
}
check(instr->operands.size() == block.linear_preds.size(), "Number of Operands does not match number of predecessors", instr.get());
} else if (instr->opcode == aco_opcode::p_extract || instr->opcode == aco_opcode::p_insert) {
check(instr->operands[0].isTemp(),
"Data operand must be temporary", instr.get());
check(instr->operands.size() == block.linear_preds.size(),
"Number of Operands does not match number of predecessors", instr.get());
} else if (instr->opcode == aco_opcode::p_extract ||
instr->opcode == aco_opcode::p_insert) {
check(instr->operands[0].isTemp(), "Data operand must be temporary", instr.get());
check(instr->operands[1].isConstant(), "Index must be constant", instr.get());
if (instr->opcode == aco_opcode::p_extract)
check(instr->operands[3].isConstant(), "Sign-extend flag must be constant", instr.get());
check(instr->operands[3].isConstant(), "Sign-extend flag must be constant",
instr.get());
check(instr->definitions[0].getTemp().type() != RegType::sgpr ||
instr->operands[0].getTemp().type() == RegType::sgpr,
instr->operands[0].getTemp().type() == RegType::sgpr,
"Can't extract/insert VGPR to SGPR", instr.get());
if (instr->operands[0].getTemp().type() == RegType::vgpr)
@ -410,69 +441,106 @@ bool validate_ir(Program* program)
"Sizes of operand and definition must match", instr.get());
if (instr->definitions[0].getTemp().type() == RegType::sgpr)
check(instr->definitions.size() >= 2 && instr->definitions[1].isFixed() && instr->definitions[1].physReg() == scc, "SGPR extract/insert needs a SCC definition", instr.get());
check(instr->definitions.size() >= 2 && instr->definitions[1].isFixed() &&
instr->definitions[1].physReg() == scc,
"SGPR extract/insert needs a SCC definition", instr.get());
check(instr->operands[2].constantEquals(8) || instr->operands[2].constantEquals(16), "Size must be 8 or 16", instr.get());
check(instr->operands[2].constantValue() < instr->operands[0].getTemp().bytes() * 8u, "Size must be smaller than source", instr.get());
check(instr->operands[2].constantEquals(8) || instr->operands[2].constantEquals(16),
"Size must be 8 or 16", instr.get());
check(instr->operands[2].constantValue() < instr->operands[0].getTemp().bytes() * 8u,
"Size must be smaller than source", instr.get());
unsigned comp = instr->operands[0].bytes() * 8u / MAX2(instr->operands[2].constantValue(), 1);
check(instr->operands[1].constantValue() < comp, "Index must be in-bounds", instr.get());
unsigned comp =
instr->operands[0].bytes() * 8u / MAX2(instr->operands[2].constantValue(), 1);
check(instr->operands[1].constantValue() < comp, "Index must be in-bounds",
instr.get());
}
break;
}
case Format::PSEUDO_REDUCTION: {
for (const Operand &op : instr->operands)
check(op.regClass().type() == RegType::vgpr, "All operands of PSEUDO_REDUCTION instructions must be in VGPRs.", instr.get());
for (const Operand& op : instr->operands)
check(op.regClass().type() == RegType::vgpr,
"All operands of PSEUDO_REDUCTION instructions must be in VGPRs.",
instr.get());
if (instr->opcode == aco_opcode::p_reduce && instr->reduction().cluster_size == program->wave_size)
check(instr->definitions[0].regClass().type() == RegType::sgpr || program->wave_size == 32, "The result of unclustered reductions must go into an SGPR.", instr.get());
if (instr->opcode == aco_opcode::p_reduce &&
instr->reduction().cluster_size == program->wave_size)
check(instr->definitions[0].regClass().type() == RegType::sgpr ||
program->wave_size == 32,
"The result of unclustered reductions must go into an SGPR.", instr.get());
else
check(instr->definitions[0].regClass().type() == RegType::vgpr, "The result of scans and clustered reductions must go into a VGPR.", instr.get());
check(instr->definitions[0].regClass().type() == RegType::vgpr,
"The result of scans and clustered reductions must go into a VGPR.",
instr.get());
break;
}
case Format::SMEM: {
if (instr->operands.size() >= 1)
check((instr->operands[0].isFixed() && !instr->operands[0].isConstant()) ||
(instr->operands[0].isTemp() && instr->operands[0].regClass().type() == RegType::sgpr), "SMEM operands must be sgpr", instr.get());
(instr->operands[0].isTemp() &&
instr->operands[0].regClass().type() == RegType::sgpr),
"SMEM operands must be sgpr", instr.get());
if (instr->operands.size() >= 2)
check(instr->operands[1].isConstant() || (instr->operands[1].isTemp() && instr->operands[1].regClass().type() == RegType::sgpr),
check(instr->operands[1].isConstant() ||
(instr->operands[1].isTemp() &&
instr->operands[1].regClass().type() == RegType::sgpr),
"SMEM offset must be constant or sgpr", instr.get());
if (!instr->definitions.empty())
check(instr->definitions[0].getTemp().type() == RegType::sgpr, "SMEM result must be sgpr", instr.get());
check(instr->definitions[0].getTemp().type() == RegType::sgpr,
"SMEM result must be sgpr", instr.get());
break;
}
case Format::MTBUF:
case Format::MUBUF: {
check(instr->operands.size() > 1, "VMEM instructions must have at least one operand", instr.get());
check(instr->operands[1].hasRegClass() && instr->operands[1].regClass().type() == RegType::vgpr,
check(instr->operands.size() > 1, "VMEM instructions must have at least one operand",
instr.get());
check(instr->operands[1].hasRegClass() &&
instr->operands[1].regClass().type() == RegType::vgpr,
"VADDR must be in vgpr for VMEM instructions", instr.get());
check(instr->operands[0].isTemp() && instr->operands[0].regClass().type() == RegType::sgpr, "VMEM resource constant must be sgpr", instr.get());
check(instr->operands.size() < 4 || (instr->operands[3].isTemp() && instr->operands[3].regClass().type() == RegType::vgpr), "VMEM write data must be vgpr", instr.get());
check(
instr->operands[0].isTemp() && instr->operands[0].regClass().type() == RegType::sgpr,
"VMEM resource constant must be sgpr", instr.get());
check(instr->operands.size() < 4 ||
(instr->operands[3].isTemp() &&
instr->operands[3].regClass().type() == RegType::vgpr),
"VMEM write data must be vgpr", instr.get());
break;
}
case Format::MIMG: {
check(instr->operands.size() >= 4, "MIMG instructions must have at least 4 operands", instr.get());
check(instr->operands[0].hasRegClass() && (instr->operands[0].regClass() == s4 || instr->operands[0].regClass() == s8),
check(instr->operands.size() >= 4, "MIMG instructions must have at least 4 operands",
instr.get());
check(instr->operands[0].hasRegClass() &&
(instr->operands[0].regClass() == s4 || instr->operands[0].regClass() == s8),
"MIMG operands[0] (resource constant) must be in 4 or 8 SGPRs", instr.get());
if (instr->operands[1].hasRegClass())
check(instr->operands[1].regClass() == s4, "MIMG operands[1] (sampler constant) must be 4 SGPRs", instr.get());
check(instr->operands[1].regClass() == s4,
"MIMG operands[1] (sampler constant) must be 4 SGPRs", instr.get());
if (!instr->operands[2].isUndefined()) {
bool is_cmpswap = instr->opcode == aco_opcode::image_atomic_cmpswap ||
instr->opcode == aco_opcode::image_atomic_fcmpswap;
check(instr->definitions.empty() || (instr->definitions[0].regClass() == instr->operands[2].regClass() || is_cmpswap),
"MIMG operands[2] (VDATA) must be the same as definitions[0] for atomics and TFE/LWE loads", instr.get());
check(instr->definitions.empty() ||
(instr->definitions[0].regClass() == instr->operands[2].regClass() ||
is_cmpswap),
"MIMG operands[2] (VDATA) must be the same as definitions[0] for atomics and "
"TFE/LWE loads",
instr.get());
}
check(instr->operands.size() == 4 || program->chip_class >= GFX10, "NSA is only supported on GFX10+", instr.get());
check(instr->operands.size() == 4 || program->chip_class >= GFX10,
"NSA is only supported on GFX10+", instr.get());
for (unsigned i = 3; i < instr->operands.size(); i++) {
if (instr->operands.size() == 4) {
check(instr->operands[i].hasRegClass() && instr->operands[i].regClass().type() == RegType::vgpr,
check(instr->operands[i].hasRegClass() &&
instr->operands[i].regClass().type() == RegType::vgpr,
"MIMG operands[3] (VADDR) must be VGPR", instr.get());
} else {
check(instr->operands[i].regClass() == v1, "MIMG VADDR must be v1 if NSA is used", instr.get());
check(instr->operands[i].regClass() == v1, "MIMG VADDR must be v1 if NSA is used",
instr.get());
}
}
check(instr->definitions.empty() || (instr->definitions[0].isTemp() && instr->definitions[0].regClass().type() == RegType::vgpr),
check(instr->definitions.empty() ||
(instr->definitions[0].isTemp() &&
instr->definitions[0].regClass().type() == RegType::vgpr),
"MIMG definitions[0] (VDATA) must be VGPR", instr.get());
break;
}
@ -482,31 +550,38 @@ bool validate_ir(Program* program)
"Only VGPRs are valid DS instruction operands", instr.get());
}
if (!instr->definitions.empty())
check(instr->definitions[0].getTemp().type() == RegType::vgpr, "DS instruction must return VGPR", instr.get());
check(instr->definitions[0].getTemp().type() == RegType::vgpr,
"DS instruction must return VGPR", instr.get());
break;
}
case Format::EXP: {
for (unsigned i = 0; i < 4; i++)
check(instr->operands[i].hasRegClass() && instr->operands[i].regClass().type() == RegType::vgpr,
check(instr->operands[i].hasRegClass() &&
instr->operands[i].regClass().type() == RegType::vgpr,
"Only VGPRs are valid Export arguments", instr.get());
break;
}
case Format::FLAT:
check(instr->operands[1].isUndefined(), "Flat instructions don't support SADDR", instr.get());
check(instr->operands[1].isUndefined(), "Flat instructions don't support SADDR",
instr.get());
FALLTHROUGH;
case Format::GLOBAL:
case Format::SCRATCH: {
check(instr->operands[0].isTemp() && instr->operands[0].regClass().type() == RegType::vgpr, "FLAT/GLOBAL/SCRATCH address must be vgpr", instr.get());
check(instr->operands[1].hasRegClass() && instr->operands[1].regClass().type() == RegType::sgpr,
check(
instr->operands[0].isTemp() && instr->operands[0].regClass().type() == RegType::vgpr,
"FLAT/GLOBAL/SCRATCH address must be vgpr", instr.get());
check(instr->operands[1].hasRegClass() &&
instr->operands[1].regClass().type() == RegType::sgpr,
"FLAT/GLOBAL/SCRATCH sgpr address must be undefined or sgpr", instr.get());
if (!instr->definitions.empty())
check(instr->definitions[0].getTemp().type() == RegType::vgpr, "FLAT/GLOBAL/SCRATCH result must be vgpr", instr.get());
check(instr->definitions[0].getTemp().type() == RegType::vgpr,
"FLAT/GLOBAL/SCRATCH result must be vgpr", instr.get());
else
check(instr->operands[2].regClass().type() == RegType::vgpr, "FLAT/GLOBAL/SCRATCH data must be vgpr", instr.get());
check(instr->operands[2].regClass().type() == RegType::vgpr,
"FLAT/GLOBAL/SCRATCH data must be vgpr", instr.get());
break;
}
default:
break;
default: break;
}
}
}
@ -518,20 +593,26 @@ bool validate_ir(Program* program)
/* predecessors/successors should be sorted */
for (unsigned j = 0; j + 1 < block.linear_preds.size(); j++)
check_block(block.linear_preds[j] < block.linear_preds[j + 1], "linear predecessors must be sorted", &block);
check_block(block.linear_preds[j] < block.linear_preds[j + 1],
"linear predecessors must be sorted", &block);
for (unsigned j = 0; j + 1 < block.logical_preds.size(); j++)
check_block(block.logical_preds[j] < block.logical_preds[j + 1], "logical predecessors must be sorted", &block);
check_block(block.logical_preds[j] < block.logical_preds[j + 1],
"logical predecessors must be sorted", &block);
for (unsigned j = 0; j + 1 < block.linear_succs.size(); j++)
check_block(block.linear_succs[j] < block.linear_succs[j + 1], "linear successors must be sorted", &block);
check_block(block.linear_succs[j] < block.linear_succs[j + 1],
"linear successors must be sorted", &block);
for (unsigned j = 0; j + 1 < block.logical_succs.size(); j++)
check_block(block.logical_succs[j] < block.logical_succs[j + 1], "logical successors must be sorted", &block);
check_block(block.logical_succs[j] < block.logical_succs[j + 1],
"logical successors must be sorted", &block);
/* critical edges are not allowed */
if (block.linear_preds.size() > 1) {
for (unsigned pred : block.linear_preds)
check_block(program->blocks[pred].linear_succs.size() == 1, "linear critical edges are not allowed", &program->blocks[pred]);
check_block(program->blocks[pred].linear_succs.size() == 1,
"linear critical edges are not allowed", &program->blocks[pred]);
for (unsigned pred : block.logical_preds)
check_block(program->blocks[pred].logical_succs.size() == 1, "logical critical edges are not allowed", &program->blocks[pred]);
check_block(program->blocks[pred].logical_succs.size() == 1,
"logical critical edges are not allowed", &program->blocks[pred]);
}
}
@ -544,8 +625,8 @@ namespace {
struct Location {
Location() : block(NULL), instr(NULL) {}
Block *block;
Instruction *instr; //NULL if it's the block's live-in
Block* block;
Instruction* instr; // NULL if it's the block's live-in
};
struct Assignment {
@ -554,18 +635,20 @@ struct Assignment {
PhysReg reg;
};
bool ra_fail(Program *program, Location loc, Location loc2, const char *fmt, ...) {
bool
ra_fail(Program* program, Location loc, Location loc2, const char* fmt, ...)
{
va_list args;
va_start(args, fmt);
char msg[1024];
vsprintf(msg, fmt, args);
va_end(args);
char *out;
char* out;
size_t outsize;
struct u_memstream mem;
u_memstream_open(&mem, &out, &outsize);
FILE *const memf = u_memstream_get(&mem);
FILE* const memf = u_memstream_get(&mem);
fprintf(memf, "RA error found at instruction in BB%d:\n", loc.block->index);
if (loc.instr) {
@ -587,7 +670,8 @@ bool ra_fail(Program *program, Location loc, Location loc2, const char *fmt, ...
return true;
}
bool validate_subdword_operand(chip_class chip, const aco_ptr<Instruction>& instr, unsigned index)
bool
validate_subdword_operand(chip_class chip, const aco_ptr<Instruction>& instr, unsigned index)
{
Operand op = instr->operands[index];
unsigned byte = op.physReg().byte();
@ -635,14 +719,14 @@ bool validate_subdword_operand(chip_class chip, const aco_ptr<Instruction>& inst
if (byte == 2 && index == 2)
return true;
break;
default:
break;
default: break;
}
return byte == 0;
}
bool validate_subdword_definition(chip_class chip, const aco_ptr<Instruction>& instr)
bool
validate_subdword_definition(chip_class chip, const aco_ptr<Instruction>& instr)
{
Definition def = instr->definitions[0];
unsigned byte = def.physReg().byte();
@ -664,16 +748,15 @@ bool validate_subdword_definition(chip_class chip, const aco_ptr<Instruction>& i
case aco_opcode::global_load_ubyte_d16_hi:
case aco_opcode::global_load_short_d16_hi:
case aco_opcode::ds_read_u8_d16_hi:
case aco_opcode::ds_read_u16_d16_hi:
return byte == 2;
default:
break;
case aco_opcode::ds_read_u16_d16_hi: return byte == 2;
default: break;
}
return byte == 0;
}
unsigned get_subdword_bytes_written(Program *program, const aco_ptr<Instruction>& instr, unsigned index)
unsigned
get_subdword_bytes_written(Program* program, const aco_ptr<Instruction>& instr, unsigned index)
{
chip_class chip = program->chip_class;
Definition def = instr->definitions[index];
@ -703,8 +786,7 @@ unsigned get_subdword_bytes_written(Program *program, const aco_ptr<Instruction>
case aco_opcode::global_load_ubyte_d16_hi:
case aco_opcode::global_load_short_d16_hi:
case aco_opcode::ds_read_u8_d16_hi:
case aco_opcode::ds_read_u16_d16_hi:
return program->dev.sram_ecc_enabled ? 4 : 2;
case aco_opcode::ds_read_u16_d16_hi: return program->dev.sram_ecc_enabled ? 4 : 2;
case aco_opcode::v_mad_f16:
case aco_opcode::v_mad_u16:
case aco_opcode::v_mad_i16:
@ -714,16 +796,18 @@ unsigned get_subdword_bytes_written(Program *program, const aco_ptr<Instruction>
if (chip >= GFX9)
return 2;
break;
default:
break;
default: break;
}
return MAX2(chip >= GFX10 ? def.bytes() : 4, instr_info.definition_size[(int)instr->opcode] / 8u);
return MAX2(chip >= GFX10 ? def.bytes() : 4,
instr_info.definition_size[(int)instr->opcode] / 8u);
}
} /* end namespace */
bool validate_ra(Program *program) {
bool
validate_ra(Program* program)
{
if (!(debug_flags & DEBUG_VALIDATE_RA))
return false;
@ -754,13 +838,21 @@ bool validate_ra(Program *program) {
if (!op.isFixed())
err |= ra_fail(program, loc, Location(), "Operand %d is not assigned a register", i);
if (assignments.count(op.tempId()) && assignments[op.tempId()].reg != op.physReg())
err |= ra_fail(program, loc, assignments.at(op.tempId()).firstloc, "Operand %d has an inconsistent register assignment with instruction", i);
if ((op.getTemp().type() == RegType::vgpr && op.physReg().reg_b + op.bytes() > (256 + program->config->num_vgprs) * 4) ||
(op.getTemp().type() == RegType::sgpr && op.physReg() + op.size() > program->config->num_sgprs && op.physReg() < sgpr_limit))
err |= ra_fail(program, loc, assignments.at(op.tempId()).firstloc, "Operand %d has an out-of-bounds register assignment", i);
err |=
ra_fail(program, loc, assignments.at(op.tempId()).firstloc,
"Operand %d has an inconsistent register assignment with instruction", i);
if ((op.getTemp().type() == RegType::vgpr &&
op.physReg().reg_b + op.bytes() > (256 + program->config->num_vgprs) * 4) ||
(op.getTemp().type() == RegType::sgpr &&
op.physReg() + op.size() > program->config->num_sgprs &&
op.physReg() < sgpr_limit))
err |= ra_fail(program, loc, assignments.at(op.tempId()).firstloc,
"Operand %d has an out-of-bounds register assignment", i);
if (op.physReg() == vcc && !program->needs_vcc)
err |= ra_fail(program, loc, Location(), "Operand %d fixed to vcc but needs_vcc=false", i);
if (op.regClass().is_subdword() && !validate_subdword_operand(program->chip_class, instr, i))
err |= ra_fail(program, loc, Location(),
"Operand %d fixed to vcc but needs_vcc=false", i);
if (op.regClass().is_subdword() &&
!validate_subdword_operand(program->chip_class, instr, i))
err |= ra_fail(program, loc, Location(), "Operand %d not aligned correctly", i);
if (!assignments[op.tempId()].firstloc.block)
assignments[op.tempId()].firstloc = loc;
@ -773,15 +865,23 @@ bool validate_ra(Program *program) {
if (!def.isTemp())
continue;
if (!def.isFixed())
err |= ra_fail(program, loc, Location(), "Definition %d is not assigned a register", i);
err |=
ra_fail(program, loc, Location(), "Definition %d is not assigned a register", i);
if (assignments[def.tempId()].defloc.block)
err |= ra_fail(program, loc, assignments.at(def.tempId()).defloc, "Temporary %%%d also defined by instruction", def.tempId());
if ((def.getTemp().type() == RegType::vgpr && def.physReg().reg_b + def.bytes() > (256 + program->config->num_vgprs) * 4) ||
(def.getTemp().type() == RegType::sgpr && def.physReg() + def.size() > program->config->num_sgprs && def.physReg() < sgpr_limit))
err |= ra_fail(program, loc, assignments.at(def.tempId()).firstloc, "Definition %d has an out-of-bounds register assignment", i);
err |= ra_fail(program, loc, assignments.at(def.tempId()).defloc,
"Temporary %%%d also defined by instruction", def.tempId());
if ((def.getTemp().type() == RegType::vgpr &&
def.physReg().reg_b + def.bytes() > (256 + program->config->num_vgprs) * 4) ||
(def.getTemp().type() == RegType::sgpr &&
def.physReg() + def.size() > program->config->num_sgprs &&
def.physReg() < sgpr_limit))
err |= ra_fail(program, loc, assignments.at(def.tempId()).firstloc,
"Definition %d has an out-of-bounds register assignment", i);
if (def.physReg() == vcc && !program->needs_vcc)
err |= ra_fail(program, loc, Location(), "Definition %d fixed to vcc but needs_vcc=false", i);
if (def.regClass().is_subdword() && !validate_subdword_definition(program->chip_class, instr))
err |= ra_fail(program, loc, Location(),
"Definition %d fixed to vcc but needs_vcc=false", i);
if (def.regClass().is_subdword() &&
!validate_subdword_definition(program->chip_class, instr))
err |= ra_fail(program, loc, Location(), "Definition %d not aligned correctly", i);
if (!assignments[def.tempId()].firstloc.block)
assignments[def.tempId()].firstloc = loc;
@ -810,7 +910,9 @@ bool validate_ra(Program *program) {
PhysReg reg = assignments.at(tmp.id()).reg;
for (unsigned i = 0; i < tmp.bytes(); i++) {
if (regs[reg.reg_b + i]) {
err |= ra_fail(program, loc, Location(), "Assignment of element %d of %%%d already taken by %%%d in live-out", i, tmp.id(), regs[reg.reg_b + i]);
err |= ra_fail(program, loc, Location(),
"Assignment of element %d of %%%d already taken by %%%d in live-out",
i, tmp.id(), regs[reg.reg_b + i]);
}
regs[reg.reg_b + i] = tmp.id();
}
@ -826,7 +928,10 @@ bool validate_ra(Program *program) {
PhysReg reg = assignments.at(tmp.id()).reg;
for (unsigned i = 0; i < tmp.bytes(); i++) {
if (regs[reg.reg_b + i])
err |= ra_fail(program, loc, Location(), "Assignment of element %d of %%%d already taken by %%%d in live-out", i, tmp.id(), regs[reg.reg_b + i]);
err |= ra_fail(
program, loc, Location(),
"Assignment of element %d of %%%d already taken by %%%d in live-out", i,
tmp.id(), regs[reg.reg_b + i]);
}
live.emplace(tmp);
}
@ -886,16 +991,23 @@ bool validate_ra(Program *program) {
PhysReg reg = assignments.at(tmp.id()).reg;
for (unsigned j = 0; j < tmp.bytes(); j++) {
if (regs[reg.reg_b + j])
err |= ra_fail(program, loc, assignments.at(regs[reg.reg_b + j]).defloc, "Assignment of element %d of %%%d already taken by %%%d from instruction", i, tmp.id(), regs[reg.reg_b + j]);
err |= ra_fail(
program, loc, assignments.at(regs[reg.reg_b + j]).defloc,
"Assignment of element %d of %%%d already taken by %%%d from instruction", i,
tmp.id(), regs[reg.reg_b + j]);
regs[reg.reg_b + j] = tmp.id();
}
if (def.regClass().is_subdword() && def.bytes() < 4) {
unsigned written = get_subdword_bytes_written(program, instr, i);
/* If written=4, the instruction still might write the upper half. In that case, it's the lower half that isn't preserved */
/* If written=4, the instruction still might write the upper half. In that case, it's
* the lower half that isn't preserved */
for (unsigned j = reg.byte() & ~(written - 1); j < written; j++) {
unsigned written_reg = reg.reg() * 4u + j;
if (regs[written_reg] && regs[written_reg] != def.tempId())
err |= ra_fail(program, loc, assignments.at(regs[written_reg]).defloc, "Assignment of element %d of %%%d overwrites the full register taken by %%%d from instruction", i, tmp.id(), regs[written_reg]);
err |= ra_fail(program, loc, assignments.at(regs[written_reg]).defloc,
"Assignment of element %d of %%%d overwrites the full register "
"taken by %%%d from instruction",
i, tmp.id(), regs[written_reg]);
}
}
}
@ -924,4 +1036,4 @@ bool validate_ra(Program *program) {
return err;
}
}
} // namespace aco