aco: Format.
Manually adjusted some comments for more intuitive line breaks. Reviewed-by: Tony Wasserka <tony.wasserka@gmx.de> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/11258>
This commit is contained in:
parent
97ec360dc4
commit
1e2639026f
|
@ -41,14 +41,15 @@ struct constaddr_info {
|
|||
};
|
||||
|
||||
struct asm_context {
|
||||
Program *program;
|
||||
Program* program;
|
||||
enum chip_class chip_class;
|
||||
std::vector<std::pair<int, SOPP_instruction*>> branches;
|
||||
std::map<unsigned, constaddr_info> constaddrs;
|
||||
const int16_t* opcode;
|
||||
// TODO: keep track of branch instructions referring blocks
|
||||
// and, when emitting the block, correct the offset in instr
|
||||
asm_context(Program* program_) : program(program_), chip_class(program->chip_class) {
|
||||
asm_context(Program* program_) : program(program_), chip_class(program->chip_class)
|
||||
{
|
||||
if (chip_class <= GFX7)
|
||||
opcode = &instr_info.opcode_gfx7[0];
|
||||
else if (chip_class <= GFX9)
|
||||
|
@ -60,7 +61,8 @@ struct asm_context {
|
|||
int subvector_begin_pos = -1;
|
||||
};
|
||||
|
||||
static uint32_t get_sdwa_sel(unsigned sel, PhysReg reg)
|
||||
static uint32_t
|
||||
get_sdwa_sel(unsigned sel, PhysReg reg)
|
||||
{
|
||||
if (sel & sdwa_isra) {
|
||||
unsigned size = sdwa_rasize & sel;
|
||||
|
@ -72,7 +74,9 @@ static uint32_t get_sdwa_sel(unsigned sel, PhysReg reg)
|
|||
return sel & sdwa_asuint;
|
||||
}
|
||||
|
||||
unsigned get_mimg_nsa_dwords(const Instruction *instr) {
|
||||
unsigned
|
||||
get_mimg_nsa_dwords(const Instruction* instr)
|
||||
{
|
||||
unsigned addr_dwords = instr->operands.size() - 3;
|
||||
for (unsigned i = 1; i < addr_dwords; i++) {
|
||||
if (instr->operands[3 + i].physReg() != instr->operands[3].physReg().advance(i * 4))
|
||||
|
@ -81,7 +85,8 @@ unsigned get_mimg_nsa_dwords(const Instruction *instr) {
|
|||
return 0;
|
||||
}
|
||||
|
||||
void emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction* instr)
|
||||
void
|
||||
emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction* instr)
|
||||
{
|
||||
/* lower remaining pseudo-instructions */
|
||||
if (instr->opcode == aco_opcode::p_constaddr_getpc) {
|
||||
|
@ -99,11 +104,11 @@ void emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction*
|
|||
|
||||
uint32_t opcode = ctx.opcode[(int)instr->opcode];
|
||||
if (opcode == (uint32_t)-1) {
|
||||
char *outmem;
|
||||
char* outmem;
|
||||
size_t outsize;
|
||||
struct u_memstream mem;
|
||||
u_memstream_open(&mem, &outmem, &outsize);
|
||||
FILE *const memf = u_memstream_get(&mem);
|
||||
FILE* const memf = u_memstream_get(&mem);
|
||||
|
||||
fprintf(memf, "Unsupported opcode: ");
|
||||
aco_print_instr(instr, memf);
|
||||
|
@ -144,11 +149,11 @@ void emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction*
|
|||
|
||||
uint32_t encoding = (0b1011 << 28);
|
||||
encoding |= opcode << 23;
|
||||
encoding |=
|
||||
!instr->definitions.empty() && !(instr->definitions[0].physReg() == scc) ?
|
||||
instr->definitions[0].physReg() << 16 :
|
||||
!instr->operands.empty() && instr->operands[0].physReg() <= 127 ?
|
||||
instr->operands[0].physReg() << 16 : 0;
|
||||
encoding |= !instr->definitions.empty() && !(instr->definitions[0].physReg() == scc)
|
||||
? instr->definitions[0].physReg() << 16
|
||||
: !instr->operands.empty() && instr->operands[0].physReg() <= 127
|
||||
? instr->operands[0].physReg() << 16
|
||||
: 0;
|
||||
encoding |= sopk.imm;
|
||||
out.push_back(encoding);
|
||||
break;
|
||||
|
@ -177,7 +182,7 @@ void emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction*
|
|||
SOPP_instruction& sopp = instr->sopp();
|
||||
uint32_t encoding = (0b101111111 << 23);
|
||||
encoding |= opcode << 16;
|
||||
encoding |= (uint16_t) sopp.imm;
|
||||
encoding |= (uint16_t)sopp.imm;
|
||||
if (sopp.block != -1) {
|
||||
sopp.pass_flags = 0;
|
||||
ctx.branches.emplace_back(out.size(), &sopp);
|
||||
|
@ -208,7 +213,8 @@ void emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction*
|
|||
}
|
||||
out.push_back(encoding);
|
||||
/* SMRD instructions can take a literal on GFX7 */
|
||||
if (instr->operands.size() >= 2 && instr->operands[1].isConstant() && instr->operands[1].constantValue() >= 1024)
|
||||
if (instr->operands.size() >= 2 && instr->operands[1].isConstant() &&
|
||||
instr->operands[1].constantValue() >= 1024)
|
||||
out.push_back(instr->operands[1].constantValue() >> 2);
|
||||
return;
|
||||
}
|
||||
|
@ -235,7 +241,8 @@ void emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction*
|
|||
}
|
||||
|
||||
if (is_load || instr->operands.size() >= 3) { /* SDATA */
|
||||
encoding |= (is_load ? instr->definitions[0].physReg() : instr->operands[2].physReg()) << 6;
|
||||
encoding |= (is_load ? instr->definitions[0].physReg() : instr->operands[2].physReg())
|
||||
<< 6;
|
||||
}
|
||||
if (instr->operands.size() >= 1) { /* SBASE */
|
||||
encoding |= instr->operands[0].physReg() >> 1;
|
||||
|
@ -246,14 +253,16 @@ void emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction*
|
|||
|
||||
int32_t offset = 0;
|
||||
uint32_t soffset = ctx.chip_class >= GFX10
|
||||
? sgpr_null /* On GFX10 this is disabled by specifying SGPR_NULL */
|
||||
: 0; /* On GFX9, it is disabled by the SOE bit (and it's not present on GFX8 and below) */
|
||||
? sgpr_null /* On GFX10 this is disabled by specifying SGPR_NULL */
|
||||
: 0; /* On GFX9, it is disabled by the SOE bit (and it's not present on
|
||||
GFX8 and below) */
|
||||
if (instr->operands.size() >= 2) {
|
||||
const Operand &op_off1 = instr->operands[1];
|
||||
const Operand& op_off1 = instr->operands[1];
|
||||
if (ctx.chip_class <= GFX9) {
|
||||
offset = op_off1.isConstant() ? op_off1.constantValue() : op_off1.physReg();
|
||||
} else {
|
||||
/* GFX10 only supports constants in OFFSET, so put the operand in SOFFSET if it's an SGPR */
|
||||
/* GFX10 only supports constants in OFFSET, so put the operand in SOFFSET if it's an
|
||||
* SGPR */
|
||||
if (op_off1.isConstant()) {
|
||||
offset = op_off1.constantValue();
|
||||
} else {
|
||||
|
@ -263,8 +272,9 @@ void emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction*
|
|||
}
|
||||
|
||||
if (soe) {
|
||||
const Operand &op_off2 = instr->operands.back();
|
||||
assert(ctx.chip_class >= GFX9); /* GFX8 and below don't support specifying a constant and an SGPR at the same time */
|
||||
const Operand& op_off2 = instr->operands.back();
|
||||
assert(ctx.chip_class >= GFX9); /* GFX8 and below don't support specifying a constant
|
||||
and an SGPR at the same time */
|
||||
assert(!op_off2.isConstant());
|
||||
soffset = op_off2.physReg();
|
||||
}
|
||||
|
@ -368,9 +378,13 @@ void emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction*
|
|||
encoding = 0;
|
||||
unsigned reg = !instr->definitions.empty() ? instr->definitions[0].physReg() : 0;
|
||||
encoding |= (0xFF & reg) << 24;
|
||||
reg = instr->operands.size() >= 3 && !(instr->operands[2].physReg() == m0) ? instr->operands[2].physReg() : 0;
|
||||
reg = instr->operands.size() >= 3 && !(instr->operands[2].physReg() == m0)
|
||||
? instr->operands[2].physReg()
|
||||
: 0;
|
||||
encoding |= (0xFF & reg) << 16;
|
||||
reg = instr->operands.size() >= 2 && !(instr->operands[1].physReg() == m0) ? instr->operands[1].physReg() : 0;
|
||||
reg = instr->operands.size() >= 2 && !(instr->operands[1].physReg() == m0)
|
||||
? instr->operands[1].physReg()
|
||||
: 0;
|
||||
encoding |= (0xFF & reg) << 8;
|
||||
encoding |= (0xFF & instr->operands[0].physReg());
|
||||
out.push_back(encoding);
|
||||
|
@ -402,7 +416,8 @@ void emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction*
|
|||
encoding |= instr->operands[2].physReg() << 24;
|
||||
encoding |= (mubuf.tfe ? 1 : 0) << 23;
|
||||
encoding |= (instr->operands[0].physReg() >> 2) << 16;
|
||||
unsigned reg = instr->operands.size() > 3 ? instr->operands[3].physReg() : instr->definitions[0].physReg();
|
||||
unsigned reg = instr->operands.size() > 3 ? instr->operands[3].physReg()
|
||||
: instr->definitions[0].physReg();
|
||||
encoding |= (0xFF & reg) << 8;
|
||||
encoding |= (0xFF & instr->operands[1].physReg());
|
||||
out.push_back(encoding);
|
||||
|
@ -435,7 +450,8 @@ void emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction*
|
|||
encoding |= (mtbuf.tfe ? 1 : 0) << 23;
|
||||
encoding |= (mtbuf.slc ? 1 : 0) << 22;
|
||||
encoding |= (instr->operands[0].physReg() >> 2) << 16;
|
||||
unsigned reg = instr->operands.size() > 3 ? instr->operands[3].physReg() : instr->definitions[0].physReg();
|
||||
unsigned reg = instr->operands.size() > 3 ? instr->operands[3].physReg()
|
||||
: instr->definitions[0].physReg();
|
||||
encoding |= (0xFF & reg) << 8;
|
||||
encoding |= (0xFF & instr->operands[1].physReg());
|
||||
|
||||
|
@ -465,7 +481,8 @@ void emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction*
|
|||
encoding |= mimg.a16 ? 1 << 15 : 0;
|
||||
encoding |= mimg.da ? 1 << 14 : 0;
|
||||
} else {
|
||||
encoding |= mimg.r128 ? 1 << 15 : 0; /* GFX10: A16 moved to 2nd word, R128 replaces it in 1st word */
|
||||
encoding |= mimg.r128 ? 1 << 15
|
||||
: 0; /* GFX10: A16 moved to 2nd word, R128 replaces it in 1st word */
|
||||
encoding |= nsa_dwords << 1;
|
||||
encoding |= mimg.dim << 3; /* GFX10: dimensionality instead of declare array */
|
||||
encoding |= mimg.dlc ? 1 << 7 : 0;
|
||||
|
@ -485,7 +502,8 @@ void emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction*
|
|||
assert(!mimg.d16 || ctx.chip_class >= GFX9);
|
||||
encoding |= mimg.d16 ? 1 << 31 : 0;
|
||||
if (ctx.chip_class >= GFX10) {
|
||||
encoding |= mimg.a16 ? 1 << 30 : 0; /* GFX10: A16 still exists, but is in a different place */
|
||||
/* GFX10: A16 still exists, but is in a different place */
|
||||
encoding |= mimg.a16 ? 1 << 30 : 0;
|
||||
}
|
||||
|
||||
out.push_back(encoding);
|
||||
|
@ -539,7 +557,8 @@ void emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction*
|
|||
assert(ctx.chip_class >= GFX10 || instr->operands[1].physReg() != 0x7F);
|
||||
assert(instr->format != Format::FLAT);
|
||||
encoding |= instr->operands[1].physReg() << 16;
|
||||
} else if (instr->format != Format::FLAT || ctx.chip_class >= GFX10) { /* SADDR is actually used with FLAT on GFX10 */
|
||||
} else if (instr->format != Format::FLAT ||
|
||||
ctx.chip_class >= GFX10) { /* SADDR is actually used with FLAT on GFX10 */
|
||||
if (ctx.chip_class <= GFX9)
|
||||
encoding |= 0x7F << 16;
|
||||
else
|
||||
|
@ -611,7 +630,7 @@ void emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction*
|
|||
}
|
||||
encoding |= vop3.opsel << 11;
|
||||
for (unsigned i = 0; i < 3; i++)
|
||||
encoding |= vop3.abs[i] << (8+i);
|
||||
encoding |= vop3.abs[i] << (8 + i);
|
||||
if (instr->definitions.size() == 2)
|
||||
encoding |= instr->definitions[1].physReg() << 8;
|
||||
encoding |= (0xFF & instr->definitions[0].physReg());
|
||||
|
@ -625,7 +644,7 @@ void emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction*
|
|||
}
|
||||
encoding |= vop3.omod << 27;
|
||||
for (unsigned i = 0; i < 3; i++)
|
||||
encoding |= vop3.neg[i] << (29+i);
|
||||
encoding |= vop3.neg[i] << (29 + i);
|
||||
out.push_back(encoding);
|
||||
|
||||
} else if (instr->isVOP3P()) {
|
||||
|
@ -645,7 +664,7 @@ void emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction*
|
|||
encoding |= vop3.opsel_lo << 11;
|
||||
encoding |= ((vop3.opsel_hi & 0x4) ? 1 : 0) << 14;
|
||||
for (unsigned i = 0; i < 3; i++)
|
||||
encoding |= vop3.neg_hi[i] << (8+i);
|
||||
encoding |= vop3.neg_hi[i] << (8 + i);
|
||||
encoding |= (0xFF & instr->definitions[0].physReg());
|
||||
out.push_back(encoding);
|
||||
encoding = 0;
|
||||
|
@ -653,17 +672,17 @@ void emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction*
|
|||
encoding |= instr->operands[i].physReg() << (i * 9);
|
||||
encoding |= (vop3.opsel_hi & 0x3) << 27;
|
||||
for (unsigned i = 0; i < 3; i++)
|
||||
encoding |= vop3.neg_lo[i] << (29+i);
|
||||
encoding |= vop3.neg_lo[i] << (29 + i);
|
||||
out.push_back(encoding);
|
||||
|
||||
} else if (instr->isDPP()){
|
||||
} else if (instr->isDPP()) {
|
||||
assert(ctx.chip_class >= GFX8);
|
||||
DPP_instruction& dpp = instr->dpp();
|
||||
|
||||
/* first emit the instruction without the DPP operand */
|
||||
Operand dpp_op = instr->operands[0];
|
||||
instr->operands[0] = Operand(PhysReg{250}, v1);
|
||||
instr->format = (Format) ((uint16_t) instr->format & ~(uint16_t)Format::DPP);
|
||||
instr->format = (Format)((uint16_t)instr->format & ~(uint16_t)Format::DPP);
|
||||
emit_instruction(ctx, out, instr);
|
||||
uint32_t encoding = (0xF & dpp.row_mask) << 28;
|
||||
encoding |= (0xF & dpp.bank_mask) << 24;
|
||||
|
@ -684,7 +703,7 @@ void emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction*
|
|||
/* first emit the instruction without the SDWA operand */
|
||||
Operand sdwa_op = instr->operands[0];
|
||||
instr->operands[0] = Operand(PhysReg{249}, v1);
|
||||
instr->format = (Format) ((uint16_t) instr->format & ~(uint16_t)Format::SDWA);
|
||||
instr->format = (Format)((uint16_t)instr->format & ~(uint16_t)Format::SDWA);
|
||||
emit_instruction(ctx, out, instr);
|
||||
|
||||
uint32_t encoding = 0;
|
||||
|
@ -737,7 +756,8 @@ void emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction*
|
|||
}
|
||||
}
|
||||
|
||||
void emit_block(asm_context& ctx, std::vector<uint32_t>& out, Block& block)
|
||||
void
|
||||
emit_block(asm_context& ctx, std::vector<uint32_t>& out, Block& block)
|
||||
{
|
||||
for (aco_ptr<Instruction>& instr : block.instructions) {
|
||||
#if 0
|
||||
|
@ -754,15 +774,15 @@ void emit_block(asm_context& ctx, std::vector<uint32_t>& out, Block& block)
|
|||
}
|
||||
}
|
||||
|
||||
void fix_exports(asm_context& ctx, std::vector<uint32_t>& out, Program* program)
|
||||
void
|
||||
fix_exports(asm_context& ctx, std::vector<uint32_t>& out, Program* program)
|
||||
{
|
||||
bool exported = false;
|
||||
for (Block& block : program->blocks) {
|
||||
if (!(block.kind & block_kind_export_end))
|
||||
continue;
|
||||
std::vector<aco_ptr<Instruction>>::reverse_iterator it = block.instructions.rbegin();
|
||||
while ( it != block.instructions.rend())
|
||||
{
|
||||
while (it != block.instructions.rend()) {
|
||||
if ((*it)->isEXP()) {
|
||||
Export_instruction& exp = (*it)->exp();
|
||||
if (program->stage.hw == HWStage::VS || program->stage.hw == HWStage::NGG) {
|
||||
|
@ -785,15 +805,18 @@ void fix_exports(asm_context& ctx, std::vector<uint32_t>& out, Program* program)
|
|||
|
||||
if (!exported) {
|
||||
/* Abort in order to avoid a GPU hang. */
|
||||
bool is_vertex_or_ngg = (program->stage.hw == HWStage::VS || program->stage.hw == HWStage::NGG);
|
||||
aco_err(program, "Missing export in %s shader:", is_vertex_or_ngg ? "vertex or NGG" : "fragment");
|
||||
bool is_vertex_or_ngg =
|
||||
(program->stage.hw == HWStage::VS || program->stage.hw == HWStage::NGG);
|
||||
aco_err(program,
|
||||
"Missing export in %s shader:", is_vertex_or_ngg ? "vertex or NGG" : "fragment");
|
||||
aco_print_program(program, stderr);
|
||||
abort();
|
||||
}
|
||||
}
|
||||
|
||||
static void insert_code(asm_context& ctx, std::vector<uint32_t>& out, unsigned insert_before,
|
||||
unsigned insert_count, const uint32_t *insert_data)
|
||||
static void
|
||||
insert_code(asm_context& ctx, std::vector<uint32_t>& out, unsigned insert_before,
|
||||
unsigned insert_count, const uint32_t* insert_data)
|
||||
{
|
||||
out.insert(out.begin() + insert_before, insert_data, insert_data + insert_count);
|
||||
|
||||
|
@ -804,9 +827,9 @@ static void insert_code(asm_context& ctx, std::vector<uint32_t>& out, unsigned i
|
|||
}
|
||||
|
||||
/* Find first branch after the inserted code */
|
||||
auto branch_it = std::find_if(ctx.branches.begin(), ctx.branches.end(), [insert_before](const auto &branch) -> bool {
|
||||
return (unsigned)branch.first >= insert_before;
|
||||
});
|
||||
auto branch_it = std::find_if(ctx.branches.begin(), ctx.branches.end(),
|
||||
[insert_before](const auto& branch) -> bool
|
||||
{ return (unsigned)branch.first >= insert_before; });
|
||||
|
||||
/* Update the locations of branches */
|
||||
for (; branch_it != ctx.branches.end(); ++branch_it)
|
||||
|
@ -822,15 +845,21 @@ static void insert_code(asm_context& ctx, std::vector<uint32_t>& out, unsigned i
|
|||
}
|
||||
}
|
||||
|
||||
static void fix_branches_gfx10(asm_context& ctx, std::vector<uint32_t>& out)
|
||||
static void
|
||||
fix_branches_gfx10(asm_context& ctx, std::vector<uint32_t>& out)
|
||||
{
|
||||
/* Branches with an offset of 0x3f are buggy on GFX10, we workaround by inserting NOPs if needed. */
|
||||
/* Branches with an offset of 0x3f are buggy on GFX10,
|
||||
* we workaround by inserting NOPs if needed.
|
||||
*/
|
||||
bool gfx10_3f_bug = false;
|
||||
|
||||
do {
|
||||
auto buggy_branch_it = std::find_if(ctx.branches.begin(), ctx.branches.end(), [&ctx](const auto &branch) -> bool {
|
||||
return ((int)ctx.program->blocks[branch.second->block].offset - branch.first - 1) == 0x3f;
|
||||
});
|
||||
auto buggy_branch_it = std::find_if(
|
||||
ctx.branches.begin(), ctx.branches.end(),
|
||||
[&ctx](const auto& branch) -> bool {
|
||||
return ((int)ctx.program->blocks[branch.second->block].offset - branch.first - 1) ==
|
||||
0x3f;
|
||||
});
|
||||
|
||||
gfx10_3f_bug = buggy_branch_it != ctx.branches.end();
|
||||
|
||||
|
@ -842,7 +871,9 @@ static void fix_branches_gfx10(asm_context& ctx, std::vector<uint32_t>& out)
|
|||
} while (gfx10_3f_bug);
|
||||
}
|
||||
|
||||
void emit_long_jump(asm_context& ctx, SOPP_instruction *branch, bool backwards, std::vector<uint32_t>& out)
|
||||
void
|
||||
emit_long_jump(asm_context& ctx, SOPP_instruction* branch, bool backwards,
|
||||
std::vector<uint32_t>& out)
|
||||
{
|
||||
Builder bld(ctx.program);
|
||||
|
||||
|
@ -857,26 +888,13 @@ void emit_long_jump(asm_context& ctx, SOPP_instruction *branch, bool backwards,
|
|||
/* for conditional branches, skip the long jump if the condition is false */
|
||||
aco_opcode inv;
|
||||
switch (branch->opcode) {
|
||||
case aco_opcode::s_cbranch_scc0:
|
||||
inv = aco_opcode::s_cbranch_scc1;
|
||||
break;
|
||||
case aco_opcode::s_cbranch_scc1:
|
||||
inv = aco_opcode::s_cbranch_scc0;
|
||||
break;
|
||||
case aco_opcode::s_cbranch_vccz:
|
||||
inv = aco_opcode::s_cbranch_vccnz;
|
||||
break;
|
||||
case aco_opcode::s_cbranch_vccnz:
|
||||
inv = aco_opcode::s_cbranch_vccz;
|
||||
break;
|
||||
case aco_opcode::s_cbranch_execz:
|
||||
inv = aco_opcode::s_cbranch_execnz;
|
||||
break;
|
||||
case aco_opcode::s_cbranch_execnz:
|
||||
inv = aco_opcode::s_cbranch_execz;
|
||||
break;
|
||||
default:
|
||||
unreachable("Unhandled long jump.");
|
||||
case aco_opcode::s_cbranch_scc0: inv = aco_opcode::s_cbranch_scc1; break;
|
||||
case aco_opcode::s_cbranch_scc1: inv = aco_opcode::s_cbranch_scc0; break;
|
||||
case aco_opcode::s_cbranch_vccz: inv = aco_opcode::s_cbranch_vccnz; break;
|
||||
case aco_opcode::s_cbranch_vccnz: inv = aco_opcode::s_cbranch_vccz; break;
|
||||
case aco_opcode::s_cbranch_execz: inv = aco_opcode::s_cbranch_execnz; break;
|
||||
case aco_opcode::s_cbranch_execnz: inv = aco_opcode::s_cbranch_execz; break;
|
||||
default: unreachable("Unhandled long jump.");
|
||||
}
|
||||
instr.reset(bld.sopp(inv, -1, 7));
|
||||
emit_instruction(ctx, out, instr.get());
|
||||
|
@ -891,7 +909,9 @@ void emit_long_jump(asm_context& ctx, SOPP_instruction *branch, bool backwards,
|
|||
emit_instruction(ctx, out, instr.get());
|
||||
branch->pass_flags = out.size();
|
||||
|
||||
instr.reset(bld.sop2(aco_opcode::s_addc_u32, def_tmp_hi, op_tmp_hi, Operand(backwards ? UINT32_MAX : 0u)).instr);
|
||||
instr.reset(
|
||||
bld.sop2(aco_opcode::s_addc_u32, def_tmp_hi, op_tmp_hi, Operand(backwards ? UINT32_MAX : 0u))
|
||||
.instr);
|
||||
emit_instruction(ctx, out, instr.get());
|
||||
|
||||
/* restore SCC and clear the LSB of the new PC */
|
||||
|
@ -901,11 +921,13 @@ void emit_long_jump(asm_context& ctx, SOPP_instruction *branch, bool backwards,
|
|||
emit_instruction(ctx, out, instr.get());
|
||||
|
||||
/* create the s_setpc_b64 to jump */
|
||||
instr.reset(bld.sop1(aco_opcode::s_setpc_b64, Operand(branch->definitions[0].physReg(), s2)).instr);
|
||||
instr.reset(
|
||||
bld.sop1(aco_opcode::s_setpc_b64, Operand(branch->definitions[0].physReg(), s2)).instr);
|
||||
emit_instruction(ctx, out, instr.get());
|
||||
}
|
||||
|
||||
void fix_branches(asm_context& ctx, std::vector<uint32_t>& out)
|
||||
void
|
||||
fix_branches(asm_context& ctx, std::vector<uint32_t>& out)
|
||||
{
|
||||
bool repeat = false;
|
||||
do {
|
||||
|
@ -914,11 +936,12 @@ void fix_branches(asm_context& ctx, std::vector<uint32_t>& out)
|
|||
if (ctx.chip_class == GFX10)
|
||||
fix_branches_gfx10(ctx, out);
|
||||
|
||||
for (std::pair<int, SOPP_instruction*> &branch : ctx.branches) {
|
||||
for (std::pair<int, SOPP_instruction*>& branch : ctx.branches) {
|
||||
int offset = (int)ctx.program->blocks[branch.second->block].offset - branch.first - 1;
|
||||
if ((offset < INT16_MIN || offset > INT16_MAX) && !branch.second->pass_flags) {
|
||||
std::vector<uint32_t> long_jump;
|
||||
bool backwards = ctx.program->blocks[branch.second->block].offset < (unsigned)branch.first;
|
||||
bool backwards =
|
||||
ctx.program->blocks[branch.second->block].offset < (unsigned)branch.first;
|
||||
emit_long_jump(ctx, branch.second, backwards, long_jump);
|
||||
|
||||
out[branch.first] = long_jump[0];
|
||||
|
@ -934,13 +957,14 @@ void fix_branches(asm_context& ctx, std::vector<uint32_t>& out)
|
|||
out[branch.first + branch.second->pass_flags - 1] = offset * 4;
|
||||
} else {
|
||||
out[branch.first] &= 0xffff0000u;
|
||||
out[branch.first] |= (uint16_t) offset;
|
||||
out[branch.first] |= (uint16_t)offset;
|
||||
}
|
||||
}
|
||||
} while (repeat);
|
||||
}
|
||||
|
||||
void fix_constaddrs(asm_context& ctx, std::vector<uint32_t>& out)
|
||||
void
|
||||
fix_constaddrs(asm_context& ctx, std::vector<uint32_t>& out)
|
||||
{
|
||||
for (auto& constaddr : ctx.constaddrs) {
|
||||
constaddr_info& info = constaddr.second;
|
||||
|
@ -948,13 +972,12 @@ void fix_constaddrs(asm_context& ctx, std::vector<uint32_t>& out)
|
|||
}
|
||||
}
|
||||
|
||||
unsigned emit_program(Program* program,
|
||||
std::vector<uint32_t>& code)
|
||||
unsigned
|
||||
emit_program(Program* program, std::vector<uint32_t>& code)
|
||||
{
|
||||
asm_context ctx(program);
|
||||
|
||||
if (program->stage.hw == HWStage::VS ||
|
||||
program->stage.hw == HWStage::FS ||
|
||||
if (program->stage.hw == HWStage::VS || program->stage.hw == HWStage::FS ||
|
||||
program->stage.hw == HWStage::NGG)
|
||||
fix_exports(ctx, code, program);
|
||||
|
||||
|
@ -986,4 +1009,4 @@ unsigned emit_program(Program* program,
|
|||
return exec_size;
|
||||
}
|
||||
|
||||
}
|
||||
} // namespace aco
|
||||
|
|
|
@ -40,7 +40,8 @@ struct dce_ctx {
|
|||
std::vector<uint16_t> uses;
|
||||
std::vector<std::vector<bool>> live;
|
||||
|
||||
dce_ctx(Program* program) : current_block(program->blocks.size() - 1), uses(program->peekAllocationId())
|
||||
dce_ctx(Program* program)
|
||||
: current_block(program->blocks.size() - 1), uses(program->peekAllocationId())
|
||||
{
|
||||
live.reserve(program->blocks.size());
|
||||
for (Block& block : program->blocks)
|
||||
|
@ -48,7 +49,8 @@ struct dce_ctx {
|
|||
}
|
||||
};
|
||||
|
||||
void process_block(dce_ctx& ctx, Block& block)
|
||||
void
|
||||
process_block(dce_ctx& ctx, Block& block)
|
||||
{
|
||||
std::vector<bool>& live = ctx.live[block.index];
|
||||
assert(live.size() == block.instructions.size());
|
||||
|
@ -72,23 +74,26 @@ void process_block(dce_ctx& ctx, Block& block)
|
|||
|
||||
if (process_predecessors) {
|
||||
for (unsigned pred_idx : block.linear_preds)
|
||||
ctx.current_block = std::max(ctx.current_block, (int) pred_idx);
|
||||
ctx.current_block = std::max(ctx.current_block, (int)pred_idx);
|
||||
}
|
||||
}
|
||||
|
||||
} /* end namespace */
|
||||
|
||||
bool is_dead(const std::vector<uint16_t>& uses, Instruction *instr)
|
||||
bool
|
||||
is_dead(const std::vector<uint16_t>& uses, Instruction* instr)
|
||||
{
|
||||
if (instr->definitions.empty() || instr->isBranch())
|
||||
return false;
|
||||
if (std::any_of(instr->definitions.begin(), instr->definitions.end(),
|
||||
[&uses] (const Definition& def) { return !def.isTemp() || uses[def.tempId()];}))
|
||||
[&uses](const Definition& def) { return !def.isTemp() || uses[def.tempId()]; }))
|
||||
return false;
|
||||
return !(get_sync_info(instr).semantics & (semantic_volatile | semantic_acqrel));
|
||||
}
|
||||
|
||||
std::vector<uint16_t> dead_code_analysis(Program *program) {
|
||||
std::vector<uint16_t>
|
||||
dead_code_analysis(Program* program)
|
||||
{
|
||||
|
||||
dce_ctx ctx(program);
|
||||
|
||||
|
@ -105,5 +110,4 @@ std::vector<uint16_t> dead_code_analysis(Program *program) {
|
|||
return ctx.uses;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
} // namespace aco
|
||||
|
|
|
@ -38,7 +38,8 @@
|
|||
|
||||
namespace aco {
|
||||
|
||||
void dominator_tree(Program* program)
|
||||
void
|
||||
dominator_tree(Program* program)
|
||||
{
|
||||
program->blocks[0].logical_idom = 0;
|
||||
program->blocks[0].linear_idom = 0;
|
||||
|
@ -48,7 +49,7 @@ void dominator_tree(Program* program)
|
|||
int new_logical_idom = -1;
|
||||
int new_linear_idom = -1;
|
||||
for (unsigned pred_idx : block.logical_preds) {
|
||||
if ((int) program->blocks[pred_idx].logical_idom == -1)
|
||||
if ((int)program->blocks[pred_idx].logical_idom == -1)
|
||||
continue;
|
||||
|
||||
if (new_logical_idom == -1) {
|
||||
|
@ -56,16 +57,16 @@ void dominator_tree(Program* program)
|
|||
continue;
|
||||
}
|
||||
|
||||
while ((int) pred_idx != new_logical_idom) {
|
||||
if ((int) pred_idx > new_logical_idom)
|
||||
while ((int)pred_idx != new_logical_idom) {
|
||||
if ((int)pred_idx > new_logical_idom)
|
||||
pred_idx = program->blocks[pred_idx].logical_idom;
|
||||
if ((int) pred_idx < new_logical_idom)
|
||||
if ((int)pred_idx < new_logical_idom)
|
||||
new_logical_idom = program->blocks[new_logical_idom].logical_idom;
|
||||
}
|
||||
}
|
||||
|
||||
for (unsigned pred_idx : block.linear_preds) {
|
||||
if ((int) program->blocks[pred_idx].linear_idom == -1)
|
||||
if ((int)program->blocks[pred_idx].linear_idom == -1)
|
||||
continue;
|
||||
|
||||
if (new_linear_idom == -1) {
|
||||
|
@ -73,10 +74,10 @@ void dominator_tree(Program* program)
|
|||
continue;
|
||||
}
|
||||
|
||||
while ((int) pred_idx != new_linear_idom) {
|
||||
if ((int) pred_idx > new_linear_idom)
|
||||
while ((int)pred_idx != new_linear_idom) {
|
||||
if ((int)pred_idx > new_linear_idom)
|
||||
pred_idx = program->blocks[pred_idx].linear_idom;
|
||||
if ((int) pred_idx < new_linear_idom)
|
||||
if ((int)pred_idx < new_linear_idom)
|
||||
new_linear_idom = program->blocks[new_linear_idom].linear_idom;
|
||||
}
|
||||
}
|
||||
|
@ -86,5 +87,5 @@ void dominator_tree(Program* program)
|
|||
}
|
||||
}
|
||||
|
||||
}
|
||||
} // namespace aco
|
||||
#endif
|
||||
|
|
|
@ -31,15 +31,15 @@ namespace aco {
|
|||
namespace {
|
||||
|
||||
/* there can also be LDS and VALU clauses, but I don't see how those are interesting */
|
||||
enum clause_type
|
||||
{
|
||||
enum clause_type {
|
||||
clause_vmem,
|
||||
clause_flat,
|
||||
clause_smem,
|
||||
clause_other,
|
||||
};
|
||||
|
||||
void emit_clause(Builder& bld, unsigned num_instrs, aco_ptr<Instruction> *instrs)
|
||||
void
|
||||
emit_clause(Builder& bld, unsigned num_instrs, aco_ptr<Instruction>* instrs)
|
||||
{
|
||||
unsigned start = 0;
|
||||
|
||||
|
@ -61,7 +61,8 @@ void emit_clause(Builder& bld, unsigned num_instrs, aco_ptr<Instruction> *instrs
|
|||
|
||||
} /* end namespace */
|
||||
|
||||
void form_hard_clauses(Program *program)
|
||||
void
|
||||
form_hard_clauses(Program* program)
|
||||
{
|
||||
for (Block& block : program->blocks) {
|
||||
unsigned num_instrs = 0;
|
||||
|
@ -77,7 +78,8 @@ void form_hard_clauses(Program *program)
|
|||
|
||||
clause_type type = clause_other;
|
||||
if (instr->isVMEM() && !instr->operands.empty()) {
|
||||
if (program->chip_class == GFX10 && instr->isMIMG() && get_mimg_nsa_dwords(instr.get()) > 0)
|
||||
if (program->chip_class == GFX10 && instr->isMIMG() &&
|
||||
get_mimg_nsa_dwords(instr.get()) > 0)
|
||||
type = clause_other;
|
||||
else
|
||||
type = clause_vmem;
|
||||
|
@ -109,4 +111,4 @@ void form_hard_clauses(Program *program)
|
|||
block.instructions = std::move(new_instructions);
|
||||
}
|
||||
}
|
||||
}
|
||||
} // namespace aco
|
||||
|
|
|
@ -34,12 +34,15 @@ namespace aco {
|
|||
namespace {
|
||||
|
||||
struct NOP_ctx_gfx6 {
|
||||
void join(const NOP_ctx_gfx6 &other) {
|
||||
set_vskip_mode_then_vector = MAX2(set_vskip_mode_then_vector, other.set_vskip_mode_then_vector);
|
||||
void join(const NOP_ctx_gfx6& other)
|
||||
{
|
||||
set_vskip_mode_then_vector =
|
||||
MAX2(set_vskip_mode_then_vector, other.set_vskip_mode_then_vector);
|
||||
valu_wr_vcc_then_vccz = MAX2(valu_wr_vcc_then_vccz, other.valu_wr_vcc_then_vccz);
|
||||
valu_wr_exec_then_execz = MAX2(valu_wr_exec_then_execz, other.valu_wr_exec_then_execz);
|
||||
valu_wr_vcc_then_div_fmas = MAX2(valu_wr_vcc_then_div_fmas, other.valu_wr_vcc_then_div_fmas);
|
||||
salu_wr_m0_then_gds_msg_ttrace = MAX2(salu_wr_m0_then_gds_msg_ttrace, other.salu_wr_m0_then_gds_msg_ttrace);
|
||||
salu_wr_m0_then_gds_msg_ttrace =
|
||||
MAX2(salu_wr_m0_then_gds_msg_ttrace, other.salu_wr_m0_then_gds_msg_ttrace);
|
||||
valu_wr_exec_then_dpp = MAX2(valu_wr_exec_then_dpp, other.valu_wr_exec_then_dpp);
|
||||
salu_wr_m0_then_lds = MAX2(salu_wr_m0_then_lds, other.salu_wr_m0_then_lds);
|
||||
salu_wr_m0_then_moverel = MAX2(salu_wr_m0_then_moverel, other.salu_wr_m0_then_moverel);
|
||||
|
@ -53,23 +56,21 @@ struct NOP_ctx_gfx6 {
|
|||
}
|
||||
}
|
||||
|
||||
bool operator==(const NOP_ctx_gfx6 &other)
|
||||
bool operator==(const NOP_ctx_gfx6& other)
|
||||
{
|
||||
return
|
||||
set_vskip_mode_then_vector == other.set_vskip_mode_then_vector &&
|
||||
valu_wr_vcc_then_vccz == other.valu_wr_vcc_then_vccz &&
|
||||
valu_wr_exec_then_execz == other.valu_wr_exec_then_execz &&
|
||||
valu_wr_vcc_then_div_fmas == other.valu_wr_vcc_then_div_fmas &&
|
||||
vmem_store_then_wr_data == other.vmem_store_then_wr_data &&
|
||||
salu_wr_m0_then_gds_msg_ttrace == other.salu_wr_m0_then_gds_msg_ttrace &&
|
||||
valu_wr_exec_then_dpp == other.valu_wr_exec_then_dpp &&
|
||||
salu_wr_m0_then_lds == other.salu_wr_m0_then_lds &&
|
||||
salu_wr_m0_then_moverel == other.salu_wr_m0_then_moverel &&
|
||||
setreg_then_getsetreg == other.setreg_then_getsetreg &&
|
||||
smem_clause == other.smem_clause &&
|
||||
smem_write == other.smem_write &&
|
||||
BITSET_EQUAL(smem_clause_read_write, other.smem_clause_read_write) &&
|
||||
BITSET_EQUAL(smem_clause_write, other.smem_clause_write);
|
||||
return set_vskip_mode_then_vector == other.set_vskip_mode_then_vector &&
|
||||
valu_wr_vcc_then_vccz == other.valu_wr_vcc_then_vccz &&
|
||||
valu_wr_exec_then_execz == other.valu_wr_exec_then_execz &&
|
||||
valu_wr_vcc_then_div_fmas == other.valu_wr_vcc_then_div_fmas &&
|
||||
vmem_store_then_wr_data == other.vmem_store_then_wr_data &&
|
||||
salu_wr_m0_then_gds_msg_ttrace == other.salu_wr_m0_then_gds_msg_ttrace &&
|
||||
valu_wr_exec_then_dpp == other.valu_wr_exec_then_dpp &&
|
||||
salu_wr_m0_then_lds == other.salu_wr_m0_then_lds &&
|
||||
salu_wr_m0_then_moverel == other.salu_wr_m0_then_moverel &&
|
||||
setreg_then_getsetreg == other.setreg_then_getsetreg &&
|
||||
smem_clause == other.smem_clause && smem_write == other.smem_write &&
|
||||
BITSET_EQUAL(smem_clause_read_write, other.smem_clause_read_write) &&
|
||||
BITSET_EQUAL(smem_clause_write, other.smem_clause_write);
|
||||
}
|
||||
|
||||
void add_wait_states(unsigned amount)
|
||||
|
@ -154,7 +155,8 @@ struct NOP_ctx_gfx10 {
|
|||
std::bitset<128> sgprs_read_by_VMEM;
|
||||
std::bitset<128> sgprs_read_by_SMEM;
|
||||
|
||||
void join(const NOP_ctx_gfx10 &other) {
|
||||
void join(const NOP_ctx_gfx10& other)
|
||||
{
|
||||
has_VOPC |= other.has_VOPC;
|
||||
has_nonVALU_exec_read |= other.has_nonVALU_exec_read;
|
||||
has_VMEM |= other.has_VMEM;
|
||||
|
@ -167,23 +169,19 @@ struct NOP_ctx_gfx10 {
|
|||
sgprs_read_by_SMEM |= other.sgprs_read_by_SMEM;
|
||||
}
|
||||
|
||||
bool operator==(const NOP_ctx_gfx10 &other)
|
||||
bool operator==(const NOP_ctx_gfx10& other)
|
||||
{
|
||||
return
|
||||
has_VOPC == other.has_VOPC &&
|
||||
has_nonVALU_exec_read == other.has_nonVALU_exec_read &&
|
||||
has_VMEM == other.has_VMEM &&
|
||||
has_branch_after_VMEM == other.has_branch_after_VMEM &&
|
||||
has_DS == other.has_DS &&
|
||||
has_branch_after_DS == other.has_branch_after_DS &&
|
||||
has_NSA_MIMG == other.has_NSA_MIMG &&
|
||||
has_writelane == other.has_writelane &&
|
||||
sgprs_read_by_VMEM == other.sgprs_read_by_VMEM &&
|
||||
sgprs_read_by_SMEM == other.sgprs_read_by_SMEM;
|
||||
return has_VOPC == other.has_VOPC && has_nonVALU_exec_read == other.has_nonVALU_exec_read &&
|
||||
has_VMEM == other.has_VMEM && has_branch_after_VMEM == other.has_branch_after_VMEM &&
|
||||
has_DS == other.has_DS && has_branch_after_DS == other.has_branch_after_DS &&
|
||||
has_NSA_MIMG == other.has_NSA_MIMG && has_writelane == other.has_writelane &&
|
||||
sgprs_read_by_VMEM == other.sgprs_read_by_VMEM &&
|
||||
sgprs_read_by_SMEM == other.sgprs_read_by_SMEM;
|
||||
}
|
||||
};
|
||||
|
||||
int get_wait_states(aco_ptr<Instruction>& instr)
|
||||
int
|
||||
get_wait_states(aco_ptr<Instruction>& instr)
|
||||
{
|
||||
if (instr->opcode == aco_opcode::s_nop)
|
||||
return instr->sopp().imm + 1;
|
||||
|
@ -193,16 +191,16 @@ int get_wait_states(aco_ptr<Instruction>& instr)
|
|||
return 1;
|
||||
}
|
||||
|
||||
bool regs_intersect(PhysReg a_reg, unsigned a_size, PhysReg b_reg, unsigned b_size)
|
||||
bool
|
||||
regs_intersect(PhysReg a_reg, unsigned a_size, PhysReg b_reg, unsigned b_size)
|
||||
{
|
||||
return a_reg > b_reg ?
|
||||
(a_reg - b_reg < b_size) :
|
||||
(b_reg - a_reg < a_size);
|
||||
return a_reg > b_reg ? (a_reg - b_reg < b_size) : (b_reg - a_reg < a_size);
|
||||
}
|
||||
|
||||
template <bool Valu, bool Vintrp, bool Salu>
|
||||
int handle_raw_hazard_internal(Program *program, Block *block,
|
||||
int nops_needed, PhysReg reg, uint32_t mask)
|
||||
int
|
||||
handle_raw_hazard_internal(Program* program, Block* block, int nops_needed, PhysReg reg,
|
||||
uint32_t mask)
|
||||
{
|
||||
unsigned mask_size = util_last_bit(mask);
|
||||
for (int pred_idx = block->instructions.size() - 1; pred_idx >= 0; pred_idx--) {
|
||||
|
@ -217,10 +215,8 @@ int handle_raw_hazard_internal(Program *program, Block *block,
|
|||
}
|
||||
}
|
||||
|
||||
bool is_hazard = writemask != 0 &&
|
||||
((pred->isVALU() && Valu) ||
|
||||
(pred->isVINTRP() && Vintrp) ||
|
||||
(pred->isSALU() && Salu));
|
||||
bool is_hazard = writemask != 0 && ((pred->isVALU() && Valu) ||
|
||||
(pred->isVINTRP() && Vintrp) || (pred->isSALU() && Salu));
|
||||
if (is_hazard)
|
||||
return nops_needed;
|
||||
|
||||
|
@ -238,17 +234,19 @@ int handle_raw_hazard_internal(Program *program, Block *block,
|
|||
* huge value. */
|
||||
for (unsigned lin_pred : block->linear_preds) {
|
||||
res = std::max(res, handle_raw_hazard_internal<Valu, Vintrp, Salu>(
|
||||
program, &program->blocks[lin_pred], nops_needed, reg, mask));
|
||||
program, &program->blocks[lin_pred], nops_needed, reg, mask));
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
template <bool Valu, bool Vintrp, bool Salu>
|
||||
void handle_raw_hazard(Program *program, Block *cur_block, int *NOPs, int min_states, Operand op)
|
||||
void
|
||||
handle_raw_hazard(Program* program, Block* cur_block, int* NOPs, int min_states, Operand op)
|
||||
{
|
||||
if (*NOPs >= min_states)
|
||||
return;
|
||||
int res = handle_raw_hazard_internal<Valu, Vintrp, Salu>(program, cur_block, min_states, op.physReg(), u_bit_consecutive(0, op.size()));
|
||||
int res = handle_raw_hazard_internal<Valu, Vintrp, Salu>(
|
||||
program, cur_block, min_states, op.physReg(), u_bit_consecutive(0, op.size()));
|
||||
*NOPs = MAX2(*NOPs, res);
|
||||
}
|
||||
|
||||
|
@ -256,7 +254,9 @@ static auto handle_valu_then_read_hazard = handle_raw_hazard<true, true, false>;
|
|||
static auto handle_vintrp_then_read_hazard = handle_raw_hazard<false, true, false>;
|
||||
static auto handle_valu_salu_then_read_hazard = handle_raw_hazard<true, true, true>;
|
||||
|
||||
void set_bitset_range(BITSET_WORD *words, unsigned start, unsigned size) {
|
||||
void
|
||||
set_bitset_range(BITSET_WORD* words, unsigned start, unsigned size)
|
||||
{
|
||||
unsigned end = start + size - 1;
|
||||
unsigned start_mod = start % BITSET_WORDBITS;
|
||||
if (start_mod + size <= BITSET_WORDBITS) {
|
||||
|
@ -268,7 +268,9 @@ void set_bitset_range(BITSET_WORD *words, unsigned start, unsigned size) {
|
|||
}
|
||||
}
|
||||
|
||||
bool test_bitset_range(BITSET_WORD *words, unsigned start, unsigned size) {
|
||||
bool
|
||||
test_bitset_range(BITSET_WORD* words, unsigned start, unsigned size)
|
||||
{
|
||||
unsigned end = start + size - 1;
|
||||
unsigned start_mod = start % BITSET_WORDBITS;
|
||||
if (start_mod + size <= BITSET_WORDBITS) {
|
||||
|
@ -291,18 +293,21 @@ bool test_bitset_range(BITSET_WORD *words, unsigned start, unsigned size) {
|
|||
*
|
||||
* SMEM clauses are only present on GFX8+, and only matter when XNACK is set.
|
||||
*/
|
||||
void handle_smem_clause_hazards(Program *program, NOP_ctx_gfx6 &ctx,
|
||||
aco_ptr<Instruction>& instr, int *NOPs)
|
||||
void
|
||||
handle_smem_clause_hazards(Program* program, NOP_ctx_gfx6& ctx, aco_ptr<Instruction>& instr,
|
||||
int* NOPs)
|
||||
{
|
||||
/* break off from previous SMEM clause if needed */
|
||||
if (!*NOPs & (ctx.smem_clause || ctx.smem_write)) {
|
||||
/* Don't allow clauses with store instructions since the clause's
|
||||
* instructions may use the same address. */
|
||||
if (ctx.smem_write || instr->definitions.empty() || instr_info.is_atomic[(unsigned)instr->opcode]) {
|
||||
if (ctx.smem_write || instr->definitions.empty() ||
|
||||
instr_info.is_atomic[(unsigned)instr->opcode]) {
|
||||
*NOPs = 1;
|
||||
} else if (program->dev.xnack_enabled) {
|
||||
for (Operand op : instr->operands) {
|
||||
if (!op.isConstant() && test_bitset_range(ctx.smem_clause_write, op.physReg(), op.size())) {
|
||||
if (!op.isConstant() &&
|
||||
test_bitset_range(ctx.smem_clause_write, op.physReg(), op.size())) {
|
||||
*NOPs = 1;
|
||||
break;
|
||||
}
|
||||
|
@ -316,8 +321,10 @@ void handle_smem_clause_hazards(Program *program, NOP_ctx_gfx6 &ctx,
|
|||
}
|
||||
|
||||
/* TODO: we don't handle accessing VCC using the actual SGPR instead of using the alias */
|
||||
void handle_instruction_gfx6(Program *program, Block *cur_block, NOP_ctx_gfx6 &ctx,
|
||||
aco_ptr<Instruction>& instr, std::vector<aco_ptr<Instruction>>& new_instructions)
|
||||
void
|
||||
handle_instruction_gfx6(Program* program, Block* cur_block, NOP_ctx_gfx6& ctx,
|
||||
aco_ptr<Instruction>& instr,
|
||||
std::vector<aco_ptr<Instruction>>& new_instructions)
|
||||
{
|
||||
/* check hazards */
|
||||
int NOPs = 0;
|
||||
|
@ -343,14 +350,17 @@ void handle_instruction_gfx6(Program *program, Block *cur_block, NOP_ctx_gfx6 &c
|
|||
|
||||
handle_smem_clause_hazards(program, ctx, instr, &NOPs);
|
||||
} else if (instr->isSALU()) {
|
||||
if (instr->opcode == aco_opcode::s_setreg_b32 || instr->opcode == aco_opcode::s_setreg_imm32_b32 ||
|
||||
if (instr->opcode == aco_opcode::s_setreg_b32 ||
|
||||
instr->opcode == aco_opcode::s_setreg_imm32_b32 ||
|
||||
instr->opcode == aco_opcode::s_getreg_b32) {
|
||||
NOPs = MAX2(NOPs, ctx.setreg_then_getsetreg);
|
||||
}
|
||||
|
||||
if (program->chip_class == GFX9) {
|
||||
if (instr->opcode == aco_opcode::s_movrels_b32 || instr->opcode == aco_opcode::s_movrels_b64 ||
|
||||
instr->opcode == aco_opcode::s_movreld_b32 || instr->opcode == aco_opcode::s_movreld_b64) {
|
||||
if (instr->opcode == aco_opcode::s_movrels_b32 ||
|
||||
instr->opcode == aco_opcode::s_movrels_b64 ||
|
||||
instr->opcode == aco_opcode::s_movreld_b32 ||
|
||||
instr->opcode == aco_opcode::s_movreld_b64) {
|
||||
NOPs = MAX2(NOPs, ctx.salu_wr_m0_then_moverel);
|
||||
}
|
||||
}
|
||||
|
@ -398,7 +408,8 @@ void handle_instruction_gfx6(Program *program, Block *cur_block, NOP_ctx_gfx6 &c
|
|||
handle_vintrp_then_read_hazard(program, cur_block, &NOPs, 1, instr->operands[0]);
|
||||
}
|
||||
|
||||
if (instr->opcode == aco_opcode::v_div_fmas_f32 || instr->opcode == aco_opcode::v_div_fmas_f64)
|
||||
if (instr->opcode == aco_opcode::v_div_fmas_f32 ||
|
||||
instr->opcode == aco_opcode::v_div_fmas_f64)
|
||||
NOPs = MAX2(NOPs, ctx.valu_wr_vcc_then_div_fmas);
|
||||
} else if (instr->isVMEM() || instr->isFlatLike()) {
|
||||
/* If the VALU writes the SGPR that is used by a VMEM, the user must add five wait states. */
|
||||
|
@ -412,13 +423,11 @@ void handle_instruction_gfx6(Program *program, Block *cur_block, NOP_ctx_gfx6 &c
|
|||
NOPs = MAX2(NOPs, ctx.set_vskip_mode_then_vector);
|
||||
|
||||
if (program->chip_class == GFX9) {
|
||||
bool lds_scratch_global = (instr->isScratch() || instr->isGlobal()) &&
|
||||
instr->flatlike().lds;
|
||||
if (instr->isVINTRP() ||
|
||||
bool lds_scratch_global = (instr->isScratch() || instr->isGlobal()) && instr->flatlike().lds;
|
||||
if (instr->isVINTRP() || lds_scratch_global ||
|
||||
instr->opcode == aco_opcode::ds_read_addtid_b32 ||
|
||||
instr->opcode == aco_opcode::ds_write_addtid_b32 ||
|
||||
instr->opcode == aco_opcode::buffer_store_lds_dword ||
|
||||
lds_scratch_global) {
|
||||
instr->opcode == aco_opcode::buffer_store_lds_dword) {
|
||||
NOPs = MAX2(NOPs, ctx.salu_wr_m0_then_lds);
|
||||
}
|
||||
}
|
||||
|
@ -428,7 +437,8 @@ void handle_instruction_gfx6(Program *program, Block *cur_block, NOP_ctx_gfx6 &c
|
|||
// TODO: try to schedule the NOP-causing instruction up to reduce the number of stall cycles
|
||||
if (NOPs) {
|
||||
/* create NOP */
|
||||
aco_ptr<SOPP_instruction> nop{create_instruction<SOPP_instruction>(aco_opcode::s_nop, Format::SOPP, 0, 0)};
|
||||
aco_ptr<SOPP_instruction> nop{
|
||||
create_instruction<SOPP_instruction>(aco_opcode::s_nop, Format::SOPP, 0, 0)};
|
||||
nop->imm = NOPs - 1;
|
||||
nop->block = -1;
|
||||
new_instructions.emplace_back(std::move(nop));
|
||||
|
@ -485,7 +495,8 @@ void handle_instruction_gfx6(Program *program, Block *cur_block, NOP_ctx_gfx6 &c
|
|||
ctx.salu_wr_m0_then_lds = 1;
|
||||
ctx.salu_wr_m0_then_moverel = 1;
|
||||
}
|
||||
} else if (instr->opcode == aco_opcode::s_setreg_b32 || instr->opcode == aco_opcode::s_setreg_imm32_b32) {
|
||||
} else if (instr->opcode == aco_opcode::s_setreg_b32 ||
|
||||
instr->opcode == aco_opcode::s_setreg_imm32_b32) {
|
||||
SOPK_instruction& sopk = instr->sopk();
|
||||
unsigned offset = (sopk.imm >> 6) & 0x1f;
|
||||
unsigned size = ((sopk.imm >> 11) & 0x1f) + 1;
|
||||
|
@ -497,19 +508,16 @@ void handle_instruction_gfx6(Program *program, Block *cur_block, NOP_ctx_gfx6 &c
|
|||
}
|
||||
} else if (instr->isVMEM() || instr->isFlatLike()) {
|
||||
/* >64-bit MUBUF/MTBUF store with a constant in SOFFSET */
|
||||
bool consider_buf = (instr->isMUBUF() || instr->isMTBUF()) &&
|
||||
instr->operands.size() == 4 &&
|
||||
instr->operands[3].size() > 2 &&
|
||||
instr->operands[2].physReg() >= 128;
|
||||
/* MIMG store with a 128-bit T# with more than two bits set in dmask (making it a >64-bit store) */
|
||||
bool consider_buf = (instr->isMUBUF() || instr->isMTBUF()) && instr->operands.size() == 4 &&
|
||||
instr->operands[3].size() > 2 && instr->operands[2].physReg() >= 128;
|
||||
/* MIMG store with a 128-bit T# with more than two bits set in dmask (making it a >64-bit
|
||||
* store) */
|
||||
bool consider_mimg = instr->isMIMG() &&
|
||||
instr->operands[1].regClass().type() == RegType::vgpr &&
|
||||
instr->operands[1].size() > 2 &&
|
||||
instr->operands[0].size() == 4;
|
||||
instr->operands[1].size() > 2 && instr->operands[0].size() == 4;
|
||||
/* FLAT/GLOBAL/SCRATCH store with >64-bit data */
|
||||
bool consider_flat = instr->isFlatLike() &&
|
||||
instr->operands.size() == 3 &&
|
||||
instr->operands[2].size() > 2;
|
||||
bool consider_flat =
|
||||
instr->isFlatLike() && instr->operands.size() == 3 && instr->operands[2].size() > 2;
|
||||
if (consider_buf || consider_mimg || consider_flat) {
|
||||
PhysReg wrdata = instr->operands[consider_flat ? 2 : 3].physReg();
|
||||
unsigned size = instr->operands[consider_flat ? 2 : 3].size();
|
||||
|
@ -520,22 +528,26 @@ void handle_instruction_gfx6(Program *program, Block *cur_block, NOP_ctx_gfx6 &c
|
|||
}
|
||||
|
||||
template <std::size_t N>
|
||||
bool check_written_regs(const aco_ptr<Instruction> &instr, const std::bitset<N> &check_regs)
|
||||
bool
|
||||
check_written_regs(const aco_ptr<Instruction>& instr, const std::bitset<N>& check_regs)
|
||||
{
|
||||
return std::any_of(instr->definitions.begin(), instr->definitions.end(), [&check_regs](const Definition &def) -> bool {
|
||||
bool writes_any = false;
|
||||
for (unsigned i = 0; i < def.size(); i++) {
|
||||
unsigned def_reg = def.physReg() + i;
|
||||
writes_any |= def_reg < check_regs.size() && check_regs[def_reg];
|
||||
}
|
||||
return writes_any;
|
||||
});
|
||||
return std::any_of(instr->definitions.begin(), instr->definitions.end(),
|
||||
[&check_regs](const Definition& def) -> bool
|
||||
{
|
||||
bool writes_any = false;
|
||||
for (unsigned i = 0; i < def.size(); i++) {
|
||||
unsigned def_reg = def.physReg() + i;
|
||||
writes_any |= def_reg < check_regs.size() && check_regs[def_reg];
|
||||
}
|
||||
return writes_any;
|
||||
});
|
||||
}
|
||||
|
||||
template <std::size_t N>
|
||||
void mark_read_regs(const aco_ptr<Instruction> &instr, std::bitset<N> ®_reads)
|
||||
void
|
||||
mark_read_regs(const aco_ptr<Instruction>& instr, std::bitset<N>& reg_reads)
|
||||
{
|
||||
for (const Operand &op : instr->operands) {
|
||||
for (const Operand& op : instr->operands) {
|
||||
for (unsigned i = 0; i < op.size(); i++) {
|
||||
unsigned reg = op.physReg() + i;
|
||||
if (reg < reg_reads.size())
|
||||
|
@ -544,7 +556,8 @@ void mark_read_regs(const aco_ptr<Instruction> &instr, std::bitset<N> ®_reads
|
|||
}
|
||||
}
|
||||
|
||||
bool VALU_writes_sgpr(aco_ptr<Instruction>& instr)
|
||||
bool
|
||||
VALU_writes_sgpr(aco_ptr<Instruction>& instr)
|
||||
{
|
||||
if (instr->isVOPC())
|
||||
return true;
|
||||
|
@ -557,24 +570,26 @@ bool VALU_writes_sgpr(aco_ptr<Instruction>& instr)
|
|||
return false;
|
||||
}
|
||||
|
||||
bool instr_writes_exec(const aco_ptr<Instruction>& instr)
|
||||
bool
|
||||
instr_writes_exec(const aco_ptr<Instruction>& instr)
|
||||
{
|
||||
return std::any_of(instr->definitions.begin(), instr->definitions.end(), [](const Definition &def) -> bool {
|
||||
return def.physReg() == exec_lo || def.physReg() == exec_hi;
|
||||
});
|
||||
return std::any_of(instr->definitions.begin(), instr->definitions.end(),
|
||||
[](const Definition& def) -> bool
|
||||
{ return def.physReg() == exec_lo || def.physReg() == exec_hi; });
|
||||
}
|
||||
|
||||
bool instr_writes_sgpr(const aco_ptr<Instruction>& instr)
|
||||
bool
|
||||
instr_writes_sgpr(const aco_ptr<Instruction>& instr)
|
||||
{
|
||||
return std::any_of(instr->definitions.begin(), instr->definitions.end(), [](const Definition &def) -> bool {
|
||||
return def.getTemp().type() == RegType::sgpr;
|
||||
});
|
||||
return std::any_of(instr->definitions.begin(), instr->definitions.end(),
|
||||
[](const Definition& def) -> bool
|
||||
{ return def.getTemp().type() == RegType::sgpr; });
|
||||
}
|
||||
|
||||
inline bool instr_is_branch(const aco_ptr<Instruction>& instr)
|
||||
inline bool
|
||||
instr_is_branch(const aco_ptr<Instruction>& instr)
|
||||
{
|
||||
return instr->opcode == aco_opcode::s_branch ||
|
||||
instr->opcode == aco_opcode::s_cbranch_scc0 ||
|
||||
return instr->opcode == aco_opcode::s_branch || instr->opcode == aco_opcode::s_cbranch_scc0 ||
|
||||
instr->opcode == aco_opcode::s_cbranch_scc1 ||
|
||||
instr->opcode == aco_opcode::s_cbranch_vccz ||
|
||||
instr->opcode == aco_opcode::s_cbranch_vccnz ||
|
||||
|
@ -586,19 +601,20 @@ inline bool instr_is_branch(const aco_ptr<Instruction>& instr)
|
|||
instr->opcode == aco_opcode::s_cbranch_cdbgsys_and_user ||
|
||||
instr->opcode == aco_opcode::s_subvector_loop_begin ||
|
||||
instr->opcode == aco_opcode::s_subvector_loop_end ||
|
||||
instr->opcode == aco_opcode::s_setpc_b64 ||
|
||||
instr->opcode == aco_opcode::s_swappc_b64 ||
|
||||
instr->opcode == aco_opcode::s_getpc_b64 ||
|
||||
instr->opcode == aco_opcode::s_call_b64;
|
||||
instr->opcode == aco_opcode::s_setpc_b64 || instr->opcode == aco_opcode::s_swappc_b64 ||
|
||||
instr->opcode == aco_opcode::s_getpc_b64 || instr->opcode == aco_opcode::s_call_b64;
|
||||
}
|
||||
|
||||
void handle_instruction_gfx10(Program *program, Block *cur_block, NOP_ctx_gfx10 &ctx,
|
||||
aco_ptr<Instruction>& instr, std::vector<aco_ptr<Instruction>>& new_instructions)
|
||||
void
|
||||
handle_instruction_gfx10(Program* program, Block* cur_block, NOP_ctx_gfx10& ctx,
|
||||
aco_ptr<Instruction>& instr,
|
||||
std::vector<aco_ptr<Instruction>>& new_instructions)
|
||||
{
|
||||
//TODO: s_dcache_inv needs to be in it's own group on GFX10
|
||||
// TODO: s_dcache_inv needs to be in it's own group on GFX10
|
||||
|
||||
/* VMEMtoScalarWriteHazard
|
||||
* Handle EXEC/M0/SGPR write following a VMEM instruction without a VALU or "waitcnt vmcnt(0)" in-between.
|
||||
* Handle EXEC/M0/SGPR write following a VMEM instruction without a VALU or "waitcnt vmcnt(0)"
|
||||
* in-between.
|
||||
*/
|
||||
if (instr->isVMEM() || instr->isFlatLike() || instr->isDS()) {
|
||||
/* Remember all SGPRs that are read by the VMEM instruction */
|
||||
|
@ -624,7 +640,8 @@ void handle_instruction_gfx10(Program *program, Block *cur_block, NOP_ctx_gfx10
|
|||
ctx.sgprs_read_by_VMEM.reset();
|
||||
|
||||
/* Insert s_waitcnt_depctr instruction with magic imm to mitigate the problem */
|
||||
aco_ptr<SOPP_instruction> depctr{create_instruction<SOPP_instruction>(aco_opcode::s_waitcnt_depctr, Format::SOPP, 0, 0)};
|
||||
aco_ptr<SOPP_instruction> depctr{
|
||||
create_instruction<SOPP_instruction>(aco_opcode::s_waitcnt_depctr, Format::SOPP, 0, 0)};
|
||||
depctr->imm = 0xffe3;
|
||||
depctr->block = -1;
|
||||
new_instructions.emplace_back(std::move(depctr));
|
||||
|
@ -639,13 +656,13 @@ void handle_instruction_gfx10(Program *program, Block *cur_block, NOP_ctx_gfx10
|
|||
*/
|
||||
if (instr->isVOPC()) {
|
||||
ctx.has_VOPC = true;
|
||||
} else if (ctx.has_VOPC &&
|
||||
(instr->opcode == aco_opcode::v_permlane16_b32 ||
|
||||
instr->opcode == aco_opcode::v_permlanex16_b32)) {
|
||||
} else if (ctx.has_VOPC && (instr->opcode == aco_opcode::v_permlane16_b32 ||
|
||||
instr->opcode == aco_opcode::v_permlanex16_b32)) {
|
||||
ctx.has_VOPC = false;
|
||||
|
||||
/* v_nop would be discarded by SQ, so use v_mov with the first operand of the permlane */
|
||||
aco_ptr<VOP1_instruction> v_mov{create_instruction<VOP1_instruction>(aco_opcode::v_mov_b32, Format::VOP1, 1, 1)};
|
||||
aco_ptr<VOP1_instruction> v_mov{
|
||||
create_instruction<VOP1_instruction>(aco_opcode::v_mov_b32, Format::VOP1, 1, 1)};
|
||||
v_mov->definitions[0] = Definition(instr->operands[0].physReg(), v1);
|
||||
v_mov->operands[0] = Operand(instr->operands[0].physReg(), v1);
|
||||
new_instructions.emplace_back(std::move(v_mov));
|
||||
|
@ -663,7 +680,8 @@ void handle_instruction_gfx10(Program *program, Block *cur_block, NOP_ctx_gfx10
|
|||
ctx.has_nonVALU_exec_read = false;
|
||||
|
||||
/* Insert s_waitcnt_depctr instruction with magic imm to mitigate the problem */
|
||||
aco_ptr<SOPP_instruction> depctr{create_instruction<SOPP_instruction>(aco_opcode::s_waitcnt_depctr, Format::SOPP, 0, 0)};
|
||||
aco_ptr<SOPP_instruction> depctr{
|
||||
create_instruction<SOPP_instruction>(aco_opcode::s_waitcnt_depctr, Format::SOPP, 0, 0)};
|
||||
depctr->imm = 0xfffe;
|
||||
depctr->block = -1;
|
||||
new_instructions.emplace_back(std::move(depctr));
|
||||
|
@ -689,7 +707,8 @@ void handle_instruction_gfx10(Program *program, Block *cur_block, NOP_ctx_gfx10
|
|||
ctx.sgprs_read_by_SMEM.reset();
|
||||
|
||||
/* Insert s_mov to mitigate the problem */
|
||||
aco_ptr<SOP1_instruction> s_mov{create_instruction<SOP1_instruction>(aco_opcode::s_mov_b32, Format::SOP1, 1, 1)};
|
||||
aco_ptr<SOP1_instruction> s_mov{
|
||||
create_instruction<SOP1_instruction>(aco_opcode::s_mov_b32, Format::SOP1, 1, 1)};
|
||||
s_mov->definitions[0] = Definition(sgpr_null, s1);
|
||||
s_mov->operands[0] = Operand(0u);
|
||||
new_instructions.emplace_back(std::move(s_mov));
|
||||
|
@ -738,14 +757,16 @@ void handle_instruction_gfx10(Program *program, Block *cur_block, NOP_ctx_gfx10
|
|||
ctx.has_VMEM = ctx.has_branch_after_VMEM = ctx.has_DS = ctx.has_branch_after_DS = false;
|
||||
|
||||
/* Insert s_waitcnt_vscnt to mitigate the problem */
|
||||
aco_ptr<SOPK_instruction> wait{create_instruction<SOPK_instruction>(aco_opcode::s_waitcnt_vscnt, Format::SOPK, 0, 1)};
|
||||
aco_ptr<SOPK_instruction> wait{
|
||||
create_instruction<SOPK_instruction>(aco_opcode::s_waitcnt_vscnt, Format::SOPK, 0, 1)};
|
||||
wait->definitions[0] = Definition(sgpr_null, s1);
|
||||
wait->imm = 0;
|
||||
new_instructions.emplace_back(std::move(wait));
|
||||
}
|
||||
|
||||
/* NSAToVMEMBug
|
||||
* Handles NSA MIMG (4 or more dwords) immediately followed by MUBUF/MTBUF (with offset[2:1] != 0).
|
||||
* Handles NSA MIMG (4 or more dwords) immediately followed by MUBUF/MTBUF (with offset[2:1] !=
|
||||
* 0).
|
||||
*/
|
||||
if (instr->isMIMG() && get_mimg_nsa_dwords(instr.get()) > 1) {
|
||||
ctx.has_NSA_MIMG = true;
|
||||
|
@ -772,11 +793,12 @@ void handle_instruction_gfx10(Program *program, Block *cur_block, NOP_ctx_gfx10
|
|||
}
|
||||
|
||||
template <typename Ctx>
|
||||
using HandleInstr = void (*)(Program *, Block *block, Ctx&, aco_ptr<Instruction>&,
|
||||
using HandleInstr = void (*)(Program*, Block* block, Ctx&, aco_ptr<Instruction>&,
|
||||
std::vector<aco_ptr<Instruction>>&);
|
||||
|
||||
template <typename Ctx, HandleInstr<Ctx> Handle>
|
||||
void handle_block(Program *program, Ctx& ctx, Block& block)
|
||||
void
|
||||
handle_block(Program* program, Ctx& ctx, Block& block)
|
||||
{
|
||||
if (block.instructions.empty())
|
||||
return;
|
||||
|
@ -793,14 +815,15 @@ void handle_block(Program *program, Ctx& ctx, Block& block)
|
|||
}
|
||||
|
||||
template <typename Ctx, HandleInstr<Ctx> Handle>
|
||||
void mitigate_hazards(Program *program)
|
||||
void
|
||||
mitigate_hazards(Program* program)
|
||||
{
|
||||
std::vector<Ctx> all_ctx(program->blocks.size());
|
||||
std::stack<unsigned> loop_header_indices;
|
||||
|
||||
for (unsigned i = 0; i < program->blocks.size(); i++) {
|
||||
Block& block = program->blocks[i];
|
||||
Ctx &ctx = all_ctx[i];
|
||||
Ctx& ctx = all_ctx[i];
|
||||
|
||||
if (block.kind & block_kind_loop_header) {
|
||||
loop_header_indices.push(i);
|
||||
|
@ -832,7 +855,8 @@ void mitigate_hazards(Program *program)
|
|||
|
||||
} /* end namespace */
|
||||
|
||||
void insert_NOPs(Program* program)
|
||||
void
|
||||
insert_NOPs(Program* program)
|
||||
{
|
||||
if (program->chip_class >= GFX10_3)
|
||||
; /* no hazards/bugs to mitigate */
|
||||
|
@ -842,4 +866,4 @@ void insert_NOPs(Program* program)
|
|||
mitigate_hazards<NOP_ctx_gfx6, handle_instruction_gfx6>(program);
|
||||
}
|
||||
|
||||
}
|
||||
} // namespace aco
|
||||
|
|
|
@ -24,6 +24,7 @@
|
|||
|
||||
#include "aco_builder.h"
|
||||
#include "aco_ir.h"
|
||||
|
||||
#include "util/u_math.h"
|
||||
|
||||
#include <set>
|
||||
|
@ -55,10 +56,9 @@ struct wqm_ctx {
|
|||
std::vector<uint16_t> defined_in;
|
||||
std::vector<bool> needs_wqm;
|
||||
std::vector<bool> branch_wqm; /* true if the branch condition in this block should be in wqm */
|
||||
wqm_ctx(Program* program_) : program(program_),
|
||||
defined_in(program->peekAllocationId(), 0xFFFF),
|
||||
needs_wqm(program->peekAllocationId()),
|
||||
branch_wqm(program->blocks.size())
|
||||
wqm_ctx(Program* program_)
|
||||
: program(program_), defined_in(program->peekAllocationId(), 0xFFFF),
|
||||
needs_wqm(program->peekAllocationId()), branch_wqm(program->blocks.size())
|
||||
{
|
||||
for (unsigned i = 0; i < program->blocks.size(); i++)
|
||||
worklist.insert(i);
|
||||
|
@ -72,13 +72,15 @@ struct loop_info {
|
|||
bool has_divergent_break;
|
||||
bool has_divergent_continue;
|
||||
bool has_discard; /* has a discard or demote */
|
||||
loop_info(Block* b, uint16_t num, uint8_t needs_, bool breaks, bool cont, bool discard) :
|
||||
loop_header(b), num_exec_masks(num), needs(needs_), has_divergent_break(breaks),
|
||||
has_divergent_continue(cont), has_discard(discard) {}
|
||||
loop_info(Block* b, uint16_t num, uint8_t needs_, bool breaks, bool cont, bool discard)
|
||||
: loop_header(b), num_exec_masks(num), needs(needs_), has_divergent_break(breaks),
|
||||
has_divergent_continue(cont), has_discard(discard)
|
||||
{}
|
||||
};
|
||||
|
||||
struct block_info {
|
||||
std::vector<std::pair<Operand, uint8_t>> exec; /* Vector of exec masks. Either a temporary or const -1. */
|
||||
std::vector<std::pair<Operand, uint8_t>>
|
||||
exec; /* Vector of exec masks. Either a temporary or const -1. */
|
||||
std::vector<WQMState> instr_needs;
|
||||
uint8_t block_needs;
|
||||
uint8_t ever_again_needs;
|
||||
|
@ -87,14 +89,16 @@ struct block_info {
|
|||
};
|
||||
|
||||
struct exec_ctx {
|
||||
Program *program;
|
||||
Program* program;
|
||||
std::vector<block_info> info;
|
||||
std::vector<loop_info> loop;
|
||||
bool handle_wqm = false;
|
||||
exec_ctx(Program *program_) : program(program_), info(program->blocks.size()) {}
|
||||
exec_ctx(Program* program_) : program(program_), info(program->blocks.size()) {}
|
||||
};
|
||||
|
||||
bool needs_exact(aco_ptr<Instruction>& instr) {
|
||||
bool
|
||||
needs_exact(aco_ptr<Instruction>& instr)
|
||||
{
|
||||
if (instr->isMUBUF()) {
|
||||
return instr->mubuf().disable_wqm;
|
||||
} else if (instr->isMTBUF()) {
|
||||
|
@ -108,7 +112,8 @@ bool needs_exact(aco_ptr<Instruction>& instr) {
|
|||
}
|
||||
}
|
||||
|
||||
void set_needs_wqm(wqm_ctx &ctx, Temp tmp)
|
||||
void
|
||||
set_needs_wqm(wqm_ctx& ctx, Temp tmp)
|
||||
{
|
||||
if (!ctx.needs_wqm[tmp.id()]) {
|
||||
ctx.needs_wqm[tmp.id()] = true;
|
||||
|
@ -117,7 +122,8 @@ void set_needs_wqm(wqm_ctx &ctx, Temp tmp)
|
|||
}
|
||||
}
|
||||
|
||||
void mark_block_wqm(wqm_ctx &ctx, unsigned block_idx)
|
||||
void
|
||||
mark_block_wqm(wqm_ctx& ctx, unsigned block_idx)
|
||||
{
|
||||
if (ctx.branch_wqm[block_idx])
|
||||
return;
|
||||
|
@ -136,7 +142,8 @@ void mark_block_wqm(wqm_ctx &ctx, unsigned block_idx)
|
|||
mark_block_wqm(ctx, pred_idx);
|
||||
}
|
||||
|
||||
void get_block_needs(wqm_ctx &ctx, exec_ctx &exec_ctx, Block* block)
|
||||
void
|
||||
get_block_needs(wqm_ctx& ctx, exec_ctx& exec_ctx, Block* block)
|
||||
{
|
||||
block_info& info = exec_ctx.info[block->index];
|
||||
|
||||
|
@ -146,8 +153,8 @@ void get_block_needs(wqm_ctx &ctx, exec_ctx &exec_ctx, Block* block)
|
|||
aco_ptr<Instruction>& instr = block->instructions[i];
|
||||
|
||||
WQMState needs = needs_exact(instr) ? Exact : Unspecified;
|
||||
bool propagate_wqm = instr->opcode == aco_opcode::p_wqm ||
|
||||
instr->opcode == aco_opcode::p_as_uniform;
|
||||
bool propagate_wqm =
|
||||
instr->opcode == aco_opcode::p_wqm || instr->opcode == aco_opcode::p_as_uniform;
|
||||
bool preserve_wqm = instr->opcode == aco_opcode::p_discard_if;
|
||||
bool pred_by_exec = needs_exec_mask(instr.get());
|
||||
for (const Definition& definition : instr->definitions) {
|
||||
|
@ -214,7 +221,8 @@ void get_block_needs(wqm_ctx &ctx, exec_ctx &exec_ctx, Block* block)
|
|||
* breaks, which might benefit from being in exact) by adding Exact_Branch to a
|
||||
* divergent branch surrounding the nested loop, if such a branch exists.
|
||||
*/
|
||||
void handle_wqm_loops(wqm_ctx& ctx, exec_ctx& exec_ctx, unsigned preheader)
|
||||
void
|
||||
handle_wqm_loops(wqm_ctx& ctx, exec_ctx& exec_ctx, unsigned preheader)
|
||||
{
|
||||
for (unsigned idx = preheader + 1; idx < exec_ctx.program->blocks.size(); idx++) {
|
||||
Block& block = exec_ctx.program->blocks[idx];
|
||||
|
@ -231,7 +239,8 @@ void handle_wqm_loops(wqm_ctx& ctx, exec_ctx& exec_ctx, unsigned preheader)
|
|||
* ensure that the exact exec mask is not empty by adding Exact_Branch to
|
||||
* the outer divergent branch.
|
||||
*/
|
||||
void handle_exact_loops(wqm_ctx& ctx, exec_ctx& exec_ctx, unsigned preheader)
|
||||
void
|
||||
handle_exact_loops(wqm_ctx& ctx, exec_ctx& exec_ctx, unsigned preheader)
|
||||
{
|
||||
assert(exec_ctx.program->blocks[preheader + 1].kind & block_kind_loop_header);
|
||||
|
||||
|
@ -265,7 +274,8 @@ void handle_exact_loops(wqm_ctx& ctx, exec_ctx& exec_ctx, unsigned preheader)
|
|||
}
|
||||
}
|
||||
|
||||
void calculate_wqm_needs(exec_ctx& exec_ctx)
|
||||
void
|
||||
calculate_wqm_needs(exec_ctx& exec_ctx)
|
||||
{
|
||||
wqm_ctx ctx(exec_ctx.program);
|
||||
|
||||
|
@ -307,14 +317,12 @@ void calculate_wqm_needs(exec_ctx& exec_ctx)
|
|||
exec_ctx.info[i].block_needs |= Exact;
|
||||
|
||||
/* if discard is used somewhere in nested CF, we need to preserve the WQM mask */
|
||||
if ((block.kind & block_kind_discard ||
|
||||
block.kind & block_kind_uses_discard_if) &&
|
||||
if ((block.kind & block_kind_discard || block.kind & block_kind_uses_discard_if) &&
|
||||
ever_again_needs & WQM)
|
||||
exec_ctx.info[i].block_needs |= Preserve_WQM;
|
||||
|
||||
ever_again_needs |= exec_ctx.info[i].block_needs & ~Exact_Branch;
|
||||
if (block.kind & block_kind_discard ||
|
||||
block.kind & block_kind_uses_discard_if ||
|
||||
if (block.kind & block_kind_discard || block.kind & block_kind_uses_discard_if ||
|
||||
block.kind & block_kind_uses_demote)
|
||||
ever_again_needs |= Exact;
|
||||
|
||||
|
@ -327,7 +335,8 @@ void calculate_wqm_needs(exec_ctx& exec_ctx)
|
|||
exec_ctx.handle_wqm = true;
|
||||
}
|
||||
|
||||
Operand get_exec_op(Operand t)
|
||||
Operand
|
||||
get_exec_op(Operand t)
|
||||
{
|
||||
if (t.isUndefined())
|
||||
return Operand(exec, t.regClass());
|
||||
|
@ -335,7 +344,8 @@ Operand get_exec_op(Operand t)
|
|||
return t;
|
||||
}
|
||||
|
||||
void transition_to_WQM(exec_ctx& ctx, Builder bld, unsigned idx)
|
||||
void
|
||||
transition_to_WQM(exec_ctx& ctx, Builder bld, unsigned idx)
|
||||
{
|
||||
if (ctx.info[idx].exec.back().second & mask_type_wqm)
|
||||
return;
|
||||
|
@ -346,7 +356,8 @@ void transition_to_WQM(exec_ctx& ctx, Builder bld, unsigned idx)
|
|||
ctx.info[idx].exec.back().first = exec_mask;
|
||||
}
|
||||
|
||||
exec_mask = bld.sop1(Builder::s_wqm, Definition(exec, bld.lm), bld.def(s1, scc), get_exec_op(exec_mask));
|
||||
exec_mask = bld.sop1(Builder::s_wqm, Definition(exec, bld.lm), bld.def(s1, scc),
|
||||
get_exec_op(exec_mask));
|
||||
ctx.info[idx].exec.emplace_back(exec_mask, mask_type_global | mask_type_wqm);
|
||||
return;
|
||||
}
|
||||
|
@ -355,11 +366,12 @@ void transition_to_WQM(exec_ctx& ctx, Builder bld, unsigned idx)
|
|||
assert(ctx.info[idx].exec.back().second & mask_type_wqm);
|
||||
assert(ctx.info[idx].exec.back().first.size() == bld.lm.size());
|
||||
assert(ctx.info[idx].exec.back().first.isTemp());
|
||||
ctx.info[idx].exec.back().first = bld.pseudo(aco_opcode::p_parallelcopy, Definition(exec, bld.lm),
|
||||
ctx.info[idx].exec.back().first);
|
||||
ctx.info[idx].exec.back().first = bld.pseudo(
|
||||
aco_opcode::p_parallelcopy, Definition(exec, bld.lm), ctx.info[idx].exec.back().first);
|
||||
}
|
||||
|
||||
void transition_to_Exact(exec_ctx& ctx, Builder bld, unsigned idx)
|
||||
void
|
||||
transition_to_Exact(exec_ctx& ctx, Builder bld, unsigned idx)
|
||||
{
|
||||
if (ctx.info[idx].exec.back().second & mask_type_exact)
|
||||
return;
|
||||
|
@ -372,8 +384,8 @@ void transition_to_Exact(exec_ctx& ctx, Builder bld, unsigned idx)
|
|||
assert(ctx.info[idx].exec.back().second & mask_type_exact);
|
||||
assert(ctx.info[idx].exec.back().first.size() == bld.lm.size());
|
||||
assert(ctx.info[idx].exec.back().first.isTemp());
|
||||
ctx.info[idx].exec.back().first = bld.pseudo(aco_opcode::p_parallelcopy, Definition(exec, bld.lm),
|
||||
ctx.info[idx].exec.back().first);
|
||||
ctx.info[idx].exec.back().first = bld.pseudo(
|
||||
aco_opcode::p_parallelcopy, Definition(exec, bld.lm), ctx.info[idx].exec.back().first);
|
||||
return;
|
||||
}
|
||||
/* otherwise, we create an exact mask and push to the stack */
|
||||
|
@ -382,14 +394,15 @@ void transition_to_Exact(exec_ctx& ctx, Builder bld, unsigned idx)
|
|||
wqm = bld.sop1(Builder::s_and_saveexec, bld.def(bld.lm), bld.def(s1, scc),
|
||||
Definition(exec, bld.lm), ctx.info[idx].exec[0].first, Operand(exec, bld.lm));
|
||||
} else {
|
||||
bld.sop2(Builder::s_and, Definition(exec, bld.lm), bld.def(s1, scc), ctx.info[idx].exec[0].first, wqm);
|
||||
bld.sop2(Builder::s_and, Definition(exec, bld.lm), bld.def(s1, scc),
|
||||
ctx.info[idx].exec[0].first, wqm);
|
||||
}
|
||||
ctx.info[idx].exec.back().first = Operand(wqm);
|
||||
ctx.info[idx].exec.emplace_back(Operand(bld.lm), mask_type_exact);
|
||||
}
|
||||
|
||||
unsigned add_coupling_code(exec_ctx& ctx, Block* block,
|
||||
std::vector<aco_ptr<Instruction>>& instructions)
|
||||
unsigned
|
||||
add_coupling_code(exec_ctx& ctx, Block* block, std::vector<aco_ptr<Instruction>>& instructions)
|
||||
{
|
||||
unsigned idx = block->index;
|
||||
Builder bld(ctx.program, &instructions);
|
||||
|
@ -417,7 +430,8 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block,
|
|||
} else {
|
||||
uint8_t mask = mask_type_global;
|
||||
if (ctx.program->needs_wqm) {
|
||||
bld.sop1(Builder::s_wqm, Definition(exec, bld.lm), bld.def(s1, scc), Operand(exec, bld.lm));
|
||||
bld.sop1(Builder::s_wqm, Definition(exec, bld.lm), bld.def(s1, scc),
|
||||
Operand(exec, bld.lm));
|
||||
mask |= mask_type_wqm;
|
||||
} else {
|
||||
mask |= mask_type_exact;
|
||||
|
@ -440,7 +454,8 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block,
|
|||
if (info.has_discard) {
|
||||
aco_ptr<Pseudo_instruction> phi;
|
||||
for (int i = 0; i < info.num_exec_masks - 1; i++) {
|
||||
phi.reset(create_instruction<Pseudo_instruction>(aco_opcode::p_linear_phi, Format::PSEUDO, preds.size(), 1));
|
||||
phi.reset(create_instruction<Pseudo_instruction>(aco_opcode::p_linear_phi,
|
||||
Format::PSEUDO, preds.size(), 1));
|
||||
phi->definitions[0] = bld.def(bld.lm);
|
||||
phi->operands[0] = get_exec_op(ctx.info[preds[0]].exec[i].first);
|
||||
ctx.info[idx].exec[i].first = bld.insert(std::move(phi));
|
||||
|
@ -450,14 +465,16 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block,
|
|||
/* create ssa name for restore mask */
|
||||
if (info.has_divergent_break) {
|
||||
/* this phi might be trivial but ensures a parallelcopy on the loop header */
|
||||
aco_ptr<Pseudo_instruction> phi{create_instruction<Pseudo_instruction>(aco_opcode::p_linear_phi, Format::PSEUDO, preds.size(), 1)};
|
||||
aco_ptr<Pseudo_instruction> phi{create_instruction<Pseudo_instruction>(
|
||||
aco_opcode::p_linear_phi, Format::PSEUDO, preds.size(), 1)};
|
||||
phi->definitions[0] = bld.def(bld.lm);
|
||||
phi->operands[0] = get_exec_op(ctx.info[preds[0]].exec[info.num_exec_masks - 1].first);
|
||||
ctx.info[idx].exec.back().first = bld.insert(std::move(phi));
|
||||
}
|
||||
|
||||
/* create ssa name for loop active mask */
|
||||
aco_ptr<Pseudo_instruction> phi{create_instruction<Pseudo_instruction>(aco_opcode::p_linear_phi, Format::PSEUDO, preds.size(), 1)};
|
||||
aco_ptr<Pseudo_instruction> phi{create_instruction<Pseudo_instruction>(
|
||||
aco_opcode::p_linear_phi, Format::PSEUDO, preds.size(), 1)};
|
||||
if (info.has_divergent_continue)
|
||||
phi->definitions[0] = bld.def(bld.lm);
|
||||
else
|
||||
|
@ -466,7 +483,8 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block,
|
|||
Temp loop_active = bld.insert(std::move(phi));
|
||||
|
||||
if (info.has_divergent_break) {
|
||||
uint8_t mask_type = (ctx.info[idx].exec.back().second & (mask_type_wqm | mask_type_exact)) | mask_type_loop;
|
||||
uint8_t mask_type =
|
||||
(ctx.info[idx].exec.back().second & (mask_type_wqm | mask_type_exact)) | mask_type_loop;
|
||||
ctx.info[idx].exec.emplace_back(loop_active, mask_type);
|
||||
} else {
|
||||
ctx.info[idx].exec.back().first = Operand(loop_active);
|
||||
|
@ -482,8 +500,10 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block,
|
|||
}
|
||||
uint8_t mask_type = ctx.info[idx].exec.back().second & (mask_type_wqm | mask_type_exact);
|
||||
assert(ctx.info[idx].exec.back().first.size() == bld.lm.size());
|
||||
ctx.info[idx].exec.emplace_back(bld.pseudo(aco_opcode::p_parallelcopy, Definition(exec, bld.lm),
|
||||
ctx.info[idx].exec.back().first), mask_type);
|
||||
ctx.info[idx].exec.emplace_back(
|
||||
bld.pseudo(aco_opcode::p_parallelcopy, Definition(exec, bld.lm),
|
||||
ctx.info[idx].exec.back().first),
|
||||
mask_type);
|
||||
}
|
||||
|
||||
return i;
|
||||
|
@ -514,14 +534,16 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block,
|
|||
aco_ptr<Instruction>& phi = header->instructions[instr_idx++];
|
||||
assert(phi->opcode == aco_opcode::p_linear_phi);
|
||||
for (unsigned i = 1; i < phi->operands.size(); i++)
|
||||
phi->operands[i] = get_exec_op(ctx.info[header_preds[i]].exec[info.num_exec_masks - 1].first);
|
||||
phi->operands[i] =
|
||||
get_exec_op(ctx.info[header_preds[i]].exec[info.num_exec_masks - 1].first);
|
||||
}
|
||||
|
||||
if (info.has_divergent_break) {
|
||||
aco_ptr<Instruction>& phi = header->instructions[instr_idx];
|
||||
assert(phi->opcode == aco_opcode::p_linear_phi);
|
||||
for (unsigned i = 1; i < phi->operands.size(); i++)
|
||||
phi->operands[i] = get_exec_op(ctx.info[header_preds[i]].exec[info.num_exec_masks].first);
|
||||
phi->operands[i] =
|
||||
get_exec_op(ctx.info[header_preds[i]].exec[info.num_exec_masks].first);
|
||||
}
|
||||
|
||||
assert(!(block->kind & block_kind_top_level) || info.num_exec_masks <= 2);
|
||||
|
@ -541,7 +563,8 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block,
|
|||
ctx.info[idx].exec.emplace_back(same, type);
|
||||
} else {
|
||||
/* create phi for loop footer */
|
||||
aco_ptr<Pseudo_instruction> phi{create_instruction<Pseudo_instruction>(aco_opcode::p_linear_phi, Format::PSEUDO, preds.size(), 1)};
|
||||
aco_ptr<Pseudo_instruction> phi{create_instruction<Pseudo_instruction>(
|
||||
aco_opcode::p_linear_phi, Format::PSEUDO, preds.size(), 1)};
|
||||
phi->definitions[0] = bld.def(bld.lm);
|
||||
if (exec_idx == info.num_exec_masks - 1u) {
|
||||
phi->definitions[0] = Definition(exec, bld.lm);
|
||||
|
@ -578,8 +601,8 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block,
|
|||
assert(ctx.info[idx].exec.back().first.size() == bld.lm.size());
|
||||
if (get_exec_op(ctx.info[idx].exec.back().first).isTemp()) {
|
||||
/* move current exec mask into exec register */
|
||||
ctx.info[idx].exec.back().first = bld.pseudo(aco_opcode::p_parallelcopy, Definition(exec, bld.lm),
|
||||
ctx.info[idx].exec.back().first);
|
||||
ctx.info[idx].exec.back().first = bld.pseudo(
|
||||
aco_opcode::p_parallelcopy, Definition(exec, bld.lm), ctx.info[idx].exec.back().first);
|
||||
}
|
||||
|
||||
ctx.loop.pop_back();
|
||||
|
@ -591,8 +614,8 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block,
|
|||
} else {
|
||||
assert(preds.size() == 2);
|
||||
/* if one of the predecessors ends in exact mask, we pop it from stack */
|
||||
unsigned num_exec_masks = std::min(ctx.info[preds[0]].exec.size(),
|
||||
ctx.info[preds[1]].exec.size());
|
||||
unsigned num_exec_masks =
|
||||
std::min(ctx.info[preds[0]].exec.size(), ctx.info[preds[1]].exec.size());
|
||||
|
||||
if (block->kind & block_kind_merge)
|
||||
num_exec_masks--;
|
||||
|
@ -605,14 +628,16 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block,
|
|||
if (ctx.info[preds[0]].exec[i].first == ctx.info[preds[1]].exec[i].first) {
|
||||
Operand t = ctx.info[preds[0]].exec[i].first;
|
||||
/* discard/demote can change the state of the current exec mask */
|
||||
assert(!t.isTemp() || ctx.info[preds[0]].exec[i].second == ctx.info[preds[1]].exec[i].second);
|
||||
assert(!t.isTemp() ||
|
||||
ctx.info[preds[0]].exec[i].second == ctx.info[preds[1]].exec[i].second);
|
||||
uint8_t mask = ctx.info[preds[0]].exec[i].second & ctx.info[preds[1]].exec[i].second;
|
||||
ctx.info[idx].exec.emplace_back(t, mask);
|
||||
continue;
|
||||
}
|
||||
|
||||
bool in_exec = i == num_exec_masks - 1 && !(block->kind & block_kind_merge);
|
||||
Temp phi = bld.pseudo(aco_opcode::p_linear_phi, in_exec ? Definition(exec, bld.lm) : bld.def(bld.lm),
|
||||
Temp phi = bld.pseudo(aco_opcode::p_linear_phi,
|
||||
in_exec ? Definition(exec, bld.lm) : bld.def(bld.lm),
|
||||
get_exec_op(ctx.info[preds[0]].exec[i].first),
|
||||
get_exec_op(ctx.info[preds[1]].exec[i].first));
|
||||
uint8_t mask_type = ctx.info[preds[0]].exec[i].second & ctx.info[preds[1]].exec[i].second;
|
||||
|
@ -654,9 +679,9 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block,
|
|||
return i;
|
||||
}
|
||||
|
||||
void process_instructions(exec_ctx& ctx, Block* block,
|
||||
std::vector<aco_ptr<Instruction>>& instructions,
|
||||
unsigned idx)
|
||||
void
|
||||
process_instructions(exec_ctx& ctx, Block* block, std::vector<aco_ptr<Instruction>>& instructions,
|
||||
unsigned idx)
|
||||
{
|
||||
WQMState state;
|
||||
if (ctx.info[block->index].exec.back().second & mask_type_wqm)
|
||||
|
@ -667,17 +692,16 @@ void process_instructions(exec_ctx& ctx, Block* block,
|
|||
}
|
||||
|
||||
/* if the block doesn't need both, WQM and Exact, we can skip processing the instructions */
|
||||
bool process = (ctx.handle_wqm &&
|
||||
(ctx.info[block->index].block_needs & state) !=
|
||||
(ctx.info[block->index].block_needs & (WQM | Exact))) ||
|
||||
bool process = (ctx.handle_wqm && (ctx.info[block->index].block_needs & state) !=
|
||||
(ctx.info[block->index].block_needs & (WQM | Exact))) ||
|
||||
block->kind & block_kind_uses_discard_if ||
|
||||
block->kind & block_kind_uses_demote ||
|
||||
block->kind & block_kind_needs_lowering;
|
||||
block->kind & block_kind_uses_demote || block->kind & block_kind_needs_lowering;
|
||||
if (!process) {
|
||||
std::vector<aco_ptr<Instruction>>::iterator it = std::next(block->instructions.begin(), idx);
|
||||
instructions.insert(instructions.end(),
|
||||
std::move_iterator<std::vector<aco_ptr<Instruction>>::iterator>(it),
|
||||
std::move_iterator<std::vector<aco_ptr<Instruction>>::iterator>(block->instructions.end()));
|
||||
std::move_iterator<std::vector<aco_ptr<Instruction>>::iterator>(
|
||||
block->instructions.end()));
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -700,11 +724,13 @@ void process_instructions(exec_ctx& ctx, Block* block,
|
|||
/* discard from current exec */
|
||||
const Operand cond = instr->operands[0];
|
||||
Temp exit_cond = bld.sop2(Builder::s_andn2, Definition(exec, bld.lm), bld.def(s1, scc),
|
||||
Operand(exec, bld.lm), cond).def(1).getTemp();
|
||||
Operand(exec, bld.lm), cond)
|
||||
.def(1)
|
||||
.getTemp();
|
||||
|
||||
/* discard from inner to outer exec mask on stack */
|
||||
for (int i = num - 2; i >= 0; i--) {
|
||||
Instruction *andn2 = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc),
|
||||
Instruction* andn2 = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc),
|
||||
ctx.info[block->index].exec[i].first, cond);
|
||||
ctx.info[block->index].exec[i].first = Operand(andn2->definitions[0].getTemp());
|
||||
exit_cond = andn2->definitions[1].getTemp();
|
||||
|
@ -726,14 +752,16 @@ void process_instructions(exec_ctx& ctx, Block* block,
|
|||
Definition dst = instr->definitions[0];
|
||||
assert(dst.size() == bld.lm.size());
|
||||
if (state == Exact) {
|
||||
instr.reset(create_instruction<SOP1_instruction>(bld.w64or32(Builder::s_mov), Format::SOP1, 1, 1));
|
||||
instr.reset(create_instruction<SOP1_instruction>(bld.w64or32(Builder::s_mov),
|
||||
Format::SOP1, 1, 1));
|
||||
instr->operands[0] = Operand(0u);
|
||||
instr->definitions[0] = dst;
|
||||
} else {
|
||||
std::pair<Operand, uint8_t>& exact_mask = ctx.info[block->index].exec[0];
|
||||
assert(exact_mask.second & mask_type_exact);
|
||||
|
||||
instr.reset(create_instruction<SOP2_instruction>(bld.w64or32(Builder::s_andn2), Format::SOP2, 2, 2));
|
||||
instr.reset(create_instruction<SOP2_instruction>(bld.w64or32(Builder::s_andn2),
|
||||
Format::SOP2, 2, 2));
|
||||
instr->operands[0] = Operand(exec, bld.lm); /* current exec */
|
||||
instr->operands[1] = Operand(exact_mask.first);
|
||||
instr->definitions[0] = dst;
|
||||
|
@ -741,7 +769,8 @@ void process_instructions(exec_ctx& ctx, Block* block,
|
|||
}
|
||||
} else if (instr->opcode == aco_opcode::p_demote_to_helper) {
|
||||
/* turn demote into discard_if with only exact masks */
|
||||
assert((ctx.info[block->index].exec[0].second & (mask_type_exact | mask_type_global)) == (mask_type_exact | mask_type_global));
|
||||
assert((ctx.info[block->index].exec[0].second & (mask_type_exact | mask_type_global)) ==
|
||||
(mask_type_exact | mask_type_global));
|
||||
|
||||
int num;
|
||||
Temp cond, exit_cond;
|
||||
|
@ -749,8 +778,9 @@ void process_instructions(exec_ctx& ctx, Block* block,
|
|||
assert(instr->operands[0].constantValue() == -1u);
|
||||
/* transition to exact and set exec to zero */
|
||||
exit_cond = bld.tmp(s1);
|
||||
cond = bld.sop1(Builder::s_and_saveexec, bld.def(bld.lm), bld.scc(Definition(exit_cond)),
|
||||
Definition(exec, bld.lm), Operand(0u), Operand(exec, bld.lm));
|
||||
cond =
|
||||
bld.sop1(Builder::s_and_saveexec, bld.def(bld.lm), bld.scc(Definition(exit_cond)),
|
||||
Definition(exec, bld.lm), Operand(0u), Operand(exec, bld.lm));
|
||||
|
||||
num = ctx.info[block->index].exec.size() - 2;
|
||||
if (!(ctx.info[block->index].exec.back().second & mask_type_exact)) {
|
||||
|
@ -767,7 +797,7 @@ void process_instructions(exec_ctx& ctx, Block* block,
|
|||
|
||||
for (int i = num; i >= 0; i--) {
|
||||
if (ctx.info[block->index].exec[i].second & mask_type_exact) {
|
||||
Instruction *andn2 = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc),
|
||||
Instruction* andn2 = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc),
|
||||
ctx.info[block->index].exec[i].first, cond);
|
||||
if (i == (int)ctx.info[block->index].exec.size() - 1) {
|
||||
andn2->operands[0] = Operand(exec, bld.lm);
|
||||
|
@ -783,14 +813,14 @@ void process_instructions(exec_ctx& ctx, Block* block,
|
|||
instr->opcode = aco_opcode::p_exit_early_if;
|
||||
instr->operands[0] = bld.scc(exit_cond);
|
||||
state = Exact;
|
||||
|
||||
}
|
||||
|
||||
bld.insert(std::move(instr));
|
||||
}
|
||||
}
|
||||
|
||||
void add_branch_code(exec_ctx& ctx, Block* block)
|
||||
void
|
||||
add_branch_code(exec_ctx& ctx, Block* block)
|
||||
{
|
||||
unsigned idx = block->index;
|
||||
Builder bld(ctx.program, block);
|
||||
|
@ -806,8 +836,7 @@ void add_branch_code(exec_ctx& ctx, Block* block)
|
|||
}
|
||||
assert(ctx.info[idx].exec.size() <= 2);
|
||||
|
||||
if (ctx.info[idx].ever_again_needs == 0 ||
|
||||
ctx.info[idx].ever_again_needs == Exact) {
|
||||
if (ctx.info[idx].ever_again_needs == 0 || ctx.info[idx].ever_again_needs == Exact) {
|
||||
/* transition to Exact */
|
||||
aco_ptr<Instruction> branch = std::move(block->instructions.back());
|
||||
block->instructions.pop_back();
|
||||
|
@ -838,8 +867,7 @@ void add_branch_code(exec_ctx& ctx, Block* block)
|
|||
Block& loop_block = ctx.program->blocks[i];
|
||||
needs |= ctx.info[i].block_needs;
|
||||
|
||||
if (loop_block.kind & block_kind_uses_discard_if ||
|
||||
loop_block.kind & block_kind_discard ||
|
||||
if (loop_block.kind & block_kind_uses_discard_if || loop_block.kind & block_kind_discard ||
|
||||
loop_block.kind & block_kind_uses_demote)
|
||||
has_discard = true;
|
||||
if (loop_block.loop_nest_depth != loop_nest_depth)
|
||||
|
@ -871,12 +899,8 @@ void add_branch_code(exec_ctx& ctx, Block* block)
|
|||
if (block->kind & block_kind_top_level)
|
||||
num_exec_masks = std::min(num_exec_masks, 2u);
|
||||
|
||||
ctx.loop.emplace_back(&ctx.program->blocks[block->linear_succs[0]],
|
||||
num_exec_masks,
|
||||
needs,
|
||||
has_divergent_break,
|
||||
has_divergent_continue,
|
||||
has_discard);
|
||||
ctx.loop.emplace_back(&ctx.program->blocks[block->linear_succs[0]], num_exec_masks, needs,
|
||||
has_divergent_break, has_divergent_continue, has_discard);
|
||||
}
|
||||
|
||||
/* For normal breaks, this is the exec mask. For discard+break, it's the
|
||||
|
@ -903,7 +927,7 @@ void add_branch_code(exec_ctx& ctx, Block* block)
|
|||
Definition(exec, bld.lm), Operand(0u), Operand(exec, bld.lm));
|
||||
|
||||
for (int i = num - 1; i >= 0; i--) {
|
||||
Instruction *andn2 = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc),
|
||||
Instruction* andn2 = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc),
|
||||
get_exec_op(ctx.info[block->index].exec[i].first), cond);
|
||||
if (i == (int)ctx.info[idx].exec.size() - 1)
|
||||
andn2->definitions[0] = Definition(exec, bld.lm);
|
||||
|
@ -919,8 +943,10 @@ void add_branch_code(exec_ctx& ctx, Block* block)
|
|||
}
|
||||
|
||||
if (block->kind & block_kind_continue_or_break) {
|
||||
assert(ctx.program->blocks[ctx.program->blocks[block->linear_succs[1]].linear_succs[0]].kind & block_kind_loop_header);
|
||||
assert(ctx.program->blocks[ctx.program->blocks[block->linear_succs[0]].linear_succs[0]].kind & block_kind_loop_exit);
|
||||
assert(ctx.program->blocks[ctx.program->blocks[block->linear_succs[1]].linear_succs[0]].kind &
|
||||
block_kind_loop_header);
|
||||
assert(ctx.program->blocks[ctx.program->blocks[block->linear_succs[0]].linear_succs[0]].kind &
|
||||
block_kind_loop_exit);
|
||||
assert(block->instructions.back()->opcode == aco_opcode::p_branch);
|
||||
block->instructions.pop_back();
|
||||
|
||||
|
@ -931,8 +957,10 @@ void add_branch_code(exec_ctx& ctx, Block* block)
|
|||
}
|
||||
|
||||
if (need_parallelcopy)
|
||||
ctx.info[idx].exec.back().first = bld.pseudo(aco_opcode::p_parallelcopy, Definition(exec, bld.lm), ctx.info[idx].exec.back().first);
|
||||
bld.branch(aco_opcode::p_cbranch_nz, bld.hint_vcc(bld.def(s2)), Operand(exec, bld.lm), block->linear_succs[1], block->linear_succs[0]);
|
||||
ctx.info[idx].exec.back().first = bld.pseudo(
|
||||
aco_opcode::p_parallelcopy, Definition(exec, bld.lm), ctx.info[idx].exec.back().first);
|
||||
bld.branch(aco_opcode::p_cbranch_nz, bld.hint_vcc(bld.def(s2)), Operand(exec, bld.lm),
|
||||
block->linear_succs[1], block->linear_succs[0]);
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -949,8 +977,7 @@ void add_branch_code(exec_ctx& ctx, Block* block)
|
|||
|
||||
if (block->kind & block_kind_branch) {
|
||||
|
||||
if (ctx.handle_wqm &&
|
||||
ctx.info[idx].exec.size() >= 2 &&
|
||||
if (ctx.handle_wqm && ctx.info[idx].exec.size() >= 2 &&
|
||||
ctx.info[idx].exec.back().second == mask_type_exact &&
|
||||
!(ctx.info[idx].block_needs & Exact_Branch) &&
|
||||
ctx.info[idx].exec[ctx.info[idx].exec.size() - 2].second & mask_type_wqm) {
|
||||
|
@ -972,7 +999,7 @@ void add_branch_code(exec_ctx& ctx, Block* block)
|
|||
bld.pseudo(aco_opcode::p_parallelcopy, Definition(exec, bld.lm), cond);
|
||||
} else {
|
||||
Temp old_exec = bld.sop1(Builder::s_and_saveexec, bld.def(bld.lm), bld.def(s1, scc),
|
||||
Definition(exec, bld.lm), cond, Operand(exec, bld.lm));
|
||||
Definition(exec, bld.lm), cond, Operand(exec, bld.lm));
|
||||
|
||||
ctx.info[idx].exec.back().first = Operand(old_exec);
|
||||
}
|
||||
|
@ -980,7 +1007,8 @@ void add_branch_code(exec_ctx& ctx, Block* block)
|
|||
/* add next current exec to the stack */
|
||||
ctx.info[idx].exec.emplace_back(Operand(bld.lm), mask_type);
|
||||
|
||||
bld.branch(aco_opcode::p_cbranch_z, bld.hint_vcc(bld.def(s2)), Operand(exec, bld.lm), block->linear_succs[1], block->linear_succs[0]);
|
||||
bld.branch(aco_opcode::p_cbranch_z, bld.hint_vcc(bld.def(s2)), Operand(exec, bld.lm),
|
||||
block->linear_succs[1], block->linear_succs[0]);
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -990,9 +1018,11 @@ void add_branch_code(exec_ctx& ctx, Block* block)
|
|||
block->instructions.pop_back();
|
||||
assert(ctx.info[idx].exec.size() >= 2);
|
||||
Operand orig_exec = ctx.info[idx].exec[ctx.info[idx].exec.size() - 2].first;
|
||||
bld.sop2(Builder::s_andn2, Definition(exec, bld.lm), bld.def(s1, scc), orig_exec, Operand(exec, bld.lm));
|
||||
bld.sop2(Builder::s_andn2, Definition(exec, bld.lm), bld.def(s1, scc), orig_exec,
|
||||
Operand(exec, bld.lm));
|
||||
|
||||
bld.branch(aco_opcode::p_cbranch_z, bld.hint_vcc(bld.def(s2)), Operand(exec, bld.lm), block->linear_succs[1], block->linear_succs[0]);
|
||||
bld.branch(aco_opcode::p_cbranch_z, bld.hint_vcc(bld.def(s2)), Operand(exec, bld.lm),
|
||||
block->linear_succs[1], block->linear_succs[0]);
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -1020,7 +1050,8 @@ void add_branch_code(exec_ctx& ctx, Block* block)
|
|||
bld.copy(Definition(exec, bld.lm), Operand(0u, bld.lm == s2));
|
||||
}
|
||||
|
||||
bld.branch(aco_opcode::p_cbranch_nz, bld.hint_vcc(bld.def(s2)), bld.scc(cond), block->linear_succs[1], block->linear_succs[0]);
|
||||
bld.branch(aco_opcode::p_cbranch_nz, bld.hint_vcc(bld.def(s2)), bld.scc(cond),
|
||||
block->linear_succs[1], block->linear_succs[0]);
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -1048,12 +1079,14 @@ void add_branch_code(exec_ctx& ctx, Block* block)
|
|||
bld.copy(Definition(exec, bld.lm), Operand(0u, bld.lm == s2));
|
||||
}
|
||||
|
||||
bld.branch(aco_opcode::p_cbranch_nz, bld.hint_vcc(bld.def(s2)), bld.scc(cond), block->linear_succs[1], block->linear_succs[0]);
|
||||
bld.branch(aco_opcode::p_cbranch_nz, bld.hint_vcc(bld.def(s2)), bld.scc(cond),
|
||||
block->linear_succs[1], block->linear_succs[0]);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
void process_block(exec_ctx& ctx, Block* block)
|
||||
void
|
||||
process_block(exec_ctx& ctx, Block* block)
|
||||
{
|
||||
std::vector<aco_ptr<Instruction>> instructions;
|
||||
instructions.reserve(block->instructions.size());
|
||||
|
@ -1072,8 +1105,8 @@ void process_block(exec_ctx& ctx, Block* block)
|
|||
|
||||
} /* end namespace */
|
||||
|
||||
|
||||
void insert_exec_mask(Program *program)
|
||||
void
|
||||
insert_exec_mask(Program* program)
|
||||
{
|
||||
exec_ctx ctx(program);
|
||||
|
||||
|
@ -1082,8 +1115,6 @@ void insert_exec_mask(Program *program)
|
|||
|
||||
for (Block& block : program->blocks)
|
||||
process_block(ctx, &block);
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
} // namespace aco
|
||||
|
|
|
@ -23,6 +23,7 @@
|
|||
*/
|
||||
|
||||
#include "aco_ir.h"
|
||||
|
||||
#include "common/sid.h"
|
||||
|
||||
#include <map>
|
||||
|
@ -49,7 +50,8 @@ namespace {
|
|||
* - or erase gprs with counters higher than to be waited for.
|
||||
*/
|
||||
|
||||
// TODO: do a more clever insertion of wait_cnt (lgkm_cnt) when there is a load followed by a use of a previous load
|
||||
// TODO: do a more clever insertion of wait_cnt (lgkm_cnt)
|
||||
// when there is a load followed by a use of a previous load
|
||||
|
||||
/* Instructions of the same event will finish in-order except for smem
|
||||
* and maybe flat. Instructions of different events may not finish in-order. */
|
||||
|
@ -77,54 +79,50 @@ enum counter_type : uint8_t {
|
|||
num_counters = 4,
|
||||
};
|
||||
|
||||
static const uint16_t exp_events = event_exp_pos | event_exp_param | event_exp_mrt_null | event_gds_gpr_lock | event_vmem_gpr_lock;
|
||||
static const uint16_t exp_events =
|
||||
event_exp_pos | event_exp_param | event_exp_mrt_null | event_gds_gpr_lock | event_vmem_gpr_lock;
|
||||
static const uint16_t lgkm_events = event_smem | event_lds | event_gds | event_flat | event_sendmsg;
|
||||
static const uint16_t vm_events = event_vmem | event_flat;
|
||||
static const uint16_t vs_events = event_vmem_store;
|
||||
|
||||
uint8_t get_counters_for_event(wait_event ev)
|
||||
uint8_t
|
||||
get_counters_for_event(wait_event ev)
|
||||
{
|
||||
switch (ev) {
|
||||
case event_smem:
|
||||
case event_lds:
|
||||
case event_gds:
|
||||
case event_sendmsg:
|
||||
return counter_lgkm;
|
||||
case event_vmem:
|
||||
return counter_vm;
|
||||
case event_vmem_store:
|
||||
return counter_vs;
|
||||
case event_flat:
|
||||
return counter_vm | counter_lgkm;
|
||||
case event_sendmsg: return counter_lgkm;
|
||||
case event_vmem: return counter_vm;
|
||||
case event_vmem_store: return counter_vs;
|
||||
case event_flat: return counter_vm | counter_lgkm;
|
||||
case event_exp_pos:
|
||||
case event_exp_param:
|
||||
case event_exp_mrt_null:
|
||||
case event_gds_gpr_lock:
|
||||
case event_vmem_gpr_lock:
|
||||
return counter_exp;
|
||||
default:
|
||||
return 0;
|
||||
case event_vmem_gpr_lock: return counter_exp;
|
||||
default: return 0;
|
||||
}
|
||||
}
|
||||
|
||||
struct wait_entry {
|
||||
wait_imm imm;
|
||||
uint16_t events; /* use wait_event notion */
|
||||
uint16_t events; /* use wait_event notion */
|
||||
uint8_t counters; /* use counter_type notion */
|
||||
bool wait_on_read:1;
|
||||
bool logical:1;
|
||||
bool has_vmem_nosampler:1;
|
||||
bool has_vmem_sampler:1;
|
||||
bool wait_on_read : 1;
|
||||
bool logical : 1;
|
||||
bool has_vmem_nosampler : 1;
|
||||
bool has_vmem_sampler : 1;
|
||||
|
||||
wait_entry(wait_event event_, wait_imm imm_, bool logical_, bool wait_on_read_)
|
||||
: imm(imm_), events(event_), counters(get_counters_for_event(event_)),
|
||||
wait_on_read(wait_on_read_), logical(logical_),
|
||||
has_vmem_nosampler(false), has_vmem_sampler(false) {}
|
||||
: imm(imm_), events(event_), counters(get_counters_for_event(event_)),
|
||||
wait_on_read(wait_on_read_), logical(logical_), has_vmem_nosampler(false),
|
||||
has_vmem_sampler(false)
|
||||
{}
|
||||
|
||||
bool join(const wait_entry& other)
|
||||
{
|
||||
bool changed = (other.events & ~events) ||
|
||||
(other.counters & ~counters) ||
|
||||
bool changed = (other.events & ~events) || (other.counters & ~counters) ||
|
||||
(other.wait_on_read && !wait_on_read) ||
|
||||
(other.has_vmem_nosampler && !has_vmem_nosampler) ||
|
||||
(other.has_vmem_sampler && !has_vmem_sampler);
|
||||
|
@ -156,7 +154,8 @@ struct wait_entry {
|
|||
|
||||
if (counter == counter_exp) {
|
||||
imm.exp = wait_imm::unset_counter;
|
||||
events &= ~(event_exp_pos | event_exp_param | event_exp_mrt_null | event_gds_gpr_lock | event_vmem_gpr_lock);
|
||||
events &= ~(event_exp_pos | event_exp_param | event_exp_mrt_null | event_gds_gpr_lock |
|
||||
event_vmem_gpr_lock);
|
||||
}
|
||||
|
||||
if (counter == counter_vs) {
|
||||
|
@ -170,7 +169,7 @@ struct wait_entry {
|
|||
};
|
||||
|
||||
struct wait_ctx {
|
||||
Program *program;
|
||||
Program* program;
|
||||
enum chip_class chip_class;
|
||||
uint16_t max_vm_cnt;
|
||||
uint16_t max_exp_cnt;
|
||||
|
@ -189,24 +188,21 @@ struct wait_ctx {
|
|||
wait_imm barrier_imm[storage_count];
|
||||
uint16_t barrier_events[storage_count] = {}; /* use wait_event notion */
|
||||
|
||||
std::map<PhysReg,wait_entry> gpr_map;
|
||||
std::map<PhysReg, wait_entry> gpr_map;
|
||||
|
||||
wait_ctx() {}
|
||||
wait_ctx(Program *program_)
|
||||
: program(program_),
|
||||
chip_class(program_->chip_class),
|
||||
max_vm_cnt(program_->chip_class >= GFX9 ? 62 : 14),
|
||||
max_exp_cnt(6),
|
||||
max_lgkm_cnt(program_->chip_class >= GFX10 ? 62 : 14),
|
||||
max_vs_cnt(program_->chip_class >= GFX10 ? 62 : 0),
|
||||
unordered_events(event_smem | (program_->chip_class < GFX10 ? event_flat : 0)) {}
|
||||
wait_ctx(Program* program_)
|
||||
: program(program_), chip_class(program_->chip_class),
|
||||
max_vm_cnt(program_->chip_class >= GFX9 ? 62 : 14), max_exp_cnt(6),
|
||||
max_lgkm_cnt(program_->chip_class >= GFX10 ? 62 : 14),
|
||||
max_vs_cnt(program_->chip_class >= GFX10 ? 62 : 0),
|
||||
unordered_events(event_smem | (program_->chip_class < GFX10 ? event_flat : 0))
|
||||
{}
|
||||
|
||||
bool join(const wait_ctx* other, bool logical)
|
||||
{
|
||||
bool changed = other->exp_cnt > exp_cnt ||
|
||||
other->vm_cnt > vm_cnt ||
|
||||
other->lgkm_cnt > lgkm_cnt ||
|
||||
other->vs_cnt > vs_cnt ||
|
||||
bool changed = other->exp_cnt > exp_cnt || other->vm_cnt > vm_cnt ||
|
||||
other->lgkm_cnt > lgkm_cnt || other->vs_cnt > vs_cnt ||
|
||||
(other->pending_flat_lgkm && !pending_flat_lgkm) ||
|
||||
(other->pending_flat_vm && !pending_flat_vm);
|
||||
|
||||
|
@ -218,12 +214,11 @@ struct wait_ctx {
|
|||
pending_flat_vm |= other->pending_flat_vm;
|
||||
pending_s_buffer_store |= other->pending_s_buffer_store;
|
||||
|
||||
for (const auto& entry : other->gpr_map)
|
||||
{
|
||||
for (const auto& entry : other->gpr_map) {
|
||||
if (entry.second.logical != logical)
|
||||
continue;
|
||||
|
||||
using iterator = std::map<PhysReg,wait_entry>::iterator;
|
||||
using iterator = std::map<PhysReg, wait_entry>::iterator;
|
||||
const std::pair<iterator, bool> insert_pair = gpr_map.insert(entry);
|
||||
if (insert_pair.second) {
|
||||
changed = true;
|
||||
|
@ -241,12 +236,14 @@ struct wait_ctx {
|
|||
return changed;
|
||||
}
|
||||
|
||||
void wait_and_remove_from_entry(PhysReg reg, wait_entry& entry, counter_type counter) {
|
||||
void wait_and_remove_from_entry(PhysReg reg, wait_entry& entry, counter_type counter)
|
||||
{
|
||||
entry.remove_counter(counter);
|
||||
}
|
||||
};
|
||||
|
||||
wait_imm check_instr(Instruction* instr, wait_ctx& ctx)
|
||||
wait_imm
|
||||
check_instr(Instruction* instr, wait_ctx& ctx)
|
||||
{
|
||||
wait_imm wait;
|
||||
|
||||
|
@ -257,7 +254,7 @@ wait_imm check_instr(Instruction* instr, wait_ctx& ctx)
|
|||
/* check consecutively read gprs */
|
||||
for (unsigned j = 0; j < op.size(); j++) {
|
||||
PhysReg reg{op.physReg() + j};
|
||||
std::map<PhysReg,wait_entry>::iterator it = ctx.gpr_map.find(reg);
|
||||
std::map<PhysReg, wait_entry>::iterator it = ctx.gpr_map.find(reg);
|
||||
if (it == ctx.gpr_map.end() || !it->second.wait_on_read)
|
||||
continue;
|
||||
|
||||
|
@ -267,22 +264,24 @@ wait_imm check_instr(Instruction* instr, wait_ctx& ctx)
|
|||
|
||||
for (const Definition& def : instr->definitions) {
|
||||
/* check consecutively written gprs */
|
||||
for (unsigned j = 0; j < def.getTemp().size(); j++)
|
||||
{
|
||||
for (unsigned j = 0; j < def.getTemp().size(); j++) {
|
||||
PhysReg reg{def.physReg() + j};
|
||||
|
||||
std::map<PhysReg,wait_entry>::iterator it = ctx.gpr_map.find(reg);
|
||||
std::map<PhysReg, wait_entry>::iterator it = ctx.gpr_map.find(reg);
|
||||
if (it == ctx.gpr_map.end())
|
||||
continue;
|
||||
|
||||
/* Vector Memory reads and writes return in the order they were issued */
|
||||
bool has_sampler = instr->isMIMG() && !instr->operands[1].isUndefined() && instr->operands[1].regClass() == s4;
|
||||
bool has_sampler = instr->isMIMG() && !instr->operands[1].isUndefined() &&
|
||||
instr->operands[1].regClass() == s4;
|
||||
if (instr->isVMEM() && ((it->second.events & vm_events) == event_vmem) &&
|
||||
it->second.has_vmem_nosampler == !has_sampler && it->second.has_vmem_sampler == has_sampler)
|
||||
it->second.has_vmem_nosampler == !has_sampler &&
|
||||
it->second.has_vmem_sampler == has_sampler)
|
||||
continue;
|
||||
|
||||
/* LDS reads and writes return in the order they were issued. same for GDS */
|
||||
if (instr->isDS() && (it->second.events & lgkm_events) == (instr->ds().gds ? event_gds : event_lds))
|
||||
if (instr->isDS() &&
|
||||
(it->second.events & lgkm_events) == (instr->ds().gds ? event_gds : event_lds))
|
||||
continue;
|
||||
|
||||
wait.combine(it->second.imm);
|
||||
|
@ -292,7 +291,8 @@ wait_imm check_instr(Instruction* instr, wait_ctx& ctx)
|
|||
return wait;
|
||||
}
|
||||
|
||||
wait_imm parse_wait_instr(wait_ctx& ctx, Instruction *instr)
|
||||
wait_imm
|
||||
parse_wait_instr(wait_ctx& ctx, Instruction* instr)
|
||||
{
|
||||
if (instr->opcode == aco_opcode::s_waitcnt_vscnt &&
|
||||
instr->definitions[0].physReg() == sgpr_null) {
|
||||
|
@ -305,10 +305,12 @@ wait_imm parse_wait_instr(wait_ctx& ctx, Instruction *instr)
|
|||
return wait_imm();
|
||||
}
|
||||
|
||||
wait_imm perform_barrier(wait_ctx& ctx, memory_sync_info sync, unsigned semantics)
|
||||
wait_imm
|
||||
perform_barrier(wait_ctx& ctx, memory_sync_info sync, unsigned semantics)
|
||||
{
|
||||
wait_imm imm;
|
||||
sync_scope subgroup_scope = ctx.program->workgroup_size <= ctx.program->wave_size ? scope_workgroup : scope_subgroup;
|
||||
sync_scope subgroup_scope =
|
||||
ctx.program->workgroup_size <= ctx.program->wave_size ? scope_workgroup : scope_subgroup;
|
||||
if ((sync.semantics & semantics) && sync.scope > subgroup_scope) {
|
||||
unsigned storage = sync.storage;
|
||||
while (storage) {
|
||||
|
@ -321,7 +323,8 @@ wait_imm perform_barrier(wait_ctx& ctx, memory_sync_info sync, unsigned semantic
|
|||
if (bar_scope_lds <= subgroup_scope)
|
||||
events &= ~event_lds;
|
||||
|
||||
/* in non-WGP, the L1 (L0 on GFX10+) cache keeps all memory operations in-order for the same workgroup */
|
||||
/* in non-WGP, the L1 (L0 on GFX10+) cache keeps all memory operations
|
||||
* in-order for the same workgroup */
|
||||
if (!ctx.program->wgp_mode && sync.scope <= scope_workgroup)
|
||||
events &= ~(event_vmem | event_vmem_store | event_smem);
|
||||
|
||||
|
@ -333,7 +336,8 @@ wait_imm perform_barrier(wait_ctx& ctx, memory_sync_info sync, unsigned semantic
|
|||
return imm;
|
||||
}
|
||||
|
||||
void force_waitcnt(wait_ctx& ctx, wait_imm& imm)
|
||||
void
|
||||
force_waitcnt(wait_ctx& ctx, wait_imm& imm)
|
||||
{
|
||||
if (ctx.vm_cnt)
|
||||
imm.vm = 0;
|
||||
|
@ -348,7 +352,8 @@ void force_waitcnt(wait_ctx& ctx, wait_imm& imm)
|
|||
}
|
||||
}
|
||||
|
||||
wait_imm kill(Instruction* instr, wait_ctx& ctx, memory_sync_info sync_info)
|
||||
wait_imm
|
||||
kill(Instruction* instr, wait_ctx& ctx, memory_sync_info sync_info)
|
||||
{
|
||||
wait_imm imm;
|
||||
|
||||
|
@ -364,7 +369,6 @@ wait_imm kill(Instruction* instr, wait_ctx& ctx, memory_sync_info sync_info)
|
|||
|
||||
imm.combine(parse_wait_instr(ctx, instr));
|
||||
|
||||
|
||||
/* It's required to wait for scalar stores before "writing back" data.
|
||||
* It shouldn't cost anything anyways since we're about to do s_endpgm.
|
||||
*/
|
||||
|
@ -380,20 +384,19 @@ wait_imm kill(Instruction* instr, wait_ctx& ctx, memory_sync_info sync_info)
|
|||
*
|
||||
* TODO: Refine this when we have proper alias analysis.
|
||||
*/
|
||||
if (ctx.pending_s_buffer_store &&
|
||||
!instr->smem().definitions.empty() &&
|
||||
if (ctx.pending_s_buffer_store && !instr->smem().definitions.empty() &&
|
||||
!instr->smem().sync.can_reorder()) {
|
||||
imm.lgkm = 0;
|
||||
}
|
||||
}
|
||||
|
||||
if (ctx.program->early_rast && instr->opcode == aco_opcode::exp) {
|
||||
if (instr->exp().dest >= V_008DFC_SQ_EXP_POS &&
|
||||
instr->exp().dest < V_008DFC_SQ_EXP_PRIM) {
|
||||
if (instr->exp().dest >= V_008DFC_SQ_EXP_POS && instr->exp().dest < V_008DFC_SQ_EXP_PRIM) {
|
||||
|
||||
/* With early_rast, the HW will start clipping and rasterization after the 1st DONE pos export.
|
||||
* Wait for all stores (and atomics) to complete, so PS can read them.
|
||||
* TODO: This only really applies to DONE pos exports. Consider setting the DONE bit earlier.
|
||||
/* With early_rast, the HW will start clipping and rasterization after the 1st DONE pos
|
||||
* export. Wait for all stores (and atomics) to complete, so PS can read them.
|
||||
* TODO: This only really applies to DONE pos exports.
|
||||
* Consider setting the DONE bit earlier.
|
||||
*/
|
||||
if (ctx.vs_cnt > 0)
|
||||
imm.vs = 0;
|
||||
|
@ -444,9 +447,8 @@ wait_imm kill(Instruction* instr, wait_ctx& ctx, memory_sync_info sync_info)
|
|||
}
|
||||
|
||||
/* remove all gprs with higher counter from map */
|
||||
std::map<PhysReg,wait_entry>::iterator it = ctx.gpr_map.begin();
|
||||
while (it != ctx.gpr_map.end())
|
||||
{
|
||||
std::map<PhysReg, wait_entry>::iterator it = ctx.gpr_map.begin();
|
||||
while (it != ctx.gpr_map.end()) {
|
||||
if (imm.exp != wait_imm::unset_counter && imm.exp <= it->second.imm.exp)
|
||||
ctx.wait_and_remove_from_entry(it->first, it->second, counter_exp);
|
||||
if (imm.vm != wait_imm::unset_counter && imm.vm <= it->second.imm.vm)
|
||||
|
@ -472,13 +474,15 @@ wait_imm kill(Instruction* instr, wait_ctx& ctx, memory_sync_info sync_info)
|
|||
return imm;
|
||||
}
|
||||
|
||||
void update_barrier_counter(uint8_t *ctr, unsigned max)
|
||||
void
|
||||
update_barrier_counter(uint8_t* ctr, unsigned max)
|
||||
{
|
||||
if (*ctr != wait_imm::unset_counter && *ctr < max)
|
||||
(*ctr)++;
|
||||
}
|
||||
|
||||
void update_barrier_imm(wait_ctx& ctx, uint8_t counters, wait_event event, memory_sync_info sync)
|
||||
void
|
||||
update_barrier_imm(wait_ctx& ctx, uint8_t counters, wait_event event, memory_sync_info sync)
|
||||
{
|
||||
for (unsigned i = 0; i < storage_count; i++) {
|
||||
wait_imm& bar = ctx.barrier_imm[i];
|
||||
|
@ -506,7 +510,8 @@ void update_barrier_imm(wait_ctx& ctx, uint8_t counters, wait_event event, memor
|
|||
}
|
||||
}
|
||||
|
||||
void update_counters(wait_ctx& ctx, wait_event event, memory_sync_info sync=memory_sync_info())
|
||||
void
|
||||
update_counters(wait_ctx& ctx, wait_event event, memory_sync_info sync = memory_sync_info())
|
||||
{
|
||||
uint8_t counters = get_counters_for_event(event);
|
||||
|
||||
|
@ -529,7 +534,7 @@ void update_counters(wait_ctx& ctx, wait_event event, memory_sync_info sync=memo
|
|||
if (ctx.pending_flat_vm)
|
||||
counters &= ~counter_vm;
|
||||
|
||||
for (std::pair<const PhysReg,wait_entry>& e : ctx.gpr_map) {
|
||||
for (std::pair<const PhysReg, wait_entry>& e : ctx.gpr_map) {
|
||||
wait_entry& entry = e.second;
|
||||
|
||||
if (entry.events & ctx.unordered_events)
|
||||
|
@ -537,18 +542,23 @@ void update_counters(wait_ctx& ctx, wait_event event, memory_sync_info sync=memo
|
|||
|
||||
assert(entry.events);
|
||||
|
||||
if ((counters & counter_exp) && (entry.events & exp_events) == event && entry.imm.exp < ctx.max_exp_cnt)
|
||||
if ((counters & counter_exp) && (entry.events & exp_events) == event &&
|
||||
entry.imm.exp < ctx.max_exp_cnt)
|
||||
entry.imm.exp++;
|
||||
if ((counters & counter_lgkm) && (entry.events & lgkm_events) == event && entry.imm.lgkm < ctx.max_lgkm_cnt)
|
||||
if ((counters & counter_lgkm) && (entry.events & lgkm_events) == event &&
|
||||
entry.imm.lgkm < ctx.max_lgkm_cnt)
|
||||
entry.imm.lgkm++;
|
||||
if ((counters & counter_vm) && (entry.events & vm_events) == event && entry.imm.vm < ctx.max_vm_cnt)
|
||||
if ((counters & counter_vm) && (entry.events & vm_events) == event &&
|
||||
entry.imm.vm < ctx.max_vm_cnt)
|
||||
entry.imm.vm++;
|
||||
if ((counters & counter_vs) && (entry.events & vs_events) == event && entry.imm.vs < ctx.max_vs_cnt)
|
||||
if ((counters & counter_vs) && (entry.events & vs_events) == event &&
|
||||
entry.imm.vs < ctx.max_vs_cnt)
|
||||
entry.imm.vs++;
|
||||
}
|
||||
}
|
||||
|
||||
void update_counters_for_flat_load(wait_ctx& ctx, memory_sync_info sync=memory_sync_info())
|
||||
void
|
||||
update_counters_for_flat_load(wait_ctx& ctx, memory_sync_info sync = memory_sync_info())
|
||||
{
|
||||
assert(ctx.chip_class < GFX10);
|
||||
|
||||
|
@ -559,8 +569,7 @@ void update_counters_for_flat_load(wait_ctx& ctx, memory_sync_info sync=memory_s
|
|||
|
||||
update_barrier_imm(ctx, counter_vm | counter_lgkm, event_flat, sync);
|
||||
|
||||
for (std::pair<PhysReg,wait_entry> e : ctx.gpr_map)
|
||||
{
|
||||
for (std::pair<PhysReg, wait_entry> e : ctx.gpr_map) {
|
||||
if (e.second.counters & counter_vm)
|
||||
e.second.imm.vm = 0;
|
||||
if (e.second.counters & counter_lgkm)
|
||||
|
@ -570,8 +579,9 @@ void update_counters_for_flat_load(wait_ctx& ctx, memory_sync_info sync=memory_s
|
|||
ctx.pending_flat_vm = true;
|
||||
}
|
||||
|
||||
void insert_wait_entry(wait_ctx& ctx, PhysReg reg, RegClass rc, wait_event event, bool wait_on_read,
|
||||
bool has_sampler=false)
|
||||
void
|
||||
insert_wait_entry(wait_ctx& ctx, PhysReg reg, RegClass rc, wait_event event, bool wait_on_read,
|
||||
bool has_sampler = false)
|
||||
{
|
||||
uint16_t counters = get_counters_for_event(event);
|
||||
wait_imm imm;
|
||||
|
@ -589,24 +599,27 @@ void insert_wait_entry(wait_ctx& ctx, PhysReg reg, RegClass rc, wait_event event
|
|||
new_entry.has_vmem_sampler = (event & event_vmem) && has_sampler;
|
||||
|
||||
for (unsigned i = 0; i < rc.size(); i++) {
|
||||
auto it = ctx.gpr_map.emplace(PhysReg{reg.reg()+i}, new_entry);
|
||||
auto it = ctx.gpr_map.emplace(PhysReg{reg.reg() + i}, new_entry);
|
||||
if (!it.second)
|
||||
it.first->second.join(new_entry);
|
||||
}
|
||||
}
|
||||
|
||||
void insert_wait_entry(wait_ctx& ctx, Operand op, wait_event event, bool has_sampler=false)
|
||||
void
|
||||
insert_wait_entry(wait_ctx& ctx, Operand op, wait_event event, bool has_sampler = false)
|
||||
{
|
||||
if (!op.isConstant() && !op.isUndefined())
|
||||
insert_wait_entry(ctx, op.physReg(), op.regClass(), event, false, has_sampler);
|
||||
}
|
||||
|
||||
void insert_wait_entry(wait_ctx& ctx, Definition def, wait_event event, bool has_sampler=false)
|
||||
void
|
||||
insert_wait_entry(wait_ctx& ctx, Definition def, wait_event event, bool has_sampler = false)
|
||||
{
|
||||
insert_wait_entry(ctx, def.physReg(), def.regClass(), event, true, has_sampler);
|
||||
}
|
||||
|
||||
void gen(Instruction* instr, wait_ctx& ctx)
|
||||
void
|
||||
gen(Instruction* instr, wait_ctx& ctx)
|
||||
{
|
||||
switch (instr->format) {
|
||||
case Format::EXP: {
|
||||
|
@ -622,13 +635,11 @@ void gen(Instruction* instr, wait_ctx& ctx)
|
|||
update_counters(ctx, ev);
|
||||
|
||||
/* insert new entries for exported vgprs */
|
||||
for (unsigned i = 0; i < 4; i++)
|
||||
{
|
||||
for (unsigned i = 0; i < 4; i++) {
|
||||
if (exp_instr.enabled_mask & (1 << i)) {
|
||||
unsigned idx = exp_instr.compressed ? i >> 1 : i;
|
||||
assert(idx < exp_instr.operands.size());
|
||||
insert_wait_entry(ctx, exp_instr.operands[idx], ev);
|
||||
|
||||
}
|
||||
}
|
||||
insert_wait_entry(ctx, exec, s2, ev, false);
|
||||
|
@ -651,8 +662,7 @@ void gen(Instruction* instr, wait_ctx& ctx)
|
|||
|
||||
if (!instr->definitions.empty())
|
||||
insert_wait_entry(ctx, instr->definitions[0], event_smem);
|
||||
else if (ctx.chip_class >= GFX10 &&
|
||||
!smem.sync.can_reorder())
|
||||
else if (ctx.chip_class >= GFX10 && !smem.sync.can_reorder())
|
||||
ctx.pending_s_buffer_store = true;
|
||||
|
||||
break;
|
||||
|
@ -677,23 +687,21 @@ void gen(Instruction* instr, wait_ctx& ctx)
|
|||
case Format::MTBUF:
|
||||
case Format::MIMG:
|
||||
case Format::GLOBAL: {
|
||||
wait_event ev = !instr->definitions.empty() || ctx.chip_class < GFX10 ? event_vmem : event_vmem_store;
|
||||
wait_event ev =
|
||||
!instr->definitions.empty() || ctx.chip_class < GFX10 ? event_vmem : event_vmem_store;
|
||||
update_counters(ctx, ev, get_sync_info(instr));
|
||||
|
||||
bool has_sampler = instr->isMIMG() && !instr->operands[1].isUndefined() && instr->operands[1].regClass() == s4;
|
||||
bool has_sampler = instr->isMIMG() && !instr->operands[1].isUndefined() &&
|
||||
instr->operands[1].regClass() == s4;
|
||||
|
||||
if (!instr->definitions.empty())
|
||||
insert_wait_entry(ctx, instr->definitions[0], ev, has_sampler);
|
||||
|
||||
if (ctx.chip_class == GFX6 &&
|
||||
instr->format != Format::MIMG &&
|
||||
instr->operands.size() == 4) {
|
||||
if (ctx.chip_class == GFX6 && instr->format != Format::MIMG && instr->operands.size() == 4) {
|
||||
ctx.exp_cnt++;
|
||||
update_counters(ctx, event_vmem_gpr_lock);
|
||||
insert_wait_entry(ctx, instr->operands[3], event_vmem_gpr_lock);
|
||||
} else if (ctx.chip_class == GFX6 &&
|
||||
instr->isMIMG() &&
|
||||
!instr->operands[2].isUndefined()) {
|
||||
} else if (ctx.chip_class == GFX6 && instr->isMIMG() && !instr->operands[2].isUndefined()) {
|
||||
ctx.exp_cnt++;
|
||||
update_counters(ctx, event_vmem_gpr_lock);
|
||||
insert_wait_entry(ctx, instr->operands[2], event_vmem_gpr_lock);
|
||||
|
@ -702,35 +710,37 @@ void gen(Instruction* instr, wait_ctx& ctx)
|
|||
break;
|
||||
}
|
||||
case Format::SOPP: {
|
||||
if (instr->opcode == aco_opcode::s_sendmsg ||
|
||||
instr->opcode == aco_opcode::s_sendmsghalt)
|
||||
if (instr->opcode == aco_opcode::s_sendmsg || instr->opcode == aco_opcode::s_sendmsghalt)
|
||||
update_counters(ctx, event_sendmsg);
|
||||
break;
|
||||
}
|
||||
default:
|
||||
break;
|
||||
default: break;
|
||||
}
|
||||
}
|
||||
|
||||
void emit_waitcnt(wait_ctx& ctx, std::vector<aco_ptr<Instruction>>& instructions, wait_imm imm)
|
||||
void
|
||||
emit_waitcnt(wait_ctx& ctx, std::vector<aco_ptr<Instruction>>& instructions, wait_imm imm)
|
||||
{
|
||||
if (imm.vs != wait_imm::unset_counter) {
|
||||
assert(ctx.chip_class >= GFX10);
|
||||
SOPK_instruction* waitcnt_vs = create_instruction<SOPK_instruction>(aco_opcode::s_waitcnt_vscnt, Format::SOPK, 0, 1);
|
||||
SOPK_instruction* waitcnt_vs =
|
||||
create_instruction<SOPK_instruction>(aco_opcode::s_waitcnt_vscnt, Format::SOPK, 0, 1);
|
||||
waitcnt_vs->definitions[0] = Definition(sgpr_null, s1);
|
||||
waitcnt_vs->imm = imm.vs;
|
||||
instructions.emplace_back(waitcnt_vs);
|
||||
imm.vs = wait_imm::unset_counter;
|
||||
}
|
||||
if (!imm.empty()) {
|
||||
SOPP_instruction* waitcnt = create_instruction<SOPP_instruction>(aco_opcode::s_waitcnt, Format::SOPP, 0, 0);
|
||||
SOPP_instruction* waitcnt =
|
||||
create_instruction<SOPP_instruction>(aco_opcode::s_waitcnt, Format::SOPP, 0, 0);
|
||||
waitcnt->imm = imm.pack(ctx.chip_class);
|
||||
waitcnt->block = -1;
|
||||
instructions.emplace_back(waitcnt);
|
||||
}
|
||||
}
|
||||
|
||||
void handle_block(Program *program, Block& block, wait_ctx& ctx)
|
||||
void
|
||||
handle_block(Program* program, Block& block, wait_ctx& ctx)
|
||||
{
|
||||
std::vector<aco_ptr<Instruction>> new_instructions;
|
||||
|
||||
|
@ -763,7 +773,8 @@ void handle_block(Program *program, Block& block, wait_ctx& ctx)
|
|||
|
||||
} /* end namespace */
|
||||
|
||||
void insert_wait_states(Program* program)
|
||||
void
|
||||
insert_wait_states(Program* program)
|
||||
{
|
||||
/* per BB ctx */
|
||||
std::vector<bool> done(program->blocks.size());
|
||||
|
@ -818,5 +829,4 @@ void insert_wait_states(Program* program)
|
|||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
} // namespace aco
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -39,21 +39,22 @@ struct shader_io_state {
|
|||
uint8_t mask[VARYING_SLOT_MAX];
|
||||
Temp temps[VARYING_SLOT_MAX * 4u];
|
||||
|
||||
shader_io_state() {
|
||||
shader_io_state()
|
||||
{
|
||||
memset(mask, 0, sizeof(mask));
|
||||
std::fill_n(temps, VARYING_SLOT_MAX * 4u, Temp(0, RegClass::v1));
|
||||
}
|
||||
};
|
||||
|
||||
struct isel_context {
|
||||
const struct radv_nir_compiler_options *options;
|
||||
struct radv_shader_args *args;
|
||||
Program *program;
|
||||
nir_shader *shader;
|
||||
const struct radv_nir_compiler_options* options;
|
||||
struct radv_shader_args* args;
|
||||
Program* program;
|
||||
nir_shader* shader;
|
||||
uint32_t constant_data_offset;
|
||||
Block *block;
|
||||
Block* block;
|
||||
uint32_t first_temp_id;
|
||||
std::unordered_map<unsigned, std::array<Temp,NIR_MAX_VEC_COMPONENTS>> allocated_vec;
|
||||
std::unordered_map<unsigned, std::array<Temp, NIR_MAX_VEC_COMPONENTS>> allocated_vec;
|
||||
Stage stage;
|
||||
struct {
|
||||
bool has_branch;
|
||||
|
@ -66,7 +67,8 @@ struct isel_context {
|
|||
struct {
|
||||
bool is_divergent = false;
|
||||
} parent_if;
|
||||
bool exec_potentially_empty_discard = false; /* set to false when loop_nest_depth==0 && parent_if.is_divergent==false */
|
||||
bool exec_potentially_empty_discard =
|
||||
false; /* set to false when loop_nest_depth==0 && parent_if.is_divergent==false */
|
||||
uint16_t exec_potentially_empty_break_depth = UINT16_MAX;
|
||||
/* Set to false when loop_nest_depth==exec_potentially_empty_break_depth
|
||||
* and parent_if.is_divergent==false. Called _break but it's also used for
|
||||
|
@ -76,7 +78,7 @@ struct isel_context {
|
|||
} cf_info;
|
||||
|
||||
/* NIR range analysis. */
|
||||
struct hash_table *range_ht;
|
||||
struct hash_table* range_ht;
|
||||
nir_unsigned_upper_bound_config ub_config;
|
||||
|
||||
Temp arg_temps[AC_MAX_ARGS];
|
||||
|
@ -102,22 +104,19 @@ struct isel_context {
|
|||
shader_io_state outputs;
|
||||
};
|
||||
|
||||
inline Temp get_arg(isel_context *ctx, struct ac_arg arg)
|
||||
inline Temp
|
||||
get_arg(isel_context* ctx, struct ac_arg arg)
|
||||
{
|
||||
assert(arg.used);
|
||||
return ctx->arg_temps[arg.arg_index];
|
||||
}
|
||||
|
||||
void init_context(isel_context *ctx, nir_shader *shader);
|
||||
void cleanup_context(isel_context *ctx);
|
||||
void init_context(isel_context* ctx, nir_shader* shader);
|
||||
void cleanup_context(isel_context* ctx);
|
||||
|
||||
isel_context
|
||||
setup_isel_context(Program* program,
|
||||
unsigned shader_count,
|
||||
struct nir_shader *const *shaders,
|
||||
ac_shader_config* config,
|
||||
struct radv_shader_args *args,
|
||||
bool is_gs_copy_shader);
|
||||
isel_context setup_isel_context(Program* program, unsigned shader_count,
|
||||
struct nir_shader* const* shaders, ac_shader_config* config,
|
||||
struct radv_shader_args* args, bool is_gs_copy_shader);
|
||||
|
||||
} // namespace aco
|
||||
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -23,6 +23,7 @@
|
|||
*/
|
||||
|
||||
#include "aco_interface.h"
|
||||
|
||||
#include "aco_ir.h"
|
||||
|
||||
#include "vulkan/radv_shader.h"
|
||||
|
@ -37,23 +38,33 @@
|
|||
static const std::array<aco_compiler_statistic_info, aco::num_statistics> statistic_infos = []()
|
||||
{
|
||||
std::array<aco_compiler_statistic_info, aco::num_statistics> ret{};
|
||||
ret[aco::statistic_hash] = aco_compiler_statistic_info{"Hash", "CRC32 hash of code and constant data"};
|
||||
ret[aco::statistic_instructions] = aco_compiler_statistic_info{"Instructions", "Instruction count"};
|
||||
ret[aco::statistic_copies] = aco_compiler_statistic_info{"Copies", "Copy instructions created for pseudo-instructions"};
|
||||
ret[aco::statistic_hash] =
|
||||
aco_compiler_statistic_info{"Hash", "CRC32 hash of code and constant data"};
|
||||
ret[aco::statistic_instructions] =
|
||||
aco_compiler_statistic_info{"Instructions", "Instruction count"};
|
||||
ret[aco::statistic_copies] =
|
||||
aco_compiler_statistic_info{"Copies", "Copy instructions created for pseudo-instructions"};
|
||||
ret[aco::statistic_branches] = aco_compiler_statistic_info{"Branches", "Branch instructions"};
|
||||
ret[aco::statistic_latency] = aco_compiler_statistic_info{"Latency", "Issue cycles plus stall cycles"};
|
||||
ret[aco::statistic_inv_throughput] = aco_compiler_statistic_info{"Inverse Throughput", "Estimated busy cycles to execute one wave"};
|
||||
ret[aco::statistic_vmem_clauses] = aco_compiler_statistic_info{"VMEM Clause", "Number of VMEM clauses (includes 1-sized clauses)"};
|
||||
ret[aco::statistic_smem_clauses] = aco_compiler_statistic_info{"SMEM Clause", "Number of SMEM clauses (includes 1-sized clauses)"};
|
||||
ret[aco::statistic_sgpr_presched] = aco_compiler_statistic_info{"Pre-Sched SGPRs", "SGPR usage before scheduling"};
|
||||
ret[aco::statistic_vgpr_presched] = aco_compiler_statistic_info{"Pre-Sched VGPRs", "VGPR usage before scheduling"};
|
||||
ret[aco::statistic_latency] =
|
||||
aco_compiler_statistic_info{"Latency", "Issue cycles plus stall cycles"};
|
||||
ret[aco::statistic_inv_throughput] = aco_compiler_statistic_info{
|
||||
"Inverse Throughput", "Estimated busy cycles to execute one wave"};
|
||||
ret[aco::statistic_vmem_clauses] = aco_compiler_statistic_info{
|
||||
"VMEM Clause", "Number of VMEM clauses (includes 1-sized clauses)"};
|
||||
ret[aco::statistic_smem_clauses] = aco_compiler_statistic_info{
|
||||
"SMEM Clause", "Number of SMEM clauses (includes 1-sized clauses)"};
|
||||
ret[aco::statistic_sgpr_presched] =
|
||||
aco_compiler_statistic_info{"Pre-Sched SGPRs", "SGPR usage before scheduling"};
|
||||
ret[aco::statistic_vgpr_presched] =
|
||||
aco_compiler_statistic_info{"Pre-Sched VGPRs", "VGPR usage before scheduling"};
|
||||
return ret;
|
||||
}();
|
||||
|
||||
const unsigned aco_num_statistics = aco::num_statistics;
|
||||
const aco_compiler_statistic_info *aco_statistic_infos = statistic_infos.data();
|
||||
const aco_compiler_statistic_info* aco_statistic_infos = statistic_infos.data();
|
||||
|
||||
static void validate(aco::Program *program)
|
||||
static void
|
||||
validate(aco::Program* program)
|
||||
{
|
||||
if (!(aco::debug_flags & aco::DEBUG_VALIDATE_IR))
|
||||
return;
|
||||
|
@ -62,10 +73,9 @@ static void validate(aco::Program *program)
|
|||
assert(is_valid);
|
||||
}
|
||||
|
||||
void aco_compile_shader(unsigned shader_count,
|
||||
struct nir_shader *const *shaders,
|
||||
struct radv_shader_binary **binary,
|
||||
struct radv_shader_args *args)
|
||||
void
|
||||
aco_compile_shader(unsigned shader_count, struct nir_shader* const* shaders,
|
||||
struct radv_shader_binary** binary, struct radv_shader_args* args)
|
||||
{
|
||||
aco::init();
|
||||
|
||||
|
@ -116,11 +126,11 @@ void aco_compile_shader(unsigned shader_count,
|
|||
|
||||
std::string llvm_ir;
|
||||
if (args->options->record_ir) {
|
||||
char *data = NULL;
|
||||
char* data = NULL;
|
||||
size_t size = 0;
|
||||
u_memstream mem;
|
||||
if (u_memstream_open(&mem, &data, &size)) {
|
||||
FILE *const memf = u_memstream_get(&mem);
|
||||
FILE* const memf = u_memstream_get(&mem);
|
||||
aco_print_program(program.get(), memf);
|
||||
fputc(0, memf);
|
||||
u_memstream_close(&mem);
|
||||
|
@ -137,8 +147,7 @@ void aco_compile_shader(unsigned shader_count,
|
|||
aco_print_program(program.get(), stderr, live_vars, aco::print_live_vars | aco::print_kill);
|
||||
|
||||
if (!args->is_trap_handler_shader) {
|
||||
if (!args->options->disable_optimizations &&
|
||||
!(aco::debug_flags & aco::DEBUG_NO_SCHED))
|
||||
if (!args->options->disable_optimizations && !(aco::debug_flags & aco::DEBUG_NO_SCHED))
|
||||
aco::schedule_program(program.get(), live_vars);
|
||||
validate(program.get());
|
||||
|
||||
|
@ -189,11 +198,11 @@ void aco_compile_shader(unsigned shader_count,
|
|||
|
||||
std::string disasm;
|
||||
if (get_disasm) {
|
||||
char *data = NULL;
|
||||
char* data = NULL;
|
||||
size_t disasm_size = 0;
|
||||
struct u_memstream mem;
|
||||
if (u_memstream_open(&mem, &data, &disasm_size)) {
|
||||
FILE *const memf = u_memstream_get(&mem);
|
||||
FILE* const memf = u_memstream_get(&mem);
|
||||
aco::print_asm(program.get(), code, exec_size / 4u, memf);
|
||||
fputc(0, memf);
|
||||
u_memstream_close(&mem);
|
||||
|
@ -214,10 +223,10 @@ void aco_compile_shader(unsigned shader_count,
|
|||
* directly for the disk cache. Uninitialized data can appear because of
|
||||
* padding in the struct or because legacy_binary->data can be at an offset
|
||||
* from the start less than sizeof(radv_shader_binary_legacy). */
|
||||
radv_shader_binary_legacy* legacy_binary = (radv_shader_binary_legacy*) calloc(size, 1);
|
||||
radv_shader_binary_legacy* legacy_binary = (radv_shader_binary_legacy*)calloc(size, 1);
|
||||
|
||||
legacy_binary->base.type = RADV_BINARY_TYPE_LEGACY;
|
||||
legacy_binary->base.stage = shaders[shader_count-1]->info.stage;
|
||||
legacy_binary->base.stage = shaders[shader_count - 1]->info.stage;
|
||||
legacy_binary->base.is_gs_copy_shader = args->is_gs_copy_shader;
|
||||
legacy_binary->base.total_size = size;
|
||||
|
||||
|
@ -225,7 +234,8 @@ void aco_compile_shader(unsigned shader_count,
|
|||
memcpy(legacy_binary->data, program->statistics, aco::num_statistics * sizeof(uint32_t));
|
||||
legacy_binary->stats_size = stats_size;
|
||||
|
||||
memcpy(legacy_binary->data + legacy_binary->stats_size, code.data(), code.size() * sizeof(uint32_t));
|
||||
memcpy(legacy_binary->data + legacy_binary->stats_size, code.data(),
|
||||
code.size() * sizeof(uint32_t));
|
||||
legacy_binary->exec_size = exec_size;
|
||||
legacy_binary->code_size = code.size() * sizeof(uint32_t);
|
||||
|
||||
|
@ -233,12 +243,15 @@ void aco_compile_shader(unsigned shader_count,
|
|||
legacy_binary->disasm_size = 0;
|
||||
legacy_binary->ir_size = llvm_ir.size();
|
||||
|
||||
llvm_ir.copy((char*) legacy_binary->data + legacy_binary->stats_size + legacy_binary->code_size, llvm_ir.size());
|
||||
llvm_ir.copy((char*)legacy_binary->data + legacy_binary->stats_size + legacy_binary->code_size,
|
||||
llvm_ir.size());
|
||||
|
||||
if (get_disasm) {
|
||||
disasm.copy((char*) legacy_binary->data + legacy_binary->stats_size + legacy_binary->code_size + llvm_ir.size(), disasm.size());
|
||||
disasm.copy((char*)legacy_binary->data + legacy_binary->stats_size +
|
||||
legacy_binary->code_size + llvm_ir.size(),
|
||||
disasm.size());
|
||||
legacy_binary->disasm_size = disasm.size();
|
||||
}
|
||||
|
||||
*binary = (radv_shader_binary*) legacy_binary;
|
||||
*binary = (radv_shader_binary*)legacy_binary;
|
||||
}
|
||||
|
|
|
@ -39,12 +39,10 @@ struct aco_compiler_statistic_info {
|
|||
};
|
||||
|
||||
extern const unsigned aco_num_statistics;
|
||||
extern const struct aco_compiler_statistic_info *aco_statistic_infos;
|
||||
extern const struct aco_compiler_statistic_info* aco_statistic_infos;
|
||||
|
||||
void aco_compile_shader(unsigned shader_count,
|
||||
struct nir_shader *const *shaders,
|
||||
struct radv_shader_binary** binary,
|
||||
struct radv_shader_args *args);
|
||||
void aco_compile_shader(unsigned shader_count, struct nir_shader* const* shaders,
|
||||
struct radv_shader_binary** binary, struct radv_shader_args* args);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
|
|
@ -32,39 +32,40 @@ namespace aco {
|
|||
|
||||
uint64_t debug_flags = 0;
|
||||
|
||||
static const struct debug_control aco_debug_options[] = {
|
||||
{"validateir", DEBUG_VALIDATE_IR},
|
||||
{"validatera", DEBUG_VALIDATE_RA},
|
||||
{"perfwarn", DEBUG_PERFWARN},
|
||||
{"force-waitcnt", DEBUG_FORCE_WAITCNT},
|
||||
{"novn", DEBUG_NO_VN},
|
||||
{"noopt", DEBUG_NO_OPT},
|
||||
{"nosched", DEBUG_NO_SCHED},
|
||||
{"perfinfo", DEBUG_PERF_INFO},
|
||||
{"liveinfo", DEBUG_LIVE_INFO},
|
||||
{NULL, 0}
|
||||
};
|
||||
static const struct debug_control aco_debug_options[] = {{"validateir", DEBUG_VALIDATE_IR},
|
||||
{"validatera", DEBUG_VALIDATE_RA},
|
||||
{"perfwarn", DEBUG_PERFWARN},
|
||||
{"force-waitcnt", DEBUG_FORCE_WAITCNT},
|
||||
{"novn", DEBUG_NO_VN},
|
||||
{"noopt", DEBUG_NO_OPT},
|
||||
{"nosched", DEBUG_NO_SCHED},
|
||||
{"perfinfo", DEBUG_PERF_INFO},
|
||||
{"liveinfo", DEBUG_LIVE_INFO},
|
||||
{NULL, 0}};
|
||||
|
||||
static once_flag init_once_flag = ONCE_FLAG_INIT;
|
||||
|
||||
static void init_once()
|
||||
static void
|
||||
init_once()
|
||||
{
|
||||
debug_flags = parse_debug_string(getenv("ACO_DEBUG"), aco_debug_options);
|
||||
|
||||
#ifndef NDEBUG
|
||||
#ifndef NDEBUG
|
||||
/* enable some flags by default on debug builds */
|
||||
debug_flags |= aco::DEBUG_VALIDATE_IR;
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
|
||||
void init()
|
||||
void
|
||||
init()
|
||||
{
|
||||
call_once(&init_once_flag, init_once);
|
||||
}
|
||||
|
||||
void init_program(Program *program, Stage stage, struct radv_shader_info *info,
|
||||
enum chip_class chip_class, enum radeon_family family,
|
||||
bool wgp_mode, ac_shader_config *config)
|
||||
void
|
||||
init_program(Program* program, Stage stage, struct radv_shader_info* info,
|
||||
enum chip_class chip_class, enum radeon_family family, bool wgp_mode,
|
||||
ac_shader_config* config)
|
||||
{
|
||||
program->stage = stage;
|
||||
program->config = config;
|
||||
|
@ -72,24 +73,12 @@ void init_program(Program *program, Stage stage, struct radv_shader_info *info,
|
|||
program->chip_class = chip_class;
|
||||
if (family == CHIP_UNKNOWN) {
|
||||
switch (chip_class) {
|
||||
case GFX6:
|
||||
program->family = CHIP_TAHITI;
|
||||
break;
|
||||
case GFX7:
|
||||
program->family = CHIP_BONAIRE;
|
||||
break;
|
||||
case GFX8:
|
||||
program->family = CHIP_POLARIS10;
|
||||
break;
|
||||
case GFX9:
|
||||
program->family = CHIP_VEGA10;
|
||||
break;
|
||||
case GFX10:
|
||||
program->family = CHIP_NAVI10;
|
||||
break;
|
||||
default:
|
||||
program->family = CHIP_UNKNOWN;
|
||||
break;
|
||||
case GFX6: program->family = CHIP_TAHITI; break;
|
||||
case GFX7: program->family = CHIP_BONAIRE; break;
|
||||
case GFX8: program->family = CHIP_POLARIS10; break;
|
||||
case GFX9: program->family = CHIP_VEGA10; break;
|
||||
case GFX10: program->family = CHIP_NAVI10; break;
|
||||
default: program->family = CHIP_UNKNOWN; break;
|
||||
}
|
||||
} else {
|
||||
program->family = family;
|
||||
|
@ -98,7 +87,8 @@ void init_program(Program *program, Stage stage, struct radv_shader_info *info,
|
|||
program->lane_mask = program->wave_size == 32 ? s1 : s2;
|
||||
|
||||
program->dev.lds_encoding_granule = chip_class >= GFX7 ? 512 : 256;
|
||||
program->dev.lds_alloc_granule = chip_class >= GFX10_3 ? 1024 : program->dev.lds_encoding_granule;
|
||||
program->dev.lds_alloc_granule =
|
||||
chip_class >= GFX10_3 ? 1024 : program->dev.lds_encoding_granule;
|
||||
program->dev.lds_limit = chip_class >= GFX7 ? 65536 : 32768;
|
||||
/* apparently gfx702 also has 16-bank LDS but I can't find a family for that */
|
||||
program->dev.has_16bank_lds = family == CHIP_KABINI || family == CHIP_STONEY;
|
||||
|
@ -111,7 +101,8 @@ void init_program(Program *program, Stage stage, struct radv_shader_info *info,
|
|||
program->dev.physical_sgprs = 5120; /* doesn't matter as long as it's at least 128 * 40 */
|
||||
program->dev.physical_vgprs = program->wave_size == 32 ? 1024 : 512;
|
||||
program->dev.sgpr_alloc_granule = 128;
|
||||
program->dev.sgpr_limit = 108; /* includes VCC, which can be treated as s[106-107] on GFX10+ */
|
||||
program->dev.sgpr_limit =
|
||||
108; /* includes VCC, which can be treated as s[106-107] on GFX10+ */
|
||||
if (chip_class >= GFX10_3)
|
||||
program->dev.vgpr_alloc_granule = program->wave_size == 32 ? 16 : 8;
|
||||
else
|
||||
|
@ -145,18 +136,14 @@ void init_program(Program *program, Stage stage, struct radv_shader_info *info,
|
|||
/* GFX9 APUS */
|
||||
case CHIP_RAVEN:
|
||||
case CHIP_RAVEN2:
|
||||
case CHIP_RENOIR:
|
||||
program->dev.xnack_enabled = true;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
case CHIP_RENOIR: program->dev.xnack_enabled = true; break;
|
||||
default: break;
|
||||
}
|
||||
|
||||
program->dev.sram_ecc_enabled = program->family == CHIP_ARCTURUS;
|
||||
/* apparently gfx702 also has fast v_fma_f32 but I can't find a family for that */
|
||||
program->dev.has_fast_fma32 = program->chip_class >= GFX9;
|
||||
if (program->family == CHIP_TAHITI ||
|
||||
program->family == CHIP_CARRIZO ||
|
||||
if (program->family == CHIP_TAHITI || program->family == CHIP_CARRIZO ||
|
||||
program->family == CHIP_HAWAII)
|
||||
program->dev.has_fast_fma32 = true;
|
||||
|
||||
|
@ -176,29 +163,24 @@ void init_program(Program *program, Stage stage, struct radv_shader_info *info,
|
|||
program->next_fp_mode.round32 = fp_round_ne;
|
||||
}
|
||||
|
||||
memory_sync_info get_sync_info(const Instruction* instr)
|
||||
memory_sync_info
|
||||
get_sync_info(const Instruction* instr)
|
||||
{
|
||||
switch (instr->format) {
|
||||
case Format::SMEM:
|
||||
return instr->smem().sync;
|
||||
case Format::MUBUF:
|
||||
return instr->mubuf().sync;
|
||||
case Format::MIMG:
|
||||
return instr->mimg().sync;
|
||||
case Format::MTBUF:
|
||||
return instr->mtbuf().sync;
|
||||
case Format::SMEM: return instr->smem().sync;
|
||||
case Format::MUBUF: return instr->mubuf().sync;
|
||||
case Format::MIMG: return instr->mimg().sync;
|
||||
case Format::MTBUF: return instr->mtbuf().sync;
|
||||
case Format::FLAT:
|
||||
case Format::GLOBAL:
|
||||
case Format::SCRATCH:
|
||||
return instr->flatlike().sync;
|
||||
case Format::DS:
|
||||
return instr->ds().sync;
|
||||
default:
|
||||
return memory_sync_info();
|
||||
case Format::SCRATCH: return instr->flatlike().sync;
|
||||
case Format::DS: return instr->ds().sync;
|
||||
default: return memory_sync_info();
|
||||
}
|
||||
}
|
||||
|
||||
bool can_use_SDWA(chip_class chip, const aco_ptr<Instruction>& instr, bool pre_ra)
|
||||
bool
|
||||
can_use_SDWA(chip_class chip, const aco_ptr<Instruction>& instr, bool pre_ra)
|
||||
{
|
||||
if (!instr->isVALU())
|
||||
return false;
|
||||
|
@ -218,7 +200,7 @@ bool can_use_SDWA(chip_class chip, const aco_ptr<Instruction>& instr, bool pre_r
|
|||
if (vop3.omod && chip < GFX9)
|
||||
return false;
|
||||
|
||||
//TODO: return true if we know we will use vcc
|
||||
// TODO: return true if we know we will use vcc
|
||||
if (!pre_ra && instr->definitions.size() >= 2)
|
||||
return false;
|
||||
|
||||
|
@ -244,38 +226,36 @@ bool can_use_SDWA(chip_class chip, const aco_ptr<Instruction>& instr, bool pre_r
|
|||
return false;
|
||||
}
|
||||
|
||||
bool is_mac = instr->opcode == aco_opcode::v_mac_f32 ||
|
||||
instr->opcode == aco_opcode::v_mac_f16 ||
|
||||
instr->opcode == aco_opcode::v_fmac_f32 ||
|
||||
instr->opcode == aco_opcode::v_fmac_f16;
|
||||
bool is_mac = instr->opcode == aco_opcode::v_mac_f32 || instr->opcode == aco_opcode::v_mac_f16 ||
|
||||
instr->opcode == aco_opcode::v_fmac_f32 || instr->opcode == aco_opcode::v_fmac_f16;
|
||||
|
||||
if (chip != GFX8 && is_mac)
|
||||
return false;
|
||||
|
||||
//TODO: return true if we know we will use vcc
|
||||
// TODO: return true if we know we will use vcc
|
||||
if (!pre_ra && instr->isVOPC())
|
||||
return false;
|
||||
if (!pre_ra && instr->operands.size() >= 3 && !is_mac)
|
||||
return false;
|
||||
|
||||
return instr->opcode != aco_opcode::v_madmk_f32 &&
|
||||
instr->opcode != aco_opcode::v_madak_f32 &&
|
||||
instr->opcode != aco_opcode::v_madmk_f16 &&
|
||||
instr->opcode != aco_opcode::v_madak_f16 &&
|
||||
return instr->opcode != aco_opcode::v_madmk_f32 && instr->opcode != aco_opcode::v_madak_f32 &&
|
||||
instr->opcode != aco_opcode::v_madmk_f16 && instr->opcode != aco_opcode::v_madak_f16 &&
|
||||
instr->opcode != aco_opcode::v_readfirstlane_b32 &&
|
||||
instr->opcode != aco_opcode::v_clrexcp &&
|
||||
instr->opcode != aco_opcode::v_swap_b32;
|
||||
instr->opcode != aco_opcode::v_clrexcp && instr->opcode != aco_opcode::v_swap_b32;
|
||||
}
|
||||
|
||||
/* updates "instr" and returns the old instruction (or NULL if no update was needed) */
|
||||
aco_ptr<Instruction> convert_to_SDWA(chip_class chip, aco_ptr<Instruction>& instr)
|
||||
aco_ptr<Instruction>
|
||||
convert_to_SDWA(chip_class chip, aco_ptr<Instruction>& instr)
|
||||
{
|
||||
if (instr->isSDWA())
|
||||
return NULL;
|
||||
|
||||
aco_ptr<Instruction> tmp = std::move(instr);
|
||||
Format format = (Format)(((uint16_t)tmp->format & ~(uint16_t)Format::VOP3) | (uint16_t)Format::SDWA);
|
||||
instr.reset(create_instruction<SDWA_instruction>(tmp->opcode, format, tmp->operands.size(), tmp->definitions.size()));
|
||||
Format format =
|
||||
(Format)(((uint16_t)tmp->format & ~(uint16_t)Format::VOP3) | (uint16_t)Format::SDWA);
|
||||
instr.reset(create_instruction<SDWA_instruction>(tmp->opcode, format, tmp->operands.size(),
|
||||
tmp->definitions.size()));
|
||||
std::copy(tmp->operands.cbegin(), tmp->operands.cend(), instr->operands.begin());
|
||||
std::copy(tmp->definitions.cbegin(), tmp->definitions.cend(), instr->definitions.begin());
|
||||
|
||||
|
@ -295,15 +275,9 @@ aco_ptr<Instruction> convert_to_SDWA(chip_class chip, aco_ptr<Instruction>& inst
|
|||
break;
|
||||
|
||||
switch (instr->operands[i].bytes()) {
|
||||
case 1:
|
||||
sdwa.sel[i] = sdwa_ubyte;
|
||||
break;
|
||||
case 2:
|
||||
sdwa.sel[i] = sdwa_uword;
|
||||
break;
|
||||
case 4:
|
||||
sdwa.sel[i] = sdwa_udword;
|
||||
break;
|
||||
case 1: sdwa.sel[i] = sdwa_ubyte; break;
|
||||
case 2: sdwa.sel[i] = sdwa_uword; break;
|
||||
case 4: sdwa.sel[i] = sdwa_udword; break;
|
||||
}
|
||||
}
|
||||
switch (instr->definitions[0].bytes()) {
|
||||
|
@ -315,9 +289,7 @@ aco_ptr<Instruction> convert_to_SDWA(chip_class chip, aco_ptr<Instruction>& inst
|
|||
sdwa.dst_sel = sdwa_uword;
|
||||
sdwa.dst_preserve = true;
|
||||
break;
|
||||
case 4:
|
||||
sdwa.dst_sel = sdwa_udword;
|
||||
break;
|
||||
case 4: sdwa.dst_sel = sdwa_udword; break;
|
||||
}
|
||||
|
||||
if (instr->definitions[0].getTemp().type() == RegType::sgpr && chip == GFX8)
|
||||
|
@ -330,7 +302,8 @@ aco_ptr<Instruction> convert_to_SDWA(chip_class chip, aco_ptr<Instruction>& inst
|
|||
return tmp;
|
||||
}
|
||||
|
||||
bool can_use_opsel(chip_class chip, aco_opcode op, int idx, bool high)
|
||||
bool
|
||||
can_use_opsel(chip_class chip, aco_opcode op, int idx, bool high)
|
||||
{
|
||||
/* opsel is only GFX9+ */
|
||||
if ((high || idx == -1) && chip < GFX9)
|
||||
|
@ -362,21 +335,18 @@ bool can_use_opsel(chip_class chip, aco_opcode op, int idx, bool high)
|
|||
case aco_opcode::v_lshlrev_b16_e64:
|
||||
case aco_opcode::v_lshrrev_b16_e64:
|
||||
case aco_opcode::v_ashrrev_i16_e64:
|
||||
case aco_opcode::v_mul_lo_u16_e64:
|
||||
return true;
|
||||
case aco_opcode::v_mul_lo_u16_e64: return true;
|
||||
case aco_opcode::v_pack_b32_f16:
|
||||
case aco_opcode::v_cvt_pknorm_i16_f16:
|
||||
case aco_opcode::v_cvt_pknorm_u16_f16:
|
||||
return idx != -1;
|
||||
case aco_opcode::v_cvt_pknorm_u16_f16: return idx != -1;
|
||||
case aco_opcode::v_mad_u32_u16:
|
||||
case aco_opcode::v_mad_i32_i16:
|
||||
return idx >= 0 && idx < 2;
|
||||
default:
|
||||
return false;
|
||||
case aco_opcode::v_mad_i32_i16: return idx >= 0 && idx < 2;
|
||||
default: return false;
|
||||
}
|
||||
}
|
||||
|
||||
uint32_t get_reduction_identity(ReduceOp op, unsigned idx)
|
||||
uint32_t
|
||||
get_reduction_identity(ReduceOp op, unsigned idx)
|
||||
{
|
||||
switch (op) {
|
||||
case iadd8:
|
||||
|
@ -397,65 +367,44 @@ uint32_t get_reduction_identity(ReduceOp op, unsigned idx)
|
|||
case umax8:
|
||||
case umax16:
|
||||
case umax32:
|
||||
case umax64:
|
||||
return 0;
|
||||
case umax64: return 0;
|
||||
case imul8:
|
||||
case imul16:
|
||||
case imul32:
|
||||
case imul64:
|
||||
return idx ? 0 : 1;
|
||||
case fmul16:
|
||||
return 0x3c00u; /* 1.0 */
|
||||
case fmul32:
|
||||
return 0x3f800000u; /* 1.0 */
|
||||
case fmul64:
|
||||
return idx ? 0x3ff00000u : 0u; /* 1.0 */
|
||||
case imin8:
|
||||
return INT8_MAX;
|
||||
case imin16:
|
||||
return INT16_MAX;
|
||||
case imin32:
|
||||
return INT32_MAX;
|
||||
case imin64:
|
||||
return idx ? 0x7fffffffu : 0xffffffffu;
|
||||
case imax8:
|
||||
return INT8_MIN;
|
||||
case imax16:
|
||||
return INT16_MIN;
|
||||
case imax32:
|
||||
return INT32_MIN;
|
||||
case imax64:
|
||||
return idx ? 0x80000000u : 0;
|
||||
case imul64: return idx ? 0 : 1;
|
||||
case fmul16: return 0x3c00u; /* 1.0 */
|
||||
case fmul32: return 0x3f800000u; /* 1.0 */
|
||||
case fmul64: return idx ? 0x3ff00000u : 0u; /* 1.0 */
|
||||
case imin8: return INT8_MAX;
|
||||
case imin16: return INT16_MAX;
|
||||
case imin32: return INT32_MAX;
|
||||
case imin64: return idx ? 0x7fffffffu : 0xffffffffu;
|
||||
case imax8: return INT8_MIN;
|
||||
case imax16: return INT16_MIN;
|
||||
case imax32: return INT32_MIN;
|
||||
case imax64: return idx ? 0x80000000u : 0;
|
||||
case umin8:
|
||||
case umin16:
|
||||
case iand8:
|
||||
case iand16:
|
||||
return 0xffffffffu;
|
||||
case iand16: return 0xffffffffu;
|
||||
case umin32:
|
||||
case umin64:
|
||||
case iand32:
|
||||
case iand64:
|
||||
return 0xffffffffu;
|
||||
case fmin16:
|
||||
return 0x7c00u; /* infinity */
|
||||
case fmin32:
|
||||
return 0x7f800000u; /* infinity */
|
||||
case fmin64:
|
||||
return idx ? 0x7ff00000u : 0u; /* infinity */
|
||||
case fmax16:
|
||||
return 0xfc00u; /* negative infinity */
|
||||
case fmax32:
|
||||
return 0xff800000u; /* negative infinity */
|
||||
case fmax64:
|
||||
return idx ? 0xfff00000u : 0u; /* negative infinity */
|
||||
default:
|
||||
unreachable("Invalid reduction operation");
|
||||
break;
|
||||
case iand64: return 0xffffffffu;
|
||||
case fmin16: return 0x7c00u; /* infinity */
|
||||
case fmin32: return 0x7f800000u; /* infinity */
|
||||
case fmin64: return idx ? 0x7ff00000u : 0u; /* infinity */
|
||||
case fmax16: return 0xfc00u; /* negative infinity */
|
||||
case fmax32: return 0xff800000u; /* negative infinity */
|
||||
case fmax64: return idx ? 0xfff00000u : 0u; /* negative infinity */
|
||||
default: unreachable("Invalid reduction operation"); break;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
bool needs_exec_mask(const Instruction* instr) {
|
||||
bool
|
||||
needs_exec_mask(const Instruction* instr)
|
||||
{
|
||||
if (instr->isSALU() || instr->isBranch())
|
||||
return instr->reads_exec();
|
||||
if (instr->isSMEM())
|
||||
|
@ -479,10 +428,8 @@ bool needs_exec_mask(const Instruction* instr) {
|
|||
case aco_opcode::p_reload:
|
||||
case aco_opcode::p_logical_start:
|
||||
case aco_opcode::p_logical_end:
|
||||
case aco_opcode::p_startpgm:
|
||||
return false;
|
||||
default:
|
||||
break;
|
||||
case aco_opcode::p_startpgm: return false;
|
||||
default: break;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -495,10 +442,11 @@ bool needs_exec_mask(const Instruction* instr) {
|
|||
return true;
|
||||
}
|
||||
|
||||
wait_imm::wait_imm() :
|
||||
vm(unset_counter), exp(unset_counter), lgkm(unset_counter), vs(unset_counter) {}
|
||||
wait_imm::wait_imm(uint16_t vm_, uint16_t exp_, uint16_t lgkm_, uint16_t vs_) :
|
||||
vm(vm_), exp(exp_), lgkm(lgkm_), vs(vs_) {}
|
||||
wait_imm::wait_imm() : vm(unset_counter), exp(unset_counter), lgkm(unset_counter), vs(unset_counter)
|
||||
{}
|
||||
wait_imm::wait_imm(uint16_t vm_, uint16_t exp_, uint16_t lgkm_, uint16_t vs_)
|
||||
: vm(vm_), exp(exp_), lgkm(lgkm_), vs(vs_)
|
||||
{}
|
||||
|
||||
wait_imm::wait_imm(enum chip_class chip, uint16_t packed) : vs(unset_counter)
|
||||
{
|
||||
|
@ -513,7 +461,8 @@ wait_imm::wait_imm(enum chip_class chip, uint16_t packed) : vs(unset_counter)
|
|||
lgkm |= (packed >> 8) & 0x30;
|
||||
}
|
||||
|
||||
uint16_t wait_imm::pack(enum chip_class chip) const
|
||||
uint16_t
|
||||
wait_imm::pack(enum chip_class chip) const
|
||||
{
|
||||
uint16_t imm = 0;
|
||||
assert(exp == unset_counter || exp <= 0x7);
|
||||
|
@ -536,13 +485,16 @@ uint16_t wait_imm::pack(enum chip_class chip) const
|
|||
break;
|
||||
}
|
||||
if (chip < GFX9 && vm == wait_imm::unset_counter)
|
||||
imm |= 0xc000; /* should have no effect on pre-GFX9 and now we won't have to worry about the architecture when interpreting the immediate */
|
||||
imm |= 0xc000; /* should have no effect on pre-GFX9 and now we won't have to worry about the
|
||||
architecture when interpreting the immediate */
|
||||
if (chip < GFX10 && lgkm == wait_imm::unset_counter)
|
||||
imm |= 0x3000; /* should have no effect on pre-GFX10 and now we won't have to worry about the architecture when interpreting the immediate */
|
||||
imm |= 0x3000; /* should have no effect on pre-GFX10 and now we won't have to worry about the
|
||||
architecture when interpreting the immediate */
|
||||
return imm;
|
||||
}
|
||||
|
||||
bool wait_imm::combine(const wait_imm& other)
|
||||
bool
|
||||
wait_imm::combine(const wait_imm& other)
|
||||
{
|
||||
bool changed = other.vm < vm || other.exp < exp || other.lgkm < lgkm || other.vs < vs;
|
||||
vm = std::min(vm, other.vm);
|
||||
|
@ -552,17 +504,21 @@ bool wait_imm::combine(const wait_imm& other)
|
|||
return changed;
|
||||
}
|
||||
|
||||
bool wait_imm::empty() const
|
||||
bool
|
||||
wait_imm::empty() const
|
||||
{
|
||||
return vm == unset_counter && exp == unset_counter &&
|
||||
lgkm == unset_counter && vs == unset_counter;
|
||||
return vm == unset_counter && exp == unset_counter && lgkm == unset_counter &&
|
||||
vs == unset_counter;
|
||||
}
|
||||
|
||||
bool should_form_clause(const Instruction *a, const Instruction *b)
|
||||
bool
|
||||
should_form_clause(const Instruction* a, const Instruction* b)
|
||||
{
|
||||
/* Vertex attribute loads from the same binding likely load from similar addresses */
|
||||
unsigned a_vtx_binding = a->isMUBUF() ? a->mubuf().vtx_binding : (a->isMTBUF() ? a->mtbuf().vtx_binding : 0);
|
||||
unsigned b_vtx_binding = b->isMUBUF() ? b->mubuf().vtx_binding : (b->isMTBUF() ? b->mtbuf().vtx_binding : 0);
|
||||
unsigned a_vtx_binding =
|
||||
a->isMUBUF() ? a->mubuf().vtx_binding : (a->isMTBUF() ? a->mtbuf().vtx_binding : 0);
|
||||
unsigned b_vtx_binding =
|
||||
b->isMUBUF() ? b->mubuf().vtx_binding : (b->isMTBUF() ? b->mtbuf().vtx_binding : 0);
|
||||
if (a_vtx_binding && a_vtx_binding == b_vtx_binding)
|
||||
return true;
|
||||
|
||||
|
@ -584,4 +540,4 @@ bool should_form_clause(const Instruction *a, const Instruction *b)
|
|||
return false;
|
||||
}
|
||||
|
||||
}
|
||||
} // namespace aco
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -24,13 +24,15 @@
|
|||
*/
|
||||
|
||||
#include "aco_ir.h"
|
||||
|
||||
#include "util/u_math.h"
|
||||
|
||||
#include <set>
|
||||
#include <vector>
|
||||
|
||||
namespace aco {
|
||||
RegisterDemand get_live_changes(aco_ptr<Instruction>& instr)
|
||||
RegisterDemand
|
||||
get_live_changes(aco_ptr<Instruction>& instr)
|
||||
{
|
||||
RegisterDemand changes;
|
||||
for (const Definition& def : instr->definitions) {
|
||||
|
@ -48,7 +50,8 @@ RegisterDemand get_live_changes(aco_ptr<Instruction>& instr)
|
|||
return changes;
|
||||
}
|
||||
|
||||
RegisterDemand get_temp_registers(aco_ptr<Instruction>& instr)
|
||||
RegisterDemand
|
||||
get_temp_registers(aco_ptr<Instruction>& instr)
|
||||
{
|
||||
RegisterDemand temp_registers;
|
||||
|
||||
|
@ -67,7 +70,9 @@ RegisterDemand get_temp_registers(aco_ptr<Instruction>& instr)
|
|||
return temp_registers;
|
||||
}
|
||||
|
||||
RegisterDemand get_demand_before(RegisterDemand demand, aco_ptr<Instruction>& instr, aco_ptr<Instruction>& instr_before)
|
||||
RegisterDemand
|
||||
get_demand_before(RegisterDemand demand, aco_ptr<Instruction>& instr,
|
||||
aco_ptr<Instruction>& instr_before)
|
||||
{
|
||||
demand -= get_live_changes(instr);
|
||||
demand -= get_temp_registers(instr);
|
||||
|
@ -77,8 +82,9 @@ RegisterDemand get_demand_before(RegisterDemand demand, aco_ptr<Instruction>& in
|
|||
}
|
||||
|
||||
namespace {
|
||||
void process_live_temps_per_block(Program *program, live& lives, Block* block,
|
||||
std::set<unsigned>& worklist, std::vector<uint16_t>& phi_sgpr_ops)
|
||||
void
|
||||
process_live_temps_per_block(Program* program, live& lives, Block* block,
|
||||
std::set<unsigned>& worklist, std::vector<uint16_t>& phi_sgpr_ops)
|
||||
{
|
||||
std::vector<RegisterDemand>& register_demand = lives.register_demand[block->index];
|
||||
RegisterDemand new_demand;
|
||||
|
@ -94,8 +100,8 @@ void process_live_temps_per_block(Program *program, live& lives, Block* block,
|
|||
|
||||
/* traverse the instructions backwards */
|
||||
int idx;
|
||||
for (idx = block->instructions.size() -1; idx >= 0; idx--) {
|
||||
Instruction *insn = block->instructions[idx].get();
|
||||
for (idx = block->instructions.size() - 1; idx >= 0; idx--) {
|
||||
Instruction* insn = block->instructions[idx].get();
|
||||
if (is_phi(insn))
|
||||
break;
|
||||
|
||||
|
@ -131,8 +137,7 @@ void process_live_temps_per_block(Program *program, live& lives, Block* block,
|
|||
for (Operand& op : insn->operands)
|
||||
op.setKill(false);
|
||||
|
||||
for (unsigned i = 0; i < insn->operands.size(); ++i)
|
||||
{
|
||||
for (unsigned i = 0; i < insn->operands.size(); ++i) {
|
||||
Operand& operand = insn->operands[i];
|
||||
if (!operand.isTemp())
|
||||
continue;
|
||||
|
@ -143,7 +148,8 @@ void process_live_temps_per_block(Program *program, live& lives, Block* block,
|
|||
if (inserted) {
|
||||
operand.setFirstKill(true);
|
||||
for (unsigned j = i + 1; j < insn->operands.size(); ++j) {
|
||||
if (insn->operands[j].isTemp() && insn->operands[j].tempId() == operand.tempId()) {
|
||||
if (insn->operands[j].isTemp() &&
|
||||
insn->operands[j].tempId() == operand.tempId()) {
|
||||
insn->operands[j].setFirstKill(false);
|
||||
insn->operands[j].setKill(true);
|
||||
}
|
||||
|
@ -167,7 +173,7 @@ void process_live_temps_per_block(Program *program, live& lives, Block* block,
|
|||
int phi_idx = idx;
|
||||
while (phi_idx >= 0) {
|
||||
register_demand[phi_idx] = new_demand;
|
||||
Instruction *insn = block->instructions[phi_idx].get();
|
||||
Instruction* insn = block->instructions[phi_idx].get();
|
||||
|
||||
assert(is_phi(insn) && insn->definitions.size() == 1);
|
||||
if (!insn->definitions[0].isTemp()) {
|
||||
|
@ -196,7 +202,8 @@ void process_live_temps_per_block(Program *program, live& lives, Block* block,
|
|||
|
||||
#ifndef NDEBUG
|
||||
if (preds.empty())
|
||||
aco_err(program, "Temporary never defined or are defined after use: %%%d in BB%d", t, block->index);
|
||||
aco_err(program, "Temporary never defined or are defined after use: %%%d in BB%d", t,
|
||||
block->index);
|
||||
#endif
|
||||
|
||||
for (unsigned pred_idx : preds) {
|
||||
|
@ -209,14 +216,13 @@ void process_live_temps_per_block(Program *program, live& lives, Block* block,
|
|||
/* handle phi operands */
|
||||
phi_idx = idx;
|
||||
while (phi_idx >= 0) {
|
||||
Instruction *insn = block->instructions[phi_idx].get();
|
||||
Instruction* insn = block->instructions[phi_idx].get();
|
||||
assert(is_phi(insn));
|
||||
/* directly insert into the predecessors live-out set */
|
||||
std::vector<unsigned>& preds = insn->opcode == aco_opcode::p_phi
|
||||
? block->logical_preds
|
||||
: block->linear_preds;
|
||||
std::vector<unsigned>& preds =
|
||||
insn->opcode == aco_opcode::p_phi ? block->logical_preds : block->linear_preds;
|
||||
for (unsigned i = 0; i < preds.size(); ++i) {
|
||||
Operand &operand = insn->operands[i];
|
||||
Operand& operand = insn->operands[i];
|
||||
if (!operand.isTemp())
|
||||
continue;
|
||||
if (operand.isFixed() && operand.physReg() == vcc)
|
||||
|
@ -238,18 +244,19 @@ void process_live_temps_per_block(Program *program, live& lives, Block* block,
|
|||
assert(block->index != 0 || (new_demand == RegisterDemand() && live.empty()));
|
||||
}
|
||||
|
||||
unsigned calc_waves_per_workgroup(Program *program)
|
||||
unsigned
|
||||
calc_waves_per_workgroup(Program* program)
|
||||
{
|
||||
/* When workgroup size is not known, just go with wave_size */
|
||||
unsigned workgroup_size = program->workgroup_size == UINT_MAX
|
||||
? program->wave_size
|
||||
: program->workgroup_size;
|
||||
unsigned workgroup_size =
|
||||
program->workgroup_size == UINT_MAX ? program->wave_size : program->workgroup_size;
|
||||
|
||||
return align(workgroup_size, program->wave_size) / program->wave_size;
|
||||
}
|
||||
} /* end namespace */
|
||||
|
||||
uint16_t get_extra_sgprs(Program *program)
|
||||
uint16_t
|
||||
get_extra_sgprs(Program* program)
|
||||
{
|
||||
if (program->chip_class >= GFX10) {
|
||||
assert(!program->needs_flat_scr);
|
||||
|
@ -275,26 +282,30 @@ uint16_t get_extra_sgprs(Program *program)
|
|||
}
|
||||
}
|
||||
|
||||
uint16_t get_sgpr_alloc(Program *program, uint16_t addressable_sgprs)
|
||||
uint16_t
|
||||
get_sgpr_alloc(Program* program, uint16_t addressable_sgprs)
|
||||
{
|
||||
uint16_t sgprs = addressable_sgprs + get_extra_sgprs(program);
|
||||
uint16_t granule = program->dev.sgpr_alloc_granule;
|
||||
return ALIGN_NPOT(std::max(sgprs, granule), granule);
|
||||
}
|
||||
|
||||
uint16_t get_vgpr_alloc(Program *program, uint16_t addressable_vgprs)
|
||||
uint16_t
|
||||
get_vgpr_alloc(Program* program, uint16_t addressable_vgprs)
|
||||
{
|
||||
assert(addressable_vgprs <= program->dev.vgpr_limit);
|
||||
uint16_t granule = program->dev.vgpr_alloc_granule;
|
||||
return align(std::max(addressable_vgprs, granule), granule);
|
||||
}
|
||||
|
||||
unsigned round_down(unsigned a, unsigned b)
|
||||
unsigned
|
||||
round_down(unsigned a, unsigned b)
|
||||
{
|
||||
return a - (a % b);
|
||||
}
|
||||
|
||||
uint16_t get_addr_sgpr_from_waves(Program *program, uint16_t waves)
|
||||
uint16_t
|
||||
get_addr_sgpr_from_waves(Program* program, uint16_t waves)
|
||||
{
|
||||
/* it's not possible to allocate more than 128 SGPRs */
|
||||
uint16_t sgprs = std::min(program->dev.physical_sgprs / waves, 128);
|
||||
|
@ -303,21 +314,24 @@ uint16_t get_addr_sgpr_from_waves(Program *program, uint16_t waves)
|
|||
return std::min(sgprs, program->dev.sgpr_limit);
|
||||
}
|
||||
|
||||
uint16_t get_addr_vgpr_from_waves(Program *program, uint16_t waves)
|
||||
uint16_t
|
||||
get_addr_vgpr_from_waves(Program* program, uint16_t waves)
|
||||
{
|
||||
uint16_t vgprs = program->dev.physical_vgprs / waves & ~(program->dev.vgpr_alloc_granule - 1);
|
||||
vgprs -= program->config->num_shared_vgprs / 2;
|
||||
return std::min(vgprs, program->dev.vgpr_limit);
|
||||
}
|
||||
|
||||
void calc_min_waves(Program* program)
|
||||
void
|
||||
calc_min_waves(Program* program)
|
||||
{
|
||||
unsigned waves_per_workgroup = calc_waves_per_workgroup(program);
|
||||
unsigned simd_per_cu_wgp = program->dev.simd_per_cu * (program->wgp_mode ? 2 : 1);
|
||||
program->min_waves = DIV_ROUND_UP(waves_per_workgroup, simd_per_cu_wgp);
|
||||
}
|
||||
|
||||
void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand)
|
||||
void
|
||||
update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand)
|
||||
{
|
||||
unsigned max_waves_per_simd = program->dev.max_wave64_per_simd * (64 / program->wave_size);
|
||||
unsigned simd_per_cu_wgp = program->dev.simd_per_cu * (program->wgp_mode ? 2 : 1);
|
||||
|
@ -333,8 +347,10 @@ void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand)
|
|||
program->max_reg_demand = new_demand;
|
||||
} else {
|
||||
program->num_waves = program->dev.physical_sgprs / get_sgpr_alloc(program, new_demand.sgpr);
|
||||
uint16_t vgpr_demand = get_vgpr_alloc(program, new_demand.vgpr) + program->config->num_shared_vgprs / 2;
|
||||
program->num_waves = std::min<uint16_t>(program->num_waves, program->dev.physical_vgprs / vgpr_demand);
|
||||
uint16_t vgpr_demand =
|
||||
get_vgpr_alloc(program, new_demand.vgpr) + program->config->num_shared_vgprs / 2;
|
||||
program->num_waves =
|
||||
std::min<uint16_t>(program->num_waves, program->dev.physical_vgprs / vgpr_demand);
|
||||
program->max_waves = max_waves_per_simd;
|
||||
|
||||
/* adjust max_waves for workgroup and LDS limits */
|
||||
|
@ -346,12 +362,15 @@ void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand)
|
|||
workgroups_per_cu_wgp = std::min(workgroups_per_cu_wgp, lds_limit / lds);
|
||||
}
|
||||
if (waves_per_workgroup > 1 && program->chip_class < GFX10)
|
||||
workgroups_per_cu_wgp = std::min(workgroups_per_cu_wgp, 16u); /* TODO: is this a SI-only limit? what about Navi? */
|
||||
workgroups_per_cu_wgp = std::min(
|
||||
workgroups_per_cu_wgp, 16u); /* TODO: is this a SI-only limit? what about Navi? */
|
||||
|
||||
/* in cases like waves_per_workgroup=3 or lds=65536 and
|
||||
* waves_per_workgroup=1, we want the maximum possible number of waves per
|
||||
* SIMD and not the minimum. so DIV_ROUND_UP is used */
|
||||
program->max_waves = std::min<uint16_t>(program->max_waves, DIV_ROUND_UP(workgroups_per_cu_wgp * waves_per_workgroup, simd_per_cu_wgp));
|
||||
program->max_waves = std::min<uint16_t>(
|
||||
program->max_waves,
|
||||
DIV_ROUND_UP(workgroups_per_cu_wgp * waves_per_workgroup, simd_per_cu_wgp));
|
||||
|
||||
/* incorporate max_waves and calculate max_reg_demand */
|
||||
program->num_waves = std::min<uint16_t>(program->num_waves, program->max_waves);
|
||||
|
@ -360,7 +379,8 @@ void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand)
|
|||
}
|
||||
}
|
||||
|
||||
live live_var_analysis(Program* program)
|
||||
live
|
||||
live_var_analysis(Program* program)
|
||||
{
|
||||
live result;
|
||||
result.live_out.resize(program->blocks.size());
|
||||
|
@ -371,14 +391,16 @@ live live_var_analysis(Program* program)
|
|||
|
||||
program->needs_vcc = false;
|
||||
|
||||
/* this implementation assumes that the block idx corresponds to the block's position in program->blocks vector */
|
||||
/* this implementation assumes that the block idx corresponds to the block's position in
|
||||
* program->blocks vector */
|
||||
for (Block& block : program->blocks)
|
||||
worklist.insert(block.index);
|
||||
while (!worklist.empty()) {
|
||||
std::set<unsigned>::reverse_iterator b_it = worklist.rbegin();
|
||||
unsigned block_idx = *b_it;
|
||||
worklist.erase(block_idx);
|
||||
process_live_temps_per_block(program, result, &program->blocks[block_idx], worklist, phi_sgpr_ops);
|
||||
process_live_temps_per_block(program, result, &program->blocks[block_idx], worklist,
|
||||
phi_sgpr_ops);
|
||||
new_demand.update(program->blocks[block_idx].register_demand);
|
||||
}
|
||||
|
||||
|
@ -389,5 +411,4 @@ live live_var_analysis(Program* program)
|
|||
return result;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
} // namespace aco
|
||||
|
|
|
@ -47,7 +47,8 @@ struct ssa_state {
|
|||
std::vector<bool> visited;
|
||||
};
|
||||
|
||||
Operand get_ssa(Program *program, unsigned block_idx, ssa_state *state, bool before_write)
|
||||
Operand
|
||||
get_ssa(Program* program, unsigned block_idx, ssa_state* state, bool before_write)
|
||||
{
|
||||
if (!before_write) {
|
||||
auto it = state->writes.find(block_idx);
|
||||
|
@ -79,7 +80,8 @@ Operand get_ssa(Program *program, unsigned block_idx, ssa_state *state, bool bef
|
|||
Temp res = Temp(program->allocateTmp(program->lane_mask));
|
||||
state->latest[block_idx] = Operand(res);
|
||||
|
||||
aco_ptr<Pseudo_instruction> phi{create_instruction<Pseudo_instruction>(aco_opcode::p_linear_phi, Format::PSEUDO, pred, 1)};
|
||||
aco_ptr<Pseudo_instruction> phi{
|
||||
create_instruction<Pseudo_instruction>(aco_opcode::p_linear_phi, Format::PSEUDO, pred, 1)};
|
||||
for (unsigned i = 0; i < pred; i++)
|
||||
phi->operands[i] = get_ssa(program, block.linear_preds[i], state, false);
|
||||
phi->definitions[0] = Definition(res);
|
||||
|
@ -89,11 +91,11 @@ Operand get_ssa(Program *program, unsigned block_idx, ssa_state *state, bool bef
|
|||
}
|
||||
}
|
||||
|
||||
void insert_before_logical_end(Block *block, aco_ptr<Instruction> instr)
|
||||
void
|
||||
insert_before_logical_end(Block* block, aco_ptr<Instruction> instr)
|
||||
{
|
||||
auto IsLogicalEnd = [] (const aco_ptr<Instruction>& inst) -> bool {
|
||||
return inst->opcode == aco_opcode::p_logical_end;
|
||||
};
|
||||
auto IsLogicalEnd = [](const aco_ptr<Instruction>& inst) -> bool
|
||||
{ return inst->opcode == aco_opcode::p_logical_end; };
|
||||
auto it = std::find_if(block->instructions.crbegin(), block->instructions.crend(), IsLogicalEnd);
|
||||
|
||||
if (it == block->instructions.crend()) {
|
||||
|
@ -104,13 +106,13 @@ void insert_before_logical_end(Block *block, aco_ptr<Instruction> instr)
|
|||
}
|
||||
}
|
||||
|
||||
void build_merge_code(Program *program, Block *block, Definition dst, Operand prev, Operand cur)
|
||||
void
|
||||
build_merge_code(Program* program, Block* block, Definition dst, Operand prev, Operand cur)
|
||||
{
|
||||
Builder bld(program);
|
||||
|
||||
auto IsLogicalEnd = [] (const aco_ptr<Instruction>& instr) -> bool {
|
||||
return instr->opcode == aco_opcode::p_logical_end;
|
||||
};
|
||||
auto IsLogicalEnd = [](const aco_ptr<Instruction>& instr) -> bool
|
||||
{ return instr->opcode == aco_opcode::p_logical_end; };
|
||||
auto it = std::find_if(block->instructions.rbegin(), block->instructions.rend(), IsLogicalEnd);
|
||||
assert(it != block->instructions.rend());
|
||||
bld.reset(&block->instructions, std::prev(it.base()));
|
||||
|
@ -126,7 +128,8 @@ void build_merge_code(Program *program, Block *block, Definition dst, Operand pr
|
|||
if (!prev_is_constant) {
|
||||
if (!cur_is_constant) {
|
||||
Temp tmp1 = bld.tmp(bld.lm), tmp2 = bld.tmp(bld.lm);
|
||||
bld.sop2(Builder::s_andn2, Definition(tmp1), bld.def(s1, scc), prev, Operand(exec, bld.lm));
|
||||
bld.sop2(Builder::s_andn2, Definition(tmp1), bld.def(s1, scc), prev,
|
||||
Operand(exec, bld.lm));
|
||||
bld.sop2(Builder::s_and, Definition(tmp2), bld.def(s1, scc), cur, Operand(exec, bld.lm));
|
||||
bld.sop2(Builder::s_or, dst, bld.def(s1, scc), tmp1, tmp2);
|
||||
} else if (cur.constantValue()) {
|
||||
|
@ -151,7 +154,8 @@ void build_merge_code(Program *program, Block *block, Definition dst, Operand pr
|
|||
}
|
||||
}
|
||||
|
||||
void init_any_pred_defined(Program *program, ssa_state *state, Block *block, aco_ptr<Instruction>& phi)
|
||||
void
|
||||
init_any_pred_defined(Program* program, ssa_state* state, Block* block, aco_ptr<Instruction>& phi)
|
||||
{
|
||||
std::fill(state->any_pred_defined.begin(), state->any_pred_defined.end(), false);
|
||||
for (unsigned i = 0; i < block->logical_preds.size(); i++) {
|
||||
|
@ -178,7 +182,9 @@ void init_any_pred_defined(Program *program, ssa_state *state, Block *block, aco
|
|||
}
|
||||
}
|
||||
|
||||
void lower_divergent_bool_phi(Program *program, ssa_state *state, Block *block, aco_ptr<Instruction>& phi)
|
||||
void
|
||||
lower_divergent_bool_phi(Program* program, ssa_state* state, Block* block,
|
||||
aco_ptr<Instruction>& phi)
|
||||
{
|
||||
Builder bld(program);
|
||||
|
||||
|
@ -186,7 +192,8 @@ void lower_divergent_bool_phi(Program *program, ssa_state *state, Block *block,
|
|||
state->all_preds_uniform = !(block->kind & block_kind_merge) &&
|
||||
block->linear_preds.size() == block->logical_preds.size();
|
||||
for (unsigned pred : block->logical_preds)
|
||||
state->all_preds_uniform = state->all_preds_uniform && (program->blocks[pred].kind & block_kind_uniform);
|
||||
state->all_preds_uniform =
|
||||
state->all_preds_uniform && (program->blocks[pred].kind & block_kind_uniform);
|
||||
state->checked_preds_for_uniform = true;
|
||||
}
|
||||
|
||||
|
@ -230,7 +237,7 @@ void lower_divergent_bool_phi(Program *program, ssa_state *state, Block *block,
|
|||
bool uniform_merge = block->kind & block_kind_loop_header;
|
||||
|
||||
for (unsigned i = 0; i < phi->operands.size(); i++) {
|
||||
Block *pred = &program->blocks[block->logical_preds[i]];
|
||||
Block* pred = &program->blocks[block->logical_preds[i]];
|
||||
|
||||
bool need_get_ssa = !uniform_merge;
|
||||
if (block->kind & block_kind_loop_header && !(pred->kind & block_kind_uniform))
|
||||
|
@ -254,7 +261,8 @@ void lower_divergent_bool_phi(Program *program, ssa_state *state, Block *block,
|
|||
|
||||
unsigned num_preds = block->linear_preds.size();
|
||||
if (phi->operands.size() != num_preds) {
|
||||
Pseudo_instruction* new_phi{create_instruction<Pseudo_instruction>(aco_opcode::p_linear_phi, Format::PSEUDO, num_preds, 1)};
|
||||
Pseudo_instruction* new_phi{create_instruction<Pseudo_instruction>(
|
||||
aco_opcode::p_linear_phi, Format::PSEUDO, num_preds, 1)};
|
||||
new_phi->definitions[0] = phi->definitions[0];
|
||||
phi.reset(new_phi);
|
||||
} else {
|
||||
|
@ -268,7 +276,8 @@ void lower_divergent_bool_phi(Program *program, ssa_state *state, Block *block,
|
|||
return;
|
||||
}
|
||||
|
||||
void lower_subdword_phis(Program *program, Block *block, aco_ptr<Instruction>& phi)
|
||||
void
|
||||
lower_subdword_phis(Program* program, Block* block, aco_ptr<Instruction>& phi)
|
||||
{
|
||||
Builder bld(program);
|
||||
for (unsigned i = 0; i < phi->operands.size(); i++) {
|
||||
|
@ -278,21 +287,24 @@ void lower_subdword_phis(Program *program, Block *block, aco_ptr<Instruction>& p
|
|||
continue;
|
||||
|
||||
assert(phi->operands[i].isTemp());
|
||||
Block *pred = &program->blocks[block->logical_preds[i]];
|
||||
Block* pred = &program->blocks[block->logical_preds[i]];
|
||||
Temp phi_src = phi->operands[i].getTemp();
|
||||
|
||||
assert(phi_src.regClass().type() == RegType::sgpr);
|
||||
Temp tmp = bld.tmp(RegClass(RegType::vgpr, phi_src.size()));
|
||||
insert_before_logical_end(pred, bld.copy(Definition(tmp), phi_src).get_ptr());
|
||||
Temp new_phi_src = bld.tmp(phi->definitions[0].regClass());
|
||||
insert_before_logical_end(pred, bld.pseudo(aco_opcode::p_extract_vector, Definition(new_phi_src), tmp, Operand(0u)).get_ptr());
|
||||
insert_before_logical_end(
|
||||
pred, bld.pseudo(aco_opcode::p_extract_vector, Definition(new_phi_src), tmp, Operand(0u))
|
||||
.get_ptr());
|
||||
|
||||
phi->operands[i].setTemp(new_phi_src);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
void lower_phis(Program* program)
|
||||
void
|
||||
lower_phis(Program* program)
|
||||
{
|
||||
ssa_state state;
|
||||
|
||||
|
@ -301,7 +313,8 @@ void lower_phis(Program* program)
|
|||
state.needs_init = true;
|
||||
for (aco_ptr<Instruction>& phi : block.instructions) {
|
||||
if (phi->opcode == aco_opcode::p_phi) {
|
||||
assert(program->wave_size == 64 ? phi->definitions[0].regClass() != s1 : phi->definitions[0].regClass() != s2);
|
||||
assert(program->wave_size == 64 ? phi->definitions[0].regClass() != s1
|
||||
: phi->definitions[0].regClass() != s2);
|
||||
if (phi->definitions[0].regClass() == program->lane_mask)
|
||||
lower_divergent_bool_phi(program, &state, &block, phi);
|
||||
else if (phi->definitions[0].regClass().is_subdword())
|
||||
|
@ -313,4 +326,4 @@ void lower_phis(Program* program)
|
|||
}
|
||||
}
|
||||
|
||||
}
|
||||
} // namespace aco
|
||||
|
|
|
@ -53,32 +53,32 @@ struct copy {
|
|||
|
||||
struct merge_node {
|
||||
Operand value = Operand(); /* original value: can be an SSA-def or constant value */
|
||||
uint32_t index = -1u; /* index into the vector of merge sets */
|
||||
uint32_t index = -1u; /* index into the vector of merge sets */
|
||||
uint32_t defined_at = -1u; /* defining block */
|
||||
|
||||
/* we also remember two dominating defs with the same value: */
|
||||
Temp equal_anc_in = Temp(); /* within the same merge set */
|
||||
Temp equal_anc_in = Temp(); /* within the same merge set */
|
||||
Temp equal_anc_out = Temp(); /* from a different set */
|
||||
};
|
||||
|
||||
struct cssa_ctx {
|
||||
Program* program;
|
||||
std::vector<IDSet>& live_out; /* live-out sets per block */
|
||||
std::vector<IDSet>& live_out; /* live-out sets per block */
|
||||
std::vector<std::vector<copy>> parallelcopies; /* copies per block */
|
||||
std::vector<merge_set> merge_sets; /* each vector is one (ordered) merge set */
|
||||
std::vector<merge_set> merge_sets; /* each vector is one (ordered) merge set */
|
||||
std::unordered_map<uint32_t, merge_node> merge_node_table; /* tempid -> merge node */
|
||||
};
|
||||
|
||||
/* create (virtual) parallelcopies for each phi instruction and
|
||||
* already merge copy-definitions with phi-defs into merge sets */
|
||||
void collect_parallelcopies(cssa_ctx& ctx)
|
||||
void
|
||||
collect_parallelcopies(cssa_ctx& ctx)
|
||||
{
|
||||
ctx.parallelcopies.resize(ctx.program->blocks.size());
|
||||
Builder bld(ctx.program);
|
||||
for (Block& block : ctx.program->blocks) {
|
||||
for (aco_ptr<Instruction>& phi : block.instructions) {
|
||||
if (phi->opcode != aco_opcode::p_phi &&
|
||||
phi->opcode != aco_opcode::p_linear_phi)
|
||||
if (phi->opcode != aco_opcode::p_phi && phi->opcode != aco_opcode::p_linear_phi)
|
||||
break;
|
||||
|
||||
const Definition& def = phi->definitions[0];
|
||||
|
@ -89,9 +89,8 @@ void collect_parallelcopies(cssa_ctx& ctx)
|
|||
if (!def.isTemp())
|
||||
continue;
|
||||
|
||||
std::vector<unsigned>& preds = phi->opcode == aco_opcode::p_phi ?
|
||||
block.logical_preds :
|
||||
block.linear_preds;
|
||||
std::vector<unsigned>& preds =
|
||||
phi->opcode == aco_opcode::p_phi ? block.logical_preds : block.linear_preds;
|
||||
uint32_t index = ctx.merge_sets.size();
|
||||
merge_set set;
|
||||
|
||||
|
@ -151,8 +150,8 @@ void collect_parallelcopies(cssa_ctx& ctx)
|
|||
}
|
||||
|
||||
/* check whether the definition of a comes after b. */
|
||||
inline
|
||||
bool defined_after(cssa_ctx& ctx, Temp a, Temp b)
|
||||
inline bool
|
||||
defined_after(cssa_ctx& ctx, Temp a, Temp b)
|
||||
{
|
||||
merge_node& node_a = ctx.merge_node_table[a.id()];
|
||||
merge_node& node_b = ctx.merge_node_table[b.id()];
|
||||
|
@ -163,25 +162,24 @@ bool defined_after(cssa_ctx& ctx, Temp a, Temp b)
|
|||
}
|
||||
|
||||
/* check whether a dominates b where b is defined after a */
|
||||
inline
|
||||
bool dominates(cssa_ctx& ctx, Temp a, Temp b)
|
||||
inline bool
|
||||
dominates(cssa_ctx& ctx, Temp a, Temp b)
|
||||
{
|
||||
assert(defined_after(ctx, b, a));
|
||||
merge_node& node_a = ctx.merge_node_table[a.id()];
|
||||
merge_node& node_b = ctx.merge_node_table[b.id()];
|
||||
unsigned idom = node_b.defined_at;
|
||||
while (idom > node_a.defined_at)
|
||||
idom = b.regClass().type() == RegType::vgpr ?
|
||||
ctx.program->blocks[idom].logical_idom :
|
||||
ctx.program->blocks[idom].linear_idom;
|
||||
idom = b.regClass().type() == RegType::vgpr ? ctx.program->blocks[idom].logical_idom
|
||||
: ctx.program->blocks[idom].linear_idom;
|
||||
|
||||
return idom == node_a.defined_at;
|
||||
}
|
||||
|
||||
/* check intersection between var and parent:
|
||||
* We already know that parent dominates var. */
|
||||
inline
|
||||
bool intersects(cssa_ctx& ctx, Temp var, Temp parent)
|
||||
inline bool
|
||||
intersects(cssa_ctx& ctx, Temp var, Temp parent)
|
||||
{
|
||||
merge_node& node_var = ctx.merge_node_table[var.id()];
|
||||
merge_node& node_parent = ctx.merge_node_table[parent.id()];
|
||||
|
@ -196,9 +194,9 @@ bool intersects(cssa_ctx& ctx, Temp var, Temp parent)
|
|||
/* parent is defined in a different block than var */
|
||||
if (node_parent.defined_at < node_var.defined_at) {
|
||||
/* if the parent is not live-in, they don't interfere */
|
||||
std::vector<uint32_t>& preds = var.type() == RegType::vgpr ?
|
||||
ctx.program->blocks[block_idx].logical_preds :
|
||||
ctx.program->blocks[block_idx].linear_preds;
|
||||
std::vector<uint32_t>& preds = var.type() == RegType::vgpr
|
||||
? ctx.program->blocks[block_idx].logical_preds
|
||||
: ctx.program->blocks[block_idx].linear_preds;
|
||||
for (uint32_t pred : preds) {
|
||||
if (!ctx.live_out[pred].count(parent.id()))
|
||||
return false;
|
||||
|
@ -246,8 +244,8 @@ bool intersects(cssa_ctx& ctx, Temp var, Temp parent)
|
|||
/* check interference between var and parent:
|
||||
* i.e. they have different values and intersect.
|
||||
* If parent and var share the same value, also updates the equal ancestor. */
|
||||
inline
|
||||
bool interference(cssa_ctx& ctx, Temp var, Temp parent)
|
||||
inline bool
|
||||
interference(cssa_ctx& ctx, Temp var, Temp parent)
|
||||
{
|
||||
assert(var != parent);
|
||||
merge_node& node_var = ctx.merge_node_table[var.id()];
|
||||
|
@ -281,13 +279,14 @@ bool interference(cssa_ctx& ctx, Temp var, Temp parent)
|
|||
|
||||
/* tries to merge set_b into set_a of given temporary and
|
||||
* drops that temporary as it is being coalesced */
|
||||
bool try_merge_merge_set(cssa_ctx& ctx, Temp dst, merge_set& set_b)
|
||||
bool
|
||||
try_merge_merge_set(cssa_ctx& ctx, Temp dst, merge_set& set_b)
|
||||
{
|
||||
auto def_node_it = ctx.merge_node_table.find(dst.id());
|
||||
uint32_t index = def_node_it->second.index;
|
||||
merge_set& set_a = ctx.merge_sets[index];
|
||||
std::vector<Temp> dom; /* stack of the traversal */
|
||||
merge_set union_set; /* the new merged merge-set */
|
||||
merge_set union_set; /* the new merged merge-set */
|
||||
uint32_t i_a = 0;
|
||||
uint32_t i_b = 0;
|
||||
|
||||
|
@ -335,7 +334,8 @@ bool try_merge_merge_set(cssa_ctx& ctx, Temp dst, merge_set& set_b)
|
|||
}
|
||||
|
||||
/* returns true if the copy can safely be omitted */
|
||||
bool try_coalesce_copy(cssa_ctx& ctx, copy copy, uint32_t block_idx)
|
||||
bool
|
||||
try_coalesce_copy(cssa_ctx& ctx, copy copy, uint32_t block_idx)
|
||||
{
|
||||
/* we can only coalesce temporaries */
|
||||
if (!copy.op.isTemp())
|
||||
|
@ -348,11 +348,9 @@ bool try_coalesce_copy(cssa_ctx& ctx, copy copy, uint32_t block_idx)
|
|||
uint32_t pred = block_idx;
|
||||
do {
|
||||
block_idx = pred;
|
||||
pred = copy.op.regClass().type() == RegType::vgpr ?
|
||||
ctx.program->blocks[pred].logical_idom :
|
||||
ctx.program->blocks[pred].linear_idom;
|
||||
} while (block_idx != pred &&
|
||||
ctx.live_out[pred].count(copy.op.tempId()));
|
||||
pred = copy.op.regClass().type() == RegType::vgpr ? ctx.program->blocks[pred].logical_idom
|
||||
: ctx.program->blocks[pred].linear_idom;
|
||||
} while (block_idx != pred && ctx.live_out[pred].count(copy.op.tempId()));
|
||||
op_node.defined_at = block_idx;
|
||||
op_node.value = copy.op;
|
||||
}
|
||||
|
@ -385,7 +383,8 @@ struct ltg_node {
|
|||
|
||||
/* emit the copies in an order that does not
|
||||
* create interferences within a merge-set */
|
||||
void emit_copies_block(Builder bld, std::map<uint32_t, ltg_node>& ltg, RegType type)
|
||||
void
|
||||
emit_copies_block(Builder bld, std::map<uint32_t, ltg_node>& ltg, RegType type)
|
||||
{
|
||||
auto&& it = ltg.begin();
|
||||
while (it != ltg.end()) {
|
||||
|
@ -410,16 +409,16 @@ void emit_copies_block(Builder bld, std::map<uint32_t, ltg_node>& ltg, RegType t
|
|||
}
|
||||
|
||||
/* count the number of remaining circular dependencies */
|
||||
unsigned num = std::count_if(ltg.begin(), ltg.end(), [&] (auto& n){
|
||||
return n.second.cp.def.regClass().type() == type;
|
||||
});
|
||||
unsigned num = std::count_if(ltg.begin(), ltg.end(),
|
||||
[&](auto& n) { return n.second.cp.def.regClass().type() == type; });
|
||||
|
||||
/* if there are circular dependencies, we just emit them as single parallelcopy */
|
||||
if (num) {
|
||||
// TODO: this should be restricted to a feasible number of registers
|
||||
// and otherwise use a temporary to avoid having to reload more (spilled)
|
||||
// variables than we have registers.
|
||||
aco_ptr<Pseudo_instruction> copy{create_instruction<Pseudo_instruction>(aco_opcode::p_parallelcopy, Format::PSEUDO, num, num)};
|
||||
aco_ptr<Pseudo_instruction> copy{create_instruction<Pseudo_instruction>(
|
||||
aco_opcode::p_parallelcopy, Format::PSEUDO, num, num)};
|
||||
it = ltg.begin();
|
||||
for (unsigned i = 0; i < num; i++) {
|
||||
while (it->second.cp.def.regClass().type() != type)
|
||||
|
@ -435,7 +434,8 @@ void emit_copies_block(Builder bld, std::map<uint32_t, ltg_node>& ltg, RegType t
|
|||
|
||||
/* either emits or coalesces all parallelcopies and
|
||||
* renames the phi-operands accordingly. */
|
||||
void emit_parallelcopies(cssa_ctx& ctx)
|
||||
void
|
||||
emit_parallelcopies(cssa_ctx& ctx)
|
||||
{
|
||||
std::unordered_map<uint32_t, Operand> renames;
|
||||
|
||||
|
@ -476,9 +476,8 @@ void emit_parallelcopies(cssa_ctx& ctx)
|
|||
Block& block = ctx.program->blocks[i];
|
||||
|
||||
/* emit VGPR copies */
|
||||
auto IsLogicalEnd = [] (const aco_ptr<Instruction>& inst) -> bool {
|
||||
return inst->opcode == aco_opcode::p_logical_end;
|
||||
};
|
||||
auto IsLogicalEnd = [](const aco_ptr<Instruction>& inst) -> bool
|
||||
{ return inst->opcode == aco_opcode::p_logical_end; };
|
||||
auto it = std::find_if(block.instructions.rbegin(), block.instructions.rend(), IsLogicalEnd);
|
||||
bld.reset(&block.instructions, std::prev(it.base()));
|
||||
emit_copies_block(bld, ltg, RegType::vgpr);
|
||||
|
@ -494,8 +493,7 @@ void emit_parallelcopies(cssa_ctx& ctx)
|
|||
/* finally, rename coalesced phi operands */
|
||||
for (Block& block : ctx.program->blocks) {
|
||||
for (aco_ptr<Instruction>& phi : block.instructions) {
|
||||
if (phi->opcode != aco_opcode::p_phi &&
|
||||
phi->opcode != aco_opcode::p_linear_phi)
|
||||
if (phi->opcode != aco_opcode::p_phi && phi->opcode != aco_opcode::p_linear_phi)
|
||||
break;
|
||||
|
||||
for (Operand& op : phi->operands) {
|
||||
|
@ -514,8 +512,8 @@ void emit_parallelcopies(cssa_ctx& ctx)
|
|||
|
||||
} /* end namespace */
|
||||
|
||||
|
||||
void lower_to_cssa(Program* program, live& live_vars)
|
||||
void
|
||||
lower_to_cssa(Program* program, live& live_vars)
|
||||
{
|
||||
reindex_ssa(program, live_vars.live_out);
|
||||
cssa_ctx ctx = {program, live_vars.live_out};
|
||||
|
@ -525,5 +523,4 @@ void lower_to_cssa(Program* program, live& live_vars)
|
|||
/* update live variable information */
|
||||
live_vars = live_var_analysis(program);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace aco
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -36,8 +36,9 @@
|
|||
namespace aco {
|
||||
namespace {
|
||||
|
||||
inline
|
||||
uint32_t murmur_32_scramble(uint32_t h, uint32_t k) {
|
||||
inline uint32_t
|
||||
murmur_32_scramble(uint32_t h, uint32_t k)
|
||||
{
|
||||
k *= 0xcc9e2d51;
|
||||
k = (k << 15) | (k >> 17);
|
||||
h ^= k * 0x1b873593;
|
||||
|
@ -46,8 +47,9 @@ uint32_t murmur_32_scramble(uint32_t h, uint32_t k) {
|
|||
return h;
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
uint32_t hash_murmur_32(Instruction* instr)
|
||||
template <typename T>
|
||||
uint32_t
|
||||
hash_murmur_32(Instruction* instr)
|
||||
{
|
||||
uint32_t hash = uint32_t(instr->format) << 16 | uint32_t(instr->opcode);
|
||||
|
||||
|
@ -58,7 +60,7 @@ uint32_t hash_murmur_32(Instruction* instr)
|
|||
for (unsigned i = 2; i < (sizeof(T) >> 2); i++) {
|
||||
uint32_t u;
|
||||
/* Accesses it though a byte array, so doesn't violate the strict aliasing rule */
|
||||
memcpy(&u, reinterpret_cast<uint8_t *>(instr) + i * 4, 4);
|
||||
memcpy(&u, reinterpret_cast<uint8_t*>(instr) + i * 4, 4);
|
||||
hash = murmur_32_scramble(hash, u);
|
||||
}
|
||||
|
||||
|
@ -92,32 +94,19 @@ struct InstrHash {
|
|||
return hash_murmur_32<SDWA_instruction>(instr);
|
||||
|
||||
switch (instr->format) {
|
||||
case Format::SMEM:
|
||||
return hash_murmur_32<SMEM_instruction>(instr);
|
||||
case Format::VINTRP:
|
||||
return hash_murmur_32<Interp_instruction>(instr);
|
||||
case Format::DS:
|
||||
return hash_murmur_32<DS_instruction>(instr);
|
||||
case Format::SOPP:
|
||||
return hash_murmur_32<SOPP_instruction>(instr);
|
||||
case Format::SOPK:
|
||||
return hash_murmur_32<SOPK_instruction>(instr);
|
||||
case Format::EXP:
|
||||
return hash_murmur_32<Export_instruction>(instr);
|
||||
case Format::MUBUF:
|
||||
return hash_murmur_32<MUBUF_instruction>(instr);
|
||||
case Format::MIMG:
|
||||
return hash_murmur_32<MIMG_instruction>(instr);
|
||||
case Format::MTBUF:
|
||||
return hash_murmur_32<MTBUF_instruction>(instr);
|
||||
case Format::FLAT:
|
||||
return hash_murmur_32<FLAT_instruction>(instr);
|
||||
case Format::PSEUDO_BRANCH:
|
||||
return hash_murmur_32<Pseudo_branch_instruction>(instr);
|
||||
case Format::PSEUDO_REDUCTION:
|
||||
return hash_murmur_32<Pseudo_reduction_instruction>(instr);
|
||||
default:
|
||||
return hash_murmur_32<Instruction>(instr);
|
||||
case Format::SMEM: return hash_murmur_32<SMEM_instruction>(instr);
|
||||
case Format::VINTRP: return hash_murmur_32<Interp_instruction>(instr);
|
||||
case Format::DS: return hash_murmur_32<DS_instruction>(instr);
|
||||
case Format::SOPP: return hash_murmur_32<SOPP_instruction>(instr);
|
||||
case Format::SOPK: return hash_murmur_32<SOPK_instruction>(instr);
|
||||
case Format::EXP: return hash_murmur_32<Export_instruction>(instr);
|
||||
case Format::MUBUF: return hash_murmur_32<MUBUF_instruction>(instr);
|
||||
case Format::MIMG: return hash_murmur_32<MIMG_instruction>(instr);
|
||||
case Format::MTBUF: return hash_murmur_32<MTBUF_instruction>(instr);
|
||||
case Format::FLAT: return hash_murmur_32<FLAT_instruction>(instr);
|
||||
case Format::PSEUDO_BRANCH: return hash_murmur_32<Pseudo_branch_instruction>(instr);
|
||||
case Format::PSEUDO_REDUCTION: return hash_murmur_32<Pseudo_reduction_instruction>(instr);
|
||||
default: return hash_murmur_32<Instruction>(instr);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
@ -129,7 +118,8 @@ struct InstrPred {
|
|||
return false;
|
||||
if (a->opcode != b->opcode)
|
||||
return false;
|
||||
if (a->operands.size() != b->operands.size() || a->definitions.size() != b->definitions.size())
|
||||
if (a->operands.size() != b->operands.size() ||
|
||||
a->definitions.size() != b->definitions.size())
|
||||
return false; /* possible with pseudo-instructions */
|
||||
for (unsigned i = 0; i < a->operands.size(); i++) {
|
||||
if (a->operands[i].isConstant()) {
|
||||
|
@ -137,14 +127,12 @@ struct InstrPred {
|
|||
return false;
|
||||
if (a->operands[i].constantValue() != b->operands[i].constantValue())
|
||||
return false;
|
||||
}
|
||||
else if (a->operands[i].isTemp()) {
|
||||
} else if (a->operands[i].isTemp()) {
|
||||
if (!b->operands[i].isTemp())
|
||||
return false;
|
||||
if (a->operands[i].tempId() != b->operands[i].tempId())
|
||||
return false;
|
||||
}
|
||||
else if (a->operands[i].isUndefined() ^ b->operands[i].isUndefined())
|
||||
} else if (a->operands[i].isUndefined() ^ b->operands[i].isUndefined())
|
||||
return false;
|
||||
if (a->operands[i].isFixed()) {
|
||||
if (!b->operands[i].isFixed())
|
||||
|
@ -179,154 +167,110 @@ struct InstrPred {
|
|||
VOP3_instruction& a3 = a->vop3();
|
||||
VOP3_instruction& b3 = b->vop3();
|
||||
for (unsigned i = 0; i < 3; i++) {
|
||||
if (a3.abs[i] != b3.abs[i] ||
|
||||
a3.neg[i] != b3.neg[i])
|
||||
if (a3.abs[i] != b3.abs[i] || a3.neg[i] != b3.neg[i])
|
||||
return false;
|
||||
}
|
||||
return a3.clamp == b3.clamp &&
|
||||
a3.omod == b3.omod &&
|
||||
a3.opsel == b3.opsel;
|
||||
return a3.clamp == b3.clamp && a3.omod == b3.omod && a3.opsel == b3.opsel;
|
||||
}
|
||||
if (a->isDPP()) {
|
||||
DPP_instruction& aDPP = a->dpp();
|
||||
DPP_instruction& bDPP = b->dpp();
|
||||
return aDPP.pass_flags == bDPP.pass_flags &&
|
||||
aDPP.dpp_ctrl == bDPP.dpp_ctrl &&
|
||||
aDPP.bank_mask == bDPP.bank_mask &&
|
||||
aDPP.row_mask == bDPP.row_mask &&
|
||||
aDPP.bound_ctrl == bDPP.bound_ctrl &&
|
||||
aDPP.abs[0] == bDPP.abs[0] &&
|
||||
aDPP.abs[1] == bDPP.abs[1] &&
|
||||
aDPP.neg[0] == bDPP.neg[0] &&
|
||||
return aDPP.pass_flags == bDPP.pass_flags && aDPP.dpp_ctrl == bDPP.dpp_ctrl &&
|
||||
aDPP.bank_mask == bDPP.bank_mask && aDPP.row_mask == bDPP.row_mask &&
|
||||
aDPP.bound_ctrl == bDPP.bound_ctrl && aDPP.abs[0] == bDPP.abs[0] &&
|
||||
aDPP.abs[1] == bDPP.abs[1] && aDPP.neg[0] == bDPP.neg[0] &&
|
||||
aDPP.neg[1] == bDPP.neg[1];
|
||||
}
|
||||
if (a->isSDWA()) {
|
||||
SDWA_instruction& aSDWA = a->sdwa();
|
||||
SDWA_instruction& bSDWA = b->sdwa();
|
||||
return aSDWA.sel[0] == bSDWA.sel[0] &&
|
||||
aSDWA.sel[1] == bSDWA.sel[1] &&
|
||||
aSDWA.dst_sel == bSDWA.dst_sel &&
|
||||
aSDWA.abs[0] == bSDWA.abs[0] &&
|
||||
aSDWA.abs[1] == bSDWA.abs[1] &&
|
||||
aSDWA.neg[0] == bSDWA.neg[0] &&
|
||||
aSDWA.neg[1] == bSDWA.neg[1] &&
|
||||
aSDWA.dst_preserve == bSDWA.dst_preserve &&
|
||||
aSDWA.clamp == bSDWA.clamp &&
|
||||
aSDWA.omod == bSDWA.omod;
|
||||
return aSDWA.sel[0] == bSDWA.sel[0] && aSDWA.sel[1] == bSDWA.sel[1] &&
|
||||
aSDWA.dst_sel == bSDWA.dst_sel && aSDWA.abs[0] == bSDWA.abs[0] &&
|
||||
aSDWA.abs[1] == bSDWA.abs[1] && aSDWA.neg[0] == bSDWA.neg[0] &&
|
||||
aSDWA.neg[1] == bSDWA.neg[1] && aSDWA.dst_preserve == bSDWA.dst_preserve &&
|
||||
aSDWA.clamp == bSDWA.clamp && aSDWA.omod == bSDWA.omod;
|
||||
}
|
||||
|
||||
switch (a->format) {
|
||||
case Format::SOPK: {
|
||||
if (a->opcode == aco_opcode::s_getreg_b32)
|
||||
case Format::SOPK: {
|
||||
if (a->opcode == aco_opcode::s_getreg_b32)
|
||||
return false;
|
||||
SOPK_instruction& aK = a->sopk();
|
||||
SOPK_instruction& bK = b->sopk();
|
||||
return aK.imm == bK.imm;
|
||||
}
|
||||
case Format::SMEM: {
|
||||
SMEM_instruction& aS = a->smem();
|
||||
SMEM_instruction& bS = b->smem();
|
||||
/* isel shouldn't be creating situations where this assertion fails */
|
||||
assert(aS.prevent_overflow == bS.prevent_overflow);
|
||||
return aS.sync == bS.sync && aS.glc == bS.glc && aS.dlc == bS.dlc && aS.nv == bS.nv &&
|
||||
aS.disable_wqm == bS.disable_wqm && aS.prevent_overflow == bS.prevent_overflow;
|
||||
}
|
||||
case Format::VINTRP: {
|
||||
Interp_instruction& aI = a->vintrp();
|
||||
Interp_instruction& bI = b->vintrp();
|
||||
if (aI.attribute != bI.attribute)
|
||||
return false;
|
||||
if (aI.component != bI.component)
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
case Format::VOP3P: {
|
||||
VOP3P_instruction& a3P = a->vop3p();
|
||||
VOP3P_instruction& b3P = b->vop3p();
|
||||
for (unsigned i = 0; i < 3; i++) {
|
||||
if (a3P.neg_lo[i] != b3P.neg_lo[i] || a3P.neg_hi[i] != b3P.neg_hi[i])
|
||||
return false;
|
||||
SOPK_instruction& aK = a->sopk();
|
||||
SOPK_instruction& bK = b->sopk();
|
||||
return aK.imm == bK.imm;
|
||||
}
|
||||
case Format::SMEM: {
|
||||
SMEM_instruction& aS = a->smem();
|
||||
SMEM_instruction& bS = b->smem();
|
||||
/* isel shouldn't be creating situations where this assertion fails */
|
||||
assert(aS.prevent_overflow == bS.prevent_overflow);
|
||||
return aS.sync == bS.sync && aS.glc == bS.glc && aS.dlc == bS.dlc &&
|
||||
aS.nv == bS.nv && aS.disable_wqm == bS.disable_wqm &&
|
||||
aS.prevent_overflow == bS.prevent_overflow;
|
||||
}
|
||||
case Format::VINTRP: {
|
||||
Interp_instruction& aI = a->vintrp();
|
||||
Interp_instruction& bI = b->vintrp();
|
||||
if (aI.attribute != bI.attribute)
|
||||
return false;
|
||||
if (aI.component != bI.component)
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
case Format::VOP3P: {
|
||||
VOP3P_instruction& a3P = a->vop3p();
|
||||
VOP3P_instruction& b3P = b->vop3p();
|
||||
for (unsigned i = 0; i < 3; i++) {
|
||||
if (a3P.neg_lo[i] != b3P.neg_lo[i] ||
|
||||
a3P.neg_hi[i] != b3P.neg_hi[i])
|
||||
return false;
|
||||
}
|
||||
return a3P.opsel_lo == b3P.opsel_lo &&
|
||||
a3P.opsel_hi == b3P.opsel_hi &&
|
||||
a3P.clamp == b3P.clamp;
|
||||
}
|
||||
case Format::PSEUDO_REDUCTION: {
|
||||
Pseudo_reduction_instruction& aR = a->reduction();
|
||||
Pseudo_reduction_instruction& bR = b->reduction();
|
||||
return aR.pass_flags == bR.pass_flags &&
|
||||
aR.reduce_op == bR.reduce_op &&
|
||||
aR.cluster_size == bR.cluster_size;
|
||||
}
|
||||
case Format::DS: {
|
||||
assert(a->opcode == aco_opcode::ds_bpermute_b32 ||
|
||||
a->opcode == aco_opcode::ds_permute_b32 ||
|
||||
a->opcode == aco_opcode::ds_swizzle_b32);
|
||||
DS_instruction& aD = a->ds();
|
||||
DS_instruction& bD = b->ds();
|
||||
return aD.sync == bD.sync &&
|
||||
aD.pass_flags == bD.pass_flags &&
|
||||
aD.gds == bD.gds &&
|
||||
aD.offset0 == bD.offset0 &&
|
||||
aD.offset1 == bD.offset1;
|
||||
}
|
||||
case Format::MTBUF: {
|
||||
MTBUF_instruction& aM = a->mtbuf();
|
||||
MTBUF_instruction& bM = b->mtbuf();
|
||||
return aM.sync == bM.sync &&
|
||||
aM.dfmt == bM.dfmt &&
|
||||
aM.nfmt == bM.nfmt &&
|
||||
aM.offset == bM.offset &&
|
||||
aM.offen == bM.offen &&
|
||||
aM.idxen == bM.idxen &&
|
||||
aM.glc == bM.glc &&
|
||||
aM.dlc == bM.dlc &&
|
||||
aM.slc == bM.slc &&
|
||||
aM.tfe == bM.tfe &&
|
||||
aM.disable_wqm == bM.disable_wqm;
|
||||
}
|
||||
case Format::MUBUF: {
|
||||
MUBUF_instruction& aM = a->mubuf();
|
||||
MUBUF_instruction& bM = b->mubuf();
|
||||
return aM.sync == bM.sync &&
|
||||
aM.offset == bM.offset &&
|
||||
aM.offen == bM.offen &&
|
||||
aM.idxen == bM.idxen &&
|
||||
aM.glc == bM.glc &&
|
||||
aM.dlc == bM.dlc &&
|
||||
aM.slc == bM.slc &&
|
||||
aM.tfe == bM.tfe &&
|
||||
aM.lds == bM.lds &&
|
||||
aM.disable_wqm == bM.disable_wqm;
|
||||
}
|
||||
case Format::MIMG: {
|
||||
MIMG_instruction& aM = a->mimg();
|
||||
MIMG_instruction& bM = b->mimg();
|
||||
return aM.sync == bM.sync &&
|
||||
aM.dmask == bM.dmask &&
|
||||
aM.unrm == bM.unrm &&
|
||||
aM.glc == bM.glc &&
|
||||
aM.slc == bM.slc &&
|
||||
aM.tfe == bM.tfe &&
|
||||
aM.da == bM.da &&
|
||||
aM.lwe == bM.lwe &&
|
||||
aM.r128 == bM.r128 &&
|
||||
aM.a16 == bM.a16 &&
|
||||
aM.d16 == bM.d16 &&
|
||||
aM.disable_wqm == bM.disable_wqm;
|
||||
}
|
||||
case Format::FLAT:
|
||||
case Format::GLOBAL:
|
||||
case Format::SCRATCH:
|
||||
case Format::EXP:
|
||||
case Format::SOPP:
|
||||
case Format::PSEUDO_BRANCH:
|
||||
case Format::PSEUDO_BARRIER:
|
||||
assert(false);
|
||||
default:
|
||||
return true;
|
||||
return a3P.opsel_lo == b3P.opsel_lo && a3P.opsel_hi == b3P.opsel_hi &&
|
||||
a3P.clamp == b3P.clamp;
|
||||
}
|
||||
case Format::PSEUDO_REDUCTION: {
|
||||
Pseudo_reduction_instruction& aR = a->reduction();
|
||||
Pseudo_reduction_instruction& bR = b->reduction();
|
||||
return aR.pass_flags == bR.pass_flags && aR.reduce_op == bR.reduce_op &&
|
||||
aR.cluster_size == bR.cluster_size;
|
||||
}
|
||||
case Format::DS: {
|
||||
assert(a->opcode == aco_opcode::ds_bpermute_b32 ||
|
||||
a->opcode == aco_opcode::ds_permute_b32 || a->opcode == aco_opcode::ds_swizzle_b32);
|
||||
DS_instruction& aD = a->ds();
|
||||
DS_instruction& bD = b->ds();
|
||||
return aD.sync == bD.sync && aD.pass_flags == bD.pass_flags && aD.gds == bD.gds &&
|
||||
aD.offset0 == bD.offset0 && aD.offset1 == bD.offset1;
|
||||
}
|
||||
case Format::MTBUF: {
|
||||
MTBUF_instruction& aM = a->mtbuf();
|
||||
MTBUF_instruction& bM = b->mtbuf();
|
||||
return aM.sync == bM.sync && aM.dfmt == bM.dfmt && aM.nfmt == bM.nfmt &&
|
||||
aM.offset == bM.offset && aM.offen == bM.offen && aM.idxen == bM.idxen &&
|
||||
aM.glc == bM.glc && aM.dlc == bM.dlc && aM.slc == bM.slc && aM.tfe == bM.tfe &&
|
||||
aM.disable_wqm == bM.disable_wqm;
|
||||
}
|
||||
case Format::MUBUF: {
|
||||
MUBUF_instruction& aM = a->mubuf();
|
||||
MUBUF_instruction& bM = b->mubuf();
|
||||
return aM.sync == bM.sync && aM.offset == bM.offset && aM.offen == bM.offen &&
|
||||
aM.idxen == bM.idxen && aM.glc == bM.glc && aM.dlc == bM.dlc && aM.slc == bM.slc &&
|
||||
aM.tfe == bM.tfe && aM.lds == bM.lds && aM.disable_wqm == bM.disable_wqm;
|
||||
}
|
||||
case Format::MIMG: {
|
||||
MIMG_instruction& aM = a->mimg();
|
||||
MIMG_instruction& bM = b->mimg();
|
||||
return aM.sync == bM.sync && aM.dmask == bM.dmask && aM.unrm == bM.unrm &&
|
||||
aM.glc == bM.glc && aM.slc == bM.slc && aM.tfe == bM.tfe && aM.da == bM.da &&
|
||||
aM.lwe == bM.lwe && aM.r128 == bM.r128 && aM.a16 == bM.a16 && aM.d16 == bM.d16 &&
|
||||
aM.disable_wqm == bM.disable_wqm;
|
||||
}
|
||||
case Format::FLAT:
|
||||
case Format::GLOBAL:
|
||||
case Format::SCRATCH:
|
||||
case Format::EXP:
|
||||
case Format::SOPP:
|
||||
case Format::PSEUDO_BRANCH:
|
||||
case Format::PSEUDO_BARRIER: assert(false);
|
||||
default: return true;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
@ -345,7 +289,8 @@ struct vn_ctx {
|
|||
*/
|
||||
uint32_t exec_id = 1;
|
||||
|
||||
vn_ctx(Program* program_) : program(program_) {
|
||||
vn_ctx(Program* program_) : program(program_)
|
||||
{
|
||||
static_assert(sizeof(Temp) == 4, "Temp must fit in 32bits");
|
||||
unsigned size = 0;
|
||||
for (Block& block : program->blocks)
|
||||
|
@ -354,11 +299,11 @@ struct vn_ctx {
|
|||
}
|
||||
};
|
||||
|
||||
|
||||
/* dominates() returns true if the parent block dominates the child block and
|
||||
* if the parent block is part of the same loop or has a smaller loop nest depth.
|
||||
*/
|
||||
bool dominates(vn_ctx& ctx, uint32_t parent, uint32_t child)
|
||||
bool
|
||||
dominates(vn_ctx& ctx, uint32_t parent, uint32_t child)
|
||||
{
|
||||
unsigned parent_loop_nest_depth = ctx.program->blocks[parent].loop_nest_depth;
|
||||
while (parent < child && parent_loop_nest_depth <= ctx.program->blocks[child].loop_nest_depth)
|
||||
|
@ -375,42 +320,40 @@ bool dominates(vn_ctx& ctx, uint32_t parent, uint32_t child)
|
|||
* Note that expr_set must not be used with instructions
|
||||
* which cannot be eliminated.
|
||||
*/
|
||||
bool can_eliminate(aco_ptr<Instruction>& instr)
|
||||
bool
|
||||
can_eliminate(aco_ptr<Instruction>& instr)
|
||||
{
|
||||
switch (instr->format) {
|
||||
case Format::FLAT:
|
||||
case Format::GLOBAL:
|
||||
case Format::SCRATCH:
|
||||
case Format::EXP:
|
||||
case Format::SOPP:
|
||||
case Format::PSEUDO_BRANCH:
|
||||
case Format::PSEUDO_BARRIER:
|
||||
case Format::FLAT:
|
||||
case Format::GLOBAL:
|
||||
case Format::SCRATCH:
|
||||
case Format::EXP:
|
||||
case Format::SOPP:
|
||||
case Format::PSEUDO_BRANCH:
|
||||
case Format::PSEUDO_BARRIER: return false;
|
||||
case Format::DS:
|
||||
return instr->opcode == aco_opcode::ds_bpermute_b32 ||
|
||||
instr->opcode == aco_opcode::ds_permute_b32 ||
|
||||
instr->opcode == aco_opcode::ds_swizzle_b32;
|
||||
case Format::SMEM:
|
||||
case Format::MUBUF:
|
||||
case Format::MIMG:
|
||||
case Format::MTBUF:
|
||||
if (!get_sync_info(instr.get()).can_reorder())
|
||||
return false;
|
||||
case Format::DS:
|
||||
return instr->opcode == aco_opcode::ds_bpermute_b32 ||
|
||||
instr->opcode == aco_opcode::ds_permute_b32 ||
|
||||
instr->opcode == aco_opcode::ds_swizzle_b32;
|
||||
case Format::SMEM:
|
||||
case Format::MUBUF:
|
||||
case Format::MIMG:
|
||||
case Format::MTBUF:
|
||||
if (!get_sync_info(instr.get()).can_reorder())
|
||||
return false;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
break;
|
||||
default: break;
|
||||
}
|
||||
|
||||
if (instr->definitions.empty() ||
|
||||
instr->opcode == aco_opcode::p_phi ||
|
||||
instr->opcode == aco_opcode::p_linear_phi ||
|
||||
instr->definitions[0].isNoCSE())
|
||||
if (instr->definitions.empty() || instr->opcode == aco_opcode::p_phi ||
|
||||
instr->opcode == aco_opcode::p_linear_phi || instr->definitions[0].isNoCSE())
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void process_block(vn_ctx& ctx, Block& block)
|
||||
void
|
||||
process_block(vn_ctx& ctx, Block& block)
|
||||
{
|
||||
std::vector<aco_ptr<Instruction>> new_instructions;
|
||||
new_instructions.reserve(block.instructions.size());
|
||||
|
@ -435,8 +378,9 @@ void process_block(vn_ctx& ctx, Block& block)
|
|||
}
|
||||
|
||||
/* simple copy-propagation through renaming */
|
||||
bool copy_instr = instr->opcode == aco_opcode::p_parallelcopy ||
|
||||
(instr->opcode == aco_opcode::p_create_vector && instr->operands.size() == 1);
|
||||
bool copy_instr =
|
||||
instr->opcode == aco_opcode::p_parallelcopy ||
|
||||
(instr->opcode == aco_opcode::p_create_vector && instr->operands.size() == 1);
|
||||
if (copy_instr && !instr->definitions[0].isFixed() && instr->operands[0].isTemp() &&
|
||||
instr->operands[0].regClass() == instr->definitions[0].regClass()) {
|
||||
ctx.renames[instr->definitions[0].tempId()] = instr->operands[0].getTemp();
|
||||
|
@ -479,7 +423,8 @@ void process_block(vn_ctx& ctx, Block& block)
|
|||
block.instructions = std::move(new_instructions);
|
||||
}
|
||||
|
||||
void rename_phi_operands(Block& block, std::map<uint32_t, Temp>& renames)
|
||||
void
|
||||
rename_phi_operands(Block& block, std::map<uint32_t, Temp>& renames)
|
||||
{
|
||||
for (aco_ptr<Instruction>& phi : block.instructions) {
|
||||
if (phi->opcode != aco_opcode::p_phi && phi->opcode != aco_opcode::p_linear_phi)
|
||||
|
@ -496,8 +441,8 @@ void rename_phi_operands(Block& block, std::map<uint32_t, Temp>& renames)
|
|||
}
|
||||
} /* end namespace */
|
||||
|
||||
|
||||
void value_numbering(Program* program)
|
||||
void
|
||||
value_numbering(Program* program)
|
||||
{
|
||||
vn_ctx ctx(program);
|
||||
std::vector<unsigned> loop_headers;
|
||||
|
@ -521,10 +466,8 @@ void value_numbering(Program* program)
|
|||
rename_phi_operands(block, ctx.renames);
|
||||
|
||||
/* increment exec_id when entering nested control flow */
|
||||
if (block.kind & block_kind_branch ||
|
||||
block.kind & block_kind_loop_preheader ||
|
||||
block.kind & block_kind_break ||
|
||||
block.kind & block_kind_continue ||
|
||||
if (block.kind & block_kind_branch || block.kind & block_kind_loop_preheader ||
|
||||
block.kind & block_kind_break || block.kind & block_kind_continue ||
|
||||
block.kind & block_kind_discard)
|
||||
ctx.exec_id++;
|
||||
else if (block.kind & block_kind_continue_or_break)
|
||||
|
@ -538,4 +481,4 @@ void value_numbering(Program* program)
|
|||
}
|
||||
}
|
||||
|
||||
}
|
||||
} // namespace aco
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -24,9 +24,9 @@
|
|||
|
||||
#include "aco_ir.h"
|
||||
|
||||
#include <bitset>
|
||||
#include <algorithm>
|
||||
#include <array>
|
||||
#include <bitset>
|
||||
#include <vector>
|
||||
|
||||
namespace aco {
|
||||
|
@ -41,15 +41,14 @@ enum {
|
|||
written_by_multiple_instrs = -4,
|
||||
};
|
||||
|
||||
struct pr_opt_ctx
|
||||
{
|
||||
Program *program;
|
||||
Block *current_block;
|
||||
struct pr_opt_ctx {
|
||||
Program* program;
|
||||
Block* current_block;
|
||||
int current_instr_idx;
|
||||
std::vector<uint16_t> uses;
|
||||
std::array<int, max_reg_cnt * 4u> instr_idx_by_regs;
|
||||
|
||||
void reset_block(Block *block)
|
||||
void reset_block(Block* block)
|
||||
{
|
||||
current_block = block;
|
||||
current_instr_idx = -1;
|
||||
|
@ -57,9 +56,10 @@ struct pr_opt_ctx
|
|||
}
|
||||
};
|
||||
|
||||
void save_reg_writes(pr_opt_ctx &ctx, aco_ptr<Instruction> &instr)
|
||||
void
|
||||
save_reg_writes(pr_opt_ctx& ctx, aco_ptr<Instruction>& instr)
|
||||
{
|
||||
for (const Definition &def : instr->definitions) {
|
||||
for (const Definition& def : instr->definitions) {
|
||||
assert(def.regClass().type() != RegType::sgpr || def.physReg().reg() <= 255);
|
||||
assert(def.regClass().type() != RegType::vgpr || def.physReg().reg() >= 256);
|
||||
|
||||
|
@ -75,20 +75,21 @@ void save_reg_writes(pr_opt_ctx &ctx, aco_ptr<Instruction> &instr)
|
|||
}
|
||||
}
|
||||
|
||||
int last_writer_idx(pr_opt_ctx &ctx, PhysReg physReg, RegClass rc)
|
||||
int
|
||||
last_writer_idx(pr_opt_ctx& ctx, PhysReg physReg, RegClass rc)
|
||||
{
|
||||
/* Verify that all of the operand's registers are written by the same instruction. */
|
||||
int instr_idx = ctx.instr_idx_by_regs[physReg.reg()];
|
||||
unsigned dw_size = DIV_ROUND_UP(rc.bytes(), 4u);
|
||||
unsigned r = physReg.reg();
|
||||
bool all_same = std::all_of(
|
||||
&ctx.instr_idx_by_regs[r], &ctx.instr_idx_by_regs[r + dw_size],
|
||||
[instr_idx](int i) { return i == instr_idx; });
|
||||
bool all_same = std::all_of(&ctx.instr_idx_by_regs[r], &ctx.instr_idx_by_regs[r + dw_size],
|
||||
[instr_idx](int i) { return i == instr_idx; });
|
||||
|
||||
return all_same ? instr_idx : written_by_multiple_instrs;
|
||||
}
|
||||
|
||||
int last_writer_idx(pr_opt_ctx &ctx, const Operand &op)
|
||||
int
|
||||
last_writer_idx(pr_opt_ctx& ctx, const Operand& op)
|
||||
{
|
||||
if (op.isConstant() || op.isUndefined())
|
||||
return const_or_undef;
|
||||
|
@ -104,7 +105,8 @@ int last_writer_idx(pr_opt_ctx &ctx, const Operand &op)
|
|||
return instr_idx;
|
||||
}
|
||||
|
||||
void try_apply_branch_vcc(pr_opt_ctx &ctx, aco_ptr<Instruction> &instr)
|
||||
void
|
||||
try_apply_branch_vcc(pr_opt_ctx& ctx, aco_ptr<Instruction>& instr)
|
||||
{
|
||||
/* We are looking for the following pattern:
|
||||
*
|
||||
|
@ -123,8 +125,7 @@ void try_apply_branch_vcc(pr_opt_ctx &ctx, aco_ptr<Instruction> &instr)
|
|||
if (ctx.program->chip_class < GFX8)
|
||||
return;
|
||||
|
||||
if (instr->format != Format::PSEUDO_BRANCH ||
|
||||
instr->operands.size() == 0 ||
|
||||
if (instr->format != Format::PSEUDO_BRANCH || instr->operands.size() == 0 ||
|
||||
instr->operands[0].physReg() != scc)
|
||||
return;
|
||||
|
||||
|
@ -141,13 +142,12 @@ void try_apply_branch_vcc(pr_opt_ctx &ctx, aco_ptr<Instruction> &instr)
|
|||
last_exec_wr_idx > last_vcc_wr_idx || last_exec_wr_idx < not_written_in_block)
|
||||
return;
|
||||
|
||||
aco_ptr<Instruction> &op0_instr = ctx.current_block->instructions[op0_instr_idx];
|
||||
aco_ptr<Instruction> &last_vcc_wr = ctx.current_block->instructions[last_vcc_wr_idx];
|
||||
aco_ptr<Instruction>& op0_instr = ctx.current_block->instructions[op0_instr_idx];
|
||||
aco_ptr<Instruction>& last_vcc_wr = ctx.current_block->instructions[last_vcc_wr_idx];
|
||||
|
||||
if ((op0_instr->opcode != aco_opcode::s_and_b64 /* wave64 */ &&
|
||||
op0_instr->opcode != aco_opcode::s_and_b32 /* wave32 */) ||
|
||||
op0_instr->operands[0].physReg() != vcc ||
|
||||
op0_instr->operands[1].physReg() != exec ||
|
||||
op0_instr->operands[0].physReg() != vcc || op0_instr->operands[1].physReg() != exec ||
|
||||
!last_vcc_wr->isVOPC())
|
||||
return;
|
||||
|
||||
|
@ -159,7 +159,8 @@ void try_apply_branch_vcc(pr_opt_ctx &ctx, aco_ptr<Instruction> &instr)
|
|||
instr->operands[0] = op0_instr->operands[0];
|
||||
}
|
||||
|
||||
void try_optimize_scc_nocompare(pr_opt_ctx &ctx, aco_ptr<Instruction> &instr)
|
||||
void
|
||||
try_optimize_scc_nocompare(pr_opt_ctx& ctx, aco_ptr<Instruction>& instr)
|
||||
{
|
||||
/* We are looking for the following pattern:
|
||||
*
|
||||
|
@ -180,8 +181,7 @@ void try_optimize_scc_nocompare(pr_opt_ctx &ctx, aco_ptr<Instruction> &instr)
|
|||
if (instr->isSOPC() &&
|
||||
(instr->opcode == aco_opcode::s_cmp_eq_u32 || instr->opcode == aco_opcode::s_cmp_eq_i32 ||
|
||||
instr->opcode == aco_opcode::s_cmp_lg_u32 || instr->opcode == aco_opcode::s_cmp_lg_i32 ||
|
||||
instr->opcode == aco_opcode::s_cmp_eq_u64 ||
|
||||
instr->opcode == aco_opcode::s_cmp_lg_u64) &&
|
||||
instr->opcode == aco_opcode::s_cmp_eq_u64 || instr->opcode == aco_opcode::s_cmp_lg_u64) &&
|
||||
(instr->operands[0].constantEquals(0) || instr->operands[1].constantEquals(0)) &&
|
||||
(instr->operands[0].isTemp() || instr->operands[1].isTemp())) {
|
||||
/* Make sure the constant is always in operand 1 */
|
||||
|
@ -197,8 +197,9 @@ void try_optimize_scc_nocompare(pr_opt_ctx &ctx, aco_ptr<Instruction> &instr)
|
|||
if (wr_idx < 0 || wr_idx != sccwr_idx)
|
||||
return;
|
||||
|
||||
aco_ptr<Instruction> &wr_instr = ctx.current_block->instructions[wr_idx];
|
||||
if (!wr_instr->isSALU() || wr_instr->definitions.size() < 2 || wr_instr->definitions[1].physReg() != scc)
|
||||
aco_ptr<Instruction>& wr_instr = ctx.current_block->instructions[wr_idx];
|
||||
if (!wr_instr->isSALU() || wr_instr->definitions.size() < 2 ||
|
||||
wr_instr->definitions[1].physReg() != scc)
|
||||
return;
|
||||
|
||||
/* Look for instructions which set SCC := (D != 0) */
|
||||
|
@ -232,10 +233,8 @@ void try_optimize_scc_nocompare(pr_opt_ctx &ctx, aco_ptr<Instruction> &instr)
|
|||
case aco_opcode::s_ashr_i32:
|
||||
case aco_opcode::s_ashr_i64:
|
||||
case aco_opcode::s_abs_i32:
|
||||
case aco_opcode::s_absdiff_i32:
|
||||
break;
|
||||
default:
|
||||
return;
|
||||
case aco_opcode::s_absdiff_i32: break;
|
||||
default: return;
|
||||
}
|
||||
|
||||
/* Use the SCC def from wr_instr */
|
||||
|
@ -245,13 +244,12 @@ void try_optimize_scc_nocompare(pr_opt_ctx &ctx, aco_ptr<Instruction> &instr)
|
|||
|
||||
/* Set the opcode and operand to 32-bit */
|
||||
instr->operands[1] = Operand(0u);
|
||||
instr->opcode = (instr->opcode == aco_opcode::s_cmp_eq_u32 ||
|
||||
instr->opcode == aco_opcode::s_cmp_eq_i32 ||
|
||||
instr->opcode == aco_opcode::s_cmp_eq_u64)
|
||||
? aco_opcode::s_cmp_eq_u32
|
||||
: aco_opcode::s_cmp_lg_u32;
|
||||
} else if ((instr->format == Format::PSEUDO_BRANCH &&
|
||||
instr->operands.size() == 1 &&
|
||||
instr->opcode =
|
||||
(instr->opcode == aco_opcode::s_cmp_eq_u32 || instr->opcode == aco_opcode::s_cmp_eq_i32 ||
|
||||
instr->opcode == aco_opcode::s_cmp_eq_u64)
|
||||
? aco_opcode::s_cmp_eq_u32
|
||||
: aco_opcode::s_cmp_lg_u32;
|
||||
} else if ((instr->format == Format::PSEUDO_BRANCH && instr->operands.size() == 1 &&
|
||||
instr->operands[0].physReg() == scc) ||
|
||||
instr->opcode == aco_opcode::s_cselect_b32) {
|
||||
|
||||
|
@ -265,10 +263,11 @@ void try_optimize_scc_nocompare(pr_opt_ctx &ctx, aco_ptr<Instruction> &instr)
|
|||
if (wr_idx < 0)
|
||||
return;
|
||||
|
||||
aco_ptr<Instruction> &wr_instr = ctx.current_block->instructions[wr_idx];
|
||||
aco_ptr<Instruction>& wr_instr = ctx.current_block->instructions[wr_idx];
|
||||
|
||||
/* Check if we found the pattern above. */
|
||||
if (wr_instr->opcode != aco_opcode::s_cmp_eq_u32 && wr_instr->opcode != aco_opcode::s_cmp_lg_u32)
|
||||
if (wr_instr->opcode != aco_opcode::s_cmp_eq_u32 &&
|
||||
wr_instr->opcode != aco_opcode::s_cmp_lg_u32)
|
||||
return;
|
||||
if (wr_instr->operands[0].physReg() != scc)
|
||||
return;
|
||||
|
@ -282,11 +281,13 @@ void try_optimize_scc_nocompare(pr_opt_ctx &ctx, aco_ptr<Instruction> &instr)
|
|||
if (wr_instr->opcode == aco_opcode::s_cmp_eq_u32) {
|
||||
/* Flip the meaning of the instruction to correctly use the SCC. */
|
||||
if (instr->format == Format::PSEUDO_BRANCH)
|
||||
instr->opcode = instr->opcode == aco_opcode::p_cbranch_z ? aco_opcode::p_cbranch_nz : aco_opcode::p_cbranch_z;
|
||||
instr->opcode = instr->opcode == aco_opcode::p_cbranch_z ? aco_opcode::p_cbranch_nz
|
||||
: aco_opcode::p_cbranch_z;
|
||||
else if (instr->opcode == aco_opcode::s_cselect_b32)
|
||||
std::swap(instr->operands[0], instr->operands[1]);
|
||||
else
|
||||
unreachable("scc_nocompare optimization is only implemented for p_cbranch and s_cselect");
|
||||
unreachable(
|
||||
"scc_nocompare optimization is only implemented for p_cbranch and s_cselect");
|
||||
}
|
||||
|
||||
/* Use the SCC def from the original instruction, not the comparison */
|
||||
|
@ -295,7 +296,8 @@ void try_optimize_scc_nocompare(pr_opt_ctx &ctx, aco_ptr<Instruction> &instr)
|
|||
}
|
||||
}
|
||||
|
||||
void process_instruction(pr_opt_ctx &ctx, aco_ptr<Instruction> &instr)
|
||||
void
|
||||
process_instruction(pr_opt_ctx& ctx, aco_ptr<Instruction>& instr)
|
||||
{
|
||||
ctx.current_instr_idx++;
|
||||
|
||||
|
@ -307,9 +309,10 @@ void process_instruction(pr_opt_ctx &ctx, aco_ptr<Instruction> &instr)
|
|||
save_reg_writes(ctx, instr);
|
||||
}
|
||||
|
||||
} /* End of empty namespace */
|
||||
} // namespace
|
||||
|
||||
void optimize_postRA(Program* program)
|
||||
void
|
||||
optimize_postRA(Program* program)
|
||||
{
|
||||
pr_opt_ctx ctx;
|
||||
ctx.program = program;
|
||||
|
@ -319,10 +322,10 @@ void optimize_postRA(Program* program)
|
|||
* Goes through each instruction exactly once, and can transform
|
||||
* instructions or adjust the use counts of temps.
|
||||
*/
|
||||
for (auto &block : program->blocks) {
|
||||
for (auto& block : program->blocks) {
|
||||
ctx.reset_block(&block);
|
||||
|
||||
for (aco_ptr<Instruction> &instr : block.instructions)
|
||||
for (aco_ptr<Instruction>& instr : block.instructions)
|
||||
process_instruction(ctx, instr);
|
||||
}
|
||||
|
||||
|
@ -330,13 +333,12 @@ void optimize_postRA(Program* program)
|
|||
* Gets rid of instructions which are manually deleted or
|
||||
* no longer have any uses.
|
||||
*/
|
||||
for (auto &block : program->blocks) {
|
||||
auto new_end = std::remove_if(
|
||||
block.instructions.begin(), block.instructions.end(),
|
||||
[&ctx](const aco_ptr<Instruction> &instr) { return !instr || is_dead(ctx.uses, instr.get()); });
|
||||
for (auto& block : program->blocks) {
|
||||
auto new_end = std::remove_if(block.instructions.begin(), block.instructions.end(),
|
||||
[&ctx](const aco_ptr<Instruction>& instr)
|
||||
{ return !instr || is_dead(ctx.uses, instr.get()); });
|
||||
block.instructions.resize(new_end - block.instructions.begin());
|
||||
}
|
||||
}
|
||||
|
||||
} /* End of aco namespace */
|
||||
|
||||
} // namespace aco
|
||||
|
|
|
@ -39,17 +39,17 @@ namespace {
|
|||
|
||||
/* LLVM disassembler only supports GFX8+, try to disassemble with CLRXdisasm
|
||||
* for GFX6-GFX7 if found on the system, this is better than nothing.
|
||||
*/
|
||||
bool print_asm_gfx6_gfx7(Program *program, std::vector<uint32_t>& binary,
|
||||
FILE *output)
|
||||
*/
|
||||
bool
|
||||
print_asm_gfx6_gfx7(Program* program, std::vector<uint32_t>& binary, FILE* output)
|
||||
{
|
||||
#ifdef _WIN32
|
||||
return true;
|
||||
#else
|
||||
char path[] = "/tmp/fileXXXXXX";
|
||||
char line[2048], command[128];
|
||||
const char *gpu_type;
|
||||
FILE *p;
|
||||
const char* gpu_type;
|
||||
FILE* p;
|
||||
int fd;
|
||||
|
||||
/* Dump the binary into a temporary file. */
|
||||
|
@ -57,8 +57,7 @@ bool print_asm_gfx6_gfx7(Program *program, std::vector<uint32_t>& binary,
|
|||
if (fd < 0)
|
||||
return true;
|
||||
|
||||
for (uint32_t w : binary)
|
||||
{
|
||||
for (uint32_t w : binary) {
|
||||
if (write(fd, &w, sizeof(w)) == -1)
|
||||
goto fail;
|
||||
}
|
||||
|
@ -69,30 +68,16 @@ bool print_asm_gfx6_gfx7(Program *program, std::vector<uint32_t>& binary,
|
|||
switch (program->chip_class) {
|
||||
case GFX6:
|
||||
switch (program->family) {
|
||||
case CHIP_TAHITI:
|
||||
gpu_type = "tahiti";
|
||||
break;
|
||||
case CHIP_PITCAIRN:
|
||||
gpu_type = "pitcairn";
|
||||
break;
|
||||
case CHIP_VERDE:
|
||||
gpu_type = "capeverde";
|
||||
break;
|
||||
case CHIP_OLAND:
|
||||
gpu_type = "oland";
|
||||
break;
|
||||
case CHIP_HAINAN:
|
||||
gpu_type = "hainan";
|
||||
break;
|
||||
default:
|
||||
unreachable("Invalid GFX6 family!");
|
||||
case CHIP_TAHITI: gpu_type = "tahiti"; break;
|
||||
case CHIP_PITCAIRN: gpu_type = "pitcairn"; break;
|
||||
case CHIP_VERDE: gpu_type = "capeverde"; break;
|
||||
case CHIP_OLAND: gpu_type = "oland"; break;
|
||||
case CHIP_HAINAN: gpu_type = "hainan"; break;
|
||||
default: unreachable("Invalid GFX6 family!");
|
||||
}
|
||||
break;
|
||||
case GFX7:
|
||||
gpu_type = "gfx700";
|
||||
break;
|
||||
default:
|
||||
unreachable("Invalid chip class!");
|
||||
case GFX7: gpu_type = "gfx700"; break;
|
||||
default: unreachable("Invalid chip class!");
|
||||
}
|
||||
|
||||
sprintf(command, "clrxdisasm --gpuType=%s -r %s", gpu_type, path);
|
||||
|
@ -121,22 +106,21 @@ fail:
|
|||
#endif
|
||||
}
|
||||
|
||||
std::pair<bool, size_t> disasm_instr(chip_class chip, LLVMDisasmContextRef disasm,
|
||||
uint32_t *binary, unsigned exec_size, size_t pos,
|
||||
char *outline, unsigned outline_size)
|
||||
std::pair<bool, size_t>
|
||||
disasm_instr(chip_class chip, LLVMDisasmContextRef disasm, uint32_t* binary, unsigned exec_size,
|
||||
size_t pos, char* outline, unsigned outline_size)
|
||||
{
|
||||
/* mask out src2 on v_writelane_b32 */
|
||||
if (((chip == GFX8 || chip == GFX9) && (binary[pos] & 0xffff8000) == 0xd28a0000) ||
|
||||
(chip >= GFX10 && (binary[pos] & 0xffff8000) == 0xd7610000)) {
|
||||
binary[pos+1] = binary[pos+1] & 0xF803FFFF;
|
||||
binary[pos + 1] = binary[pos + 1] & 0xF803FFFF;
|
||||
}
|
||||
|
||||
size_t l = LLVMDisasmInstruction(disasm, (uint8_t *) &binary[pos],
|
||||
(exec_size - pos) * sizeof(uint32_t), pos * 4,
|
||||
outline, outline_size);
|
||||
size_t l =
|
||||
LLVMDisasmInstruction(disasm, (uint8_t*)&binary[pos], (exec_size - pos) * sizeof(uint32_t),
|
||||
pos * 4, outline, outline_size);
|
||||
|
||||
if (chip >= GFX10 && l == 8 &&
|
||||
((binary[pos] & 0xffff0000) == 0xd7610000) &&
|
||||
if (chip >= GFX10 && l == 8 && ((binary[pos] & 0xffff0000) == 0xd7610000) &&
|
||||
((binary[pos + 1] & 0x1ff) == 0xff)) {
|
||||
/* v_writelane with literal uses 3 dwords but llvm consumes only 2 */
|
||||
l += 4;
|
||||
|
@ -145,14 +129,14 @@ std::pair<bool, size_t> disasm_instr(chip_class chip, LLVMDisasmContextRef disas
|
|||
bool invalid = false;
|
||||
size_t size;
|
||||
if (!l &&
|
||||
((chip >= GFX9 && (binary[pos] & 0xffff8000) == 0xd1348000) || /* v_add_u32_e64 + clamp */
|
||||
((chip >= GFX9 && (binary[pos] & 0xffff8000) == 0xd1348000) || /* v_add_u32_e64 + clamp */
|
||||
(chip >= GFX10 && (binary[pos] & 0xffff8000) == 0xd7038000) || /* v_add_u16_e64 + clamp */
|
||||
(chip <= GFX9 && (binary[pos] & 0xffff8000) == 0xd1268000) || /* v_add_u16_e64 + clamp */
|
||||
(chip <= GFX9 && (binary[pos] & 0xffff8000) == 0xd1268000) || /* v_add_u16_e64 + clamp */
|
||||
(chip >= GFX10 && (binary[pos] & 0xffff8000) == 0xd76d8000) || /* v_add3_u32 + clamp */
|
||||
(chip == GFX9 && (binary[pos] & 0xffff8000) == 0xd1ff8000)) /* v_add3_u32 + clamp */) {
|
||||
strcpy(outline, "\tinteger addition + clamp");
|
||||
bool has_literal = chip >= GFX10 &&
|
||||
(((binary[pos+1] & 0x1ff) == 0xff) || (((binary[pos+1] >> 9) & 0x1ff) == 0xff));
|
||||
bool has_literal = chip >= GFX10 && (((binary[pos + 1] & 0x1ff) == 0xff) ||
|
||||
(((binary[pos + 1] >> 9) & 0x1ff) == 0xff));
|
||||
size = 2 + has_literal;
|
||||
} else if (chip >= GFX10 && l == 4 && ((binary[pos] & 0xfe0001ff) == 0x020000f9)) {
|
||||
strcpy(outline, "\tv_cndmask_b32 + sdwa");
|
||||
|
@ -170,8 +154,8 @@ std::pair<bool, size_t> disasm_instr(chip_class chip, LLVMDisasmContextRef disas
|
|||
}
|
||||
} /* end namespace */
|
||||
|
||||
bool print_asm(Program *program, std::vector<uint32_t>& binary,
|
||||
unsigned exec_size, FILE *output)
|
||||
bool
|
||||
print_asm(Program* program, std::vector<uint32_t>& binary, unsigned exec_size, FILE* output)
|
||||
{
|
||||
if (program->chip_class <= GFX7) {
|
||||
/* Do not abort if clrxdisasm isn't found. */
|
||||
|
@ -187,7 +171,7 @@ bool print_asm(Program *program, std::vector<uint32_t>& binary,
|
|||
}
|
||||
|
||||
std::vector<llvm::SymbolInfoTy> symbols;
|
||||
std::vector<std::array<char,16>> block_names;
|
||||
std::vector<std::array<char, 16>> block_names;
|
||||
block_names.reserve(program->blocks.size());
|
||||
for (Block& block : program->blocks) {
|
||||
if (!referenced_blocks[block.index])
|
||||
|
@ -195,18 +179,18 @@ bool print_asm(Program *program, std::vector<uint32_t>& binary,
|
|||
std::array<char, 16> name;
|
||||
sprintf(name.data(), "BB%u", block.index);
|
||||
block_names.push_back(name);
|
||||
symbols.emplace_back(block.offset * 4, llvm::StringRef(block_names[block_names.size() - 1].data()), 0);
|
||||
symbols.emplace_back(block.offset * 4,
|
||||
llvm::StringRef(block_names[block_names.size() - 1].data()), 0);
|
||||
}
|
||||
|
||||
const char *features = "";
|
||||
const char* features = "";
|
||||
if (program->chip_class >= GFX10 && program->wave_size == 64) {
|
||||
features = "+wavefrontsize64";
|
||||
}
|
||||
|
||||
LLVMDisasmContextRef disasm = LLVMCreateDisasmCPUFeatures("amdgcn-mesa-mesa3d",
|
||||
ac_get_llvm_processor_name(program->family),
|
||||
features,
|
||||
&symbols, 0, NULL, NULL);
|
||||
LLVMDisasmContextRef disasm =
|
||||
LLVMCreateDisasmCPUFeatures("amdgcn-mesa-mesa3d", ac_get_llvm_processor_name(program->family),
|
||||
features, &symbols, 0, NULL, NULL);
|
||||
|
||||
size_t pos = 0;
|
||||
bool invalid = false;
|
||||
|
@ -216,7 +200,8 @@ bool print_asm(Program *program, std::vector<uint32_t>& binary,
|
|||
unsigned prev_pos = 0;
|
||||
unsigned repeat_count = 0;
|
||||
while (pos < exec_size) {
|
||||
bool new_block = next_block < program->blocks.size() && pos == program->blocks[next_block].offset;
|
||||
bool new_block =
|
||||
next_block < program->blocks.size() && pos == program->blocks[next_block].offset;
|
||||
if (pos + prev_size <= exec_size && prev_pos != pos && !new_block &&
|
||||
memcmp(&binary[prev_pos], &binary[pos], prev_size * 4) == 0) {
|
||||
repeat_count++;
|
||||
|
@ -235,8 +220,8 @@ bool print_asm(Program *program, std::vector<uint32_t>& binary,
|
|||
}
|
||||
|
||||
char outline[1024];
|
||||
std::pair<bool, size_t> res = disasm_instr(
|
||||
program->chip_class, disasm, binary.data(), exec_size, pos, outline, sizeof(outline));
|
||||
std::pair<bool, size_t> res = disasm_instr(program->chip_class, disasm, binary.data(),
|
||||
exec_size, pos, outline, sizeof(outline));
|
||||
invalid |= res.first;
|
||||
|
||||
fprintf(output, "%-60s ;", outline);
|
||||
|
@ -271,4 +256,4 @@ bool print_asm(Program *program, std::vector<uint32_t>& binary,
|
|||
return invalid;
|
||||
}
|
||||
|
||||
}
|
||||
} // namespace aco
|
||||
|
|
|
@ -86,36 +86,38 @@ const std::array<const char*, num_reduce_ops> reduce_ops = []()
|
|||
return ret;
|
||||
}();
|
||||
|
||||
static void print_reg_class(const RegClass rc, FILE *output)
|
||||
static void
|
||||
print_reg_class(const RegClass rc, FILE* output)
|
||||
{
|
||||
switch (rc) {
|
||||
case RegClass::s1: fprintf(output, " s1: "); return;
|
||||
case RegClass::s2: fprintf(output, " s2: "); return;
|
||||
case RegClass::s3: fprintf(output, " s3: "); return;
|
||||
case RegClass::s4: fprintf(output, " s4: "); return;
|
||||
case RegClass::s6: fprintf(output, " s6: "); return;
|
||||
case RegClass::s8: fprintf(output, " s8: "); return;
|
||||
case RegClass::s16: fprintf(output, "s16: "); return;
|
||||
case RegClass::v1: fprintf(output, " v1: "); return;
|
||||
case RegClass::v2: fprintf(output, " v2: "); return;
|
||||
case RegClass::v3: fprintf(output, " v3: "); return;
|
||||
case RegClass::v4: fprintf(output, " v4: "); return;
|
||||
case RegClass::v5: fprintf(output, " v5: "); return;
|
||||
case RegClass::v6: fprintf(output, " v6: "); return;
|
||||
case RegClass::v7: fprintf(output, " v7: "); return;
|
||||
case RegClass::v8: fprintf(output, " v8: "); return;
|
||||
case RegClass::v1b: fprintf(output, " v1b: "); return;
|
||||
case RegClass::v2b: fprintf(output, " v2b: "); return;
|
||||
case RegClass::v3b: fprintf(output, " v3b: "); return;
|
||||
case RegClass::v4b: fprintf(output, " v4b: "); return;
|
||||
case RegClass::v6b: fprintf(output, " v6b: "); return;
|
||||
case RegClass::v8b: fprintf(output, " v8b: "); return;
|
||||
case RegClass::v1_linear: fprintf(output, " v1: "); return;
|
||||
case RegClass::v2_linear: fprintf(output, " v2: "); return;
|
||||
case RegClass::s1: fprintf(output, " s1: "); return;
|
||||
case RegClass::s2: fprintf(output, " s2: "); return;
|
||||
case RegClass::s3: fprintf(output, " s3: "); return;
|
||||
case RegClass::s4: fprintf(output, " s4: "); return;
|
||||
case RegClass::s6: fprintf(output, " s6: "); return;
|
||||
case RegClass::s8: fprintf(output, " s8: "); return;
|
||||
case RegClass::s16: fprintf(output, "s16: "); return;
|
||||
case RegClass::v1: fprintf(output, " v1: "); return;
|
||||
case RegClass::v2: fprintf(output, " v2: "); return;
|
||||
case RegClass::v3: fprintf(output, " v3: "); return;
|
||||
case RegClass::v4: fprintf(output, " v4: "); return;
|
||||
case RegClass::v5: fprintf(output, " v5: "); return;
|
||||
case RegClass::v6: fprintf(output, " v6: "); return;
|
||||
case RegClass::v7: fprintf(output, " v7: "); return;
|
||||
case RegClass::v8: fprintf(output, " v8: "); return;
|
||||
case RegClass::v1b: fprintf(output, " v1b: "); return;
|
||||
case RegClass::v2b: fprintf(output, " v2b: "); return;
|
||||
case RegClass::v3b: fprintf(output, " v3b: "); return;
|
||||
case RegClass::v4b: fprintf(output, " v4b: "); return;
|
||||
case RegClass::v6b: fprintf(output, " v6b: "); return;
|
||||
case RegClass::v8b: fprintf(output, " v8b: "); return;
|
||||
case RegClass::v1_linear: fprintf(output, " v1: "); return;
|
||||
case RegClass::v2_linear: fprintf(output, " v2: "); return;
|
||||
}
|
||||
}
|
||||
|
||||
void print_physReg(PhysReg reg, unsigned bytes, FILE *output, unsigned flags)
|
||||
void
|
||||
print_physReg(PhysReg reg, unsigned bytes, FILE* output, unsigned flags)
|
||||
{
|
||||
if (reg == 124) {
|
||||
fprintf(output, "m0");
|
||||
|
@ -134,16 +136,17 @@ void print_physReg(PhysReg reg, unsigned bytes, FILE *output, unsigned flags)
|
|||
} else {
|
||||
fprintf(output, "%c[%d", is_vgpr ? 'v' : 's', r);
|
||||
if (size > 1)
|
||||
fprintf(output, "-%d]", r + size -1);
|
||||
fprintf(output, "-%d]", r + size - 1);
|
||||
else
|
||||
fprintf(output, "]");
|
||||
}
|
||||
if (reg.byte() || bytes % 4)
|
||||
fprintf(output, "[%d:%d]", reg.byte()*8, (reg.byte()+bytes) * 8);
|
||||
fprintf(output, "[%d:%d]", reg.byte() * 8, (reg.byte() + bytes) * 8);
|
||||
}
|
||||
}
|
||||
|
||||
static void print_constant(uint8_t reg, FILE *output)
|
||||
static void
|
||||
print_constant(uint8_t reg, FILE* output)
|
||||
{
|
||||
if (reg >= 128 && reg <= 192) {
|
||||
fprintf(output, "%d", reg - 128);
|
||||
|
@ -154,37 +157,20 @@ static void print_constant(uint8_t reg, FILE *output)
|
|||
}
|
||||
|
||||
switch (reg) {
|
||||
case 240:
|
||||
fprintf(output, "0.5");
|
||||
break;
|
||||
case 241:
|
||||
fprintf(output, "-0.5");
|
||||
break;
|
||||
case 242:
|
||||
fprintf(output, "1.0");
|
||||
break;
|
||||
case 243:
|
||||
fprintf(output, "-1.0");
|
||||
break;
|
||||
case 244:
|
||||
fprintf(output, "2.0");
|
||||
break;
|
||||
case 245:
|
||||
fprintf(output, "-2.0");
|
||||
break;
|
||||
case 246:
|
||||
fprintf(output, "4.0");
|
||||
break;
|
||||
case 247:
|
||||
fprintf(output, "-4.0");
|
||||
break;
|
||||
case 248:
|
||||
fprintf(output, "1/(2*PI)");
|
||||
break;
|
||||
case 240: fprintf(output, "0.5"); break;
|
||||
case 241: fprintf(output, "-0.5"); break;
|
||||
case 242: fprintf(output, "1.0"); break;
|
||||
case 243: fprintf(output, "-1.0"); break;
|
||||
case 244: fprintf(output, "2.0"); break;
|
||||
case 245: fprintf(output, "-2.0"); break;
|
||||
case 246: fprintf(output, "4.0"); break;
|
||||
case 247: fprintf(output, "-4.0"); break;
|
||||
case 248: fprintf(output, "1/(2*PI)"); break;
|
||||
}
|
||||
}
|
||||
|
||||
void aco_print_operand(const Operand *operand, FILE *output, unsigned flags)
|
||||
void
|
||||
aco_print_operand(const Operand* operand, FILE* output, unsigned flags)
|
||||
{
|
||||
if (operand->isLiteral() || (operand->isConstant() && operand->bytes() == 1)) {
|
||||
if (operand->bytes() == 1)
|
||||
|
@ -216,7 +202,8 @@ void aco_print_operand(const Operand *operand, FILE *output, unsigned flags)
|
|||
}
|
||||
}
|
||||
|
||||
static void print_definition(const Definition *definition, FILE *output, unsigned flags)
|
||||
static void
|
||||
print_definition(const Definition* definition, FILE* output, unsigned flags)
|
||||
{
|
||||
if (!(flags & print_no_ssa))
|
||||
print_reg_class(definition->regClass(), output);
|
||||
|
@ -235,7 +222,8 @@ static void print_definition(const Definition *definition, FILE *output, unsigne
|
|||
print_physReg(definition->physReg(), definition->bytes(), output, flags);
|
||||
}
|
||||
|
||||
static void print_storage(storage_class storage, FILE *output)
|
||||
static void
|
||||
print_storage(storage_class storage, FILE* output)
|
||||
{
|
||||
fprintf(output, " storage:");
|
||||
int printed = 0;
|
||||
|
@ -255,7 +243,8 @@ static void print_storage(storage_class storage, FILE *output)
|
|||
printed += fprintf(output, "%svgpr_spill", printed ? "," : "");
|
||||
}
|
||||
|
||||
static void print_semantics(memory_semantics sem, FILE *output)
|
||||
static void
|
||||
print_semantics(memory_semantics sem, FILE* output)
|
||||
{
|
||||
fprintf(output, " semantics:");
|
||||
int printed = 0;
|
||||
|
@ -275,36 +264,29 @@ static void print_semantics(memory_semantics sem, FILE *output)
|
|||
printed += fprintf(output, "%srmw", printed ? "," : "");
|
||||
}
|
||||
|
||||
static void print_scope(sync_scope scope, FILE *output, const char *prefix="scope")
|
||||
static void
|
||||
print_scope(sync_scope scope, FILE* output, const char* prefix = "scope")
|
||||
{
|
||||
fprintf(output, " %s:", prefix);
|
||||
switch (scope) {
|
||||
case scope_invocation:
|
||||
fprintf(output, "invocation");
|
||||
break;
|
||||
case scope_subgroup:
|
||||
fprintf(output, "subgroup");
|
||||
break;
|
||||
case scope_workgroup:
|
||||
fprintf(output, "workgroup");
|
||||
break;
|
||||
case scope_queuefamily:
|
||||
fprintf(output, "queuefamily");
|
||||
break;
|
||||
case scope_device:
|
||||
fprintf(output, "device");
|
||||
break;
|
||||
case scope_invocation: fprintf(output, "invocation"); break;
|
||||
case scope_subgroup: fprintf(output, "subgroup"); break;
|
||||
case scope_workgroup: fprintf(output, "workgroup"); break;
|
||||
case scope_queuefamily: fprintf(output, "queuefamily"); break;
|
||||
case scope_device: fprintf(output, "device"); break;
|
||||
}
|
||||
}
|
||||
|
||||
static void print_sync(memory_sync_info sync, FILE *output)
|
||||
static void
|
||||
print_sync(memory_sync_info sync, FILE* output)
|
||||
{
|
||||
print_storage(sync.storage, output);
|
||||
print_semantics(sync.semantics, output);
|
||||
print_scope(sync.scope, output);
|
||||
}
|
||||
|
||||
static void print_instr_format_specific(const Instruction *instr, FILE *output)
|
||||
static void
|
||||
print_instr_format_specific(const Instruction* instr, FILE* output)
|
||||
{
|
||||
switch (instr->format) {
|
||||
case Format::SOPK: {
|
||||
|
@ -319,9 +301,12 @@ static void print_instr_format_specific(const Instruction *instr, FILE *output)
|
|||
/* we usually should check the chip class for vmcnt/lgkm, but
|
||||
* insert_waitcnt() should fill it in regardless. */
|
||||
unsigned vmcnt = (imm & 0xF) | ((imm & (0x3 << 14)) >> 10);
|
||||
if (vmcnt != 63) fprintf(output, " vmcnt(%d)", vmcnt);
|
||||
if (((imm >> 4) & 0x7) < 0x7) fprintf(output, " expcnt(%d)", (imm >> 4) & 0x7);
|
||||
if (((imm >> 8) & 0x3F) < 0x3F) fprintf(output, " lgkmcnt(%d)", (imm >> 8) & 0x3F);
|
||||
if (vmcnt != 63)
|
||||
fprintf(output, " vmcnt(%d)", vmcnt);
|
||||
if (((imm >> 4) & 0x7) < 0x7)
|
||||
fprintf(output, " expcnt(%d)", (imm >> 4) & 0x7);
|
||||
if (((imm >> 8) & 0x3F) < 0x3F)
|
||||
fprintf(output, " lgkmcnt(%d)", (imm >> 8) & 0x3F);
|
||||
break;
|
||||
}
|
||||
case aco_opcode::s_endpgm:
|
||||
|
@ -337,35 +322,21 @@ static void print_instr_format_specific(const Instruction *instr, FILE *output)
|
|||
case aco_opcode::s_sendmsg: {
|
||||
unsigned id = imm & sendmsg_id_mask;
|
||||
switch (id) {
|
||||
case sendmsg_none:
|
||||
fprintf(output, " sendmsg(MSG_NONE)");
|
||||
break;
|
||||
case sendmsg_none: fprintf(output, " sendmsg(MSG_NONE)"); break;
|
||||
case _sendmsg_gs:
|
||||
fprintf(output, " sendmsg(gs%s%s, %u)",
|
||||
imm & 0x10 ? ", cut" : "", imm & 0x20 ? ", emit" : "", imm >> 8);
|
||||
fprintf(output, " sendmsg(gs%s%s, %u)", imm & 0x10 ? ", cut" : "",
|
||||
imm & 0x20 ? ", emit" : "", imm >> 8);
|
||||
break;
|
||||
case _sendmsg_gs_done:
|
||||
fprintf(output, " sendmsg(gs_done%s%s, %u)",
|
||||
imm & 0x10 ? ", cut" : "", imm & 0x20 ? ", emit" : "", imm >> 8);
|
||||
break;
|
||||
case sendmsg_save_wave:
|
||||
fprintf(output, " sendmsg(save_wave)");
|
||||
break;
|
||||
case sendmsg_stall_wave_gen:
|
||||
fprintf(output, " sendmsg(stall_wave_gen)");
|
||||
break;
|
||||
case sendmsg_halt_waves:
|
||||
fprintf(output, " sendmsg(halt_waves)");
|
||||
break;
|
||||
case sendmsg_ordered_ps_done:
|
||||
fprintf(output, " sendmsg(ordered_ps_done)");
|
||||
break;
|
||||
case sendmsg_early_prim_dealloc:
|
||||
fprintf(output, " sendmsg(early_prim_dealloc)");
|
||||
break;
|
||||
case sendmsg_gs_alloc_req:
|
||||
fprintf(output, " sendmsg(gs_alloc_req)");
|
||||
fprintf(output, " sendmsg(gs_done%s%s, %u)", imm & 0x10 ? ", cut" : "",
|
||||
imm & 0x20 ? ", emit" : "", imm >> 8);
|
||||
break;
|
||||
case sendmsg_save_wave: fprintf(output, " sendmsg(save_wave)"); break;
|
||||
case sendmsg_stall_wave_gen: fprintf(output, " sendmsg(stall_wave_gen)"); break;
|
||||
case sendmsg_halt_waves: fprintf(output, " sendmsg(halt_waves)"); break;
|
||||
case sendmsg_ordered_ps_done: fprintf(output, " sendmsg(ordered_ps_done)"); break;
|
||||
case sendmsg_early_prim_dealloc: fprintf(output, " sendmsg(early_prim_dealloc)"); break;
|
||||
case sendmsg_gs_alloc_req: fprintf(output, " sendmsg(gs_alloc_req)"); break;
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
@ -433,40 +404,21 @@ static void print_instr_format_specific(const Instruction *instr, FILE *output)
|
|||
}
|
||||
case Format::MIMG: {
|
||||
const MIMG_instruction& mimg = instr->mimg();
|
||||
unsigned identity_dmask = !instr->definitions.empty() ?
|
||||
(1 << instr->definitions[0].size()) - 1 :
|
||||
0xf;
|
||||
unsigned identity_dmask =
|
||||
!instr->definitions.empty() ? (1 << instr->definitions[0].size()) - 1 : 0xf;
|
||||
if ((mimg.dmask & identity_dmask) != identity_dmask)
|
||||
fprintf(output, " dmask:%s%s%s%s",
|
||||
mimg.dmask & 0x1 ? "x" : "",
|
||||
mimg.dmask & 0x2 ? "y" : "",
|
||||
mimg.dmask & 0x4 ? "z" : "",
|
||||
fprintf(output, " dmask:%s%s%s%s", mimg.dmask & 0x1 ? "x" : "",
|
||||
mimg.dmask & 0x2 ? "y" : "", mimg.dmask & 0x4 ? "z" : "",
|
||||
mimg.dmask & 0x8 ? "w" : "");
|
||||
switch (mimg.dim) {
|
||||
case ac_image_1d:
|
||||
fprintf(output, " 1d");
|
||||
break;
|
||||
case ac_image_2d:
|
||||
fprintf(output, " 2d");
|
||||
break;
|
||||
case ac_image_3d:
|
||||
fprintf(output, " 3d");
|
||||
break;
|
||||
case ac_image_cube:
|
||||
fprintf(output, " cube");
|
||||
break;
|
||||
case ac_image_1darray:
|
||||
fprintf(output, " 1darray");
|
||||
break;
|
||||
case ac_image_2darray:
|
||||
fprintf(output, " 2darray");
|
||||
break;
|
||||
case ac_image_2dmsaa:
|
||||
fprintf(output, " 2dmsaa");
|
||||
break;
|
||||
case ac_image_2darraymsaa:
|
||||
fprintf(output, " 2darraymsaa");
|
||||
break;
|
||||
case ac_image_1d: fprintf(output, " 1d"); break;
|
||||
case ac_image_2d: fprintf(output, " 2d"); break;
|
||||
case ac_image_3d: fprintf(output, " 3d"); break;
|
||||
case ac_image_cube: fprintf(output, " cube"); break;
|
||||
case ac_image_1darray: fprintf(output, " 1darray"); break;
|
||||
case ac_image_2darray: fprintf(output, " 2darray"); break;
|
||||
case ac_image_2dmsaa: fprintf(output, " 2dmsaa"); break;
|
||||
case ac_image_2darraymsaa: fprintf(output, " 2darraymsaa"); break;
|
||||
}
|
||||
if (mimg.unrm)
|
||||
fprintf(output, " unrm");
|
||||
|
@ -495,10 +447,8 @@ static void print_instr_format_specific(const Instruction *instr, FILE *output)
|
|||
const Export_instruction& exp = instr->exp();
|
||||
unsigned identity_mask = exp.compressed ? 0x5 : 0xf;
|
||||
if ((exp.enabled_mask & identity_mask) != identity_mask)
|
||||
fprintf(output, " en:%c%c%c%c",
|
||||
exp.enabled_mask & 0x1 ? 'r' : '*',
|
||||
exp.enabled_mask & 0x2 ? 'g' : '*',
|
||||
exp.enabled_mask & 0x4 ? 'b' : '*',
|
||||
fprintf(output, " en:%c%c%c%c", exp.enabled_mask & 0x1 ? 'r' : '*',
|
||||
exp.enabled_mask & 0x2 ? 'g' : '*', exp.enabled_mask & 0x4 ? 'b' : '*',
|
||||
exp.enabled_mask & 0x8 ? 'a' : '*');
|
||||
if (exp.compressed)
|
||||
fprintf(output, " compr");
|
||||
|
@ -624,15 +574,9 @@ static void print_instr_format_specific(const Instruction *instr, FILE *output)
|
|||
if (instr->isVOP3()) {
|
||||
const VOP3_instruction& vop3 = instr->vop3();
|
||||
switch (vop3.omod) {
|
||||
case 1:
|
||||
fprintf(output, " *2");
|
||||
break;
|
||||
case 2:
|
||||
fprintf(output, " *4");
|
||||
break;
|
||||
case 3:
|
||||
fprintf(output, " *0.5");
|
||||
break;
|
||||
case 1: fprintf(output, " *2"); break;
|
||||
case 2: fprintf(output, " *4"); break;
|
||||
case 3: fprintf(output, " *0.5"); break;
|
||||
}
|
||||
if (vop3.clamp)
|
||||
fprintf(output, " clamp");
|
||||
|
@ -641,8 +585,7 @@ static void print_instr_format_specific(const Instruction *instr, FILE *output)
|
|||
} else if (instr->isDPP()) {
|
||||
const DPP_instruction& dpp = instr->dpp();
|
||||
if (dpp.dpp_ctrl <= 0xff) {
|
||||
fprintf(output, " quad_perm:[%d,%d,%d,%d]",
|
||||
dpp.dpp_ctrl & 0x3, (dpp.dpp_ctrl >> 2) & 0x3,
|
||||
fprintf(output, " quad_perm:[%d,%d,%d,%d]", dpp.dpp_ctrl & 0x3, (dpp.dpp_ctrl >> 2) & 0x3,
|
||||
(dpp.dpp_ctrl >> 4) & 0x3, (dpp.dpp_ctrl >> 6) & 0x3);
|
||||
} else if (dpp.dpp_ctrl >= 0x101 && dpp.dpp_ctrl <= 0x10f) {
|
||||
fprintf(output, " row_shl:%d", dpp.dpp_ctrl & 0xf);
|
||||
|
@ -678,21 +621,14 @@ static void print_instr_format_specific(const Instruction *instr, FILE *output)
|
|||
} else if (instr->isSDWA()) {
|
||||
const SDWA_instruction& sdwa = instr->sdwa();
|
||||
switch (sdwa.omod) {
|
||||
case 1:
|
||||
fprintf(output, " *2");
|
||||
break;
|
||||
case 2:
|
||||
fprintf(output, " *4");
|
||||
break;
|
||||
case 3:
|
||||
fprintf(output, " *0.5");
|
||||
break;
|
||||
case 1: fprintf(output, " *2"); break;
|
||||
case 2: fprintf(output, " *4"); break;
|
||||
case 3: fprintf(output, " *0.5"); break;
|
||||
}
|
||||
if (sdwa.clamp)
|
||||
fprintf(output, " clamp");
|
||||
switch (sdwa.dst_sel & sdwa_asuint) {
|
||||
case sdwa_udword:
|
||||
break;
|
||||
case sdwa_udword: break;
|
||||
case sdwa_ubyte0:
|
||||
case sdwa_ubyte1:
|
||||
case sdwa_ubyte2:
|
||||
|
@ -711,7 +647,8 @@ static void print_instr_format_specific(const Instruction *instr, FILE *output)
|
|||
}
|
||||
}
|
||||
|
||||
void aco_print_instr(const Instruction *instr, FILE *output, unsigned flags)
|
||||
void
|
||||
aco_print_instr(const Instruction* instr, FILE* output, unsigned flags)
|
||||
{
|
||||
if (!instr->definitions.empty()) {
|
||||
for (unsigned i = 0; i < instr->definitions.size(); ++i) {
|
||||
|
@ -723,10 +660,10 @@ void aco_print_instr(const Instruction *instr, FILE *output, unsigned flags)
|
|||
}
|
||||
fprintf(output, "%s", instr_info.name[(int)instr->opcode]);
|
||||
if (instr->operands.size()) {
|
||||
bool *const abs = (bool *)alloca(instr->operands.size() * sizeof(bool));
|
||||
bool *const neg = (bool *)alloca(instr->operands.size() * sizeof(bool));
|
||||
bool *const opsel = (bool *)alloca(instr->operands.size() * sizeof(bool));
|
||||
uint8_t *const sel = (uint8_t *)alloca(instr->operands.size() * sizeof(uint8_t));
|
||||
bool* const abs = (bool*)alloca(instr->operands.size() * sizeof(bool));
|
||||
bool* const neg = (bool*)alloca(instr->operands.size() * sizeof(bool));
|
||||
bool* const opsel = (bool*)alloca(instr->operands.size() * sizeof(bool));
|
||||
uint8_t* const sel = (uint8_t*)alloca(instr->operands.size() * sizeof(uint8_t));
|
||||
for (unsigned i = 0; i < instr->operands.size(); ++i) {
|
||||
abs[i] = false;
|
||||
neg[i] = false;
|
||||
|
@ -792,8 +729,7 @@ void aco_print_instr(const Instruction *instr, FILE *output, unsigned flags)
|
|||
if (instr->isVOP3P()) {
|
||||
const VOP3P_instruction& vop3 = instr->vop3p();
|
||||
if ((vop3.opsel_lo & (1 << i)) || !(vop3.opsel_hi & (1 << i))) {
|
||||
fprintf(output, ".%c%c",
|
||||
vop3.opsel_lo & (1 << i) ? 'y' : 'x',
|
||||
fprintf(output, ".%c%c", vop3.opsel_lo & (1 << i) ? 'y' : 'x',
|
||||
vop3.opsel_hi & (1 << i) ? 'y' : 'x');
|
||||
}
|
||||
if (vop3.neg_lo[i] && vop3.neg_hi[i])
|
||||
|
@ -808,7 +744,8 @@ void aco_print_instr(const Instruction *instr, FILE *output, unsigned flags)
|
|||
print_instr_format_specific(instr, output);
|
||||
}
|
||||
|
||||
static void print_block_kind(uint16_t kind, FILE *output)
|
||||
static void
|
||||
print_block_kind(uint16_t kind, FILE* output)
|
||||
{
|
||||
if (kind & block_kind_uniform)
|
||||
fprintf(output, "uniform, ");
|
||||
|
@ -844,7 +781,8 @@ static void print_block_kind(uint16_t kind, FILE *output)
|
|||
fprintf(output, "export_end, ");
|
||||
}
|
||||
|
||||
static void print_stage(Stage stage, FILE *output)
|
||||
static void
|
||||
print_stage(Stage stage, FILE* output)
|
||||
{
|
||||
fprintf(output, "ACO shader stage: ");
|
||||
|
||||
|
@ -888,7 +826,8 @@ static void print_stage(Stage stage, FILE *output)
|
|||
fprintf(output, "\n");
|
||||
}
|
||||
|
||||
void aco_print_block(const Block* block, FILE *output, unsigned flags, const live& live_vars)
|
||||
void
|
||||
aco_print_block(const Block* block, FILE* output, unsigned flags, const live& live_vars)
|
||||
{
|
||||
fprintf(output, "BB%d\n", block->index);
|
||||
fprintf(output, "/* logical preds: ");
|
||||
|
@ -927,19 +866,16 @@ void aco_print_block(const Block* block, FILE *output, unsigned flags, const liv
|
|||
}
|
||||
}
|
||||
|
||||
void aco_print_program(const Program *program, FILE *output, const live& live_vars, unsigned flags)
|
||||
void
|
||||
aco_print_program(const Program* program, FILE* output, const live& live_vars, unsigned flags)
|
||||
{
|
||||
switch (program->progress) {
|
||||
case CompilationProgress::after_isel:
|
||||
fprintf(output, "After Instruction Selection:\n");
|
||||
break;
|
||||
case CompilationProgress::after_isel: fprintf(output, "After Instruction Selection:\n"); break;
|
||||
case CompilationProgress::after_spilling:
|
||||
fprintf(output, "After Spilling:\n");
|
||||
flags |= print_kill;
|
||||
break;
|
||||
case CompilationProgress::after_ra:
|
||||
fprintf(output, "After RA:\n");
|
||||
break;
|
||||
case CompilationProgress::after_ra: fprintf(output, "After RA:\n"); break;
|
||||
}
|
||||
|
||||
print_stage(program->stage, output);
|
||||
|
@ -965,9 +901,10 @@ void aco_print_program(const Program *program, FILE *output, const live& live_va
|
|||
fprintf(output, "\n");
|
||||
}
|
||||
|
||||
void aco_print_program(const Program *program, FILE *output, unsigned flags)
|
||||
void
|
||||
aco_print_program(const Program* program, FILE* output, unsigned flags)
|
||||
{
|
||||
aco_print_program(program, output, live(), flags);
|
||||
}
|
||||
|
||||
}
|
||||
} // namespace aco
|
||||
|
|
|
@ -36,7 +36,8 @@
|
|||
|
||||
namespace aco {
|
||||
|
||||
void setup_reduce_temp(Program* program)
|
||||
void
|
||||
setup_reduce_temp(Program* program)
|
||||
{
|
||||
unsigned last_top_level_block_idx = 0;
|
||||
unsigned maxSize = 0;
|
||||
|
@ -69,7 +70,8 @@ void setup_reduce_temp(Program* program)
|
|||
if (reduceTmp_in_loop && block.loop_nest_depth == 0) {
|
||||
assert(inserted_at == (int)last_top_level_block_idx);
|
||||
|
||||
aco_ptr<Instruction> end{create_instruction<Instruction>(aco_opcode::p_end_linear_vgpr, Format::PSEUDO, vtmp_in_loop ? 2 : 1, 0)};
|
||||
aco_ptr<Instruction> end{create_instruction<Instruction>(
|
||||
aco_opcode::p_end_linear_vgpr, Format::PSEUDO, vtmp_in_loop ? 2 : 1, 0)};
|
||||
end->operands[0] = Operand(reduceTmp);
|
||||
if (vtmp_in_loop)
|
||||
end->operands[1] = Operand(vtmp);
|
||||
|
@ -89,7 +91,7 @@ void setup_reduce_temp(Program* program)
|
|||
|
||||
std::vector<aco_ptr<Instruction>>::iterator it;
|
||||
for (it = block.instructions.begin(); it != block.instructions.end(); ++it) {
|
||||
Instruction *instr = (*it).get();
|
||||
Instruction* instr = (*it).get();
|
||||
if (instr->format != Format::PSEUDO_REDUCTION)
|
||||
continue;
|
||||
|
||||
|
@ -98,7 +100,8 @@ void setup_reduce_temp(Program* program)
|
|||
|
||||
if ((int)last_top_level_block_idx != inserted_at) {
|
||||
reduceTmp = program->allocateTmp(reduceTmp.regClass());
|
||||
aco_ptr<Pseudo_instruction> create{create_instruction<Pseudo_instruction>(aco_opcode::p_start_linear_vgpr, Format::PSEUDO, 0, 1)};
|
||||
aco_ptr<Pseudo_instruction> create{create_instruction<Pseudo_instruction>(
|
||||
aco_opcode::p_start_linear_vgpr, Format::PSEUDO, 0, 1)};
|
||||
create->definitions[0] = Definition(reduceTmp);
|
||||
/* find the right place to insert this definition */
|
||||
if (last_top_level_block_idx == block.index) {
|
||||
|
@ -110,18 +113,19 @@ void setup_reduce_temp(Program* program)
|
|||
} else {
|
||||
assert(last_top_level_block_idx < block.index);
|
||||
/* insert before the branch at last top level block */
|
||||
std::vector<aco_ptr<Instruction>>& instructions = program->blocks[last_top_level_block_idx].instructions;
|
||||
instructions.insert(std::next(instructions.begin(), instructions.size() - 1), std::move(create));
|
||||
std::vector<aco_ptr<Instruction>>& instructions =
|
||||
program->blocks[last_top_level_block_idx].instructions;
|
||||
instructions.insert(std::next(instructions.begin(), instructions.size() - 1),
|
||||
std::move(create));
|
||||
inserted_at = last_top_level_block_idx;
|
||||
}
|
||||
}
|
||||
|
||||
/* same as before, except for the vector temporary instead of the reduce temporary */
|
||||
unsigned cluster_size = instr->reduction().cluster_size;
|
||||
bool need_vtmp = op == imul32 || op == fadd64 || op == fmul64 ||
|
||||
op == fmin64 || op == fmax64 || op == umin64 ||
|
||||
op == umax64 || op == imin64 || op == imax64 ||
|
||||
op == imul64;
|
||||
bool need_vtmp = op == imul32 || op == fadd64 || op == fmul64 || op == fmin64 ||
|
||||
op == fmax64 || op == umin64 || op == umax64 || op == imin64 ||
|
||||
op == imax64 || op == imul64;
|
||||
bool gfx10_need_vtmp = op == imul8 || op == imax8 || op == imin8 || op == umin8 ||
|
||||
op == imul16 || op == imax16 || op == imin16 || op == umin16 ||
|
||||
op == iadd64;
|
||||
|
@ -138,15 +142,18 @@ void setup_reduce_temp(Program* program)
|
|||
vtmp_in_loop |= need_vtmp && block.loop_nest_depth > 0;
|
||||
if (need_vtmp && (int)last_top_level_block_idx != vtmp_inserted_at) {
|
||||
vtmp = program->allocateTmp(vtmp.regClass());
|
||||
aco_ptr<Pseudo_instruction> create{create_instruction<Pseudo_instruction>(aco_opcode::p_start_linear_vgpr, Format::PSEUDO, 0, 1)};
|
||||
aco_ptr<Pseudo_instruction> create{create_instruction<Pseudo_instruction>(
|
||||
aco_opcode::p_start_linear_vgpr, Format::PSEUDO, 0, 1)};
|
||||
create->definitions[0] = Definition(vtmp);
|
||||
if (last_top_level_block_idx == block.index) {
|
||||
it = block.instructions.insert(it, std::move(create));
|
||||
it++;
|
||||
} else {
|
||||
assert(last_top_level_block_idx < block.index);
|
||||
std::vector<aco_ptr<Instruction>>& instructions = program->blocks[last_top_level_block_idx].instructions;
|
||||
instructions.insert(std::next(instructions.begin(), instructions.size() - 1), std::move(create));
|
||||
std::vector<aco_ptr<Instruction>>& instructions =
|
||||
program->blocks[last_top_level_block_idx].instructions;
|
||||
instructions.insert(std::next(instructions.begin(), instructions.size() - 1),
|
||||
std::move(create));
|
||||
vtmp_inserted_at = last_top_level_block_idx;
|
||||
}
|
||||
}
|
||||
|
@ -158,5 +165,4 @@ void setup_reduce_temp(Program* program)
|
|||
}
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
}; // namespace aco
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -34,8 +34,8 @@ struct idx_ctx {
|
|||
std::vector<uint32_t> renames;
|
||||
};
|
||||
|
||||
inline
|
||||
void reindex_defs(idx_ctx& ctx, aco_ptr<Instruction>& instr)
|
||||
inline void
|
||||
reindex_defs(idx_ctx& ctx, aco_ptr<Instruction>& instr)
|
||||
{
|
||||
for (Definition& def : instr->definitions) {
|
||||
if (!def.isTemp())
|
||||
|
@ -48,8 +48,8 @@ void reindex_defs(idx_ctx& ctx, aco_ptr<Instruction>& instr)
|
|||
}
|
||||
}
|
||||
|
||||
inline
|
||||
void reindex_ops(idx_ctx& ctx, aco_ptr<Instruction>& instr)
|
||||
inline void
|
||||
reindex_ops(idx_ctx& ctx, aco_ptr<Instruction>& instr)
|
||||
{
|
||||
for (Operand& op : instr->operands) {
|
||||
if (!op.isTemp())
|
||||
|
@ -60,7 +60,8 @@ void reindex_ops(idx_ctx& ctx, aco_ptr<Instruction>& instr)
|
|||
}
|
||||
}
|
||||
|
||||
void reindex_program(idx_ctx& ctx, Program* program)
|
||||
void
|
||||
reindex_program(idx_ctx& ctx, Program* program)
|
||||
{
|
||||
ctx.renames.resize(program->peekAllocationId());
|
||||
|
||||
|
@ -88,12 +89,13 @@ void reindex_program(idx_ctx& ctx, Program* program)
|
|||
/* update program members */
|
||||
program->private_segment_buffer = Temp(ctx.renames[program->private_segment_buffer.id()],
|
||||
program->private_segment_buffer.regClass());
|
||||
program->scratch_offset = Temp(ctx.renames[program->scratch_offset.id()],
|
||||
program->scratch_offset.regClass());
|
||||
program->scratch_offset =
|
||||
Temp(ctx.renames[program->scratch_offset.id()], program->scratch_offset.regClass());
|
||||
program->temp_rc = ctx.temp_rc;
|
||||
}
|
||||
|
||||
void update_live_out(idx_ctx& ctx, std::vector<IDSet>& live_out)
|
||||
void
|
||||
update_live_out(idx_ctx& ctx, std::vector<IDSet>& live_out)
|
||||
{
|
||||
for (IDSet& set : live_out) {
|
||||
IDSet new_set;
|
||||
|
@ -105,7 +107,8 @@ void update_live_out(idx_ctx& ctx, std::vector<IDSet>& live_out)
|
|||
|
||||
} /* end namespace */
|
||||
|
||||
void reindex_ssa(Program* program)
|
||||
void
|
||||
reindex_ssa(Program* program)
|
||||
{
|
||||
idx_ctx ctx;
|
||||
reindex_program(ctx, program);
|
||||
|
@ -113,7 +116,8 @@ void reindex_ssa(Program* program)
|
|||
program->allocationID = program->temp_rc.size();
|
||||
}
|
||||
|
||||
void reindex_ssa(Program* program, std::vector<IDSet>& live_out)
|
||||
void
|
||||
reindex_ssa(Program* program, std::vector<IDSet>& live_out)
|
||||
{
|
||||
idx_ctx ctx;
|
||||
reindex_program(ctx, program);
|
||||
|
@ -122,4 +126,4 @@ void reindex_ssa(Program* program, std::vector<IDSet>& live_out)
|
|||
program->allocationID = program->temp_rc.size();
|
||||
}
|
||||
|
||||
}
|
||||
} // namespace aco
|
||||
|
|
|
@ -34,11 +34,11 @@
|
|||
#define SMEM_WINDOW_SIZE (350 - ctx.num_waves * 35)
|
||||
#define VMEM_WINDOW_SIZE (1024 - ctx.num_waves * 64)
|
||||
#define POS_EXP_WINDOW_SIZE 512
|
||||
#define SMEM_MAX_MOVES (64 - ctx.num_waves * 4)
|
||||
#define VMEM_MAX_MOVES (256 - ctx.num_waves * 16)
|
||||
#define SMEM_MAX_MOVES (64 - ctx.num_waves * 4)
|
||||
#define VMEM_MAX_MOVES (256 - ctx.num_waves * 16)
|
||||
/* creating clauses decreases def-use distances, so make it less aggressive the lower num_waves is */
|
||||
#define VMEM_CLAUSE_MAX_GRAB_DIST (ctx.num_waves * 8)
|
||||
#define POS_EXP_MAX_MOVES 512
|
||||
#define POS_EXP_MAX_MOVES 512
|
||||
|
||||
namespace aco {
|
||||
|
||||
|
@ -54,7 +54,7 @@ enum MoveResult {
|
|||
* or below a group of instruction that hardware can execute as a clause.
|
||||
*/
|
||||
struct DownwardsCursor {
|
||||
int source_idx; /* Current instruction to consider for moving */
|
||||
int source_idx; /* Current instruction to consider for moving */
|
||||
|
||||
int insert_idx_clause; /* First clause instruction */
|
||||
int insert_idx; /* First instruction *after* the clause */
|
||||
|
@ -66,11 +66,9 @@ struct DownwardsCursor {
|
|||
RegisterDemand total_demand;
|
||||
|
||||
DownwardsCursor(int current_idx, RegisterDemand initial_clause_demand)
|
||||
: source_idx(current_idx - 1),
|
||||
insert_idx_clause(current_idx),
|
||||
insert_idx(current_idx + 1),
|
||||
clause_demand(initial_clause_demand) {
|
||||
}
|
||||
: source_idx(current_idx - 1), insert_idx_clause(current_idx), insert_idx(current_idx + 1),
|
||||
clause_demand(initial_clause_demand)
|
||||
{}
|
||||
|
||||
void verify_invariants(const RegisterDemand* register_demand);
|
||||
};
|
||||
|
@ -91,18 +89,16 @@ struct UpwardsCursor {
|
|||
insert_idx = -1; /* to be initialized later */
|
||||
}
|
||||
|
||||
bool has_insert_idx() const {
|
||||
return insert_idx != -1;
|
||||
}
|
||||
bool has_insert_idx() const { return insert_idx != -1; }
|
||||
void verify_invariants(const RegisterDemand* register_demand);
|
||||
};
|
||||
|
||||
struct MoveState {
|
||||
RegisterDemand max_registers;
|
||||
|
||||
Block *block;
|
||||
Instruction *current;
|
||||
RegisterDemand *register_demand; /* demand per instruction */
|
||||
Block* block;
|
||||
Instruction* current;
|
||||
RegisterDemand* register_demand; /* demand per instruction */
|
||||
bool improved_rar;
|
||||
|
||||
std::vector<bool> depends_on;
|
||||
|
@ -143,19 +139,22 @@ struct sched_ctx {
|
|||
*/
|
||||
|
||||
template <typename T>
|
||||
void move_element(T begin_it, size_t idx, size_t before) {
|
||||
if (idx < before) {
|
||||
auto begin = std::next(begin_it, idx);
|
||||
auto end = std::next(begin_it, before);
|
||||
std::rotate(begin, begin + 1, end);
|
||||
} else if (idx > before) {
|
||||
auto begin = std::next(begin_it, before);
|
||||
auto end = std::next(begin_it, idx + 1);
|
||||
std::rotate(begin, end - 1, end);
|
||||
}
|
||||
void
|
||||
move_element(T begin_it, size_t idx, size_t before)
|
||||
{
|
||||
if (idx < before) {
|
||||
auto begin = std::next(begin_it, idx);
|
||||
auto end = std::next(begin_it, before);
|
||||
std::rotate(begin, begin + 1, end);
|
||||
} else if (idx > before) {
|
||||
auto begin = std::next(begin_it, before);
|
||||
auto end = std::next(begin_it, idx + 1);
|
||||
std::rotate(begin, end - 1, end);
|
||||
}
|
||||
}
|
||||
|
||||
void DownwardsCursor::verify_invariants(const RegisterDemand* register_demand)
|
||||
void
|
||||
DownwardsCursor::verify_invariants(const RegisterDemand* register_demand)
|
||||
{
|
||||
assert(source_idx < insert_idx_clause);
|
||||
assert(insert_idx_clause < insert_idx);
|
||||
|
@ -175,7 +174,8 @@ void DownwardsCursor::verify_invariants(const RegisterDemand* register_demand)
|
|||
#endif
|
||||
}
|
||||
|
||||
DownwardsCursor MoveState::downwards_init(int current_idx, bool improved_rar_, bool may_form_clauses)
|
||||
DownwardsCursor
|
||||
MoveState::downwards_init(int current_idx, bool improved_rar_, bool may_form_clauses)
|
||||
{
|
||||
improved_rar = improved_rar_;
|
||||
|
||||
|
@ -202,7 +202,8 @@ DownwardsCursor MoveState::downwards_init(int current_idx, bool improved_rar_, b
|
|||
/* If add_to_clause is true, the current clause is extended by moving the
|
||||
* instruction at source_idx in front of the clause. Otherwise, the instruction
|
||||
* is moved past the end of the clause without extending it */
|
||||
MoveResult MoveState::downwards_move(DownwardsCursor& cursor, bool add_to_clause)
|
||||
MoveResult
|
||||
MoveState::downwards_move(DownwardsCursor& cursor, bool add_to_clause)
|
||||
{
|
||||
aco_ptr<Instruction>& instr = block->instructions[cursor.source_idx];
|
||||
|
||||
|
@ -211,7 +212,8 @@ MoveResult MoveState::downwards_move(DownwardsCursor& cursor, bool add_to_clause
|
|||
return move_fail_ssa;
|
||||
|
||||
/* check if one of candidate's operands is killed by depending instruction */
|
||||
std::vector<bool>& RAR_deps = improved_rar ? (add_to_clause ? RAR_dependencies_clause : RAR_dependencies) : depends_on;
|
||||
std::vector<bool>& RAR_deps =
|
||||
improved_rar ? (add_to_clause ? RAR_dependencies_clause : RAR_dependencies) : depends_on;
|
||||
for (const Operand& op : instr->operands) {
|
||||
if (op.isTemp() && RAR_deps[op.tempId()]) {
|
||||
// FIXME: account for difference in register pressure
|
||||
|
@ -274,7 +276,8 @@ MoveResult MoveState::downwards_move(DownwardsCursor& cursor, bool add_to_clause
|
|||
return move_success;
|
||||
}
|
||||
|
||||
void MoveState::downwards_skip(DownwardsCursor& cursor)
|
||||
void
|
||||
MoveState::downwards_skip(DownwardsCursor& cursor)
|
||||
{
|
||||
aco_ptr<Instruction>& instr = block->instructions[cursor.source_idx];
|
||||
|
||||
|
@ -292,7 +295,9 @@ void MoveState::downwards_skip(DownwardsCursor& cursor)
|
|||
cursor.verify_invariants(register_demand);
|
||||
}
|
||||
|
||||
void UpwardsCursor::verify_invariants(const RegisterDemand* register_demand) {
|
||||
void
|
||||
UpwardsCursor::verify_invariants(const RegisterDemand* register_demand)
|
||||
{
|
||||
#ifndef NDEBUG
|
||||
if (!has_insert_idx()) {
|
||||
return;
|
||||
|
@ -308,7 +313,8 @@ void UpwardsCursor::verify_invariants(const RegisterDemand* register_demand) {
|
|||
#endif
|
||||
}
|
||||
|
||||
UpwardsCursor MoveState::upwards_init(int source_idx, bool improved_rar_)
|
||||
UpwardsCursor
|
||||
MoveState::upwards_init(int source_idx, bool improved_rar_)
|
||||
{
|
||||
improved_rar = improved_rar_;
|
||||
|
||||
|
@ -323,7 +329,8 @@ UpwardsCursor MoveState::upwards_init(int source_idx, bool improved_rar_)
|
|||
return UpwardsCursor(source_idx);
|
||||
}
|
||||
|
||||
bool MoveState::upwards_check_deps(UpwardsCursor& cursor)
|
||||
bool
|
||||
MoveState::upwards_check_deps(UpwardsCursor& cursor)
|
||||
{
|
||||
aco_ptr<Instruction>& instr = block->instructions[cursor.source_idx];
|
||||
for (const Operand& op : instr->operands) {
|
||||
|
@ -333,13 +340,15 @@ bool MoveState::upwards_check_deps(UpwardsCursor& cursor)
|
|||
return true;
|
||||
}
|
||||
|
||||
void MoveState::upwards_update_insert_idx(UpwardsCursor& cursor)
|
||||
void
|
||||
MoveState::upwards_update_insert_idx(UpwardsCursor& cursor)
|
||||
{
|
||||
cursor.insert_idx = cursor.source_idx;
|
||||
cursor.total_demand = register_demand[cursor.insert_idx];
|
||||
}
|
||||
|
||||
MoveResult MoveState::upwards_move(UpwardsCursor& cursor)
|
||||
MoveResult
|
||||
MoveState::upwards_move(UpwardsCursor& cursor)
|
||||
{
|
||||
assert(cursor.has_insert_idx());
|
||||
|
||||
|
@ -355,13 +364,15 @@ MoveResult MoveState::upwards_move(UpwardsCursor& cursor)
|
|||
return move_fail_rar;
|
||||
}
|
||||
|
||||
/* check if register pressure is low enough: the diff is negative if register pressure is decreased */
|
||||
/* check if register pressure is low enough: the diff is negative if register pressure is
|
||||
* decreased */
|
||||
const RegisterDemand candidate_diff = get_live_changes(instr);
|
||||
const RegisterDemand temp = get_temp_registers(instr);
|
||||
if (RegisterDemand(cursor.total_demand + candidate_diff).exceeds(max_registers))
|
||||
return move_fail_pressure;
|
||||
const RegisterDemand temp2 = get_temp_registers(block->instructions[cursor.insert_idx - 1]);
|
||||
const RegisterDemand new_demand = register_demand[cursor.insert_idx - 1] - temp2 + candidate_diff + temp;
|
||||
const RegisterDemand new_demand =
|
||||
register_demand[cursor.insert_idx - 1] - temp2 + candidate_diff + temp;
|
||||
if (new_demand.exceeds(max_registers))
|
||||
return move_fail_pressure;
|
||||
|
||||
|
@ -385,7 +396,8 @@ MoveResult MoveState::upwards_move(UpwardsCursor& cursor)
|
|||
return move_success;
|
||||
}
|
||||
|
||||
void MoveState::upwards_skip(UpwardsCursor& cursor)
|
||||
void
|
||||
MoveState::upwards_skip(UpwardsCursor& cursor)
|
||||
{
|
||||
if (cursor.has_insert_idx()) {
|
||||
aco_ptr<Instruction>& instr = block->instructions[cursor.source_idx];
|
||||
|
@ -405,30 +417,33 @@ void MoveState::upwards_skip(UpwardsCursor& cursor)
|
|||
cursor.verify_invariants(register_demand);
|
||||
}
|
||||
|
||||
bool is_gs_or_done_sendmsg(const Instruction *instr)
|
||||
bool
|
||||
is_gs_or_done_sendmsg(const Instruction* instr)
|
||||
{
|
||||
if (instr->opcode == aco_opcode::s_sendmsg) {
|
||||
uint16_t imm = instr->sopp().imm;
|
||||
return (imm & sendmsg_id_mask) == _sendmsg_gs ||
|
||||
(imm & sendmsg_id_mask) == _sendmsg_gs_done;
|
||||
return (imm & sendmsg_id_mask) == _sendmsg_gs || (imm & sendmsg_id_mask) == _sendmsg_gs_done;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
bool is_done_sendmsg(const Instruction *instr)
|
||||
bool
|
||||
is_done_sendmsg(const Instruction* instr)
|
||||
{
|
||||
if (instr->opcode == aco_opcode::s_sendmsg)
|
||||
return (instr->sopp().imm & sendmsg_id_mask) == _sendmsg_gs_done;
|
||||
return false;
|
||||
}
|
||||
|
||||
memory_sync_info get_sync_info_with_hack(const Instruction* instr)
|
||||
memory_sync_info
|
||||
get_sync_info_with_hack(const Instruction* instr)
|
||||
{
|
||||
memory_sync_info sync = get_sync_info(instr);
|
||||
if (instr->isSMEM() && !instr->operands.empty() && instr->operands[0].bytes() == 16) {
|
||||
// FIXME: currently, it doesn't seem beneficial to omit this due to how our scheduler works
|
||||
sync.storage = (storage_class)(sync.storage | storage_buffer);
|
||||
sync.semantics = (memory_semantics)((sync.semantics | semantic_private) & ~semantic_can_reorder);
|
||||
sync.semantics =
|
||||
(memory_semantics)((sync.semantics | semantic_private) & ~semantic_can_reorder);
|
||||
}
|
||||
return sync;
|
||||
}
|
||||
|
@ -451,11 +466,13 @@ struct hazard_query {
|
|||
bool contains_sendmsg;
|
||||
bool uses_exec;
|
||||
memory_event_set mem_events;
|
||||
unsigned aliasing_storage; /* storage classes which are accessed (non-SMEM) */
|
||||
unsigned aliasing_storage; /* storage classes which are accessed (non-SMEM) */
|
||||
unsigned aliasing_storage_smem; /* storage classes which are accessed (SMEM) */
|
||||
};
|
||||
|
||||
void init_hazard_query(hazard_query *query) {
|
||||
void
|
||||
init_hazard_query(hazard_query* query)
|
||||
{
|
||||
query->contains_spill = false;
|
||||
query->contains_sendmsg = false;
|
||||
query->uses_exec = false;
|
||||
|
@ -464,7 +481,8 @@ void init_hazard_query(hazard_query *query) {
|
|||
query->aliasing_storage_smem = 0;
|
||||
}
|
||||
|
||||
void add_memory_event(memory_event_set *set, Instruction *instr, memory_sync_info *sync)
|
||||
void
|
||||
add_memory_event(memory_event_set* set, Instruction* instr, memory_sync_info* sync)
|
||||
{
|
||||
set->has_control_barrier |= is_done_sendmsg(instr);
|
||||
if (instr->opcode == aco_opcode::p_barrier) {
|
||||
|
@ -494,7 +512,8 @@ void add_memory_event(memory_event_set *set, Instruction *instr, memory_sync_inf
|
|||
}
|
||||
}
|
||||
|
||||
void add_to_hazard_query(hazard_query *query, Instruction *instr)
|
||||
void
|
||||
add_to_hazard_query(hazard_query* query, Instruction* instr)
|
||||
{
|
||||
if (instr->opcode == aco_opcode::p_spill || instr->opcode == aco_opcode::p_reload)
|
||||
query->contains_spill = true;
|
||||
|
@ -507,7 +526,8 @@ void add_to_hazard_query(hazard_query *query, Instruction *instr)
|
|||
|
||||
if (!(sync.semantics & semantic_can_reorder)) {
|
||||
unsigned storage = sync.storage;
|
||||
/* images and buffer/global memory can alias */ //TODO: more precisely, buffer images and buffer/global memory can alias
|
||||
/* images and buffer/global memory can alias */ // TODO: more precisely, buffer images and
|
||||
// buffer/global memory can alias
|
||||
if (storage & (storage_buffer | storage_image))
|
||||
storage |= storage_buffer | storage_image;
|
||||
if (instr->isSMEM())
|
||||
|
@ -531,7 +551,8 @@ enum HazardResult {
|
|||
hazard_fail_unreorderable,
|
||||
};
|
||||
|
||||
HazardResult perform_hazard_query(hazard_query *query, Instruction *instr, bool upwards)
|
||||
HazardResult
|
||||
perform_hazard_query(hazard_query* query, Instruction* instr, bool upwards)
|
||||
{
|
||||
/* don't schedule discards downwards */
|
||||
if (!upwards && instr->opcode == aco_opcode::p_exit_early_if)
|
||||
|
@ -549,10 +570,8 @@ HazardResult perform_hazard_query(hazard_query *query, Instruction *instr, bool
|
|||
return hazard_fail_export;
|
||||
|
||||
/* don't move non-reorderable instructions */
|
||||
if (instr->opcode == aco_opcode::s_memtime ||
|
||||
instr->opcode == aco_opcode::s_memrealtime ||
|
||||
instr->opcode == aco_opcode::s_setprio ||
|
||||
instr->opcode == aco_opcode::s_getreg_b32)
|
||||
if (instr->opcode == aco_opcode::s_memtime || instr->opcode == aco_opcode::s_memrealtime ||
|
||||
instr->opcode == aco_opcode::s_setprio || instr->opcode == aco_opcode::s_getreg_b32)
|
||||
return hazard_fail_unreorderable;
|
||||
|
||||
memory_event_set instr_set;
|
||||
|
@ -560,8 +579,8 @@ HazardResult perform_hazard_query(hazard_query *query, Instruction *instr, bool
|
|||
memory_sync_info sync = get_sync_info_with_hack(instr);
|
||||
add_memory_event(&instr_set, instr, &sync);
|
||||
|
||||
memory_event_set *first = &instr_set;
|
||||
memory_event_set *second = &query->mem_events;
|
||||
memory_event_set* first = &instr_set;
|
||||
memory_event_set* second = &query->mem_events;
|
||||
if (upwards)
|
||||
std::swap(first, second);
|
||||
|
||||
|
@ -571,7 +590,8 @@ HazardResult perform_hazard_query(hazard_query *query, Instruction *instr, bool
|
|||
if ((first->has_control_barrier || first->access_atomic) && second->bar_acquire)
|
||||
return hazard_fail_barrier;
|
||||
if (((first->access_acquire || first->bar_acquire) && second->bar_classes) ||
|
||||
((first->access_acquire | first->bar_acquire) & (second->access_relaxed | second->access_atomic)))
|
||||
((first->access_acquire | first->bar_acquire) &
|
||||
(second->access_relaxed | second->access_atomic)))
|
||||
return hazard_fail_barrier;
|
||||
|
||||
/* everything before barrier(release) happens before the atomics/control_barriers after *
|
||||
|
@ -580,7 +600,8 @@ HazardResult perform_hazard_query(hazard_query *query, Instruction *instr, bool
|
|||
if (first->bar_release && (second->has_control_barrier || second->access_atomic))
|
||||
return hazard_fail_barrier;
|
||||
if ((first->bar_classes && (second->bar_release || second->access_release)) ||
|
||||
((first->access_relaxed | first->access_atomic) & (second->bar_release | second->access_release)))
|
||||
((first->access_relaxed | first->access_atomic) &
|
||||
(second->bar_release | second->access_release)))
|
||||
return hazard_fail_barrier;
|
||||
|
||||
/* don't move memory barriers around other memory barriers */
|
||||
|
@ -589,14 +610,15 @@ HazardResult perform_hazard_query(hazard_query *query, Instruction *instr, bool
|
|||
|
||||
/* Don't move memory accesses to before control barriers. I don't think
|
||||
* this is necessary for the Vulkan memory model, but it might be for GLSL450. */
|
||||
unsigned control_classes = storage_buffer | storage_atomic_counter | storage_image | storage_shared;
|
||||
if (first->has_control_barrier && ((second->access_atomic | second->access_relaxed) & control_classes))
|
||||
unsigned control_classes =
|
||||
storage_buffer | storage_atomic_counter | storage_image | storage_shared;
|
||||
if (first->has_control_barrier &&
|
||||
((second->access_atomic | second->access_relaxed) & control_classes))
|
||||
return hazard_fail_barrier;
|
||||
|
||||
/* don't move memory loads/stores past potentially aliasing loads/stores */
|
||||
unsigned aliasing_storage = instr->isSMEM() ?
|
||||
query->aliasing_storage_smem :
|
||||
query->aliasing_storage;
|
||||
unsigned aliasing_storage =
|
||||
instr->isSMEM() ? query->aliasing_storage_smem : query->aliasing_storage;
|
||||
if ((sync.storage & aliasing_storage) && !(sync.semantics & semantic_can_reorder)) {
|
||||
unsigned intersect = sync.storage & aliasing_storage;
|
||||
if (intersect & storage_shared)
|
||||
|
@ -614,9 +636,9 @@ HazardResult perform_hazard_query(hazard_query *query, Instruction *instr, bool
|
|||
return hazard_success;
|
||||
}
|
||||
|
||||
void schedule_SMEM(sched_ctx& ctx, Block* block,
|
||||
std::vector<RegisterDemand>& register_demand,
|
||||
Instruction* current, int idx)
|
||||
void
|
||||
schedule_SMEM(sched_ctx& ctx, Block* block, std::vector<RegisterDemand>& register_demand,
|
||||
Instruction* current, int idx)
|
||||
{
|
||||
assert(idx != 0);
|
||||
int window_size = SMEM_WINDOW_SIZE;
|
||||
|
@ -634,30 +656,37 @@ void schedule_SMEM(sched_ctx& ctx, Block* block,
|
|||
|
||||
DownwardsCursor cursor = ctx.mv.downwards_init(idx, false, false);
|
||||
|
||||
for (int candidate_idx = idx - 1; k < max_moves && candidate_idx > (int) idx - window_size; candidate_idx--) {
|
||||
for (int candidate_idx = idx - 1; k < max_moves && candidate_idx > (int)idx - window_size;
|
||||
candidate_idx--) {
|
||||
assert(candidate_idx >= 0);
|
||||
assert(candidate_idx == cursor.source_idx);
|
||||
aco_ptr<Instruction>& candidate = block->instructions[candidate_idx];
|
||||
|
||||
/* break if we'd make the previous SMEM instruction stall */
|
||||
bool can_stall_prev_smem = idx <= ctx.last_SMEM_dep_idx && candidate_idx < ctx.last_SMEM_dep_idx;
|
||||
bool can_stall_prev_smem =
|
||||
idx <= ctx.last_SMEM_dep_idx && candidate_idx < ctx.last_SMEM_dep_idx;
|
||||
if (can_stall_prev_smem && ctx.last_SMEM_stall >= 0)
|
||||
break;
|
||||
|
||||
/* break when encountering another MEM instruction, logical_start or barriers */
|
||||
if (candidate->opcode == aco_opcode::p_logical_start)
|
||||
break;
|
||||
/* only move VMEM instructions below descriptor loads. be more aggressive at higher num_waves to help create more vmem clauses */
|
||||
if (candidate->isVMEM() && (cursor.insert_idx - cursor.source_idx > (ctx.num_waves * 4) || current->operands[0].size() == 4))
|
||||
/* only move VMEM instructions below descriptor loads. be more aggressive at higher num_waves
|
||||
* to help create more vmem clauses */
|
||||
if (candidate->isVMEM() && (cursor.insert_idx - cursor.source_idx > (ctx.num_waves * 4) ||
|
||||
current->operands[0].size() == 4))
|
||||
break;
|
||||
/* don't move descriptor loads below buffer loads */
|
||||
if (candidate->format == Format::SMEM && current->operands[0].size() == 4 && candidate->operands[0].size() == 2)
|
||||
if (candidate->format == Format::SMEM && current->operands[0].size() == 4 &&
|
||||
candidate->operands[0].size() == 2)
|
||||
break;
|
||||
|
||||
bool can_move_down = true;
|
||||
|
||||
HazardResult haz = perform_hazard_query(&hq, candidate.get(), false);
|
||||
if (haz == hazard_fail_reorder_ds || haz == hazard_fail_spill || haz == hazard_fail_reorder_sendmsg || haz == hazard_fail_barrier || haz == hazard_fail_export)
|
||||
if (haz == hazard_fail_reorder_ds || haz == hazard_fail_spill ||
|
||||
haz == hazard_fail_reorder_sendmsg || haz == hazard_fail_barrier ||
|
||||
haz == hazard_fail_export)
|
||||
can_move_down = false;
|
||||
else if (haz != hazard_success)
|
||||
break;
|
||||
|
@ -689,9 +718,10 @@ void schedule_SMEM(sched_ctx& ctx, Block* block,
|
|||
|
||||
bool found_dependency = false;
|
||||
/* second, check if we have instructions after current to move up */
|
||||
for (int candidate_idx = idx + 1; k < max_moves && candidate_idx < (int) idx + window_size; candidate_idx++) {
|
||||
for (int candidate_idx = idx + 1; k < max_moves && candidate_idx < (int)idx + window_size;
|
||||
candidate_idx++) {
|
||||
assert(candidate_idx == up_cursor.source_idx);
|
||||
assert(candidate_idx < (int) block->instructions.size());
|
||||
assert(candidate_idx < (int)block->instructions.size());
|
||||
aco_ptr<Instruction>& candidate = block->instructions[candidate_idx];
|
||||
|
||||
if (candidate->opcode == aco_opcode::p_logical_end)
|
||||
|
@ -748,9 +778,9 @@ void schedule_SMEM(sched_ctx& ctx, Block* block,
|
|||
ctx.last_SMEM_stall = 10 - ctx.num_waves - k;
|
||||
}
|
||||
|
||||
void schedule_VMEM(sched_ctx& ctx, Block* block,
|
||||
std::vector<RegisterDemand>& register_demand,
|
||||
Instruction* current, int idx)
|
||||
void
|
||||
schedule_VMEM(sched_ctx& ctx, Block* block, std::vector<RegisterDemand>& register_demand,
|
||||
Instruction* current, int idx)
|
||||
{
|
||||
assert(idx != 0);
|
||||
int window_size = VMEM_WINDOW_SIZE;
|
||||
|
@ -767,7 +797,8 @@ void schedule_VMEM(sched_ctx& ctx, Block* block,
|
|||
|
||||
DownwardsCursor cursor = ctx.mv.downwards_init(idx, true, true);
|
||||
|
||||
for (int candidate_idx = idx - 1; k < max_moves && candidate_idx > (int) idx - window_size; candidate_idx--) {
|
||||
for (int candidate_idx = idx - 1; k < max_moves && candidate_idx > (int)idx - window_size;
|
||||
candidate_idx--) {
|
||||
assert(candidate_idx == cursor.source_idx);
|
||||
assert(candidate_idx >= 0);
|
||||
aco_ptr<Instruction>& candidate = block->instructions[candidate_idx];
|
||||
|
@ -778,7 +809,8 @@ void schedule_VMEM(sched_ctx& ctx, Block* block,
|
|||
break;
|
||||
|
||||
/* break if we'd make the previous SMEM instruction stall */
|
||||
bool can_stall_prev_smem = idx <= ctx.last_SMEM_dep_idx && candidate_idx < ctx.last_SMEM_dep_idx;
|
||||
bool can_stall_prev_smem =
|
||||
idx <= ctx.last_SMEM_dep_idx && candidate_idx < ctx.last_SMEM_dep_idx;
|
||||
if (can_stall_prev_smem && ctx.last_SMEM_stall >= 0)
|
||||
break;
|
||||
|
||||
|
@ -787,14 +819,15 @@ void schedule_VMEM(sched_ctx& ctx, Block* block,
|
|||
int grab_dist = cursor.insert_idx_clause - candidate_idx;
|
||||
/* We can't easily tell how much this will decrease the def-to-use
|
||||
* distances, so just use how far it will be moved as a heuristic. */
|
||||
part_of_clause = grab_dist < clause_max_grab_dist &&
|
||||
should_form_clause(current, candidate.get());
|
||||
part_of_clause =
|
||||
grab_dist < clause_max_grab_dist && should_form_clause(current, candidate.get());
|
||||
}
|
||||
|
||||
/* if current depends on candidate, add additional dependencies and continue */
|
||||
bool can_move_down = !is_vmem || part_of_clause;
|
||||
|
||||
HazardResult haz = perform_hazard_query(part_of_clause ? &clause_hq : &indep_hq, candidate.get(), false);
|
||||
HazardResult haz =
|
||||
perform_hazard_query(part_of_clause ? &clause_hq : &indep_hq, candidate.get(), false);
|
||||
if (haz == hazard_fail_reorder_ds || haz == hazard_fail_spill ||
|
||||
haz == hazard_fail_reorder_sendmsg || haz == hazard_fail_barrier ||
|
||||
haz == hazard_fail_export)
|
||||
|
@ -809,7 +842,7 @@ void schedule_VMEM(sched_ctx& ctx, Block* block,
|
|||
continue;
|
||||
}
|
||||
|
||||
Instruction *candidate_ptr = candidate.get();
|
||||
Instruction* candidate_ptr = candidate.get();
|
||||
MoveResult res = ctx.mv.downwards_move(cursor, part_of_clause);
|
||||
if (res == move_fail_ssa || res == move_fail_rar) {
|
||||
add_to_hazard_query(&indep_hq, candidate.get());
|
||||
|
@ -832,9 +865,10 @@ void schedule_VMEM(sched_ctx& ctx, Block* block,
|
|||
|
||||
bool found_dependency = false;
|
||||
/* second, check if we have instructions after current to move up */
|
||||
for (int candidate_idx = idx + 1; k < max_moves && candidate_idx < (int) idx + window_size; candidate_idx++) {
|
||||
for (int candidate_idx = idx + 1; k < max_moves && candidate_idx < (int)idx + window_size;
|
||||
candidate_idx++) {
|
||||
assert(candidate_idx == up_cursor.source_idx);
|
||||
assert(candidate_idx < (int) block->instructions.size());
|
||||
assert(candidate_idx < (int)block->instructions.size());
|
||||
aco_ptr<Instruction>& candidate = block->instructions[candidate_idx];
|
||||
bool is_vmem = candidate->isVMEM() || candidate->isFlatLike();
|
||||
|
||||
|
@ -889,9 +923,9 @@ void schedule_VMEM(sched_ctx& ctx, Block* block,
|
|||
}
|
||||
}
|
||||
|
||||
void schedule_position_export(sched_ctx& ctx, Block* block,
|
||||
std::vector<RegisterDemand>& register_demand,
|
||||
Instruction* current, int idx)
|
||||
void
|
||||
schedule_position_export(sched_ctx& ctx, Block* block, std::vector<RegisterDemand>& register_demand,
|
||||
Instruction* current, int idx)
|
||||
{
|
||||
assert(idx != 0);
|
||||
int window_size = POS_EXP_WINDOW_SIZE;
|
||||
|
@ -904,7 +938,8 @@ void schedule_position_export(sched_ctx& ctx, Block* block,
|
|||
init_hazard_query(&hq);
|
||||
add_to_hazard_query(&hq, current);
|
||||
|
||||
for (int candidate_idx = idx - 1; k < max_moves && candidate_idx > (int) idx - window_size; candidate_idx--) {
|
||||
for (int candidate_idx = idx - 1; k < max_moves && candidate_idx > (int)idx - window_size;
|
||||
candidate_idx--) {
|
||||
assert(candidate_idx >= 0);
|
||||
aco_ptr<Instruction>& candidate = block->instructions[candidate_idx];
|
||||
|
||||
|
@ -935,7 +970,8 @@ void schedule_position_export(sched_ctx& ctx, Block* block,
|
|||
}
|
||||
}
|
||||
|
||||
void schedule_block(sched_ctx& ctx, Program *program, Block* block, live& live_vars)
|
||||
void
|
||||
schedule_block(sched_ctx& ctx, Program* program, Block* block, live& live_vars)
|
||||
{
|
||||
ctx.last_SMEM_dep_idx = 0;
|
||||
ctx.last_SMEM_stall = INT16_MIN;
|
||||
|
@ -950,7 +986,8 @@ void schedule_block(sched_ctx& ctx, Program *program, Block* block, live& live_v
|
|||
unsigned target = current->exp().dest;
|
||||
if (target >= V_008DFC_SQ_EXP_POS && target < V_008DFC_SQ_EXP_PRIM) {
|
||||
ctx.mv.current = current;
|
||||
schedule_position_export(ctx, block, live_vars.register_demand[block->index], current, idx);
|
||||
schedule_position_export(ctx, block, live_vars.register_demand[block->index], current,
|
||||
idx);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -975,8 +1012,8 @@ void schedule_block(sched_ctx& ctx, Program *program, Block* block, live& live_v
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
void schedule_program(Program *program, live& live_vars)
|
||||
void
|
||||
schedule_program(Program* program, live& live_vars)
|
||||
{
|
||||
/* don't use program->max_reg_demand because that is affected by max_waves_per_simd */
|
||||
RegisterDemand demand;
|
||||
|
@ -991,7 +1028,7 @@ void schedule_program(Program *program, live& live_vars)
|
|||
/* Allowing the scheduler to reduce the number of waves to as low as 5
|
||||
* improves performance of Thrones of Britannia significantly and doesn't
|
||||
* seem to hurt anything else. */
|
||||
//TODO: account for possible uneven num_waves on GFX10+
|
||||
// TODO: account for possible uneven num_waves on GFX10+
|
||||
unsigned wave_fac = program->dev.physical_vgprs / 256;
|
||||
if (program->num_waves <= 5 * wave_fac)
|
||||
ctx.num_waves = program->num_waves;
|
||||
|
@ -1008,8 +1045,8 @@ void schedule_program(Program *program, live& live_vars)
|
|||
ctx.num_waves = std::max<uint16_t>(ctx.num_waves / wave_fac, 1);
|
||||
|
||||
assert(ctx.num_waves > 0);
|
||||
ctx.mv.max_registers = { int16_t(get_addr_vgpr_from_waves(program, ctx.num_waves * wave_fac) - 2),
|
||||
int16_t(get_addr_sgpr_from_waves(program, ctx.num_waves * wave_fac))};
|
||||
ctx.mv.max_registers = {int16_t(get_addr_vgpr_from_waves(program, ctx.num_waves * wave_fac) - 2),
|
||||
int16_t(get_addr_sgpr_from_waves(program, ctx.num_waves * wave_fac))};
|
||||
|
||||
for (Block& block : program->blocks)
|
||||
schedule_block(ctx, program, &block, live_vars);
|
||||
|
@ -1021,8 +1058,8 @@ void schedule_program(Program *program, live& live_vars)
|
|||
}
|
||||
update_vgpr_sgpr_demand(program, new_demand);
|
||||
|
||||
/* if enabled, this code asserts that register_demand is updated correctly */
|
||||
#if 0
|
||||
/* if enabled, this code asserts that register_demand is updated correctly */
|
||||
#if 0
|
||||
int prev_num_waves = program->num_waves;
|
||||
const RegisterDemand prev_max_demand = program->max_reg_demand;
|
||||
|
||||
|
@ -1042,7 +1079,7 @@ void schedule_program(Program *program, live& live_vars)
|
|||
|
||||
assert(program->max_reg_demand == prev_max_demand);
|
||||
assert(program->num_waves == prev_num_waves);
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
|
||||
}
|
||||
} // namespace aco
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -37,7 +37,8 @@ struct phi_info_item {
|
|||
};
|
||||
|
||||
struct ssa_elimination_ctx {
|
||||
/* The outer vectors should be indexed by block index. The inner vectors store phi information for each block. */
|
||||
/* The outer vectors should be indexed by block index. The inner vectors store phi information
|
||||
* for each block. */
|
||||
std::vector<std::vector<phi_info_item>> logical_phi_info;
|
||||
std::vector<std::vector<phi_info_item>> linear_phi_info;
|
||||
std::vector<bool> empty_blocks;
|
||||
|
@ -45,14 +46,14 @@ struct ssa_elimination_ctx {
|
|||
Program* program;
|
||||
|
||||
ssa_elimination_ctx(Program* program_)
|
||||
: logical_phi_info(program_->blocks.size())
|
||||
, linear_phi_info(program_->blocks.size())
|
||||
, empty_blocks(program_->blocks.size(), true)
|
||||
, blocks_incoming_exec_used(program_->blocks.size(), true)
|
||||
, program(program_) {}
|
||||
: logical_phi_info(program_->blocks.size()), linear_phi_info(program_->blocks.size()),
|
||||
empty_blocks(program_->blocks.size(), true),
|
||||
blocks_incoming_exec_used(program_->blocks.size(), true), program(program_)
|
||||
{}
|
||||
};
|
||||
|
||||
void collect_phi_info(ssa_elimination_ctx& ctx)
|
||||
void
|
||||
collect_phi_info(ssa_elimination_ctx& ctx)
|
||||
{
|
||||
for (Block& block : ctx.program->blocks) {
|
||||
for (aco_ptr<Instruction>& phi : block.instructions) {
|
||||
|
@ -67,9 +68,11 @@ void collect_phi_info(ssa_elimination_ctx& ctx)
|
|||
|
||||
assert(phi->definitions[0].size() == phi->operands[i].size());
|
||||
|
||||
std::vector<unsigned>& preds = phi->opcode == aco_opcode::p_phi ? block.logical_preds : block.linear_preds;
|
||||
std::vector<unsigned>& preds =
|
||||
phi->opcode == aco_opcode::p_phi ? block.logical_preds : block.linear_preds;
|
||||
uint32_t pred_idx = preds[i];
|
||||
auto& info_vec = phi->opcode == aco_opcode::p_phi ? ctx.logical_phi_info[pred_idx] : ctx.linear_phi_info[pred_idx];
|
||||
auto& info_vec = phi->opcode == aco_opcode::p_phi ? ctx.logical_phi_info[pred_idx]
|
||||
: ctx.linear_phi_info[pred_idx];
|
||||
info_vec.push_back({phi->definitions[0], phi->operands[i]});
|
||||
ctx.empty_blocks[pred_idx] = false;
|
||||
}
|
||||
|
@ -77,11 +80,12 @@ void collect_phi_info(ssa_elimination_ctx& ctx)
|
|||
}
|
||||
}
|
||||
|
||||
void insert_parallelcopies(ssa_elimination_ctx& ctx)
|
||||
void
|
||||
insert_parallelcopies(ssa_elimination_ctx& ctx)
|
||||
{
|
||||
/* insert the parallelcopies from logical phis before p_logical_end */
|
||||
for (unsigned block_idx = 0; block_idx < ctx.program->blocks.size(); ++block_idx) {
|
||||
auto &logical_phi_info = ctx.logical_phi_info[block_idx];
|
||||
auto& logical_phi_info = ctx.logical_phi_info[block_idx];
|
||||
if (logical_phi_info.empty())
|
||||
continue;
|
||||
|
||||
|
@ -93,10 +97,11 @@ void insert_parallelcopies(ssa_elimination_ctx& ctx)
|
|||
}
|
||||
|
||||
std::vector<aco_ptr<Instruction>>::iterator it = std::next(block.instructions.begin(), idx);
|
||||
aco_ptr<Pseudo_instruction> pc{create_instruction<Pseudo_instruction>(aco_opcode::p_parallelcopy, Format::PSEUDO, logical_phi_info.size(), logical_phi_info.size())};
|
||||
aco_ptr<Pseudo_instruction> pc{
|
||||
create_instruction<Pseudo_instruction>(aco_opcode::p_parallelcopy, Format::PSEUDO,
|
||||
logical_phi_info.size(), logical_phi_info.size())};
|
||||
unsigned i = 0;
|
||||
for (auto& phi_info : logical_phi_info)
|
||||
{
|
||||
for (auto& phi_info : logical_phi_info) {
|
||||
pc->definitions[i] = phi_info.def;
|
||||
pc->operands[i] = phi_info.op;
|
||||
i++;
|
||||
|
@ -108,7 +113,7 @@ void insert_parallelcopies(ssa_elimination_ctx& ctx)
|
|||
|
||||
/* insert parallelcopies for the linear phis at the end of blocks just before the branch */
|
||||
for (unsigned block_idx = 0; block_idx < ctx.program->blocks.size(); ++block_idx) {
|
||||
auto &linear_phi_info = ctx.linear_phi_info[block_idx];
|
||||
auto& linear_phi_info = ctx.linear_phi_info[block_idx];
|
||||
if (linear_phi_info.empty())
|
||||
continue;
|
||||
|
||||
|
@ -116,10 +121,11 @@ void insert_parallelcopies(ssa_elimination_ctx& ctx)
|
|||
std::vector<aco_ptr<Instruction>>::iterator it = block.instructions.end();
|
||||
--it;
|
||||
assert((*it)->isBranch());
|
||||
aco_ptr<Pseudo_instruction> pc{create_instruction<Pseudo_instruction>(aco_opcode::p_parallelcopy, Format::PSEUDO, linear_phi_info.size(), linear_phi_info.size())};
|
||||
aco_ptr<Pseudo_instruction> pc{
|
||||
create_instruction<Pseudo_instruction>(aco_opcode::p_parallelcopy, Format::PSEUDO,
|
||||
linear_phi_info.size(), linear_phi_info.size())};
|
||||
unsigned i = 0;
|
||||
for (auto& phi_info : linear_phi_info)
|
||||
{
|
||||
for (auto& phi_info : linear_phi_info) {
|
||||
pc->definitions[i] = phi_info.def;
|
||||
pc->operands[i] = phi_info.op;
|
||||
i++;
|
||||
|
@ -130,38 +136,38 @@ void insert_parallelcopies(ssa_elimination_ctx& ctx)
|
|||
}
|
||||
}
|
||||
|
||||
bool is_empty_block(Block* block, bool ignore_exec_writes)
|
||||
bool
|
||||
is_empty_block(Block* block, bool ignore_exec_writes)
|
||||
{
|
||||
/* check if this block is empty and the exec mask is not needed */
|
||||
for (aco_ptr<Instruction>& instr : block->instructions) {
|
||||
switch (instr->opcode) {
|
||||
case aco_opcode::p_linear_phi:
|
||||
case aco_opcode::p_phi:
|
||||
case aco_opcode::p_logical_start:
|
||||
case aco_opcode::p_logical_end:
|
||||
case aco_opcode::p_branch:
|
||||
case aco_opcode::p_linear_phi:
|
||||
case aco_opcode::p_phi:
|
||||
case aco_opcode::p_logical_start:
|
||||
case aco_opcode::p_logical_end:
|
||||
case aco_opcode::p_branch: break;
|
||||
case aco_opcode::p_parallelcopy:
|
||||
for (unsigned i = 0; i < instr->definitions.size(); i++) {
|
||||
if (ignore_exec_writes && instr->definitions[i].physReg() == exec)
|
||||
continue;
|
||||
if (instr->definitions[i].physReg() != instr->operands[i].physReg())
|
||||
return false;
|
||||
}
|
||||
break;
|
||||
case aco_opcode::s_andn2_b64:
|
||||
case aco_opcode::s_andn2_b32:
|
||||
if (ignore_exec_writes && instr->definitions[0].physReg() == exec)
|
||||
break;
|
||||
case aco_opcode::p_parallelcopy:
|
||||
for (unsigned i = 0; i < instr->definitions.size(); i++) {
|
||||
if (ignore_exec_writes && instr->definitions[i].physReg() == exec)
|
||||
continue;
|
||||
if (instr->definitions[i].physReg() != instr->operands[i].physReg())
|
||||
return false;
|
||||
}
|
||||
break;
|
||||
case aco_opcode::s_andn2_b64:
|
||||
case aco_opcode::s_andn2_b32:
|
||||
if (ignore_exec_writes && instr->definitions[0].physReg() == exec)
|
||||
break;
|
||||
return false;
|
||||
default:
|
||||
return false;
|
||||
return false;
|
||||
default: return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
void try_remove_merge_block(ssa_elimination_ctx& ctx, Block* block)
|
||||
void
|
||||
try_remove_merge_block(ssa_elimination_ctx& ctx, Block* block)
|
||||
{
|
||||
/* check if the successor is another merge block which restores exec */
|
||||
// TODO: divergent loops also restore exec
|
||||
|
@ -179,7 +185,8 @@ void try_remove_merge_block(ssa_elimination_ctx& ctx, Block* block)
|
|||
block->instructions.emplace_back(std::move(branch));
|
||||
}
|
||||
|
||||
void try_remove_invert_block(ssa_elimination_ctx& ctx, Block* block)
|
||||
void
|
||||
try_remove_invert_block(ssa_elimination_ctx& ctx, Block* block)
|
||||
{
|
||||
assert(block->linear_succs.size() == 2);
|
||||
/* only remove this block if the successor got removed as well */
|
||||
|
@ -193,7 +200,7 @@ void try_remove_invert_block(ssa_elimination_ctx& ctx, Block* block)
|
|||
unsigned succ_idx = block->linear_succs[0];
|
||||
assert(block->linear_preds.size() == 2);
|
||||
for (unsigned i = 0; i < 2; i++) {
|
||||
Block *pred = &ctx.program->blocks[block->linear_preds[i]];
|
||||
Block* pred = &ctx.program->blocks[block->linear_preds[i]];
|
||||
pred->linear_succs[0] = succ_idx;
|
||||
ctx.program->blocks[succ_idx].linear_preds[i] = pred->index;
|
||||
|
||||
|
@ -208,7 +215,8 @@ void try_remove_invert_block(ssa_elimination_ctx& ctx, Block* block)
|
|||
block->linear_succs.clear();
|
||||
}
|
||||
|
||||
void try_remove_simple_block(ssa_elimination_ctx& ctx, Block* block)
|
||||
void
|
||||
try_remove_simple_block(ssa_elimination_ctx& ctx, Block* block)
|
||||
{
|
||||
if (!is_empty_block(block, false))
|
||||
return;
|
||||
|
@ -277,7 +285,8 @@ void try_remove_simple_block(ssa_elimination_ctx& ctx, Block* block)
|
|||
block->linear_succs.clear();
|
||||
}
|
||||
|
||||
bool instr_writes_exec(Instruction* instr)
|
||||
bool
|
||||
instr_writes_exec(Instruction* instr)
|
||||
{
|
||||
for (Definition& def : instr->definitions)
|
||||
if (def.physReg() == exec || def.physReg() == exec_hi)
|
||||
|
@ -286,7 +295,8 @@ bool instr_writes_exec(Instruction* instr)
|
|||
return false;
|
||||
}
|
||||
|
||||
void eliminate_useless_exec_writes_in_block(ssa_elimination_ctx& ctx, Block& block)
|
||||
void
|
||||
eliminate_useless_exec_writes_in_block(ssa_elimination_ctx& ctx, Block& block)
|
||||
{
|
||||
/* Check if any successor needs the outgoing exec mask from the current block. */
|
||||
|
||||
|
@ -309,8 +319,9 @@ void eliminate_useless_exec_writes_in_block(ssa_elimination_ctx& ctx, Block& blo
|
|||
exec_write_used = false;
|
||||
else
|
||||
/* blocks_incoming_exec_used is initialized to true, so this is correct even for loops. */
|
||||
exec_write_used = std::any_of(block.linear_succs.begin(), block.linear_succs.end(),
|
||||
[&ctx](int succ_idx) { return ctx.blocks_incoming_exec_used[succ_idx]; });
|
||||
exec_write_used =
|
||||
std::any_of(block.linear_succs.begin(), block.linear_succs.end(),
|
||||
[&ctx](int succ_idx) { return ctx.blocks_incoming_exec_used[succ_idx]; });
|
||||
}
|
||||
|
||||
/* Go through all instructions and eliminate useless exec writes. */
|
||||
|
@ -318,7 +329,8 @@ void eliminate_useless_exec_writes_in_block(ssa_elimination_ctx& ctx, Block& blo
|
|||
for (int i = block.instructions.size() - 1; i >= 0; --i) {
|
||||
aco_ptr<Instruction>& instr = block.instructions[i];
|
||||
|
||||
/* We already take information from phis into account before the loop, so let's just break on phis. */
|
||||
/* We already take information from phis into account before the loop, so let's just break on
|
||||
* phis. */
|
||||
if (instr->opcode == aco_opcode::p_linear_phi || instr->opcode == aco_opcode::p_phi)
|
||||
break;
|
||||
|
||||
|
@ -341,16 +353,15 @@ void eliminate_useless_exec_writes_in_block(ssa_elimination_ctx& ctx, Block& blo
|
|||
}
|
||||
|
||||
/* Remember if the current block needs an incoming exec mask from its predecessors. */
|
||||
|
||||
ctx.blocks_incoming_exec_used[block.index] = exec_write_used;
|
||||
|
||||
/* Cleanup: remove deleted instructions from the vector. */
|
||||
|
||||
auto new_end = std::remove(block.instructions.begin(), block.instructions.end(), nullptr);
|
||||
block.instructions.resize(new_end - block.instructions.begin());
|
||||
}
|
||||
|
||||
void jump_threading(ssa_elimination_ctx& ctx)
|
||||
void
|
||||
jump_threading(ssa_elimination_ctx& ctx)
|
||||
{
|
||||
for (int i = ctx.program->blocks.size() - 1; i >= 0; i--) {
|
||||
Block* block = &ctx.program->blocks[i];
|
||||
|
@ -367,8 +378,7 @@ void jump_threading(ssa_elimination_ctx& ctx)
|
|||
if (block->linear_succs.size() > 1)
|
||||
continue;
|
||||
|
||||
if (block->kind & block_kind_merge ||
|
||||
block->kind & block_kind_loop_exit)
|
||||
if (block->kind & block_kind_merge || block->kind & block_kind_loop_exit)
|
||||
try_remove_merge_block(ctx, block);
|
||||
|
||||
if (block->linear_preds.size() == 1)
|
||||
|
@ -378,8 +388,8 @@ void jump_threading(ssa_elimination_ctx& ctx)
|
|||
|
||||
} /* end namespace */
|
||||
|
||||
|
||||
void ssa_elimination(Program* program)
|
||||
void
|
||||
ssa_elimination(Program* program)
|
||||
{
|
||||
ssa_elimination_ctx ctx(program);
|
||||
|
||||
|
@ -391,6 +401,5 @@ void ssa_elimination(Program* program)
|
|||
|
||||
/* insert parallelcopies from SSA elimination */
|
||||
insert_parallelcopies(ctx);
|
||||
|
||||
}
|
||||
}
|
||||
} // namespace aco
|
||||
|
|
|
@ -23,6 +23,7 @@
|
|||
*/
|
||||
|
||||
#include "aco_ir.h"
|
||||
|
||||
#include "util/crc32.h"
|
||||
|
||||
#include <algorithm>
|
||||
|
@ -33,7 +34,8 @@
|
|||
namespace aco {
|
||||
|
||||
/* sgpr_presched/vgpr_presched */
|
||||
void collect_presched_stats(Program *program)
|
||||
void
|
||||
collect_presched_stats(Program* program)
|
||||
{
|
||||
RegisterDemand presched_demand;
|
||||
for (Block& block : program->blocks)
|
||||
|
@ -56,9 +58,9 @@ public:
|
|||
resource_count,
|
||||
};
|
||||
|
||||
BlockCycleEstimator(Program *program_) : program(program_) {}
|
||||
BlockCycleEstimator(Program* program_) : program(program_) {}
|
||||
|
||||
Program *program;
|
||||
Program* program;
|
||||
|
||||
int32_t cur_cycle = 0;
|
||||
int32_t res_available[(int)BlockCycleEstimator::resource_count] = {0};
|
||||
|
@ -72,6 +74,7 @@ public:
|
|||
unsigned predict_cost(aco_ptr<Instruction>& instr);
|
||||
void add(aco_ptr<Instruction>& instr);
|
||||
void join(const BlockCycleEstimator& other);
|
||||
|
||||
private:
|
||||
unsigned get_waitcnt_cost(wait_imm imm);
|
||||
unsigned get_dependency_cost(aco_ptr<Instruction>& instr);
|
||||
|
@ -81,8 +84,9 @@ private:
|
|||
};
|
||||
|
||||
struct wait_counter_info {
|
||||
wait_counter_info(unsigned vm_, unsigned exp_, unsigned lgkm_, unsigned vs_) :
|
||||
vm(vm_), exp(exp_), lgkm(lgkm_), vs(vs_) {}
|
||||
wait_counter_info(unsigned vm_, unsigned exp_, unsigned lgkm_, unsigned vs_)
|
||||
: vm(vm_), exp(exp_), lgkm(lgkm_), vs(vs_)
|
||||
{}
|
||||
|
||||
unsigned vm;
|
||||
unsigned exp;
|
||||
|
@ -100,107 +104,83 @@ struct perf_info {
|
|||
unsigned cost1;
|
||||
};
|
||||
|
||||
static perf_info get_perf_info(Program *program, aco_ptr<Instruction>& instr)
|
||||
static perf_info
|
||||
get_perf_info(Program* program, aco_ptr<Instruction>& instr)
|
||||
{
|
||||
instr_class cls = instr_info.classes[(int)instr->opcode];
|
||||
|
||||
#define WAIT(res) BlockCycleEstimator::res, 0
|
||||
#define WAIT_USE(res, cnt) BlockCycleEstimator::res, cnt
|
||||
#define WAIT(res) BlockCycleEstimator::res, 0
|
||||
#define WAIT_USE(res, cnt) BlockCycleEstimator::res, cnt
|
||||
|
||||
if (program->chip_class >= GFX10) {
|
||||
/* fp64 might be incorrect */
|
||||
switch (cls) {
|
||||
case instr_class::valu32:
|
||||
case instr_class::valu_convert32:
|
||||
case instr_class::valu_fma:
|
||||
return {5, WAIT_USE(valu, 1)};
|
||||
case instr_class::valu64:
|
||||
return {6, WAIT_USE(valu, 2), WAIT_USE(valu_complex, 2)};
|
||||
case instr_class::valu_fma: return {5, WAIT_USE(valu, 1)};
|
||||
case instr_class::valu64: return {6, WAIT_USE(valu, 2), WAIT_USE(valu_complex, 2)};
|
||||
case instr_class::valu_quarter_rate32:
|
||||
return {8, WAIT_USE(valu, 4), WAIT_USE(valu_complex, 4)};
|
||||
case instr_class::valu_transcendental32:
|
||||
return {10, WAIT_USE(valu, 1), WAIT_USE(valu_complex, 4)};
|
||||
case instr_class::valu_double:
|
||||
return {22, WAIT_USE(valu, 16), WAIT_USE(valu_complex, 16)};
|
||||
case instr_class::valu_double: return {22, WAIT_USE(valu, 16), WAIT_USE(valu_complex, 16)};
|
||||
case instr_class::valu_double_add:
|
||||
return {22, WAIT_USE(valu, 16), WAIT_USE(valu_complex, 16)};
|
||||
case instr_class::valu_double_convert:
|
||||
return {22, WAIT_USE(valu, 16), WAIT_USE(valu_complex, 16)};
|
||||
case instr_class::valu_double_transcendental:
|
||||
return {24, WAIT_USE(valu, 16), WAIT_USE(valu_complex, 16)};
|
||||
case instr_class::salu:
|
||||
return {2, WAIT_USE(scalar, 1)};
|
||||
case instr_class::smem:
|
||||
return {0, WAIT_USE(scalar, 1)};
|
||||
case instr_class::salu: return {2, WAIT_USE(scalar, 1)};
|
||||
case instr_class::smem: return {0, WAIT_USE(scalar, 1)};
|
||||
case instr_class::branch:
|
||||
case instr_class::sendmsg:
|
||||
return {0, WAIT_USE(branch_sendmsg, 1)};
|
||||
case instr_class::sendmsg: return {0, WAIT_USE(branch_sendmsg, 1)};
|
||||
case instr_class::ds:
|
||||
return instr->ds().gds ?
|
||||
perf_info{0, WAIT_USE(export_gds, 1)} :
|
||||
perf_info{0, WAIT_USE(lds, 1)};
|
||||
case instr_class::exp:
|
||||
return {0, WAIT_USE(export_gds, 1)};
|
||||
case instr_class::vmem:
|
||||
return {0, WAIT_USE(vmem, 1)};
|
||||
return instr->ds().gds ? perf_info{0, WAIT_USE(export_gds, 1)}
|
||||
: perf_info{0, WAIT_USE(lds, 1)};
|
||||
case instr_class::exp: return {0, WAIT_USE(export_gds, 1)};
|
||||
case instr_class::vmem: return {0, WAIT_USE(vmem, 1)};
|
||||
case instr_class::barrier:
|
||||
case instr_class::waitcnt:
|
||||
case instr_class::other:
|
||||
default:
|
||||
return {0};
|
||||
default: return {0};
|
||||
}
|
||||
} else {
|
||||
switch (cls) {
|
||||
case instr_class::valu32:
|
||||
return {4, WAIT_USE(valu, 4)};
|
||||
case instr_class::valu_convert32:
|
||||
return {16, WAIT_USE(valu, 16)};
|
||||
case instr_class::valu64:
|
||||
return {8, WAIT_USE(valu, 8)};
|
||||
case instr_class::valu_quarter_rate32:
|
||||
return {16, WAIT_USE(valu, 16)};
|
||||
case instr_class::valu32: return {4, WAIT_USE(valu, 4)};
|
||||
case instr_class::valu_convert32: return {16, WAIT_USE(valu, 16)};
|
||||
case instr_class::valu64: return {8, WAIT_USE(valu, 8)};
|
||||
case instr_class::valu_quarter_rate32: return {16, WAIT_USE(valu, 16)};
|
||||
case instr_class::valu_fma:
|
||||
return program->dev.has_fast_fma32 ?
|
||||
perf_info{4, WAIT_USE(valu, 4)} :
|
||||
perf_info{16, WAIT_USE(valu, 16)};
|
||||
case instr_class::valu_transcendental32:
|
||||
return {16, WAIT_USE(valu, 16)};
|
||||
case instr_class::valu_double:
|
||||
return {64, WAIT_USE(valu, 64)};
|
||||
case instr_class::valu_double_add:
|
||||
return {32, WAIT_USE(valu, 32)};
|
||||
case instr_class::valu_double_convert:
|
||||
return {16, WAIT_USE(valu, 16)};
|
||||
case instr_class::valu_double_transcendental:
|
||||
return {64, WAIT_USE(valu, 64)};
|
||||
case instr_class::salu:
|
||||
return {4, WAIT_USE(scalar, 4)};
|
||||
case instr_class::smem:
|
||||
return {4, WAIT_USE(scalar, 4)};
|
||||
return program->dev.has_fast_fma32 ? perf_info{4, WAIT_USE(valu, 4)}
|
||||
: perf_info{16, WAIT_USE(valu, 16)};
|
||||
case instr_class::valu_transcendental32: return {16, WAIT_USE(valu, 16)};
|
||||
case instr_class::valu_double: return {64, WAIT_USE(valu, 64)};
|
||||
case instr_class::valu_double_add: return {32, WAIT_USE(valu, 32)};
|
||||
case instr_class::valu_double_convert: return {16, WAIT_USE(valu, 16)};
|
||||
case instr_class::valu_double_transcendental: return {64, WAIT_USE(valu, 64)};
|
||||
case instr_class::salu: return {4, WAIT_USE(scalar, 4)};
|
||||
case instr_class::smem: return {4, WAIT_USE(scalar, 4)};
|
||||
case instr_class::branch:
|
||||
return {8, WAIT_USE(branch_sendmsg, 8)};
|
||||
return {4, WAIT_USE(branch_sendmsg, 4)};
|
||||
case instr_class::ds:
|
||||
return instr->ds().gds ?
|
||||
perf_info{4, WAIT_USE(export_gds, 4)} :
|
||||
perf_info{4, WAIT_USE(lds, 4)};
|
||||
case instr_class::exp:
|
||||
return {16, WAIT_USE(export_gds, 16)};
|
||||
case instr_class::vmem:
|
||||
return {4, WAIT_USE(vmem, 4)};
|
||||
return instr->ds().gds ? perf_info{4, WAIT_USE(export_gds, 4)}
|
||||
: perf_info{4, WAIT_USE(lds, 4)};
|
||||
case instr_class::exp: return {16, WAIT_USE(export_gds, 16)};
|
||||
case instr_class::vmem: return {4, WAIT_USE(vmem, 4)};
|
||||
case instr_class::barrier:
|
||||
case instr_class::waitcnt:
|
||||
case instr_class::other:
|
||||
default:
|
||||
return {4};
|
||||
default: return {4};
|
||||
}
|
||||
}
|
||||
|
||||
#undef WAIT_USE
|
||||
#undef WAIT
|
||||
#undef WAIT_USE
|
||||
#undef WAIT
|
||||
}
|
||||
|
||||
void BlockCycleEstimator::use_resources(aco_ptr<Instruction>& instr)
|
||||
void
|
||||
BlockCycleEstimator::use_resources(aco_ptr<Instruction>& instr)
|
||||
{
|
||||
perf_info perf = get_perf_info(program, instr);
|
||||
|
||||
|
@ -215,7 +195,8 @@ void BlockCycleEstimator::use_resources(aco_ptr<Instruction>& instr)
|
|||
}
|
||||
}
|
||||
|
||||
int32_t BlockCycleEstimator::cycles_until_res_available(aco_ptr<Instruction>& instr)
|
||||
int32_t
|
||||
BlockCycleEstimator::cycles_until_res_available(aco_ptr<Instruction>& instr)
|
||||
{
|
||||
perf_info perf = get_perf_info(program, instr);
|
||||
|
||||
|
@ -228,7 +209,8 @@ int32_t BlockCycleEstimator::cycles_until_res_available(aco_ptr<Instruction>& in
|
|||
return cost;
|
||||
}
|
||||
|
||||
static wait_counter_info get_wait_counter_info(aco_ptr<Instruction>& instr)
|
||||
static wait_counter_info
|
||||
get_wait_counter_info(aco_ptr<Instruction>& instr)
|
||||
{
|
||||
/* These numbers are all a bit nonsense. LDS/VMEM/SMEM/EXP performance
|
||||
* depends a lot on the situation. */
|
||||
|
@ -252,8 +234,8 @@ static wait_counter_info get_wait_counter_info(aco_ptr<Instruction>& instr)
|
|||
|
||||
bool likely_desc_load = instr->operands[0].size() == 2;
|
||||
bool soe = instr->operands.size() >= (!instr->definitions.empty() ? 3 : 4);
|
||||
bool const_offset = instr->operands[1].isConstant() &&
|
||||
(!soe || instr->operands.back().isConstant());
|
||||
bool const_offset =
|
||||
instr->operands[1].isConstant() && (!soe || instr->operands.back().isConstant());
|
||||
|
||||
if (likely_desc_load || const_offset)
|
||||
return wait_counter_info(0, 0, 30, 0); /* likely to hit L0 cache */
|
||||
|
@ -273,7 +255,8 @@ static wait_counter_info get_wait_counter_info(aco_ptr<Instruction>& instr)
|
|||
return wait_counter_info(0, 0, 0, 0);
|
||||
}
|
||||
|
||||
static wait_imm get_wait_imm(Program *program, aco_ptr<Instruction>& instr)
|
||||
static wait_imm
|
||||
get_wait_imm(Program* program, aco_ptr<Instruction>& instr)
|
||||
{
|
||||
if (instr->opcode == aco_opcode::s_endpgm) {
|
||||
return wait_imm(0, 0, 0, 0);
|
||||
|
@ -297,7 +280,8 @@ static wait_imm get_wait_imm(Program *program, aco_ptr<Instruction>& instr)
|
|||
}
|
||||
}
|
||||
|
||||
unsigned BlockCycleEstimator::get_dependency_cost(aco_ptr<Instruction>& instr)
|
||||
unsigned
|
||||
BlockCycleEstimator::get_dependency_cost(aco_ptr<Instruction>& instr)
|
||||
{
|
||||
int deps_available = cur_cycle;
|
||||
|
||||
|
@ -337,13 +321,15 @@ unsigned BlockCycleEstimator::get_dependency_cost(aco_ptr<Instruction>& instr)
|
|||
return deps_available - cur_cycle;
|
||||
}
|
||||
|
||||
unsigned BlockCycleEstimator::predict_cost(aco_ptr<Instruction>& instr)
|
||||
unsigned
|
||||
BlockCycleEstimator::predict_cost(aco_ptr<Instruction>& instr)
|
||||
{
|
||||
int32_t dep = get_dependency_cost(instr);
|
||||
return dep + std::max(cycles_until_res_available(instr) - dep, 0);
|
||||
}
|
||||
|
||||
static bool is_vector(aco_opcode op)
|
||||
static bool
|
||||
is_vector(aco_opcode op)
|
||||
{
|
||||
switch (instr_info.classes[(int)op]) {
|
||||
case instr_class::valu32:
|
||||
|
@ -358,14 +344,13 @@ static bool is_vector(aco_opcode op)
|
|||
case instr_class::exp:
|
||||
case instr_class::valu64:
|
||||
case instr_class::valu_quarter_rate32:
|
||||
case instr_class::valu_transcendental32:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
case instr_class::valu_transcendental32: return true;
|
||||
default: return false;
|
||||
}
|
||||
}
|
||||
|
||||
void BlockCycleEstimator::add(aco_ptr<Instruction>& instr)
|
||||
void
|
||||
BlockCycleEstimator::add(aco_ptr<Instruction>& instr)
|
||||
{
|
||||
perf_info perf = get_perf_info(program, instr);
|
||||
|
||||
|
@ -411,13 +396,14 @@ void BlockCycleEstimator::add(aco_ptr<Instruction>& instr)
|
|||
int32_t result_available = start + MAX2(perf.latency, latency);
|
||||
|
||||
for (Definition& def : instr->definitions) {
|
||||
int32_t *available = ®_available[def.physReg().reg()];
|
||||
int32_t* available = ®_available[def.physReg().reg()];
|
||||
for (unsigned i = 0; i < def.size(); i++)
|
||||
available[i] = MAX2(available[i], result_available);
|
||||
}
|
||||
}
|
||||
|
||||
static void join_queue(std::deque<int32_t>& queue, const std::deque<int32_t>& pred, int cycle_diff)
|
||||
static void
|
||||
join_queue(std::deque<int32_t>& queue, const std::deque<int32_t>& pred, int cycle_diff)
|
||||
{
|
||||
for (unsigned i = 0; i < MIN2(queue.size(), pred.size()); i++)
|
||||
queue.rbegin()[i] = MAX2(queue.rbegin()[i], pred.rbegin()[i] + cycle_diff);
|
||||
|
@ -425,7 +411,8 @@ static void join_queue(std::deque<int32_t>& queue, const std::deque<int32_t>& pr
|
|||
queue.push_front(pred[i] + cycle_diff);
|
||||
}
|
||||
|
||||
void BlockCycleEstimator::join(const BlockCycleEstimator& pred)
|
||||
void
|
||||
BlockCycleEstimator::join(const BlockCycleEstimator& pred)
|
||||
{
|
||||
assert(cur_cycle == 0);
|
||||
|
||||
|
@ -435,8 +422,7 @@ void BlockCycleEstimator::join(const BlockCycleEstimator& pred)
|
|||
}
|
||||
|
||||
for (unsigned i = 0; i < 512; i++)
|
||||
reg_available[i] = MAX2(reg_available[i],
|
||||
pred.reg_available[i] - pred.cur_cycle + cur_cycle);
|
||||
reg_available[i] = MAX2(reg_available[i], pred.reg_available[i] - pred.cur_cycle + cur_cycle);
|
||||
|
||||
join_queue(lgkm, pred.lgkm, -pred.cur_cycle);
|
||||
join_queue(exp, pred.exp, -pred.cur_cycle);
|
||||
|
@ -445,11 +431,12 @@ void BlockCycleEstimator::join(const BlockCycleEstimator& pred)
|
|||
}
|
||||
|
||||
/* instructions/branches/vmem_clauses/smem_clauses/cycles */
|
||||
void collect_preasm_stats(Program *program)
|
||||
void
|
||||
collect_preasm_stats(Program* program)
|
||||
{
|
||||
for (Block& block : program->blocks) {
|
||||
std::set<Instruction *> vmem_clause;
|
||||
std::set<Instruction *> smem_clause;
|
||||
std::set<Instruction*> vmem_clause;
|
||||
std::set<Instruction*> smem_clause;
|
||||
|
||||
program->statistics[statistic_instructions] += block.instructions.size();
|
||||
|
||||
|
@ -462,7 +449,8 @@ void collect_preasm_stats(Program *program)
|
|||
|
||||
if (instr->isVMEM() && !instr->operands.empty()) {
|
||||
if (std::none_of(vmem_clause.begin(), vmem_clause.end(),
|
||||
[&](Instruction *other) {return should_form_clause(instr.get(), other);}))
|
||||
[&](Instruction* other)
|
||||
{ return should_form_clause(instr.get(), other); }))
|
||||
program->statistics[statistic_vmem_clauses]++;
|
||||
vmem_clause.insert(instr.get());
|
||||
} else {
|
||||
|
@ -471,12 +459,13 @@ void collect_preasm_stats(Program *program)
|
|||
|
||||
if (instr->isSMEM() && !instr->operands.empty()) {
|
||||
if (std::none_of(smem_clause.begin(), smem_clause.end(),
|
||||
[&](Instruction *other) {return should_form_clause(instr.get(), other);}))
|
||||
[&](Instruction* other)
|
||||
{ return should_form_clause(instr.get(), other); }))
|
||||
program->statistics[statistic_smem_clauses]++;
|
||||
smem_clause.insert(instr.get());
|
||||
} else {
|
||||
smem_clause.clear();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -514,8 +503,10 @@ void collect_preasm_stats(Program *program)
|
|||
iter *= pow(0.5, block.uniform_if_depth);
|
||||
iter *= pow(0.75, block.divergent_if_logical_depth);
|
||||
|
||||
bool divergent_if_linear_else = block.logical_preds.empty() && block.linear_preds.size() == 1 && block.linear_succs.size() == 1 &&
|
||||
program->blocks[block.linear_preds[0]].kind & (block_kind_branch | block_kind_invert);
|
||||
bool divergent_if_linear_else =
|
||||
block.logical_preds.empty() && block.linear_preds.size() == 1 &&
|
||||
block.linear_succs.size() == 1 &&
|
||||
program->blocks[block.linear_preds[0]].kind & (block_kind_branch | block_kind_invert);
|
||||
if (divergent_if_linear_else)
|
||||
iter *= 0.25;
|
||||
|
||||
|
@ -540,7 +531,8 @@ void collect_preasm_stats(Program *program)
|
|||
|
||||
double max_utilization = 1.0;
|
||||
if (program->workgroup_size != UINT_MAX)
|
||||
max_utilization = program->workgroup_size / (double)align(program->workgroup_size, program->wave_size);
|
||||
max_utilization =
|
||||
program->workgroup_size / (double)align(program->workgroup_size, program->wave_size);
|
||||
wave64_per_cycle *= max_utilization;
|
||||
|
||||
program->statistics[statistic_latency] = round(latency);
|
||||
|
@ -551,7 +543,8 @@ void collect_preasm_stats(Program *program)
|
|||
|
||||
fprintf(stderr, "num_waves: %u\n", program->num_waves);
|
||||
fprintf(stderr, "salu_smem_usage: %f\n", usage[(int)BlockCycleEstimator::scalar]);
|
||||
fprintf(stderr, "branch_sendmsg_usage: %f\n", usage[(int)BlockCycleEstimator::branch_sendmsg]);
|
||||
fprintf(stderr, "branch_sendmsg_usage: %f\n",
|
||||
usage[(int)BlockCycleEstimator::branch_sendmsg]);
|
||||
fprintf(stderr, "valu_usage: %f\n", usage[(int)BlockCycleEstimator::valu]);
|
||||
fprintf(stderr, "valu_complex_usage: %f\n", usage[(int)BlockCycleEstimator::valu_complex]);
|
||||
fprintf(stderr, "lds_usage: %f\n", usage[(int)BlockCycleEstimator::lds]);
|
||||
|
@ -565,9 +558,10 @@ void collect_preasm_stats(Program *program)
|
|||
}
|
||||
}
|
||||
|
||||
void collect_postasm_stats(Program *program, const std::vector<uint32_t>& code)
|
||||
void
|
||||
collect_postasm_stats(Program* program, const std::vector<uint32_t>& code)
|
||||
{
|
||||
program->statistics[aco::statistic_hash] = util_hash_crc32(code.data(), code.size() * 4);
|
||||
}
|
||||
|
||||
}
|
||||
} // namespace aco
|
||||
|
|
|
@ -35,207 +35,198 @@
|
|||
namespace aco {
|
||||
|
||||
/*! \brief Definition of a span object
|
||||
*
|
||||
* \details A "span" is an "array view" type for holding a view of contiguous
|
||||
* data. The "span" object does not own the data itself.
|
||||
*/
|
||||
template <typename T>
|
||||
class span {
|
||||
*
|
||||
* \details A "span" is an "array view" type for holding a view of contiguous
|
||||
* data. The "span" object does not own the data itself.
|
||||
*/
|
||||
template <typename T> class span {
|
||||
public:
|
||||
using value_type = T;
|
||||
using pointer = value_type*;
|
||||
using const_pointer = const value_type*;
|
||||
using reference = value_type&;
|
||||
using const_reference = const value_type&;
|
||||
using iterator = pointer;
|
||||
using const_iterator = const_pointer;
|
||||
using reverse_iterator = std::reverse_iterator<iterator>;
|
||||
using value_type = T;
|
||||
using pointer = value_type*;
|
||||
using const_pointer = const value_type*;
|
||||
using reference = value_type&;
|
||||
using const_reference = const value_type&;
|
||||
using iterator = pointer;
|
||||
using const_iterator = const_pointer;
|
||||
using reverse_iterator = std::reverse_iterator<iterator>;
|
||||
using const_reverse_iterator = std::reverse_iterator<const_iterator>;
|
||||
using size_type = uint16_t;
|
||||
using difference_type = ptrdiff_t;
|
||||
using size_type = uint16_t;
|
||||
using difference_type = ptrdiff_t;
|
||||
|
||||
/*! \brief Compiler generated default constructor
|
||||
*/
|
||||
*/
|
||||
constexpr span() = default;
|
||||
|
||||
/*! \brief Constructor taking a pointer and the length of the span
|
||||
* \param[in] data Pointer to the underlying data array
|
||||
* \param[in] length The size of the span
|
||||
*/
|
||||
constexpr span(uint16_t offset_, const size_type length_)
|
||||
: offset{ offset_ } , length{ length_ } {}
|
||||
* \param[in] data Pointer to the underlying data array
|
||||
* \param[in] length The size of the span
|
||||
*/
|
||||
constexpr span(uint16_t offset_, const size_type length_) : offset{offset_}, length{length_} {}
|
||||
|
||||
/*! \brief Returns an iterator to the begin of the span
|
||||
* \return data
|
||||
*/
|
||||
constexpr iterator begin() noexcept {
|
||||
return (pointer)((uintptr_t)this + offset);
|
||||
}
|
||||
* \return data
|
||||
*/
|
||||
constexpr iterator begin() noexcept { return (pointer)((uintptr_t)this + offset); }
|
||||
|
||||
/*! \brief Returns a const_iterator to the begin of the span
|
||||
* \return data
|
||||
*/
|
||||
constexpr const_iterator begin() const noexcept {
|
||||
* \return data
|
||||
*/
|
||||
constexpr const_iterator begin() const noexcept
|
||||
{
|
||||
return (const_pointer)((uintptr_t)this + offset);
|
||||
}
|
||||
|
||||
/*! \brief Returns an iterator to the end of the span
|
||||
* \return data + length
|
||||
*/
|
||||
constexpr iterator end() noexcept {
|
||||
return std::next(begin(), length);
|
||||
}
|
||||
* \return data + length
|
||||
*/
|
||||
constexpr iterator end() noexcept { return std::next(begin(), length); }
|
||||
|
||||
/*! \brief Returns a const_iterator to the end of the span
|
||||
* \return data + length
|
||||
*/
|
||||
constexpr const_iterator end() const noexcept {
|
||||
return std::next(begin(), length);
|
||||
}
|
||||
* \return data + length
|
||||
*/
|
||||
constexpr const_iterator end() const noexcept { return std::next(begin(), length); }
|
||||
|
||||
/*! \brief Returns a const_iterator to the begin of the span
|
||||
* \return data
|
||||
*/
|
||||
constexpr const_iterator cbegin() const noexcept {
|
||||
return begin();
|
||||
}
|
||||
* \return data
|
||||
*/
|
||||
constexpr const_iterator cbegin() const noexcept { return begin(); }
|
||||
|
||||
/*! \brief Returns a const_iterator to the end of the span
|
||||
* \return data + length
|
||||
*/
|
||||
constexpr const_iterator cend() const noexcept {
|
||||
return std::next(begin(), length);
|
||||
}
|
||||
* \return data + length
|
||||
*/
|
||||
constexpr const_iterator cend() const noexcept { return std::next(begin(), length); }
|
||||
|
||||
/*! \brief Returns a reverse_iterator to the end of the span
|
||||
* \return reverse_iterator(end())
|
||||
*/
|
||||
constexpr reverse_iterator rbegin() noexcept {
|
||||
return reverse_iterator(end());
|
||||
}
|
||||
* \return reverse_iterator(end())
|
||||
*/
|
||||
constexpr reverse_iterator rbegin() noexcept { return reverse_iterator(end()); }
|
||||
|
||||
/*! \brief Returns a const_reverse_iterator to the end of the span
|
||||
* \return reverse_iterator(end())
|
||||
*/
|
||||
constexpr const_reverse_iterator rbegin() const noexcept {
|
||||
* \return reverse_iterator(end())
|
||||
*/
|
||||
constexpr const_reverse_iterator rbegin() const noexcept
|
||||
{
|
||||
return const_reverse_iterator(end());
|
||||
}
|
||||
|
||||
/*! \brief Returns a reverse_iterator to the begin of the span
|
||||
* \return reverse_iterator(begin())
|
||||
*/
|
||||
constexpr reverse_iterator rend() noexcept {
|
||||
return reverse_iterator(begin());
|
||||
}
|
||||
* \return reverse_iterator(begin())
|
||||
*/
|
||||
constexpr reverse_iterator rend() noexcept { return reverse_iterator(begin()); }
|
||||
|
||||
/*! \brief Returns a const_reverse_iterator to the begin of the span
|
||||
* \return reverse_iterator(begin())
|
||||
*/
|
||||
constexpr const_reverse_iterator rend() const noexcept {
|
||||
* \return reverse_iterator(begin())
|
||||
*/
|
||||
constexpr const_reverse_iterator rend() const noexcept
|
||||
{
|
||||
return const_reverse_iterator(begin());
|
||||
}
|
||||
|
||||
/*! \brief Returns a const_reverse_iterator to the end of the span
|
||||
* \return rbegin()
|
||||
*/
|
||||
constexpr const_reverse_iterator crbegin() const noexcept {
|
||||
* \return rbegin()
|
||||
*/
|
||||
constexpr const_reverse_iterator crbegin() const noexcept
|
||||
{
|
||||
return const_reverse_iterator(cend());
|
||||
}
|
||||
|
||||
/*! \brief Returns a const_reverse_iterator to the begin of the span
|
||||
* \return rend()
|
||||
*/
|
||||
constexpr const_reverse_iterator crend() const noexcept {
|
||||
* \return rend()
|
||||
*/
|
||||
constexpr const_reverse_iterator crend() const noexcept
|
||||
{
|
||||
return const_reverse_iterator(cbegin());
|
||||
}
|
||||
|
||||
/*! \brief Unchecked access operator
|
||||
* \param[in] index Index of the element we want to access
|
||||
* \return *(std::next(data, index))
|
||||
*/
|
||||
constexpr reference operator[](const size_type index) noexcept {
|
||||
* \param[in] index Index of the element we want to access
|
||||
* \return *(std::next(data, index))
|
||||
*/
|
||||
constexpr reference operator[](const size_type index) noexcept
|
||||
{
|
||||
assert(length > index);
|
||||
return *(std::next(begin(), index));
|
||||
}
|
||||
|
||||
/*! \brief Unchecked const access operator
|
||||
* \param[in] index Index of the element we want to access
|
||||
* \return *(std::next(data, index))
|
||||
*/
|
||||
constexpr const_reference operator[](const size_type index) const noexcept {
|
||||
* \param[in] index Index of the element we want to access
|
||||
* \return *(std::next(data, index))
|
||||
*/
|
||||
constexpr const_reference operator[](const size_type index) const noexcept
|
||||
{
|
||||
assert(length > index);
|
||||
return *(std::next(begin(), index));
|
||||
}
|
||||
|
||||
/*! \brief Returns a reference to the last element of the span
|
||||
* \return *(std::next(data, length - 1))
|
||||
*/
|
||||
constexpr reference back() noexcept {
|
||||
* \return *(std::next(data, length - 1))
|
||||
*/
|
||||
constexpr reference back() noexcept
|
||||
{
|
||||
assert(length > 0);
|
||||
return *(std::next(begin(), length - 1));
|
||||
}
|
||||
|
||||
/*! \brief Returns a const_reference to the last element of the span
|
||||
* \return *(std::next(data, length - 1))
|
||||
*/
|
||||
constexpr const_reference back() const noexcept {
|
||||
* \return *(std::next(data, length - 1))
|
||||
*/
|
||||
constexpr const_reference back() const noexcept
|
||||
{
|
||||
assert(length > 0);
|
||||
return *(std::next(begin(), length - 1));
|
||||
}
|
||||
|
||||
/*! \brief Returns a reference to the first element of the span
|
||||
* \return *begin()
|
||||
*/
|
||||
constexpr reference front() noexcept {
|
||||
* \return *begin()
|
||||
*/
|
||||
constexpr reference front() noexcept
|
||||
{
|
||||
assert(length > 0);
|
||||
return *begin();
|
||||
}
|
||||
|
||||
/*! \brief Returns a const_reference to the first element of the span
|
||||
* \return *cbegin()
|
||||
*/
|
||||
constexpr const_reference front() const noexcept {
|
||||
* \return *cbegin()
|
||||
*/
|
||||
constexpr const_reference front() const noexcept
|
||||
{
|
||||
assert(length > 0);
|
||||
return *cbegin();
|
||||
}
|
||||
|
||||
/*! \brief Returns true if the span is empty
|
||||
* \return length == 0
|
||||
*/
|
||||
constexpr bool empty() const noexcept {
|
||||
return length == 0;
|
||||
}
|
||||
* \return length == 0
|
||||
*/
|
||||
constexpr bool empty() const noexcept { return length == 0; }
|
||||
|
||||
/*! \brief Returns the size of the span
|
||||
* \return length == 0
|
||||
*/
|
||||
constexpr size_type size() const noexcept {
|
||||
return length;
|
||||
}
|
||||
* \return length == 0
|
||||
*/
|
||||
constexpr size_type size() const noexcept { return length; }
|
||||
|
||||
/*! \brief Decreases the size of the span by 1
|
||||
*/
|
||||
constexpr void pop_back() noexcept {
|
||||
*/
|
||||
constexpr void pop_back() noexcept
|
||||
{
|
||||
assert(length > 0);
|
||||
--length;
|
||||
}
|
||||
|
||||
/*! \brief Adds an element to the end of the span
|
||||
*/
|
||||
constexpr void push_back(const_reference val) noexcept {
|
||||
*std::next(begin(), length++) = val;
|
||||
}
|
||||
*/
|
||||
constexpr void push_back(const_reference val) noexcept { *std::next(begin(), length++) = val; }
|
||||
|
||||
/*! \brief Clears the span
|
||||
*/
|
||||
constexpr void clear() noexcept {
|
||||
*/
|
||||
constexpr void clear() noexcept
|
||||
{
|
||||
offset = 0;
|
||||
length = 0;
|
||||
}
|
||||
|
||||
private:
|
||||
uint16_t offset{ 0 }; //!> Byte offset from span to data
|
||||
size_type length{ 0 }; //!> Size of the span
|
||||
uint16_t offset{0}; //!> Byte offset from span to data
|
||||
size_type length{0}; //!> Size of the span
|
||||
};
|
||||
|
||||
/*
|
||||
|
@ -250,30 +241,32 @@ private:
|
|||
*/
|
||||
struct IDSet {
|
||||
struct Iterator {
|
||||
const IDSet *set;
|
||||
const IDSet* set;
|
||||
union {
|
||||
struct {
|
||||
uint32_t bit:6;
|
||||
uint32_t word:26;
|
||||
uint32_t bit : 6;
|
||||
uint32_t word : 26;
|
||||
};
|
||||
uint32_t id;
|
||||
};
|
||||
|
||||
Iterator& operator ++();
|
||||
Iterator& operator++();
|
||||
|
||||
bool operator != (const Iterator& other) const;
|
||||
bool operator!=(const Iterator& other) const;
|
||||
|
||||
uint32_t operator * () const;
|
||||
uint32_t operator*() const;
|
||||
};
|
||||
|
||||
size_t count(uint32_t id) const {
|
||||
size_t count(uint32_t id) const
|
||||
{
|
||||
if (id >= words.size() * 64)
|
||||
return 0;
|
||||
|
||||
return words[id / 64u] & (1ull << (id % 64u)) ? 1 : 0;
|
||||
}
|
||||
|
||||
Iterator find(uint32_t id) const {
|
||||
Iterator find(uint32_t id) const
|
||||
{
|
||||
if (!count(id))
|
||||
return end();
|
||||
|
||||
|
@ -284,7 +277,8 @@ struct IDSet {
|
|||
return it;
|
||||
}
|
||||
|
||||
std::pair<Iterator, bool> insert(uint32_t id) {
|
||||
std::pair<Iterator, bool> insert(uint32_t id)
|
||||
{
|
||||
if (words.size() * 64u <= id)
|
||||
words.resize(id / 64u + 1);
|
||||
|
||||
|
@ -302,7 +296,8 @@ struct IDSet {
|
|||
return std::make_pair(it, true);
|
||||
}
|
||||
|
||||
size_t erase(uint32_t id) {
|
||||
size_t erase(uint32_t id)
|
||||
{
|
||||
if (!count(id))
|
||||
return 0;
|
||||
|
||||
|
@ -311,7 +306,8 @@ struct IDSet {
|
|||
return 1;
|
||||
}
|
||||
|
||||
Iterator cbegin() const {
|
||||
Iterator cbegin() const
|
||||
{
|
||||
Iterator it;
|
||||
it.set = this;
|
||||
for (size_t i = 0; i < words.size(); i++) {
|
||||
|
@ -324,7 +320,8 @@ struct IDSet {
|
|||
return end();
|
||||
}
|
||||
|
||||
Iterator cend() const {
|
||||
Iterator cend() const
|
||||
{
|
||||
Iterator it;
|
||||
it.set = this;
|
||||
it.word = words.size();
|
||||
|
@ -332,27 +329,21 @@ struct IDSet {
|
|||
return it;
|
||||
}
|
||||
|
||||
Iterator begin() const {
|
||||
return cbegin();
|
||||
}
|
||||
Iterator begin() const { return cbegin(); }
|
||||
|
||||
Iterator end() const {
|
||||
return cend();
|
||||
}
|
||||
Iterator end() const { return cend(); }
|
||||
|
||||
bool empty() const {
|
||||
return bits_set == 0;
|
||||
}
|
||||
bool empty() const { return bits_set == 0; }
|
||||
|
||||
size_t size() const {
|
||||
return bits_set;
|
||||
}
|
||||
size_t size() const { return bits_set; }
|
||||
|
||||
std::vector<uint64_t> words;
|
||||
uint32_t bits_set = 0;
|
||||
};
|
||||
|
||||
inline IDSet::Iterator& IDSet::Iterator::operator ++() {
|
||||
inline IDSet::Iterator&
|
||||
IDSet::Iterator::operator++()
|
||||
{
|
||||
uint64_t m = set->words[word];
|
||||
m &= ~((2ull << bit) - 1ull);
|
||||
if (!m) {
|
||||
|
@ -374,12 +365,16 @@ inline IDSet::Iterator& IDSet::Iterator::operator ++() {
|
|||
return *this;
|
||||
}
|
||||
|
||||
inline bool IDSet::Iterator::operator != (const IDSet::Iterator& other) const {
|
||||
inline bool
|
||||
IDSet::Iterator::operator!=(const IDSet::Iterator& other) const
|
||||
{
|
||||
assert(set == other.set);
|
||||
return id != other.id;
|
||||
}
|
||||
|
||||
inline uint32_t IDSet::Iterator::operator * () const {
|
||||
inline uint32_t
|
||||
IDSet::Iterator::operator*() const
|
||||
{
|
||||
return (word << 6) | bit;
|
||||
}
|
||||
|
||||
|
|
|
@ -23,6 +23,7 @@
|
|||
*/
|
||||
|
||||
#include "aco_ir.h"
|
||||
|
||||
#include "util/memstream.h"
|
||||
|
||||
#include <array>
|
||||
|
@ -32,11 +33,11 @@
|
|||
|
||||
namespace aco {
|
||||
|
||||
static void aco_log(Program *program, enum radv_compiler_debug_level level,
|
||||
const char *prefix, const char *file, unsigned line,
|
||||
const char *fmt, va_list args)
|
||||
static void
|
||||
aco_log(Program* program, enum radv_compiler_debug_level level, const char* prefix,
|
||||
const char* file, unsigned line, const char* fmt, va_list args)
|
||||
{
|
||||
char *msg;
|
||||
char* msg;
|
||||
|
||||
if (program->debug.shorten_messages) {
|
||||
msg = ralloc_vasprintf(NULL, fmt, args);
|
||||
|
@ -55,38 +56,39 @@ static void aco_log(Program *program, enum radv_compiler_debug_level level,
|
|||
ralloc_free(msg);
|
||||
}
|
||||
|
||||
void _aco_perfwarn(Program *program, const char *file, unsigned line,
|
||||
const char *fmt, ...)
|
||||
void
|
||||
_aco_perfwarn(Program* program, const char* file, unsigned line, const char* fmt, ...)
|
||||
{
|
||||
va_list args;
|
||||
|
||||
va_start(args, fmt);
|
||||
aco_log(program, RADV_COMPILER_DEBUG_LEVEL_PERFWARN,
|
||||
"ACO PERFWARN:\n", file, line, fmt, args);
|
||||
aco_log(program, RADV_COMPILER_DEBUG_LEVEL_PERFWARN, "ACO PERFWARN:\n", file, line, fmt, args);
|
||||
va_end(args);
|
||||
}
|
||||
|
||||
void _aco_err(Program *program, const char *file, unsigned line,
|
||||
const char *fmt, ...)
|
||||
void
|
||||
_aco_err(Program* program, const char* file, unsigned line, const char* fmt, ...)
|
||||
{
|
||||
va_list args;
|
||||
|
||||
va_start(args, fmt);
|
||||
aco_log(program, RADV_COMPILER_DEBUG_LEVEL_ERROR,
|
||||
"ACO ERROR:\n", file, line, fmt, args);
|
||||
aco_log(program, RADV_COMPILER_DEBUG_LEVEL_ERROR, "ACO ERROR:\n", file, line, fmt, args);
|
||||
va_end(args);
|
||||
}
|
||||
|
||||
bool validate_ir(Program* program)
|
||||
bool
|
||||
validate_ir(Program* program)
|
||||
{
|
||||
bool is_valid = true;
|
||||
auto check = [&program, &is_valid](bool success, const char * msg, aco::Instruction * instr) -> void {
|
||||
auto check = [&program, &is_valid](bool success, const char* msg,
|
||||
aco::Instruction* instr) -> void
|
||||
{
|
||||
if (!success) {
|
||||
char *out;
|
||||
char* out;
|
||||
size_t outsize;
|
||||
struct u_memstream mem;
|
||||
u_memstream_open(&mem, &out, &outsize);
|
||||
FILE *const memf = u_memstream_get(&mem);
|
||||
FILE* const memf = u_memstream_get(&mem);
|
||||
|
||||
fprintf(memf, "%s: ", msg);
|
||||
aco_print_instr(instr, memf);
|
||||
|
@ -99,7 +101,9 @@ bool validate_ir(Program* program)
|
|||
}
|
||||
};
|
||||
|
||||
auto check_block = [&program, &is_valid](bool success, const char * msg, aco::Block * block) -> void {
|
||||
auto check_block = [&program, &is_valid](bool success, const char* msg,
|
||||
aco::Block* block) -> void
|
||||
{
|
||||
if (!success) {
|
||||
aco_err(program, "%s: BB%u", msg, block->index);
|
||||
is_valid = false;
|
||||
|
@ -132,32 +136,32 @@ bool validate_ir(Program* program)
|
|||
base_format = Format::VINTRP;
|
||||
}
|
||||
}
|
||||
check(base_format == instr_info.format[(int)instr->opcode], "Wrong base format for instruction", instr.get());
|
||||
check(base_format == instr_info.format[(int)instr->opcode],
|
||||
"Wrong base format for instruction", instr.get());
|
||||
|
||||
/* check VOP3 modifiers */
|
||||
if (instr->isVOP3() && instr->format != Format::VOP3) {
|
||||
check(base_format == Format::VOP2 ||
|
||||
base_format == Format::VOP1 ||
|
||||
base_format == Format::VOPC ||
|
||||
base_format == Format::VINTRP,
|
||||
check(base_format == Format::VOP2 || base_format == Format::VOP1 ||
|
||||
base_format == Format::VOPC || base_format == Format::VINTRP,
|
||||
"Format cannot have VOP3/VOP3B applied", instr.get());
|
||||
}
|
||||
|
||||
/* check SDWA */
|
||||
if (instr->isSDWA()) {
|
||||
check(base_format == Format::VOP2 ||
|
||||
base_format == Format::VOP1 ||
|
||||
base_format == Format::VOPC,
|
||||
check(base_format == Format::VOP2 || base_format == Format::VOP1 ||
|
||||
base_format == Format::VOPC,
|
||||
"Format cannot have SDWA applied", instr.get());
|
||||
|
||||
check(program->chip_class >= GFX8, "SDWA is GFX8+ only", instr.get());
|
||||
|
||||
SDWA_instruction& sdwa = instr->sdwa();
|
||||
check(sdwa.omod == 0 || program->chip_class >= GFX9, "SDWA omod only supported on GFX9+", instr.get());
|
||||
check(sdwa.omod == 0 || program->chip_class >= GFX9,
|
||||
"SDWA omod only supported on GFX9+", instr.get());
|
||||
if (base_format == Format::VOPC) {
|
||||
check(sdwa.clamp == false || program->chip_class == GFX8, "SDWA VOPC clamp only supported on GFX8", instr.get());
|
||||
check(sdwa.clamp == false || program->chip_class == GFX8,
|
||||
"SDWA VOPC clamp only supported on GFX8", instr.get());
|
||||
check((instr->definitions[0].isFixed() && instr->definitions[0].physReg() == vcc) ||
|
||||
program->chip_class >= GFX9,
|
||||
program->chip_class >= GFX9,
|
||||
"SDWA+VOPC definition must be fixed to vcc on GFX8", instr.get());
|
||||
}
|
||||
|
||||
|
@ -171,8 +175,7 @@ bool validate_ir(Program* program)
|
|||
}
|
||||
|
||||
const bool sdwa_opcodes =
|
||||
instr->opcode != aco_opcode::v_fmac_f32 &&
|
||||
instr->opcode != aco_opcode::v_fmac_f16 &&
|
||||
instr->opcode != aco_opcode::v_fmac_f32 && instr->opcode != aco_opcode::v_fmac_f16 &&
|
||||
instr->opcode != aco_opcode::v_fmamk_f32 &&
|
||||
instr->opcode != aco_opcode::v_fmaak_f32 &&
|
||||
instr->opcode != aco_opcode::v_fmamk_f16 &&
|
||||
|
@ -186,67 +189,75 @@ bool validate_ir(Program* program)
|
|||
|
||||
const bool feature_mac =
|
||||
program->chip_class == GFX8 &&
|
||||
(instr->opcode == aco_opcode::v_mac_f32 &&
|
||||
instr->opcode == aco_opcode::v_mac_f16);
|
||||
(instr->opcode == aco_opcode::v_mac_f32 && instr->opcode == aco_opcode::v_mac_f16);
|
||||
|
||||
check(sdwa_opcodes || feature_mac, "SDWA can't be used with this opcode", instr.get());
|
||||
|
||||
if (instr->definitions[0].regClass().is_subdword())
|
||||
check((sdwa.dst_sel & sdwa_asuint) == (sdwa_isra | instr->definitions[0].bytes()), "Unexpected SDWA sel for sub-dword definition", instr.get());
|
||||
check((sdwa.dst_sel & sdwa_asuint) == (sdwa_isra | instr->definitions[0].bytes()),
|
||||
"Unexpected SDWA sel for sub-dword definition", instr.get());
|
||||
}
|
||||
|
||||
/* check opsel */
|
||||
if (instr->isVOP3()) {
|
||||
VOP3_instruction& vop3 = instr->vop3();
|
||||
check(vop3.opsel == 0 || program->chip_class >= GFX9, "Opsel is only supported on GFX9+", instr.get());
|
||||
check(vop3.opsel == 0 || program->chip_class >= GFX9,
|
||||
"Opsel is only supported on GFX9+", instr.get());
|
||||
|
||||
for (unsigned i = 0; i < 3; i++) {
|
||||
if (i >= instr->operands.size() ||
|
||||
(instr->operands[i].hasRegClass() && instr->operands[i].regClass().is_subdword() && !instr->operands[i].isFixed()))
|
||||
(instr->operands[i].hasRegClass() &&
|
||||
instr->operands[i].regClass().is_subdword() && !instr->operands[i].isFixed()))
|
||||
check((vop3.opsel & (1 << i)) == 0, "Unexpected opsel for operand", instr.get());
|
||||
}
|
||||
if (instr->definitions[0].regClass().is_subdword() && !instr->definitions[0].isFixed())
|
||||
check((vop3.opsel & (1 << 3)) == 0, "Unexpected opsel for sub-dword definition", instr.get());
|
||||
check((vop3.opsel & (1 << 3)) == 0, "Unexpected opsel for sub-dword definition",
|
||||
instr.get());
|
||||
}
|
||||
|
||||
/* check for undefs */
|
||||
for (unsigned i = 0; i < instr->operands.size(); i++) {
|
||||
if (instr->operands[i].isUndefined()) {
|
||||
bool flat = instr->isFlatLike();
|
||||
bool can_be_undef = is_phi(instr) || instr->isEXP() ||
|
||||
instr->isReduction() ||
|
||||
bool can_be_undef = is_phi(instr) || instr->isEXP() || instr->isReduction() ||
|
||||
instr->opcode == aco_opcode::p_create_vector ||
|
||||
(flat && i == 1) || (instr->isMIMG() && (i == 1 || i == 2)) ||
|
||||
((instr->isMUBUF() || instr->isMTBUF()) && i == 1);
|
||||
check(can_be_undef, "Undefs can only be used in certain operands", instr.get());
|
||||
} else {
|
||||
check(instr->operands[i].isFixed() || instr->operands[i].isTemp() || instr->operands[i].isConstant(), "Uninitialized Operand", instr.get());
|
||||
check(instr->operands[i].isFixed() || instr->operands[i].isTemp() ||
|
||||
instr->operands[i].isConstant(),
|
||||
"Uninitialized Operand", instr.get());
|
||||
}
|
||||
}
|
||||
|
||||
/* check subdword definitions */
|
||||
for (unsigned i = 0; i < instr->definitions.size(); i++) {
|
||||
if (instr->definitions[i].regClass().is_subdword())
|
||||
check(instr->isPseudo() || instr->definitions[i].bytes() <= 4, "Only Pseudo instructions can write subdword registers larger than 4 bytes", instr.get());
|
||||
check(instr->isPseudo() || instr->definitions[i].bytes() <= 4,
|
||||
"Only Pseudo instructions can write subdword registers larger than 4 bytes",
|
||||
instr.get());
|
||||
}
|
||||
|
||||
if (instr->isSALU() || instr->isVALU()) {
|
||||
/* check literals */
|
||||
Operand literal(s1);
|
||||
for (unsigned i = 0; i < instr->operands.size(); i++)
|
||||
{
|
||||
for (unsigned i = 0; i < instr->operands.size(); i++) {
|
||||
Operand op = instr->operands[i];
|
||||
if (!op.isLiteral())
|
||||
continue;
|
||||
|
||||
check(!instr->isDPP() && !instr->isSDWA() &&
|
||||
(!instr->isVOP3() || program->chip_class >= GFX10) &&
|
||||
(!instr->isVOP3P() || program->chip_class >= GFX10),
|
||||
(!instr->isVOP3() || program->chip_class >= GFX10) &&
|
||||
(!instr->isVOP3P() || program->chip_class >= GFX10),
|
||||
"Literal applied on wrong instruction format", instr.get());
|
||||
|
||||
check(literal.isUndefined() || (literal.size() == op.size() && literal.constantValue() == op.constantValue()), "Only 1 Literal allowed", instr.get());
|
||||
check(literal.isUndefined() || (literal.size() == op.size() &&
|
||||
literal.constantValue() == op.constantValue()),
|
||||
"Only 1 Literal allowed", instr.get());
|
||||
literal = op;
|
||||
check(instr->isSALU() || instr->isVOP3() || instr->isVOP3P() || i == 0 || i == 2, "Wrong source position for Literal argument", instr.get());
|
||||
check(instr->isSALU() || instr->isVOP3() || instr->isVOP3P() || i == 0 || i == 2,
|
||||
"Wrong source position for Literal argument", instr.get());
|
||||
}
|
||||
|
||||
/* check num sgprs for VALU */
|
||||
|
@ -264,8 +275,7 @@ bool validate_ir(Program* program)
|
|||
else if (instr->isDPP())
|
||||
scalar_mask = 0x0;
|
||||
|
||||
if (instr->isVOPC() ||
|
||||
instr->opcode == aco_opcode::v_readfirstlane_b32 ||
|
||||
if (instr->isVOPC() || instr->opcode == aco_opcode::v_readfirstlane_b32 ||
|
||||
instr->opcode == aco_opcode::v_readlane_b32 ||
|
||||
instr->opcode == aco_opcode::v_readlane_b32_e64) {
|
||||
check(instr->definitions[0].getTemp().type() == RegType::sgpr,
|
||||
|
@ -277,45 +287,42 @@ bool validate_ir(Program* program)
|
|||
|
||||
unsigned num_sgprs = 0;
|
||||
unsigned sgpr[] = {0, 0};
|
||||
for (unsigned i = 0; i < instr->operands.size(); i++)
|
||||
{
|
||||
for (unsigned i = 0; i < instr->operands.size(); i++) {
|
||||
Operand op = instr->operands[i];
|
||||
if (instr->opcode == aco_opcode::v_readfirstlane_b32 ||
|
||||
instr->opcode == aco_opcode::v_readlane_b32 ||
|
||||
instr->opcode == aco_opcode::v_readlane_b32_e64) {
|
||||
check(i != 1 ||
|
||||
(op.isTemp() && op.regClass().type() == RegType::sgpr) ||
|
||||
op.isConstant(),
|
||||
check(i != 1 || (op.isTemp() && op.regClass().type() == RegType::sgpr) ||
|
||||
op.isConstant(),
|
||||
"Must be a SGPR or a constant", instr.get());
|
||||
check(i == 1 ||
|
||||
(op.isTemp() && op.regClass().type() == RegType::vgpr && op.bytes() <= 4),
|
||||
check(i == 1 || (op.isTemp() && op.regClass().type() == RegType::vgpr &&
|
||||
op.bytes() <= 4),
|
||||
"Wrong Operand type for VALU instruction", instr.get());
|
||||
continue;
|
||||
}
|
||||
if (instr->opcode == aco_opcode::v_permlane16_b32 ||
|
||||
instr->opcode == aco_opcode::v_permlanex16_b32) {
|
||||
check(i != 0 ||
|
||||
(op.isTemp() && op.regClass().type() == RegType::vgpr),
|
||||
check(i != 0 || (op.isTemp() && op.regClass().type() == RegType::vgpr),
|
||||
"Operand 0 of v_permlane must be VGPR", instr.get());
|
||||
check(i == 0 ||
|
||||
(op.isTemp() && op.regClass().type() == RegType::sgpr) ||
|
||||
op.isConstant(),
|
||||
"Lane select operands of v_permlane must be SGPR or constant", instr.get());
|
||||
check(i == 0 || (op.isTemp() && op.regClass().type() == RegType::sgpr) ||
|
||||
op.isConstant(),
|
||||
"Lane select operands of v_permlane must be SGPR or constant",
|
||||
instr.get());
|
||||
}
|
||||
|
||||
if (instr->opcode == aco_opcode::v_writelane_b32 ||
|
||||
instr->opcode == aco_opcode::v_writelane_b32_e64) {
|
||||
check(i != 2 ||
|
||||
(op.isTemp() && op.regClass().type() == RegType::vgpr && op.bytes() <= 4),
|
||||
check(i != 2 || (op.isTemp() && op.regClass().type() == RegType::vgpr &&
|
||||
op.bytes() <= 4),
|
||||
"Wrong Operand type for VALU instruction", instr.get());
|
||||
check(i == 2 ||
|
||||
(op.isTemp() && op.regClass().type() == RegType::sgpr) ||
|
||||
op.isConstant(),
|
||||
check(i == 2 || (op.isTemp() && op.regClass().type() == RegType::sgpr) ||
|
||||
op.isConstant(),
|
||||
"Must be a SGPR or a constant", instr.get());
|
||||
continue;
|
||||
}
|
||||
if (op.isTemp() && instr->operands[i].regClass().type() == RegType::sgpr) {
|
||||
check(scalar_mask & (1 << i), "Wrong source position for SGPR argument", instr.get());
|
||||
check(scalar_mask & (1 << i), "Wrong source position for SGPR argument",
|
||||
instr.get());
|
||||
|
||||
if (op.tempId() != sgpr[0] && op.tempId() != sgpr[1]) {
|
||||
if (num_sgprs < 2)
|
||||
|
@ -324,19 +331,22 @@ bool validate_ir(Program* program)
|
|||
}
|
||||
|
||||
if (op.isConstant() && !op.isLiteral())
|
||||
check(scalar_mask & (1 << i), "Wrong source position for constant argument", instr.get());
|
||||
check(scalar_mask & (1 << i), "Wrong source position for constant argument",
|
||||
instr.get());
|
||||
}
|
||||
check(num_sgprs + (literal.isUndefined() ? 0 : 1) <= const_bus_limit, "Too many SGPRs/literals", instr.get());
|
||||
check(num_sgprs + (literal.isUndefined() ? 0 : 1) <= const_bus_limit,
|
||||
"Too many SGPRs/literals", instr.get());
|
||||
}
|
||||
|
||||
if (instr->isSOP1() || instr->isSOP2()) {
|
||||
check(instr->definitions[0].getTemp().type() == RegType::sgpr, "Wrong Definition type for SALU instruction", instr.get());
|
||||
check(instr->definitions[0].getTemp().type() == RegType::sgpr,
|
||||
"Wrong Definition type for SALU instruction", instr.get());
|
||||
for (const Operand& op : instr->operands) {
|
||||
check(op.isConstant() || op.regClass().type() <= RegType::sgpr,
|
||||
"Wrong Operand type for SALU instruction", instr.get());
|
||||
check(op.isConstant() || op.regClass().type() <= RegType::sgpr,
|
||||
"Wrong Operand type for SALU instruction", instr.get());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
switch (instr->format) {
|
||||
case Format::PSEUDO: {
|
||||
|
@ -346,7 +356,8 @@ bool validate_ir(Program* program)
|
|||
check(op.bytes() < 4 || size % 4 == 0, "Operand is not aligned", instr.get());
|
||||
size += op.bytes();
|
||||
}
|
||||
check(size == instr->definitions[0].bytes(), "Definition size does not match operand sizes", instr.get());
|
||||
check(size == instr->definitions[0].bytes(),
|
||||
"Definition size does not match operand sizes", instr.get());
|
||||
if (instr->definitions[0].getTemp().type() == RegType::sgpr) {
|
||||
for (const Operand& op : instr->operands) {
|
||||
check(op.isConstant() || op.regClass().type() == RegType::sgpr,
|
||||
|
@ -354,55 +365,75 @@ bool validate_ir(Program* program)
|
|||
}
|
||||
}
|
||||
} else if (instr->opcode == aco_opcode::p_extract_vector) {
|
||||
check((instr->operands[0].isTemp()) && instr->operands[1].isConstant(), "Wrong Operand types", instr.get());
|
||||
check((instr->operands[1].constantValue() + 1) * instr->definitions[0].bytes() <= instr->operands[0].bytes(), "Index out of range", instr.get());
|
||||
check(instr->definitions[0].getTemp().type() == RegType::vgpr || instr->operands[0].regClass().type() == RegType::sgpr,
|
||||
check((instr->operands[0].isTemp()) && instr->operands[1].isConstant(),
|
||||
"Wrong Operand types", instr.get());
|
||||
check((instr->operands[1].constantValue() + 1) * instr->definitions[0].bytes() <=
|
||||
instr->operands[0].bytes(),
|
||||
"Index out of range", instr.get());
|
||||
check(instr->definitions[0].getTemp().type() == RegType::vgpr ||
|
||||
instr->operands[0].regClass().type() == RegType::sgpr,
|
||||
"Cannot extract SGPR value from VGPR vector", instr.get());
|
||||
check(program->chip_class >= GFX9 || !instr->definitions[0].regClass().is_subdword() ||
|
||||
instr->operands[0].regClass().type() == RegType::vgpr, "Cannot extract subdword from SGPR before GFX9+", instr.get());
|
||||
check(program->chip_class >= GFX9 ||
|
||||
!instr->definitions[0].regClass().is_subdword() ||
|
||||
instr->operands[0].regClass().type() == RegType::vgpr,
|
||||
"Cannot extract subdword from SGPR before GFX9+", instr.get());
|
||||
} else if (instr->opcode == aco_opcode::p_split_vector) {
|
||||
check(instr->operands[0].isTemp(), "Operand must be a temporary", instr.get());
|
||||
unsigned size = 0;
|
||||
for (const Definition& def : instr->definitions) {
|
||||
size += def.bytes();
|
||||
}
|
||||
check(size == instr->operands[0].bytes(), "Operand size does not match definition sizes", instr.get());
|
||||
check(size == instr->operands[0].bytes(),
|
||||
"Operand size does not match definition sizes", instr.get());
|
||||
if (instr->operands[0].getTemp().type() == RegType::vgpr) {
|
||||
for (const Definition& def : instr->definitions)
|
||||
check(def.regClass().type() == RegType::vgpr, "Wrong Definition type for VGPR split_vector", instr.get());
|
||||
check(def.regClass().type() == RegType::vgpr,
|
||||
"Wrong Definition type for VGPR split_vector", instr.get());
|
||||
} else {
|
||||
for (const Definition& def : instr->definitions)
|
||||
check(program->chip_class >= GFX9 || !def.regClass().is_subdword(), "Cannot split SGPR into subdword VGPRs before GFX9+", instr.get());
|
||||
check(program->chip_class >= GFX9 || !def.regClass().is_subdword(),
|
||||
"Cannot split SGPR into subdword VGPRs before GFX9+", instr.get());
|
||||
}
|
||||
} else if (instr->opcode == aco_opcode::p_parallelcopy) {
|
||||
check(instr->definitions.size() == instr->operands.size(), "Number of Operands does not match number of Definitions", instr.get());
|
||||
check(instr->definitions.size() == instr->operands.size(),
|
||||
"Number of Operands does not match number of Definitions", instr.get());
|
||||
for (unsigned i = 0; i < instr->operands.size(); i++) {
|
||||
check(instr->definitions[i].bytes() == instr->operands[i].bytes(), "Operand and Definition size must match", instr.get());
|
||||
check(instr->definitions[i].bytes() == instr->operands[i].bytes(),
|
||||
"Operand and Definition size must match", instr.get());
|
||||
if (instr->operands[i].isTemp())
|
||||
check((instr->definitions[i].getTemp().type() == instr->operands[i].regClass().type()) ||
|
||||
(instr->definitions[i].getTemp().type() == RegType::vgpr && instr->operands[i].regClass().type() == RegType::sgpr),
|
||||
check((instr->definitions[i].getTemp().type() ==
|
||||
instr->operands[i].regClass().type()) ||
|
||||
(instr->definitions[i].getTemp().type() == RegType::vgpr &&
|
||||
instr->operands[i].regClass().type() == RegType::sgpr),
|
||||
"Operand and Definition types do not match", instr.get());
|
||||
}
|
||||
} else if (instr->opcode == aco_opcode::p_phi) {
|
||||
check(instr->operands.size() == block.logical_preds.size(), "Number of Operands does not match number of predecessors", instr.get());
|
||||
check(instr->definitions[0].getTemp().type() == RegType::vgpr, "Logical Phi Definition must be vgpr", instr.get());
|
||||
check(instr->operands.size() == block.logical_preds.size(),
|
||||
"Number of Operands does not match number of predecessors", instr.get());
|
||||
check(instr->definitions[0].getTemp().type() == RegType::vgpr,
|
||||
"Logical Phi Definition must be vgpr", instr.get());
|
||||
for (const Operand& op : instr->operands)
|
||||
check(instr->definitions[0].size() == op.size(), "Operand sizes must match Definition size", instr.get());
|
||||
check(instr->definitions[0].size() == op.size(),
|
||||
"Operand sizes must match Definition size", instr.get());
|
||||
} else if (instr->opcode == aco_opcode::p_linear_phi) {
|
||||
for (const Operand& op : instr->operands) {
|
||||
check(!op.isTemp() || op.getTemp().is_linear(), "Wrong Operand type", instr.get());
|
||||
check(instr->definitions[0].size() == op.size(), "Operand sizes must match Definition size", instr.get());
|
||||
check(!op.isTemp() || op.getTemp().is_linear(), "Wrong Operand type",
|
||||
instr.get());
|
||||
check(instr->definitions[0].size() == op.size(),
|
||||
"Operand sizes must match Definition size", instr.get());
|
||||
}
|
||||
check(instr->operands.size() == block.linear_preds.size(), "Number of Operands does not match number of predecessors", instr.get());
|
||||
} else if (instr->opcode == aco_opcode::p_extract || instr->opcode == aco_opcode::p_insert) {
|
||||
check(instr->operands[0].isTemp(),
|
||||
"Data operand must be temporary", instr.get());
|
||||
check(instr->operands.size() == block.linear_preds.size(),
|
||||
"Number of Operands does not match number of predecessors", instr.get());
|
||||
} else if (instr->opcode == aco_opcode::p_extract ||
|
||||
instr->opcode == aco_opcode::p_insert) {
|
||||
check(instr->operands[0].isTemp(), "Data operand must be temporary", instr.get());
|
||||
check(instr->operands[1].isConstant(), "Index must be constant", instr.get());
|
||||
if (instr->opcode == aco_opcode::p_extract)
|
||||
check(instr->operands[3].isConstant(), "Sign-extend flag must be constant", instr.get());
|
||||
check(instr->operands[3].isConstant(), "Sign-extend flag must be constant",
|
||||
instr.get());
|
||||
|
||||
check(instr->definitions[0].getTemp().type() != RegType::sgpr ||
|
||||
instr->operands[0].getTemp().type() == RegType::sgpr,
|
||||
instr->operands[0].getTemp().type() == RegType::sgpr,
|
||||
"Can't extract/insert VGPR to SGPR", instr.get());
|
||||
|
||||
if (instr->operands[0].getTemp().type() == RegType::vgpr)
|
||||
|
@ -410,69 +441,106 @@ bool validate_ir(Program* program)
|
|||
"Sizes of operand and definition must match", instr.get());
|
||||
|
||||
if (instr->definitions[0].getTemp().type() == RegType::sgpr)
|
||||
check(instr->definitions.size() >= 2 && instr->definitions[1].isFixed() && instr->definitions[1].physReg() == scc, "SGPR extract/insert needs a SCC definition", instr.get());
|
||||
check(instr->definitions.size() >= 2 && instr->definitions[1].isFixed() &&
|
||||
instr->definitions[1].physReg() == scc,
|
||||
"SGPR extract/insert needs a SCC definition", instr.get());
|
||||
|
||||
check(instr->operands[2].constantEquals(8) || instr->operands[2].constantEquals(16), "Size must be 8 or 16", instr.get());
|
||||
check(instr->operands[2].constantValue() < instr->operands[0].getTemp().bytes() * 8u, "Size must be smaller than source", instr.get());
|
||||
check(instr->operands[2].constantEquals(8) || instr->operands[2].constantEquals(16),
|
||||
"Size must be 8 or 16", instr.get());
|
||||
check(instr->operands[2].constantValue() < instr->operands[0].getTemp().bytes() * 8u,
|
||||
"Size must be smaller than source", instr.get());
|
||||
|
||||
unsigned comp = instr->operands[0].bytes() * 8u / MAX2(instr->operands[2].constantValue(), 1);
|
||||
check(instr->operands[1].constantValue() < comp, "Index must be in-bounds", instr.get());
|
||||
unsigned comp =
|
||||
instr->operands[0].bytes() * 8u / MAX2(instr->operands[2].constantValue(), 1);
|
||||
check(instr->operands[1].constantValue() < comp, "Index must be in-bounds",
|
||||
instr.get());
|
||||
}
|
||||
break;
|
||||
}
|
||||
case Format::PSEUDO_REDUCTION: {
|
||||
for (const Operand &op : instr->operands)
|
||||
check(op.regClass().type() == RegType::vgpr, "All operands of PSEUDO_REDUCTION instructions must be in VGPRs.", instr.get());
|
||||
for (const Operand& op : instr->operands)
|
||||
check(op.regClass().type() == RegType::vgpr,
|
||||
"All operands of PSEUDO_REDUCTION instructions must be in VGPRs.",
|
||||
instr.get());
|
||||
|
||||
if (instr->opcode == aco_opcode::p_reduce && instr->reduction().cluster_size == program->wave_size)
|
||||
check(instr->definitions[0].regClass().type() == RegType::sgpr || program->wave_size == 32, "The result of unclustered reductions must go into an SGPR.", instr.get());
|
||||
if (instr->opcode == aco_opcode::p_reduce &&
|
||||
instr->reduction().cluster_size == program->wave_size)
|
||||
check(instr->definitions[0].regClass().type() == RegType::sgpr ||
|
||||
program->wave_size == 32,
|
||||
"The result of unclustered reductions must go into an SGPR.", instr.get());
|
||||
else
|
||||
check(instr->definitions[0].regClass().type() == RegType::vgpr, "The result of scans and clustered reductions must go into a VGPR.", instr.get());
|
||||
check(instr->definitions[0].regClass().type() == RegType::vgpr,
|
||||
"The result of scans and clustered reductions must go into a VGPR.",
|
||||
instr.get());
|
||||
|
||||
break;
|
||||
}
|
||||
case Format::SMEM: {
|
||||
if (instr->operands.size() >= 1)
|
||||
check((instr->operands[0].isFixed() && !instr->operands[0].isConstant()) ||
|
||||
(instr->operands[0].isTemp() && instr->operands[0].regClass().type() == RegType::sgpr), "SMEM operands must be sgpr", instr.get());
|
||||
(instr->operands[0].isTemp() &&
|
||||
instr->operands[0].regClass().type() == RegType::sgpr),
|
||||
"SMEM operands must be sgpr", instr.get());
|
||||
if (instr->operands.size() >= 2)
|
||||
check(instr->operands[1].isConstant() || (instr->operands[1].isTemp() && instr->operands[1].regClass().type() == RegType::sgpr),
|
||||
check(instr->operands[1].isConstant() ||
|
||||
(instr->operands[1].isTemp() &&
|
||||
instr->operands[1].regClass().type() == RegType::sgpr),
|
||||
"SMEM offset must be constant or sgpr", instr.get());
|
||||
if (!instr->definitions.empty())
|
||||
check(instr->definitions[0].getTemp().type() == RegType::sgpr, "SMEM result must be sgpr", instr.get());
|
||||
check(instr->definitions[0].getTemp().type() == RegType::sgpr,
|
||||
"SMEM result must be sgpr", instr.get());
|
||||
break;
|
||||
}
|
||||
case Format::MTBUF:
|
||||
case Format::MUBUF: {
|
||||
check(instr->operands.size() > 1, "VMEM instructions must have at least one operand", instr.get());
|
||||
check(instr->operands[1].hasRegClass() && instr->operands[1].regClass().type() == RegType::vgpr,
|
||||
check(instr->operands.size() > 1, "VMEM instructions must have at least one operand",
|
||||
instr.get());
|
||||
check(instr->operands[1].hasRegClass() &&
|
||||
instr->operands[1].regClass().type() == RegType::vgpr,
|
||||
"VADDR must be in vgpr for VMEM instructions", instr.get());
|
||||
check(instr->operands[0].isTemp() && instr->operands[0].regClass().type() == RegType::sgpr, "VMEM resource constant must be sgpr", instr.get());
|
||||
check(instr->operands.size() < 4 || (instr->operands[3].isTemp() && instr->operands[3].regClass().type() == RegType::vgpr), "VMEM write data must be vgpr", instr.get());
|
||||
check(
|
||||
instr->operands[0].isTemp() && instr->operands[0].regClass().type() == RegType::sgpr,
|
||||
"VMEM resource constant must be sgpr", instr.get());
|
||||
check(instr->operands.size() < 4 ||
|
||||
(instr->operands[3].isTemp() &&
|
||||
instr->operands[3].regClass().type() == RegType::vgpr),
|
||||
"VMEM write data must be vgpr", instr.get());
|
||||
break;
|
||||
}
|
||||
case Format::MIMG: {
|
||||
check(instr->operands.size() >= 4, "MIMG instructions must have at least 4 operands", instr.get());
|
||||
check(instr->operands[0].hasRegClass() && (instr->operands[0].regClass() == s4 || instr->operands[0].regClass() == s8),
|
||||
check(instr->operands.size() >= 4, "MIMG instructions must have at least 4 operands",
|
||||
instr.get());
|
||||
check(instr->operands[0].hasRegClass() &&
|
||||
(instr->operands[0].regClass() == s4 || instr->operands[0].regClass() == s8),
|
||||
"MIMG operands[0] (resource constant) must be in 4 or 8 SGPRs", instr.get());
|
||||
if (instr->operands[1].hasRegClass())
|
||||
check(instr->operands[1].regClass() == s4, "MIMG operands[1] (sampler constant) must be 4 SGPRs", instr.get());
|
||||
check(instr->operands[1].regClass() == s4,
|
||||
"MIMG operands[1] (sampler constant) must be 4 SGPRs", instr.get());
|
||||
if (!instr->operands[2].isUndefined()) {
|
||||
bool is_cmpswap = instr->opcode == aco_opcode::image_atomic_cmpswap ||
|
||||
instr->opcode == aco_opcode::image_atomic_fcmpswap;
|
||||
check(instr->definitions.empty() || (instr->definitions[0].regClass() == instr->operands[2].regClass() || is_cmpswap),
|
||||
"MIMG operands[2] (VDATA) must be the same as definitions[0] for atomics and TFE/LWE loads", instr.get());
|
||||
check(instr->definitions.empty() ||
|
||||
(instr->definitions[0].regClass() == instr->operands[2].regClass() ||
|
||||
is_cmpswap),
|
||||
"MIMG operands[2] (VDATA) must be the same as definitions[0] for atomics and "
|
||||
"TFE/LWE loads",
|
||||
instr.get());
|
||||
}
|
||||
check(instr->operands.size() == 4 || program->chip_class >= GFX10, "NSA is only supported on GFX10+", instr.get());
|
||||
check(instr->operands.size() == 4 || program->chip_class >= GFX10,
|
||||
"NSA is only supported on GFX10+", instr.get());
|
||||
for (unsigned i = 3; i < instr->operands.size(); i++) {
|
||||
if (instr->operands.size() == 4) {
|
||||
check(instr->operands[i].hasRegClass() && instr->operands[i].regClass().type() == RegType::vgpr,
|
||||
check(instr->operands[i].hasRegClass() &&
|
||||
instr->operands[i].regClass().type() == RegType::vgpr,
|
||||
"MIMG operands[3] (VADDR) must be VGPR", instr.get());
|
||||
} else {
|
||||
check(instr->operands[i].regClass() == v1, "MIMG VADDR must be v1 if NSA is used", instr.get());
|
||||
check(instr->operands[i].regClass() == v1, "MIMG VADDR must be v1 if NSA is used",
|
||||
instr.get());
|
||||
}
|
||||
}
|
||||
check(instr->definitions.empty() || (instr->definitions[0].isTemp() && instr->definitions[0].regClass().type() == RegType::vgpr),
|
||||
check(instr->definitions.empty() ||
|
||||
(instr->definitions[0].isTemp() &&
|
||||
instr->definitions[0].regClass().type() == RegType::vgpr),
|
||||
"MIMG definitions[0] (VDATA) must be VGPR", instr.get());
|
||||
break;
|
||||
}
|
||||
|
@ -482,31 +550,38 @@ bool validate_ir(Program* program)
|
|||
"Only VGPRs are valid DS instruction operands", instr.get());
|
||||
}
|
||||
if (!instr->definitions.empty())
|
||||
check(instr->definitions[0].getTemp().type() == RegType::vgpr, "DS instruction must return VGPR", instr.get());
|
||||
check(instr->definitions[0].getTemp().type() == RegType::vgpr,
|
||||
"DS instruction must return VGPR", instr.get());
|
||||
break;
|
||||
}
|
||||
case Format::EXP: {
|
||||
for (unsigned i = 0; i < 4; i++)
|
||||
check(instr->operands[i].hasRegClass() && instr->operands[i].regClass().type() == RegType::vgpr,
|
||||
check(instr->operands[i].hasRegClass() &&
|
||||
instr->operands[i].regClass().type() == RegType::vgpr,
|
||||
"Only VGPRs are valid Export arguments", instr.get());
|
||||
break;
|
||||
}
|
||||
case Format::FLAT:
|
||||
check(instr->operands[1].isUndefined(), "Flat instructions don't support SADDR", instr.get());
|
||||
check(instr->operands[1].isUndefined(), "Flat instructions don't support SADDR",
|
||||
instr.get());
|
||||
FALLTHROUGH;
|
||||
case Format::GLOBAL:
|
||||
case Format::SCRATCH: {
|
||||
check(instr->operands[0].isTemp() && instr->operands[0].regClass().type() == RegType::vgpr, "FLAT/GLOBAL/SCRATCH address must be vgpr", instr.get());
|
||||
check(instr->operands[1].hasRegClass() && instr->operands[1].regClass().type() == RegType::sgpr,
|
||||
check(
|
||||
instr->operands[0].isTemp() && instr->operands[0].regClass().type() == RegType::vgpr,
|
||||
"FLAT/GLOBAL/SCRATCH address must be vgpr", instr.get());
|
||||
check(instr->operands[1].hasRegClass() &&
|
||||
instr->operands[1].regClass().type() == RegType::sgpr,
|
||||
"FLAT/GLOBAL/SCRATCH sgpr address must be undefined or sgpr", instr.get());
|
||||
if (!instr->definitions.empty())
|
||||
check(instr->definitions[0].getTemp().type() == RegType::vgpr, "FLAT/GLOBAL/SCRATCH result must be vgpr", instr.get());
|
||||
check(instr->definitions[0].getTemp().type() == RegType::vgpr,
|
||||
"FLAT/GLOBAL/SCRATCH result must be vgpr", instr.get());
|
||||
else
|
||||
check(instr->operands[2].regClass().type() == RegType::vgpr, "FLAT/GLOBAL/SCRATCH data must be vgpr", instr.get());
|
||||
check(instr->operands[2].regClass().type() == RegType::vgpr,
|
||||
"FLAT/GLOBAL/SCRATCH data must be vgpr", instr.get());
|
||||
break;
|
||||
}
|
||||
default:
|
||||
break;
|
||||
default: break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -518,20 +593,26 @@ bool validate_ir(Program* program)
|
|||
|
||||
/* predecessors/successors should be sorted */
|
||||
for (unsigned j = 0; j + 1 < block.linear_preds.size(); j++)
|
||||
check_block(block.linear_preds[j] < block.linear_preds[j + 1], "linear predecessors must be sorted", &block);
|
||||
check_block(block.linear_preds[j] < block.linear_preds[j + 1],
|
||||
"linear predecessors must be sorted", &block);
|
||||
for (unsigned j = 0; j + 1 < block.logical_preds.size(); j++)
|
||||
check_block(block.logical_preds[j] < block.logical_preds[j + 1], "logical predecessors must be sorted", &block);
|
||||
check_block(block.logical_preds[j] < block.logical_preds[j + 1],
|
||||
"logical predecessors must be sorted", &block);
|
||||
for (unsigned j = 0; j + 1 < block.linear_succs.size(); j++)
|
||||
check_block(block.linear_succs[j] < block.linear_succs[j + 1], "linear successors must be sorted", &block);
|
||||
check_block(block.linear_succs[j] < block.linear_succs[j + 1],
|
||||
"linear successors must be sorted", &block);
|
||||
for (unsigned j = 0; j + 1 < block.logical_succs.size(); j++)
|
||||
check_block(block.logical_succs[j] < block.logical_succs[j + 1], "logical successors must be sorted", &block);
|
||||
check_block(block.logical_succs[j] < block.logical_succs[j + 1],
|
||||
"logical successors must be sorted", &block);
|
||||
|
||||
/* critical edges are not allowed */
|
||||
if (block.linear_preds.size() > 1) {
|
||||
for (unsigned pred : block.linear_preds)
|
||||
check_block(program->blocks[pred].linear_succs.size() == 1, "linear critical edges are not allowed", &program->blocks[pred]);
|
||||
check_block(program->blocks[pred].linear_succs.size() == 1,
|
||||
"linear critical edges are not allowed", &program->blocks[pred]);
|
||||
for (unsigned pred : block.logical_preds)
|
||||
check_block(program->blocks[pred].logical_succs.size() == 1, "logical critical edges are not allowed", &program->blocks[pred]);
|
||||
check_block(program->blocks[pred].logical_succs.size() == 1,
|
||||
"logical critical edges are not allowed", &program->blocks[pred]);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -544,8 +625,8 @@ namespace {
|
|||
struct Location {
|
||||
Location() : block(NULL), instr(NULL) {}
|
||||
|
||||
Block *block;
|
||||
Instruction *instr; //NULL if it's the block's live-in
|
||||
Block* block;
|
||||
Instruction* instr; // NULL if it's the block's live-in
|
||||
};
|
||||
|
||||
struct Assignment {
|
||||
|
@ -554,18 +635,20 @@ struct Assignment {
|
|||
PhysReg reg;
|
||||
};
|
||||
|
||||
bool ra_fail(Program *program, Location loc, Location loc2, const char *fmt, ...) {
|
||||
bool
|
||||
ra_fail(Program* program, Location loc, Location loc2, const char* fmt, ...)
|
||||
{
|
||||
va_list args;
|
||||
va_start(args, fmt);
|
||||
char msg[1024];
|
||||
vsprintf(msg, fmt, args);
|
||||
va_end(args);
|
||||
|
||||
char *out;
|
||||
char* out;
|
||||
size_t outsize;
|
||||
struct u_memstream mem;
|
||||
u_memstream_open(&mem, &out, &outsize);
|
||||
FILE *const memf = u_memstream_get(&mem);
|
||||
FILE* const memf = u_memstream_get(&mem);
|
||||
|
||||
fprintf(memf, "RA error found at instruction in BB%d:\n", loc.block->index);
|
||||
if (loc.instr) {
|
||||
|
@ -587,7 +670,8 @@ bool ra_fail(Program *program, Location loc, Location loc2, const char *fmt, ...
|
|||
return true;
|
||||
}
|
||||
|
||||
bool validate_subdword_operand(chip_class chip, const aco_ptr<Instruction>& instr, unsigned index)
|
||||
bool
|
||||
validate_subdword_operand(chip_class chip, const aco_ptr<Instruction>& instr, unsigned index)
|
||||
{
|
||||
Operand op = instr->operands[index];
|
||||
unsigned byte = op.physReg().byte();
|
||||
|
@ -635,14 +719,14 @@ bool validate_subdword_operand(chip_class chip, const aco_ptr<Instruction>& inst
|
|||
if (byte == 2 && index == 2)
|
||||
return true;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
default: break;
|
||||
}
|
||||
|
||||
return byte == 0;
|
||||
}
|
||||
|
||||
bool validate_subdword_definition(chip_class chip, const aco_ptr<Instruction>& instr)
|
||||
bool
|
||||
validate_subdword_definition(chip_class chip, const aco_ptr<Instruction>& instr)
|
||||
{
|
||||
Definition def = instr->definitions[0];
|
||||
unsigned byte = def.physReg().byte();
|
||||
|
@ -664,16 +748,15 @@ bool validate_subdword_definition(chip_class chip, const aco_ptr<Instruction>& i
|
|||
case aco_opcode::global_load_ubyte_d16_hi:
|
||||
case aco_opcode::global_load_short_d16_hi:
|
||||
case aco_opcode::ds_read_u8_d16_hi:
|
||||
case aco_opcode::ds_read_u16_d16_hi:
|
||||
return byte == 2;
|
||||
default:
|
||||
break;
|
||||
case aco_opcode::ds_read_u16_d16_hi: return byte == 2;
|
||||
default: break;
|
||||
}
|
||||
|
||||
return byte == 0;
|
||||
}
|
||||
|
||||
unsigned get_subdword_bytes_written(Program *program, const aco_ptr<Instruction>& instr, unsigned index)
|
||||
unsigned
|
||||
get_subdword_bytes_written(Program* program, const aco_ptr<Instruction>& instr, unsigned index)
|
||||
{
|
||||
chip_class chip = program->chip_class;
|
||||
Definition def = instr->definitions[index];
|
||||
|
@ -703,8 +786,7 @@ unsigned get_subdword_bytes_written(Program *program, const aco_ptr<Instruction>
|
|||
case aco_opcode::global_load_ubyte_d16_hi:
|
||||
case aco_opcode::global_load_short_d16_hi:
|
||||
case aco_opcode::ds_read_u8_d16_hi:
|
||||
case aco_opcode::ds_read_u16_d16_hi:
|
||||
return program->dev.sram_ecc_enabled ? 4 : 2;
|
||||
case aco_opcode::ds_read_u16_d16_hi: return program->dev.sram_ecc_enabled ? 4 : 2;
|
||||
case aco_opcode::v_mad_f16:
|
||||
case aco_opcode::v_mad_u16:
|
||||
case aco_opcode::v_mad_i16:
|
||||
|
@ -714,16 +796,18 @@ unsigned get_subdword_bytes_written(Program *program, const aco_ptr<Instruction>
|
|||
if (chip >= GFX9)
|
||||
return 2;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
default: break;
|
||||
}
|
||||
|
||||
return MAX2(chip >= GFX10 ? def.bytes() : 4, instr_info.definition_size[(int)instr->opcode] / 8u);
|
||||
return MAX2(chip >= GFX10 ? def.bytes() : 4,
|
||||
instr_info.definition_size[(int)instr->opcode] / 8u);
|
||||
}
|
||||
|
||||
} /* end namespace */
|
||||
|
||||
bool validate_ra(Program *program) {
|
||||
bool
|
||||
validate_ra(Program* program)
|
||||
{
|
||||
if (!(debug_flags & DEBUG_VALIDATE_RA))
|
||||
return false;
|
||||
|
||||
|
@ -754,13 +838,21 @@ bool validate_ra(Program *program) {
|
|||
if (!op.isFixed())
|
||||
err |= ra_fail(program, loc, Location(), "Operand %d is not assigned a register", i);
|
||||
if (assignments.count(op.tempId()) && assignments[op.tempId()].reg != op.physReg())
|
||||
err |= ra_fail(program, loc, assignments.at(op.tempId()).firstloc, "Operand %d has an inconsistent register assignment with instruction", i);
|
||||
if ((op.getTemp().type() == RegType::vgpr && op.physReg().reg_b + op.bytes() > (256 + program->config->num_vgprs) * 4) ||
|
||||
(op.getTemp().type() == RegType::sgpr && op.physReg() + op.size() > program->config->num_sgprs && op.physReg() < sgpr_limit))
|
||||
err |= ra_fail(program, loc, assignments.at(op.tempId()).firstloc, "Operand %d has an out-of-bounds register assignment", i);
|
||||
err |=
|
||||
ra_fail(program, loc, assignments.at(op.tempId()).firstloc,
|
||||
"Operand %d has an inconsistent register assignment with instruction", i);
|
||||
if ((op.getTemp().type() == RegType::vgpr &&
|
||||
op.physReg().reg_b + op.bytes() > (256 + program->config->num_vgprs) * 4) ||
|
||||
(op.getTemp().type() == RegType::sgpr &&
|
||||
op.physReg() + op.size() > program->config->num_sgprs &&
|
||||
op.physReg() < sgpr_limit))
|
||||
err |= ra_fail(program, loc, assignments.at(op.tempId()).firstloc,
|
||||
"Operand %d has an out-of-bounds register assignment", i);
|
||||
if (op.physReg() == vcc && !program->needs_vcc)
|
||||
err |= ra_fail(program, loc, Location(), "Operand %d fixed to vcc but needs_vcc=false", i);
|
||||
if (op.regClass().is_subdword() && !validate_subdword_operand(program->chip_class, instr, i))
|
||||
err |= ra_fail(program, loc, Location(),
|
||||
"Operand %d fixed to vcc but needs_vcc=false", i);
|
||||
if (op.regClass().is_subdword() &&
|
||||
!validate_subdword_operand(program->chip_class, instr, i))
|
||||
err |= ra_fail(program, loc, Location(), "Operand %d not aligned correctly", i);
|
||||
if (!assignments[op.tempId()].firstloc.block)
|
||||
assignments[op.tempId()].firstloc = loc;
|
||||
|
@ -773,15 +865,23 @@ bool validate_ra(Program *program) {
|
|||
if (!def.isTemp())
|
||||
continue;
|
||||
if (!def.isFixed())
|
||||
err |= ra_fail(program, loc, Location(), "Definition %d is not assigned a register", i);
|
||||
err |=
|
||||
ra_fail(program, loc, Location(), "Definition %d is not assigned a register", i);
|
||||
if (assignments[def.tempId()].defloc.block)
|
||||
err |= ra_fail(program, loc, assignments.at(def.tempId()).defloc, "Temporary %%%d also defined by instruction", def.tempId());
|
||||
if ((def.getTemp().type() == RegType::vgpr && def.physReg().reg_b + def.bytes() > (256 + program->config->num_vgprs) * 4) ||
|
||||
(def.getTemp().type() == RegType::sgpr && def.physReg() + def.size() > program->config->num_sgprs && def.physReg() < sgpr_limit))
|
||||
err |= ra_fail(program, loc, assignments.at(def.tempId()).firstloc, "Definition %d has an out-of-bounds register assignment", i);
|
||||
err |= ra_fail(program, loc, assignments.at(def.tempId()).defloc,
|
||||
"Temporary %%%d also defined by instruction", def.tempId());
|
||||
if ((def.getTemp().type() == RegType::vgpr &&
|
||||
def.physReg().reg_b + def.bytes() > (256 + program->config->num_vgprs) * 4) ||
|
||||
(def.getTemp().type() == RegType::sgpr &&
|
||||
def.physReg() + def.size() > program->config->num_sgprs &&
|
||||
def.physReg() < sgpr_limit))
|
||||
err |= ra_fail(program, loc, assignments.at(def.tempId()).firstloc,
|
||||
"Definition %d has an out-of-bounds register assignment", i);
|
||||
if (def.physReg() == vcc && !program->needs_vcc)
|
||||
err |= ra_fail(program, loc, Location(), "Definition %d fixed to vcc but needs_vcc=false", i);
|
||||
if (def.regClass().is_subdword() && !validate_subdword_definition(program->chip_class, instr))
|
||||
err |= ra_fail(program, loc, Location(),
|
||||
"Definition %d fixed to vcc but needs_vcc=false", i);
|
||||
if (def.regClass().is_subdword() &&
|
||||
!validate_subdword_definition(program->chip_class, instr))
|
||||
err |= ra_fail(program, loc, Location(), "Definition %d not aligned correctly", i);
|
||||
if (!assignments[def.tempId()].firstloc.block)
|
||||
assignments[def.tempId()].firstloc = loc;
|
||||
|
@ -810,7 +910,9 @@ bool validate_ra(Program *program) {
|
|||
PhysReg reg = assignments.at(tmp.id()).reg;
|
||||
for (unsigned i = 0; i < tmp.bytes(); i++) {
|
||||
if (regs[reg.reg_b + i]) {
|
||||
err |= ra_fail(program, loc, Location(), "Assignment of element %d of %%%d already taken by %%%d in live-out", i, tmp.id(), regs[reg.reg_b + i]);
|
||||
err |= ra_fail(program, loc, Location(),
|
||||
"Assignment of element %d of %%%d already taken by %%%d in live-out",
|
||||
i, tmp.id(), regs[reg.reg_b + i]);
|
||||
}
|
||||
regs[reg.reg_b + i] = tmp.id();
|
||||
}
|
||||
|
@ -826,7 +928,10 @@ bool validate_ra(Program *program) {
|
|||
PhysReg reg = assignments.at(tmp.id()).reg;
|
||||
for (unsigned i = 0; i < tmp.bytes(); i++) {
|
||||
if (regs[reg.reg_b + i])
|
||||
err |= ra_fail(program, loc, Location(), "Assignment of element %d of %%%d already taken by %%%d in live-out", i, tmp.id(), regs[reg.reg_b + i]);
|
||||
err |= ra_fail(
|
||||
program, loc, Location(),
|
||||
"Assignment of element %d of %%%d already taken by %%%d in live-out", i,
|
||||
tmp.id(), regs[reg.reg_b + i]);
|
||||
}
|
||||
live.emplace(tmp);
|
||||
}
|
||||
|
@ -886,16 +991,23 @@ bool validate_ra(Program *program) {
|
|||
PhysReg reg = assignments.at(tmp.id()).reg;
|
||||
for (unsigned j = 0; j < tmp.bytes(); j++) {
|
||||
if (regs[reg.reg_b + j])
|
||||
err |= ra_fail(program, loc, assignments.at(regs[reg.reg_b + j]).defloc, "Assignment of element %d of %%%d already taken by %%%d from instruction", i, tmp.id(), regs[reg.reg_b + j]);
|
||||
err |= ra_fail(
|
||||
program, loc, assignments.at(regs[reg.reg_b + j]).defloc,
|
||||
"Assignment of element %d of %%%d already taken by %%%d from instruction", i,
|
||||
tmp.id(), regs[reg.reg_b + j]);
|
||||
regs[reg.reg_b + j] = tmp.id();
|
||||
}
|
||||
if (def.regClass().is_subdword() && def.bytes() < 4) {
|
||||
unsigned written = get_subdword_bytes_written(program, instr, i);
|
||||
/* If written=4, the instruction still might write the upper half. In that case, it's the lower half that isn't preserved */
|
||||
/* If written=4, the instruction still might write the upper half. In that case, it's
|
||||
* the lower half that isn't preserved */
|
||||
for (unsigned j = reg.byte() & ~(written - 1); j < written; j++) {
|
||||
unsigned written_reg = reg.reg() * 4u + j;
|
||||
if (regs[written_reg] && regs[written_reg] != def.tempId())
|
||||
err |= ra_fail(program, loc, assignments.at(regs[written_reg]).defloc, "Assignment of element %d of %%%d overwrites the full register taken by %%%d from instruction", i, tmp.id(), regs[written_reg]);
|
||||
err |= ra_fail(program, loc, assignments.at(regs[written_reg]).defloc,
|
||||
"Assignment of element %d of %%%d overwrites the full register "
|
||||
"taken by %%%d from instruction",
|
||||
i, tmp.id(), regs[written_reg]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -924,4 +1036,4 @@ bool validate_ra(Program *program) {
|
|||
|
||||
return err;
|
||||
}
|
||||
}
|
||||
} // namespace aco
|
||||
|
|
Loading…
Reference in New Issue