aco: split read/writelane opcode into VOP2/VOP3 version for SI/CI

Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
This commit is contained in:
Daniel Schürmann 2019-11-07 18:02:33 +01:00
parent 23319add93
commit 6a586a6006
9 changed files with 72 additions and 35 deletions

View File

@ -418,6 +418,19 @@ public:
return insert(std::move(sub));
}
Result readlane(Definition dst, Op vsrc, Op lane)
{
if (program->chip_class >= GFX8)
return vop3(aco_opcode::v_readlane_b32_e64, dst, vsrc, lane);
else
return vop2(aco_opcode::v_readlane_b32, dst, vsrc, lane);
}
Result writelane(Definition dst, Op val, Op lane, Op vsrc) {
if (program->chip_class >= GFX8)
return vop3(aco_opcode::v_writelane_b32_e64, dst, val, lane, vsrc);
else
return vop2(aco_opcode::v_writelane_b32, dst, val, lane, vsrc);
}
<%
import itertools
formats = [("pseudo", [Format.PSEUDO], 'Pseudo_instruction', list(itertools.product(range(5), range(5))) + [(8, 1), (1, 8)]),

View File

@ -110,7 +110,9 @@ bool VALU_writes_sgpr(aco_ptr<Instruction>& instr)
return true;
if (instr->isVOP3() && instr->definitions.size() == 2)
return true;
if (instr->opcode == aco_opcode::v_readfirstlane_b32 || instr->opcode == aco_opcode::v_readlane_b32)
if (instr->opcode == aco_opcode::v_readfirstlane_b32 ||
instr->opcode == aco_opcode::v_readlane_b32 ||
instr->opcode == aco_opcode::v_readlane_b32_e64)
return true;
return false;
}
@ -285,7 +287,9 @@ int handle_instruction_gfx8_9(NOP_ctx_gfx8_9& ctx, aco_ptr<Instruction>& instr,
switch (instr->opcode) {
case aco_opcode::v_readlane_b32:
case aco_opcode::v_writelane_b32: {
case aco_opcode::v_readlane_b32_e64:
case aco_opcode::v_writelane_b32:
case aco_opcode::v_writelane_b32_e64: {
if (ctx.VALU_wrsgpr + 4 < new_idx)
break;
PhysReg reg = instr->operands[1].physReg();

View File

@ -118,7 +118,9 @@ bool pred_by_exec_mask(aco_ptr<Instruction>& instr) {
}
if (instr->opcode == aco_opcode::v_readlane_b32 ||
instr->opcode == aco_opcode::v_writelane_b32)
instr->opcode == aco_opcode::v_readlane_b32_e64 ||
instr->opcode == aco_opcode::v_writelane_b32 ||
instr->opcode == aco_opcode::v_writelane_b32_e64)
return false;
return true;

View File

@ -163,7 +163,7 @@ Temp emit_wqm(isel_context *ctx, Temp src, Temp dst=Temp(0, s1), bool program_ne
static Temp emit_bpermute(isel_context *ctx, Builder &bld, Temp index, Temp data)
{
if (index.regClass() == s1)
return bld.vop3(aco_opcode::v_readlane_b32, bld.def(s1), data, index);
return bld.readlane(bld.def(s1), data, index);
Temp index_x4 = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), index);
@ -6098,14 +6098,14 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
if (dst.regClass() == v1) {
/* src2 is ignored for writelane. RA assigns the same reg for dst */
emit_wqm(ctx, bld.vop3(aco_opcode::v_writelane_b32, bld.def(v1), val, lane, src), dst);
emit_wqm(ctx, bld.writelane(bld.def(v1), val, lane, src), dst);
} else if (dst.regClass() == v2) {
Temp src_lo = bld.tmp(v1), src_hi = bld.tmp(v1);
Temp val_lo = bld.tmp(s1), val_hi = bld.tmp(s1);
bld.pseudo(aco_opcode::p_split_vector, Definition(src_lo), Definition(src_hi), src);
bld.pseudo(aco_opcode::p_split_vector, Definition(val_lo), Definition(val_hi), val);
Temp lo = emit_wqm(ctx, bld.vop3(aco_opcode::v_writelane_b32, bld.def(v1), val_lo, lane, src_hi));
Temp hi = emit_wqm(ctx, bld.vop3(aco_opcode::v_writelane_b32, bld.def(v1), val_hi, lane, src_hi));
Temp lo = emit_wqm(ctx, bld.writelane(bld.def(v1), val_lo, lane, src_hi));
Temp hi = emit_wqm(ctx, bld.writelane(bld.def(v1), val_hi, lane, src_hi));
bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
emit_split_vector(ctx, dst, 2);
} else {

View File

@ -481,7 +481,7 @@ void emit_reduction(lower_context *ctx, aco_opcode op, ReduceOp reduce_op, unsig
if (cluster_size == 64) {
for (unsigned i = 0; i < src.size(); i++)
bld.vop3(aco_opcode::v_readlane_b32, Definition(PhysReg{sitmp+i}, s1), Operand(PhysReg{tmp+i}, v1), Operand(31u));
bld.readlane(Definition(PhysReg{sitmp+i}, s1), Operand(PhysReg{tmp+i}, v1), Operand(31u));
emit_op(ctx, tmp, sitmp, tmp, vtmp, reduce_op, src.size());
}
} else if (cluster_size == 32) {
@ -519,8 +519,8 @@ void emit_reduction(lower_context *ctx, aco_opcode op, ReduceOp reduce_op, unsig
if (ctx->program->wave_size == 64) {
/* fill in the gap in row 2 */
for (unsigned i = 0; i < src.size(); i++) {
bld.vop3(aco_opcode::v_readlane_b32, Definition(PhysReg{sitmp+i}, s1), Operand(PhysReg{tmp+i}, v1), Operand(31u));
bld.vop3(aco_opcode::v_writelane_b32, Definition(PhysReg{vtmp+i}, v1), Operand(PhysReg{sitmp+i}, s1), Operand(32u));
bld.readlane(Definition(PhysReg{sitmp+i}, s1), Operand(PhysReg{tmp+i}, v1), Operand(31u));
bld.writelane(Definition(PhysReg{vtmp+i}, v1), Operand(PhysReg{sitmp+i}, s1), Operand(32u), Operand(PhysReg{vtmp+i}, v1));
}
}
std::swap(tmp, vtmp);
@ -531,8 +531,7 @@ void emit_reduction(lower_context *ctx, aco_opcode op, ReduceOp reduce_op, unsig
if (!identity[i].isConstant() || identity[i].constantValue()) { /* bound_ctrl should take case of this overwise */
if (ctx->program->chip_class < GFX10)
assert((identity[i].isConstant() && !identity[i].isLiteral()) || identity[i].physReg() == PhysReg{sitmp+i});
bld.vop3(aco_opcode::v_writelane_b32, Definition(PhysReg{tmp+i}, v1),
identity[i], Operand(0u));
bld.writelane(Definition(PhysReg{tmp+i}, v1), identity[i], Operand(0u), Operand(PhysReg{tmp+i}, v1));
}
}
/* fall through */
@ -562,7 +561,7 @@ void emit_reduction(lower_context *ctx, aco_opcode op, ReduceOp reduce_op, unsig
bld.sop1(aco_opcode::s_mov_b32, Definition(exec_lo, s1), Operand(0u));
bld.sop1(aco_opcode::s_mov_b32, Definition(exec_hi, s1), Operand(0xffffffffu));
for (unsigned i = 0; i < src.size(); i++)
bld.vop3(aco_opcode::v_readlane_b32, Definition(PhysReg{sitmp+i}, s1), Operand(PhysReg{tmp+i}, v1), Operand(31u));
bld.readlane(Definition(PhysReg{sitmp+i}, s1), Operand(PhysReg{tmp+i}, v1), Operand(31u));
emit_op(ctx, tmp, sitmp, tmp, vtmp, reduce_op, src.size());
}
} else {
@ -581,8 +580,8 @@ void emit_reduction(lower_context *ctx, aco_opcode op, ReduceOp reduce_op, unsig
if (op == aco_opcode::p_reduce && dst.regClass().type() == RegType::sgpr) {
for (unsigned k = 0; k < src.size(); k++) {
bld.vop3(aco_opcode::v_readlane_b32, Definition(PhysReg{dst.physReg() + k}, s1),
Operand(PhysReg{tmp + k}, v1), Operand(ctx->program->wave_size - 1));
bld.readlane(Definition(PhysReg{dst.physReg() + k}, s1),
Operand(PhysReg{tmp + k}, v1), Operand(ctx->program->wave_size - 1));
}
} else if (!(dst.physReg() == tmp) && !dst_written) {
for (unsigned k = 0; k < src.size(); k++) {
@ -911,21 +910,20 @@ void lower_to_hw_instr(Program* program)
case aco_opcode::p_spill:
{
assert(instr->operands[0].regClass() == v1.as_linear());
for (unsigned i = 0; i < instr->operands[2].size(); i++) {
bld.vop3(aco_opcode::v_writelane_b32, bld.def(v1, instr->operands[0].physReg()),
Operand(PhysReg{instr->operands[2].physReg() + i}, s1),
Operand(instr->operands[1].constantValue() + i));
}
for (unsigned i = 0; i < instr->operands[2].size(); i++)
bld.writelane(bld.def(v1, instr->operands[0].physReg()),
Operand(PhysReg{instr->operands[2].physReg() + i}, s1),
Operand(instr->operands[1].constantValue() + i),
instr->operands[0]);
break;
}
case aco_opcode::p_reload:
{
assert(instr->operands[0].regClass() == v1.as_linear());
for (unsigned i = 0; i < instr->definitions[0].size(); i++) {
bld.vop3(aco_opcode::v_readlane_b32,
bld.def(s1, PhysReg{instr->definitions[0].physReg() + i}),
instr->operands[0], Operand(instr->operands[1].constantValue() + i));
}
for (unsigned i = 0; i < instr->definitions[0].size(); i++)
bld.readlane(bld.def(s1, PhysReg{instr->definitions[0].physReg() + i}),
instr->operands[0],
Operand(instr->operands[1].constantValue() + i));
break;
}
case aco_opcode::p_as_uniform:

View File

@ -592,6 +592,8 @@ for (gfx6, gfx7, gfx8, gfx9, gfx10, name) in SMEM:
VOP2 = {
# GFX6, GFX7, GFX8, GFX9, GFX10, name, input/output modifiers
(0x00, 0x00, 0x00, 0x00, 0x01, "v_cndmask_b32", False),
(0x01, 0x01, -1, -1, -1, "v_readlane_b32", False),
(0x02, 0x02, -1, -1, -1, "v_writelane_b32", False),
(0x03, 0x03, 0x01, 0x01, 0x03, "v_add_f32", True),
(0x04, 0x04, 0x02, 0x02, 0x04, "v_sub_f32", True),
(0x05, 0x05, 0x03, 0x03, 0x05, "v_subrev_f32", True),
@ -984,8 +986,8 @@ VOP3 = {
( -1, -1, 0x276, 0x276, -1, "v_interp_p2_legacy_f16", True, True),
( -1, -1, -1, 0x277, 0x35a, "v_interp_p2_f16", True, True),
(0x12b, 0x12b, 0x288, 0x288, 0x362, "v_ldexp_f32", False, True),
(0x101, 0x101, 0x289, 0x289, 0x360, "v_readlane_b32", False, False),
(0x102, 0x102, 0x28a, 0x28a, 0x361, "v_writelane_b32", False, False),
( -1, -1, 0x289, 0x289, 0x360, "v_readlane_b32_e64", False, False),
( -1, -1, 0x28a, 0x28a, 0x361, "v_writelane_b32_e64", False, False),
(0x122, 0x122, 0x28b, 0x28b, 0x364, "v_bcnt_u32_b32", False, False),
(0x123, 0x123, 0x28c, 0x28c, 0x365, "v_mbcnt_lo_u32_b32", False, False),
(0x124, 0x124, 0x28d, 0x28d, 0x366, "v_mbcnt_hi_u32_b32", False, False),

View File

@ -421,14 +421,19 @@ bool can_use_VOP3(aco_ptr<Instruction>& instr)
return instr->opcode != aco_opcode::v_madmk_f32 &&
instr->opcode != aco_opcode::v_madak_f32 &&
instr->opcode != aco_opcode::v_madmk_f16 &&
instr->opcode != aco_opcode::v_madak_f16;
instr->opcode != aco_opcode::v_madak_f16 &&
instr->opcode != aco_opcode::v_readlane_b32 &&
instr->opcode != aco_opcode::v_writelane_b32 &&
instr->opcode != aco_opcode::v_readfirstlane_b32;
}
bool can_apply_sgprs(aco_ptr<Instruction>& instr)
{
return instr->opcode != aco_opcode::v_readfirstlane_b32 &&
instr->opcode != aco_opcode::v_readlane_b32 &&
instr->opcode != aco_opcode::v_writelane_b32;
instr->opcode != aco_opcode::v_readlane_b32_e64 &&
instr->opcode != aco_opcode::v_writelane_b32 &&
instr->opcode != aco_opcode::v_writelane_b32_e64;
}
void to_VOP3(opt_ctx& ctx, aco_ptr<Instruction>& instr)
@ -458,6 +463,7 @@ bool can_accept_constant(aco_ptr<Instruction>& instr, unsigned operand)
case aco_opcode::v_interp_p2_f32:
case aco_opcode::v_mac_f32:
case aco_opcode::v_writelane_b32:
case aco_opcode::v_writelane_b32_e64:
case aco_opcode::v_cndmask_b32:
return operand != 2;
case aco_opcode::s_addk_i32:
@ -466,6 +472,7 @@ bool can_accept_constant(aco_ptr<Instruction>& instr, unsigned operand)
case aco_opcode::p_extract_vector:
case aco_opcode::p_split_vector:
case aco_opcode::v_readlane_b32:
case aco_opcode::v_readlane_b32_e64:
case aco_opcode::v_readfirstlane_b32:
return operand != 0;
default:
@ -494,7 +501,8 @@ bool valu_can_accept_literal(opt_ctx& ctx, aco_ptr<Instruction>& instr, unsigned
bool valu_can_accept_vgpr(aco_ptr<Instruction>& instr, unsigned operand)
{
if (instr->opcode == aco_opcode::v_readlane_b32 || instr->opcode == aco_opcode::v_writelane_b32)
if (instr->opcode == aco_opcode::v_readlane_b32 || instr->opcode == aco_opcode::v_readlane_b32_e64 ||
instr->opcode == aco_opcode::v_writelane_b32 || instr->opcode == aco_opcode::v_writelane_b32_e64)
return operand != 1;
return true;
}
@ -633,7 +641,7 @@ void label_instruction(opt_ctx &ctx, Block& block, aco_ptr<Instruction>& instr)
}
if (info.is_constant() && can_accept_constant(instr, i)) {
perfwarn(instr->opcode == aco_opcode::v_cndmask_b32 && i == 2, "v_cndmask_b32 with a constant selector", instr.get());
if (i == 0) {
if (i == 0 || instr->opcode == aco_opcode::v_readlane_b32 || instr->opcode == aco_opcode::v_writelane_b32) {
instr->operands[i] = Operand(info.val);
continue;
} else if (!instr->isVOP3() && can_swap_operands(instr)) {

View File

@ -1509,7 +1509,8 @@ void register_allocation(Program *program, std::vector<std::set<Temp>> live_out_
/* handle definitions which must have the same register as an operand */
if (instr->opcode == aco_opcode::v_interp_p2_f32 ||
instr->opcode == aco_opcode::v_mac_f32 ||
instr->opcode == aco_opcode::v_writelane_b32) {
instr->opcode == aco_opcode::v_writelane_b32 ||
instr->opcode == aco_opcode::v_writelane_b32_e64) {
instr->definitions[0].setFixed(instr->operands[2].physReg());
} else if (instr->opcode == aco_opcode::s_addk_i32 ||
instr->opcode == aco_opcode::s_mulk_i32) {

View File

@ -132,12 +132,21 @@ void validate(Program* program, FILE * output)
check(instr->definitions[0].getTemp().type() == RegType::vgpr ||
(int) instr->format & (int) Format::VOPC ||
instr->opcode == aco_opcode::v_readfirstlane_b32 ||
instr->opcode == aco_opcode::v_readlane_b32,
instr->opcode == aco_opcode::v_readlane_b32 ||
instr->opcode == aco_opcode::v_readlane_b32_e64,
"Wrong Definition type for VALU instruction", instr.get());
unsigned num_sgpr = 0;
unsigned sgpr_idx = instr->operands.size();
for (unsigned i = 0; i < instr->operands.size(); i++)
{
for (unsigned i = 0; i < instr->operands.size(); i++) {
if (instr->opcode == aco_opcode::v_readfirstlane_b32 ||
instr->opcode == aco_opcode::v_readlane_b32 ||
instr->opcode == aco_opcode::v_readlane_b32_e64 ||
instr->opcode == aco_opcode::v_writelane_b32 ||
instr->opcode == aco_opcode::v_writelane_b32_e64) {
check(!instr->operands[i].isLiteral(), "No literal allowed on VALU instruction", instr.get());
check(i == 1 || (instr->operands[i].isTemp() && instr->operands[i].regClass() == v1), "Wrong Operand type for VALU instruction", instr.get());
continue;
}
if (instr->operands[i].isTemp() && instr->operands[i].regClass().type() == RegType::sgpr) {
check(i != 1 || (int) instr->format & (int) Format::VOP3A, "Wrong source position for SGPR argument", instr.get());