aco: split read/writelane opcode into VOP2/VOP3 version for SI/CI
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
This commit is contained in:
parent
23319add93
commit
6a586a6006
|
@ -418,6 +418,19 @@ public:
|
|||
return insert(std::move(sub));
|
||||
}
|
||||
|
||||
Result readlane(Definition dst, Op vsrc, Op lane)
|
||||
{
|
||||
if (program->chip_class >= GFX8)
|
||||
return vop3(aco_opcode::v_readlane_b32_e64, dst, vsrc, lane);
|
||||
else
|
||||
return vop2(aco_opcode::v_readlane_b32, dst, vsrc, lane);
|
||||
}
|
||||
Result writelane(Definition dst, Op val, Op lane, Op vsrc) {
|
||||
if (program->chip_class >= GFX8)
|
||||
return vop3(aco_opcode::v_writelane_b32_e64, dst, val, lane, vsrc);
|
||||
else
|
||||
return vop2(aco_opcode::v_writelane_b32, dst, val, lane, vsrc);
|
||||
}
|
||||
<%
|
||||
import itertools
|
||||
formats = [("pseudo", [Format.PSEUDO], 'Pseudo_instruction', list(itertools.product(range(5), range(5))) + [(8, 1), (1, 8)]),
|
||||
|
|
|
@ -110,7 +110,9 @@ bool VALU_writes_sgpr(aco_ptr<Instruction>& instr)
|
|||
return true;
|
||||
if (instr->isVOP3() && instr->definitions.size() == 2)
|
||||
return true;
|
||||
if (instr->opcode == aco_opcode::v_readfirstlane_b32 || instr->opcode == aco_opcode::v_readlane_b32)
|
||||
if (instr->opcode == aco_opcode::v_readfirstlane_b32 ||
|
||||
instr->opcode == aco_opcode::v_readlane_b32 ||
|
||||
instr->opcode == aco_opcode::v_readlane_b32_e64)
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
|
@ -285,7 +287,9 @@ int handle_instruction_gfx8_9(NOP_ctx_gfx8_9& ctx, aco_ptr<Instruction>& instr,
|
|||
|
||||
switch (instr->opcode) {
|
||||
case aco_opcode::v_readlane_b32:
|
||||
case aco_opcode::v_writelane_b32: {
|
||||
case aco_opcode::v_readlane_b32_e64:
|
||||
case aco_opcode::v_writelane_b32:
|
||||
case aco_opcode::v_writelane_b32_e64: {
|
||||
if (ctx.VALU_wrsgpr + 4 < new_idx)
|
||||
break;
|
||||
PhysReg reg = instr->operands[1].physReg();
|
||||
|
|
|
@ -118,7 +118,9 @@ bool pred_by_exec_mask(aco_ptr<Instruction>& instr) {
|
|||
}
|
||||
|
||||
if (instr->opcode == aco_opcode::v_readlane_b32 ||
|
||||
instr->opcode == aco_opcode::v_writelane_b32)
|
||||
instr->opcode == aco_opcode::v_readlane_b32_e64 ||
|
||||
instr->opcode == aco_opcode::v_writelane_b32 ||
|
||||
instr->opcode == aco_opcode::v_writelane_b32_e64)
|
||||
return false;
|
||||
|
||||
return true;
|
||||
|
|
|
@ -163,7 +163,7 @@ Temp emit_wqm(isel_context *ctx, Temp src, Temp dst=Temp(0, s1), bool program_ne
|
|||
static Temp emit_bpermute(isel_context *ctx, Builder &bld, Temp index, Temp data)
|
||||
{
|
||||
if (index.regClass() == s1)
|
||||
return bld.vop3(aco_opcode::v_readlane_b32, bld.def(s1), data, index);
|
||||
return bld.readlane(bld.def(s1), data, index);
|
||||
|
||||
Temp index_x4 = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), index);
|
||||
|
||||
|
@ -6098,14 +6098,14 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
|
|||
Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
|
||||
if (dst.regClass() == v1) {
|
||||
/* src2 is ignored for writelane. RA assigns the same reg for dst */
|
||||
emit_wqm(ctx, bld.vop3(aco_opcode::v_writelane_b32, bld.def(v1), val, lane, src), dst);
|
||||
emit_wqm(ctx, bld.writelane(bld.def(v1), val, lane, src), dst);
|
||||
} else if (dst.regClass() == v2) {
|
||||
Temp src_lo = bld.tmp(v1), src_hi = bld.tmp(v1);
|
||||
Temp val_lo = bld.tmp(s1), val_hi = bld.tmp(s1);
|
||||
bld.pseudo(aco_opcode::p_split_vector, Definition(src_lo), Definition(src_hi), src);
|
||||
bld.pseudo(aco_opcode::p_split_vector, Definition(val_lo), Definition(val_hi), val);
|
||||
Temp lo = emit_wqm(ctx, bld.vop3(aco_opcode::v_writelane_b32, bld.def(v1), val_lo, lane, src_hi));
|
||||
Temp hi = emit_wqm(ctx, bld.vop3(aco_opcode::v_writelane_b32, bld.def(v1), val_hi, lane, src_hi));
|
||||
Temp lo = emit_wqm(ctx, bld.writelane(bld.def(v1), val_lo, lane, src_hi));
|
||||
Temp hi = emit_wqm(ctx, bld.writelane(bld.def(v1), val_hi, lane, src_hi));
|
||||
bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
|
||||
emit_split_vector(ctx, dst, 2);
|
||||
} else {
|
||||
|
|
|
@ -481,7 +481,7 @@ void emit_reduction(lower_context *ctx, aco_opcode op, ReduceOp reduce_op, unsig
|
|||
|
||||
if (cluster_size == 64) {
|
||||
for (unsigned i = 0; i < src.size(); i++)
|
||||
bld.vop3(aco_opcode::v_readlane_b32, Definition(PhysReg{sitmp+i}, s1), Operand(PhysReg{tmp+i}, v1), Operand(31u));
|
||||
bld.readlane(Definition(PhysReg{sitmp+i}, s1), Operand(PhysReg{tmp+i}, v1), Operand(31u));
|
||||
emit_op(ctx, tmp, sitmp, tmp, vtmp, reduce_op, src.size());
|
||||
}
|
||||
} else if (cluster_size == 32) {
|
||||
|
@ -519,8 +519,8 @@ void emit_reduction(lower_context *ctx, aco_opcode op, ReduceOp reduce_op, unsig
|
|||
if (ctx->program->wave_size == 64) {
|
||||
/* fill in the gap in row 2 */
|
||||
for (unsigned i = 0; i < src.size(); i++) {
|
||||
bld.vop3(aco_opcode::v_readlane_b32, Definition(PhysReg{sitmp+i}, s1), Operand(PhysReg{tmp+i}, v1), Operand(31u));
|
||||
bld.vop3(aco_opcode::v_writelane_b32, Definition(PhysReg{vtmp+i}, v1), Operand(PhysReg{sitmp+i}, s1), Operand(32u));
|
||||
bld.readlane(Definition(PhysReg{sitmp+i}, s1), Operand(PhysReg{tmp+i}, v1), Operand(31u));
|
||||
bld.writelane(Definition(PhysReg{vtmp+i}, v1), Operand(PhysReg{sitmp+i}, s1), Operand(32u), Operand(PhysReg{vtmp+i}, v1));
|
||||
}
|
||||
}
|
||||
std::swap(tmp, vtmp);
|
||||
|
@ -531,8 +531,7 @@ void emit_reduction(lower_context *ctx, aco_opcode op, ReduceOp reduce_op, unsig
|
|||
if (!identity[i].isConstant() || identity[i].constantValue()) { /* bound_ctrl should take case of this overwise */
|
||||
if (ctx->program->chip_class < GFX10)
|
||||
assert((identity[i].isConstant() && !identity[i].isLiteral()) || identity[i].physReg() == PhysReg{sitmp+i});
|
||||
bld.vop3(aco_opcode::v_writelane_b32, Definition(PhysReg{tmp+i}, v1),
|
||||
identity[i], Operand(0u));
|
||||
bld.writelane(Definition(PhysReg{tmp+i}, v1), identity[i], Operand(0u), Operand(PhysReg{tmp+i}, v1));
|
||||
}
|
||||
}
|
||||
/* fall through */
|
||||
|
@ -562,7 +561,7 @@ void emit_reduction(lower_context *ctx, aco_opcode op, ReduceOp reduce_op, unsig
|
|||
bld.sop1(aco_opcode::s_mov_b32, Definition(exec_lo, s1), Operand(0u));
|
||||
bld.sop1(aco_opcode::s_mov_b32, Definition(exec_hi, s1), Operand(0xffffffffu));
|
||||
for (unsigned i = 0; i < src.size(); i++)
|
||||
bld.vop3(aco_opcode::v_readlane_b32, Definition(PhysReg{sitmp+i}, s1), Operand(PhysReg{tmp+i}, v1), Operand(31u));
|
||||
bld.readlane(Definition(PhysReg{sitmp+i}, s1), Operand(PhysReg{tmp+i}, v1), Operand(31u));
|
||||
emit_op(ctx, tmp, sitmp, tmp, vtmp, reduce_op, src.size());
|
||||
}
|
||||
} else {
|
||||
|
@ -581,7 +580,7 @@ void emit_reduction(lower_context *ctx, aco_opcode op, ReduceOp reduce_op, unsig
|
|||
|
||||
if (op == aco_opcode::p_reduce && dst.regClass().type() == RegType::sgpr) {
|
||||
for (unsigned k = 0; k < src.size(); k++) {
|
||||
bld.vop3(aco_opcode::v_readlane_b32, Definition(PhysReg{dst.physReg() + k}, s1),
|
||||
bld.readlane(Definition(PhysReg{dst.physReg() + k}, s1),
|
||||
Operand(PhysReg{tmp + k}, v1), Operand(ctx->program->wave_size - 1));
|
||||
}
|
||||
} else if (!(dst.physReg() == tmp) && !dst_written) {
|
||||
|
@ -911,21 +910,20 @@ void lower_to_hw_instr(Program* program)
|
|||
case aco_opcode::p_spill:
|
||||
{
|
||||
assert(instr->operands[0].regClass() == v1.as_linear());
|
||||
for (unsigned i = 0; i < instr->operands[2].size(); i++) {
|
||||
bld.vop3(aco_opcode::v_writelane_b32, bld.def(v1, instr->operands[0].physReg()),
|
||||
for (unsigned i = 0; i < instr->operands[2].size(); i++)
|
||||
bld.writelane(bld.def(v1, instr->operands[0].physReg()),
|
||||
Operand(PhysReg{instr->operands[2].physReg() + i}, s1),
|
||||
Operand(instr->operands[1].constantValue() + i));
|
||||
}
|
||||
Operand(instr->operands[1].constantValue() + i),
|
||||
instr->operands[0]);
|
||||
break;
|
||||
}
|
||||
case aco_opcode::p_reload:
|
||||
{
|
||||
assert(instr->operands[0].regClass() == v1.as_linear());
|
||||
for (unsigned i = 0; i < instr->definitions[0].size(); i++) {
|
||||
bld.vop3(aco_opcode::v_readlane_b32,
|
||||
bld.def(s1, PhysReg{instr->definitions[0].physReg() + i}),
|
||||
instr->operands[0], Operand(instr->operands[1].constantValue() + i));
|
||||
}
|
||||
for (unsigned i = 0; i < instr->definitions[0].size(); i++)
|
||||
bld.readlane(bld.def(s1, PhysReg{instr->definitions[0].physReg() + i}),
|
||||
instr->operands[0],
|
||||
Operand(instr->operands[1].constantValue() + i));
|
||||
break;
|
||||
}
|
||||
case aco_opcode::p_as_uniform:
|
||||
|
|
|
@ -592,6 +592,8 @@ for (gfx6, gfx7, gfx8, gfx9, gfx10, name) in SMEM:
|
|||
VOP2 = {
|
||||
# GFX6, GFX7, GFX8, GFX9, GFX10, name, input/output modifiers
|
||||
(0x00, 0x00, 0x00, 0x00, 0x01, "v_cndmask_b32", False),
|
||||
(0x01, 0x01, -1, -1, -1, "v_readlane_b32", False),
|
||||
(0x02, 0x02, -1, -1, -1, "v_writelane_b32", False),
|
||||
(0x03, 0x03, 0x01, 0x01, 0x03, "v_add_f32", True),
|
||||
(0x04, 0x04, 0x02, 0x02, 0x04, "v_sub_f32", True),
|
||||
(0x05, 0x05, 0x03, 0x03, 0x05, "v_subrev_f32", True),
|
||||
|
@ -984,8 +986,8 @@ VOP3 = {
|
|||
( -1, -1, 0x276, 0x276, -1, "v_interp_p2_legacy_f16", True, True),
|
||||
( -1, -1, -1, 0x277, 0x35a, "v_interp_p2_f16", True, True),
|
||||
(0x12b, 0x12b, 0x288, 0x288, 0x362, "v_ldexp_f32", False, True),
|
||||
(0x101, 0x101, 0x289, 0x289, 0x360, "v_readlane_b32", False, False),
|
||||
(0x102, 0x102, 0x28a, 0x28a, 0x361, "v_writelane_b32", False, False),
|
||||
( -1, -1, 0x289, 0x289, 0x360, "v_readlane_b32_e64", False, False),
|
||||
( -1, -1, 0x28a, 0x28a, 0x361, "v_writelane_b32_e64", False, False),
|
||||
(0x122, 0x122, 0x28b, 0x28b, 0x364, "v_bcnt_u32_b32", False, False),
|
||||
(0x123, 0x123, 0x28c, 0x28c, 0x365, "v_mbcnt_lo_u32_b32", False, False),
|
||||
(0x124, 0x124, 0x28d, 0x28d, 0x366, "v_mbcnt_hi_u32_b32", False, False),
|
||||
|
|
|
@ -421,14 +421,19 @@ bool can_use_VOP3(aco_ptr<Instruction>& instr)
|
|||
return instr->opcode != aco_opcode::v_madmk_f32 &&
|
||||
instr->opcode != aco_opcode::v_madak_f32 &&
|
||||
instr->opcode != aco_opcode::v_madmk_f16 &&
|
||||
instr->opcode != aco_opcode::v_madak_f16;
|
||||
instr->opcode != aco_opcode::v_madak_f16 &&
|
||||
instr->opcode != aco_opcode::v_readlane_b32 &&
|
||||
instr->opcode != aco_opcode::v_writelane_b32 &&
|
||||
instr->opcode != aco_opcode::v_readfirstlane_b32;
|
||||
}
|
||||
|
||||
bool can_apply_sgprs(aco_ptr<Instruction>& instr)
|
||||
{
|
||||
return instr->opcode != aco_opcode::v_readfirstlane_b32 &&
|
||||
instr->opcode != aco_opcode::v_readlane_b32 &&
|
||||
instr->opcode != aco_opcode::v_writelane_b32;
|
||||
instr->opcode != aco_opcode::v_readlane_b32_e64 &&
|
||||
instr->opcode != aco_opcode::v_writelane_b32 &&
|
||||
instr->opcode != aco_opcode::v_writelane_b32_e64;
|
||||
}
|
||||
|
||||
void to_VOP3(opt_ctx& ctx, aco_ptr<Instruction>& instr)
|
||||
|
@ -458,6 +463,7 @@ bool can_accept_constant(aco_ptr<Instruction>& instr, unsigned operand)
|
|||
case aco_opcode::v_interp_p2_f32:
|
||||
case aco_opcode::v_mac_f32:
|
||||
case aco_opcode::v_writelane_b32:
|
||||
case aco_opcode::v_writelane_b32_e64:
|
||||
case aco_opcode::v_cndmask_b32:
|
||||
return operand != 2;
|
||||
case aco_opcode::s_addk_i32:
|
||||
|
@ -466,6 +472,7 @@ bool can_accept_constant(aco_ptr<Instruction>& instr, unsigned operand)
|
|||
case aco_opcode::p_extract_vector:
|
||||
case aco_opcode::p_split_vector:
|
||||
case aco_opcode::v_readlane_b32:
|
||||
case aco_opcode::v_readlane_b32_e64:
|
||||
case aco_opcode::v_readfirstlane_b32:
|
||||
return operand != 0;
|
||||
default:
|
||||
|
@ -494,7 +501,8 @@ bool valu_can_accept_literal(opt_ctx& ctx, aco_ptr<Instruction>& instr, unsigned
|
|||
|
||||
bool valu_can_accept_vgpr(aco_ptr<Instruction>& instr, unsigned operand)
|
||||
{
|
||||
if (instr->opcode == aco_opcode::v_readlane_b32 || instr->opcode == aco_opcode::v_writelane_b32)
|
||||
if (instr->opcode == aco_opcode::v_readlane_b32 || instr->opcode == aco_opcode::v_readlane_b32_e64 ||
|
||||
instr->opcode == aco_opcode::v_writelane_b32 || instr->opcode == aco_opcode::v_writelane_b32_e64)
|
||||
return operand != 1;
|
||||
return true;
|
||||
}
|
||||
|
@ -633,7 +641,7 @@ void label_instruction(opt_ctx &ctx, Block& block, aco_ptr<Instruction>& instr)
|
|||
}
|
||||
if (info.is_constant() && can_accept_constant(instr, i)) {
|
||||
perfwarn(instr->opcode == aco_opcode::v_cndmask_b32 && i == 2, "v_cndmask_b32 with a constant selector", instr.get());
|
||||
if (i == 0) {
|
||||
if (i == 0 || instr->opcode == aco_opcode::v_readlane_b32 || instr->opcode == aco_opcode::v_writelane_b32) {
|
||||
instr->operands[i] = Operand(info.val);
|
||||
continue;
|
||||
} else if (!instr->isVOP3() && can_swap_operands(instr)) {
|
||||
|
|
|
@ -1509,7 +1509,8 @@ void register_allocation(Program *program, std::vector<std::set<Temp>> live_out_
|
|||
/* handle definitions which must have the same register as an operand */
|
||||
if (instr->opcode == aco_opcode::v_interp_p2_f32 ||
|
||||
instr->opcode == aco_opcode::v_mac_f32 ||
|
||||
instr->opcode == aco_opcode::v_writelane_b32) {
|
||||
instr->opcode == aco_opcode::v_writelane_b32 ||
|
||||
instr->opcode == aco_opcode::v_writelane_b32_e64) {
|
||||
instr->definitions[0].setFixed(instr->operands[2].physReg());
|
||||
} else if (instr->opcode == aco_opcode::s_addk_i32 ||
|
||||
instr->opcode == aco_opcode::s_mulk_i32) {
|
||||
|
|
|
@ -132,12 +132,21 @@ void validate(Program* program, FILE * output)
|
|||
check(instr->definitions[0].getTemp().type() == RegType::vgpr ||
|
||||
(int) instr->format & (int) Format::VOPC ||
|
||||
instr->opcode == aco_opcode::v_readfirstlane_b32 ||
|
||||
instr->opcode == aco_opcode::v_readlane_b32,
|
||||
instr->opcode == aco_opcode::v_readlane_b32 ||
|
||||
instr->opcode == aco_opcode::v_readlane_b32_e64,
|
||||
"Wrong Definition type for VALU instruction", instr.get());
|
||||
unsigned num_sgpr = 0;
|
||||
unsigned sgpr_idx = instr->operands.size();
|
||||
for (unsigned i = 0; i < instr->operands.size(); i++)
|
||||
{
|
||||
for (unsigned i = 0; i < instr->operands.size(); i++) {
|
||||
if (instr->opcode == aco_opcode::v_readfirstlane_b32 ||
|
||||
instr->opcode == aco_opcode::v_readlane_b32 ||
|
||||
instr->opcode == aco_opcode::v_readlane_b32_e64 ||
|
||||
instr->opcode == aco_opcode::v_writelane_b32 ||
|
||||
instr->opcode == aco_opcode::v_writelane_b32_e64) {
|
||||
check(!instr->operands[i].isLiteral(), "No literal allowed on VALU instruction", instr.get());
|
||||
check(i == 1 || (instr->operands[i].isTemp() && instr->operands[i].regClass() == v1), "Wrong Operand type for VALU instruction", instr.get());
|
||||
continue;
|
||||
}
|
||||
if (instr->operands[i].isTemp() && instr->operands[i].regClass().type() == RegType::sgpr) {
|
||||
check(i != 1 || (int) instr->format & (int) Format::VOP3A, "Wrong source position for SGPR argument", instr.get());
|
||||
|
||||
|
|
Loading…
Reference in New Issue