aco/lower_to_hw: remove gfx6/7 subdword paths

Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/28836>
This commit is contained in:
Georg Lehmann 2024-04-20 09:10:53 +02:00 committed by Marge Bot
parent 6ecbda83f8
commit d4084f7f09
2 changed files with 16 additions and 285 deletions

View File

@ -1371,43 +1371,6 @@ do_copy(lower_context* ctx, Builder& bld, const copy_operation& copy, bool* pres
bld.sop1(aco_opcode::s_mov_b32, def, op);
} else if (def.regClass() == s2) {
bld.sop1(aco_opcode::s_mov_b64, def, op);
} else if (def.regClass().is_subdword() && ctx->program->gfx_level < GFX8) {
if (op.physReg().byte()) {
assert(def.physReg().byte() == 0);
bld.vop2(aco_opcode::v_lshrrev_b32, def, Operand::c32(op.physReg().byte() * 8), op);
} else if (def.physReg().byte()) {
assert(op.physReg().byte() == 0);
/* preserve the target's lower half */
uint32_t bits = def.physReg().byte() * 8;
PhysReg lo_reg = PhysReg(def.physReg().reg());
Definition lo_half =
Definition(lo_reg, RegClass::get(RegType::vgpr, def.physReg().byte()));
Definition dst =
Definition(lo_reg, RegClass::get(RegType::vgpr, lo_half.bytes() + op.bytes()));
if (def.physReg().reg() == op.physReg().reg()) {
bld.vop2(aco_opcode::v_and_b32, lo_half, Operand::c32((1 << bits) - 1u),
Operand(lo_reg, lo_half.regClass()));
if (def.physReg().byte() == 1) {
bld.vop2(aco_opcode::v_mul_u32_u24, dst, Operand::c32((1 << bits) + 1u), op);
} else if (def.physReg().byte() == 2) {
bld.vop3(aco_opcode::v_cvt_pk_u16_u32, dst, Operand(lo_reg, v2b), op);
} else if (def.physReg().byte() == 3) {
bld.sop1(aco_opcode::s_mov_b32, Definition(scratch_sgpr, s1),
Operand::c32((1 << bits) + 1u));
bld.vop3(aco_opcode::v_mul_lo_u32, dst, Operand(scratch_sgpr, s1), op);
}
} else {
lo_half.setFixed(lo_half.physReg().advance(4 - def.physReg().byte()));
bld.vop2(aco_opcode::v_lshlrev_b32, lo_half, Operand::c32(32 - bits),
Operand(lo_reg, lo_half.regClass()));
bld.vop3(aco_opcode::v_alignbyte_b32, dst, op,
Operand(lo_half.physReg(), lo_half.regClass()),
Operand::c32(4 - def.physReg().byte()));
}
} else {
bld.vop1(aco_opcode::v_mov_b32, def, op);
}
} else if (def.regClass() == v1b && ctx->program->gfx_level >= GFX11) {
uint8_t swiz[] = {4, 5, 6, 7};
swiz[def.physReg().byte()] = op.physReg().byte();
@ -1568,6 +1531,8 @@ do_swap(lower_context* ctx, Builder& bld, const copy_operation& copy, bool prese
void
do_pack_2x16(lower_context* ctx, Builder& bld, Definition def, Operand lo, Operand hi)
{
assert(ctx->program->gfx_level >= GFX8);
if (lo.isConstant() && hi.isConstant()) {
copy_constant(ctx, bld, def, Operand::c32(lo.constantValue() | (hi.constantValue() << 16)));
return;
@ -1651,35 +1616,12 @@ do_pack_2x16(lower_context* ctx, Builder& bld, Definition def, Operand lo, Opera
emit_v_mov_b16(bld, def_hi, hi);
else
emit_v_mov_b16(bld, def_lo, lo);
return;
} else if (ctx->program->gfx_level >= GFX8) {
} else {
if (lo.physReg().reg() == def.physReg().reg())
bld.vop1_sdwa(aco_opcode::v_mov_b32, def_hi, hi);
else
bld.vop1_sdwa(aco_opcode::v_mov_b32, def_lo, lo);
return;
}
/* alignbyte needs the operands in the following way:
* | xx hi | lo xx | >> 2 byte */
if (lo.physReg().byte() != hi.physReg().byte()) {
/* | xx lo | hi xx | => | lo hi | lo hi | */
assert(lo.physReg().byte() == 0 && hi.physReg().byte() == 2);
bld.vop3(aco_opcode::v_alignbyte_b32, def, lo, hi, Operand::c32(2u));
lo = Operand(def_hi.physReg(), v2b);
hi = Operand(def_lo.physReg(), v2b);
} else if (lo.physReg().byte() == 0) {
/* | xx hi | xx lo | => | xx hi | lo 00 | */
bld.vop2(aco_opcode::v_lshlrev_b32, def_hi, Operand::c32(16u), lo);
lo = Operand(def_hi.physReg(), v2b);
} else {
/* | hi xx | lo xx | => | 00 hi | lo xx | */
assert(hi.physReg().byte() == 2);
bld.vop2(aco_opcode::v_lshrrev_b32, def_lo, Operand::c32(16u), hi);
hi = Operand(def_lo.physReg(), v2b);
}
/* perform the alignbyte */
bld.vop3(aco_opcode::v_alignbyte_b32, def, hi, lo, Operand::c32(2u));
}
void
@ -1833,53 +1775,6 @@ handle_operands(std::map<PhysReg, copy_operation>& copy_map, lower_context* ctx,
}
}
/* on GFX6/7, we need some small workarounds as there is no
* SDWA instruction to do partial register writes */
if (ctx->program->gfx_level < GFX8 && it->second.bytes < 4) {
if (it->first.byte() == 0 && it->second.op.physReg().byte() == 0 && !it->second.is_used &&
pi->opcode == aco_opcode::p_split_vector) {
/* Other operations might overwrite the high bits, so change all users
* of the high bits to the new target where they are still available.
* This mechanism depends on also emitting dead definitions. */
PhysReg reg_hi = it->second.op.physReg().advance(it->second.bytes);
while (reg_hi != PhysReg(it->second.op.physReg().reg() + 1)) {
std::map<PhysReg, copy_operation>::iterator other = copy_map.begin();
for (other = copy_map.begin(); other != copy_map.end(); other++) {
/* on GFX6/7, if the high bits are used as operand, they cannot be a target */
if (other->second.op.physReg() == reg_hi) {
other->second.op.setFixed(it->first.advance(reg_hi.byte()));
break; /* break because an operand can only be used once */
}
}
reg_hi = reg_hi.advance(it->second.bytes);
}
} else if (it->first.byte()) {
assert(pi->opcode == aco_opcode::p_create_vector);
/* on GFX6/7, if we target an upper half where the lower half hasn't yet been handled,
* move to the target operand's high bits. This is save to do as it cannot be an operand
*/
PhysReg lo = PhysReg(it->first.reg());
std::map<PhysReg, copy_operation>::iterator other = copy_map.find(lo);
if (other != copy_map.end()) {
assert(other->second.bytes == it->first.byte());
PhysReg new_reg_hi = other->second.op.physReg().advance(it->first.byte());
it->second.def = Definition(new_reg_hi, it->second.def.regClass());
it->second.is_used = 0;
other->second.bytes += it->second.bytes;
other->second.def.setTemp(Temp(other->second.def.tempId(),
RegClass::get(RegType::vgpr, other->second.bytes)));
other->second.op.setTemp(Temp(other->second.op.tempId(),
RegClass::get(RegType::vgpr, other->second.bytes)));
/* if the new target's high bits are also a target, change uses */
std::map<PhysReg, copy_operation>::iterator target = copy_map.find(new_reg_hi);
if (target != copy_map.end()) {
for (unsigned i = 0; i < it->second.bytes; i++)
target->second.uses[i]++;
}
}
}
}
/* find portions where the target reg is not used as operand for any other copy */
if (it->second.is_used) {
if (it->second.op.isConstant() || skip_partial_copies) {
@ -1899,8 +1794,7 @@ handle_operands(std::map<PhysReg, copy_operation>& copy_map, lower_context* ctx,
* a partial copy allows further copies, it should be done instead. */
bool partial_copy = (has_zero_use_bytes == 0xf) || (has_zero_use_bytes == 0xf0);
for (std::pair<const PhysReg, copy_operation>& copy : copy_map) {
/* on GFX6/7, we can only do copies with full registers */
if (partial_copy || ctx->program->gfx_level <= GFX7)
if (partial_copy)
break;
for (uint16_t i = 0; i < copy.second.bytes; i++) {
/* distance might underflow */
@ -2026,8 +1920,7 @@ handle_operands(std::map<PhysReg, copy_operation>& copy_map, lower_context* ctx,
}
/* GFX6-7 can only swap full registers */
if (ctx->program->gfx_level <= GFX7)
swap.bytes = align(swap.bytes, 4);
assert (ctx->program->gfx_level > GFX7 || (swap.bytes % 4) == 0);
do_swap(ctx, bld, swap, preserve_scc, pi);
@ -2588,8 +2481,7 @@ lower_to_hw_instr(Program* program)
bld.sop2(signext ? aco_opcode::s_bfe_i32 : aco_opcode::s_bfe_u32, dst,
bld.def(s1, scc), op, Operand::c32((bits << 16) | offset));
}
} else if ((dst.regClass() == v1 && op.physReg().byte() == 0) ||
ctx.program->gfx_level <= GFX7) {
} else if (dst.regClass() == v1 && op.physReg().byte() == 0) {
assert(op.physReg().byte() == 0 && dst.physReg().byte() == 0);
if (offset == (32 - bits) && op.regClass() != s1) {
bld.vop2(signext ? aco_opcode::v_ashrrev_i32 : aco_opcode::v_lshrrev_b32, dst,
@ -2667,10 +2559,9 @@ lower_to_hw_instr(Program* program)
Operand(dst.physReg(), s1), Operand::c32(offset));
}
} else if (dst.regClass() == v1 || !has_sdwa) {
if (offset == (dst.bytes() * 8u - bits) &&
(dst.regClass() == v1 || program->gfx_level <= GFX7)) {
if (offset == (dst.bytes() * 8u - bits) && dst.regClass() == v1) {
bld.vop2(aco_opcode::v_lshlrev_b32, dst, Operand::c32(offset), op);
} else if (offset == 0 && (dst.regClass() == v1 || program->gfx_level <= GFX7)) {
} else if (offset == 0 && dst.regClass() == v1) {
bld.vop3(aco_opcode::v_bfe_u32, dst, op, Operand::zero(), Operand::c32(bits));
} else if (has_sdwa && (op.regClass() != s1 || program->gfx_level >= GFX9)) {
bld.vop1_sdwa(aco_opcode::v_mov_b32, dst, op)->sdwa().dst_sel =

View File

@ -16,8 +16,6 @@ BEGIN_TEST(to_hw_instr.swap_subdword)
PhysReg v1_hi{257};
PhysReg v1_b1{257};
PhysReg v1_b3{257};
PhysReg v2_lo{258};
PhysReg v3_lo{259};
v0_hi.reg_b += 2;
v1_hi.reg_b += 2;
v0_b1.reg_b += 1;
@ -25,164 +23,6 @@ BEGIN_TEST(to_hw_instr.swap_subdword)
v0_b3.reg_b += 3;
v1_b3.reg_b += 3;
for (unsigned i = GFX6; i <= GFX7; i++) {
if (!setup_cs(NULL, (amd_gfx_level)i))
continue;
//~gfx[67]>> p_unit_test 0
//~gfx[67]! v1: %0:v[1] = v_xor_b32 %0:v[1], %0:v[0]
//~gfx[67]! v1: %0:v[0] = v_xor_b32 %0:v[1], %0:v[0]
//~gfx[67]! v1: %0:v[1] = v_xor_b32 %0:v[1], %0:v[0]
bld.pseudo(aco_opcode::p_unit_test, Operand::zero());
bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v2b), Definition(v1_lo, v2b),
Operand(v1_lo, v2b), Operand(v0_lo, v2b));
//~gfx[67]! p_unit_test 1
//~gfx[67]! v2b: %0:v[0][16:32] = v_lshlrev_b32 16, %0:v[0][0:16]
//~gfx[67]! v1: %0:v[0] = v_alignbyte_b32 %0:v[1][0:16], %0:v[0][16:32], 2
//~gfx[67]! v1: %0:v[0] = v_alignbyte_b32 %0:v[0][0:16], %0:v[0][16:32], 2
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1u));
bld.pseudo(aco_opcode::p_create_vector, Definition(v0_lo, v1), Operand(v1_lo, v2b),
Operand(v0_lo, v2b));
//~gfx[67]! p_unit_test 2
//~gfx[67]! v2b: %0:v[0][16:32] = v_lshlrev_b32 16, %0:v[0][0:16]
//~gfx[67]! v1: %0:v[0] = v_alignbyte_b32 %0:v[1][0:16], %0:v[0][16:32], 2
//~gfx[67]! v1: %0:v[0] = v_alignbyte_b32 %0:v[0][0:16], %0:v[0][16:32], 2
//~gfx[67]! v2b: %0:v[1][0:16] = v_mov_b32 %0:v[2][0:16]
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2u));
bld.pseudo(aco_opcode::p_create_vector, Definition(v0_lo, v6b), Operand(v1_lo, v2b),
Operand(v0_lo, v2b), Operand(v2_lo, v2b));
//~gfx[67]! p_unit_test 3
//~gfx[67]! v2b: %0:v[0][16:32] = v_lshlrev_b32 16, %0:v[0][0:16]
//~gfx[67]! v1: %0:v[0] = v_alignbyte_b32 %0:v[1][0:16], %0:v[0][16:32], 2
//~gfx[67]! v1: %0:v[0] = v_alignbyte_b32 %0:v[0][0:16], %0:v[0][16:32], 2
//~gfx[67]! v2b: %0:v[1][16:32] = v_lshlrev_b32 16, %0:v[2][0:16]
//~gfx[67]! v1: %0:v[1] = v_alignbyte_b32 %0:v[3][0:16], %0:v[1][16:32], 2
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3u));
bld.pseudo(aco_opcode::p_create_vector, Definition(v0_lo, v2), Operand(v1_lo, v2b),
Operand(v0_lo, v2b), Operand(v2_lo, v2b), Operand(v3_lo, v2b));
//~gfx[67]! p_unit_test 4
//~gfx[67]! v2b: %0:v[1][16:32] = v_lshlrev_b32 16, %0:v[1][0:16]
//~gfx[67]! v1: %0:v[1] = v_alignbyte_b32 %0:v[2][0:16], %0:v[1][16:32], 2
//~gfx[67]! v2b: %0:v[0][16:32] = v_lshlrev_b32 16, %0:v[0][0:16]
//~gfx[67]! v1: %0:v[0] = v_alignbyte_b32 %0:v[3][0:16], %0:v[0][16:32], 2
//~gfx[67]! v1: %0:v[1] = v_xor_b32 %0:v[1], %0:v[0]
//~gfx[67]! v1: %0:v[0] = v_xor_b32 %0:v[1], %0:v[0]
//~gfx[67]! v1: %0:v[1] = v_xor_b32 %0:v[1], %0:v[0]
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4u));
bld.pseudo(aco_opcode::p_create_vector, Definition(v0_lo, v2), Operand(v1_lo, v2b),
Operand(v2_lo, v2b), Operand(v0_lo, v2b), Operand(v3_lo, v2b));
//~gfx[67]! p_unit_test 5
//~gfx[67]! v2b: %0:v[1][0:16] = v_mov_b32 %0:v[0][0:16]
//~gfx[67]! v2b: %0:v[0][0:16] = v_lshrrev_b32 16, %0:v[1][16:32]
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5u));
bld.pseudo(aco_opcode::p_split_vector, Definition(v1_lo, v2b), Definition(v0_lo, v2b),
Operand(v0_lo, v1));
//~gfx[67]! p_unit_test 6
//~gfx[67]! v2b: %0:v[2][0:16] = v_mov_b32 %0:v[1][0:16]
//~gfx[67]! v2b: %0:v[1][0:16] = v_mov_b32 %0:v[0][0:16]
//~gfx[67]! v2b: %0:v[0][0:16] = v_lshrrev_b32 16, %0:v[1][16:32]
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(6u));
bld.pseudo(aco_opcode::p_split_vector, Definition(v1_lo, v2b), Definition(v0_lo, v2b),
Definition(v2_lo, v2b), Operand(v0_lo, v6b));
//~gfx[67]! p_unit_test 7
//~gfx[67]! v2b: %0:v[2][0:16] = v_mov_b32 %0:v[1][0:16]
//~gfx[67]! v2b: %0:v[1][0:16] = v_mov_b32 %0:v[0][0:16]
//~gfx[67]! v2b: %0:v[0][0:16] = v_lshrrev_b32 16, %0:v[1][16:32]
//~gfx[67]! v2b: %0:v[3][0:16] = v_lshrrev_b32 16, %0:v[2][16:32]
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(7u));
bld.pseudo(aco_opcode::p_split_vector, Definition(v1_lo, v2b), Definition(v0_lo, v2b),
Definition(v2_lo, v2b), Definition(v3_lo, v2b), Operand(v0_lo, v2));
//~gfx[67]! p_unit_test 8
//~gfx[67]! v2b: %0:v[2][0:16] = v_lshrrev_b32 16, %0:v[0][16:32]
//~gfx[67]! v2b: %0:v[3][0:16] = v_lshrrev_b32 16, %0:v[1][16:32]
//~gfx[67]! v1: %0:v[1] = v_xor_b32 %0:v[1], %0:v[0]
//~gfx[67]! v1: %0:v[0] = v_xor_b32 %0:v[1], %0:v[0]
//~gfx[67]! v1: %0:v[1] = v_xor_b32 %0:v[1], %0:v[0]
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(8u));
bld.pseudo(aco_opcode::p_split_vector, Definition(v1_lo, v2b), Definition(v2_lo, v2b),
Definition(v0_lo, v2b), Definition(v3_lo, v2b), Operand(v0_lo, v2));
//~gfx[67]! p_unit_test 9
//~gfx[67]! v1: %0:v[1] = v_xor_b32 %0:v[1], %0:v[0]
//~gfx[67]! v1: %0:v[0] = v_xor_b32 %0:v[1], %0:v[0]
//~gfx[67]! v1: %0:v[1] = v_xor_b32 %0:v[1], %0:v[0]
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(9u));
bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v1b), Definition(v1_lo, v1b),
Operand(v1_lo, v1b), Operand(v0_lo, v1b));
//~gfx[67]! p_unit_test 10
//~gfx[67]! v1b: %0:v[1][24:32] = v_lshlrev_b32 24, %0:v[1][0:8]
//~gfx[67]! v2b: %0:v[1][0:16] = v_alignbyte_b32 %0:v[0][0:8], %0:v[1][24:32], 3
//~gfx[67]! v2b: %0:v[0][0:16] = v_mov_b32 %0:v[1][0:16]
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(10u));
bld.pseudo(aco_opcode::p_create_vector, Definition(v0_lo, v2b), Operand(v1_lo, v1b),
Operand(v0_lo, v1b));
//~gfx[67]! p_unit_test 11
//~gfx[67]! v1b: %0:v[1][24:32] = v_lshlrev_b32 24, %0:v[1][0:8]
//~gfx[67]! v2b: %0:v[1][0:16] = v_alignbyte_b32 %0:v[0][0:8], %0:v[1][24:32], 3
//~gfx[67]! v2b: %0:v[0][0:16] = v_mov_b32 %0:v[1][0:16]
//~gfx[67]! v2b: %0:v[0][16:32] = v_lshlrev_b32 16, %0:v[0][0:16]
//~gfx[67]! v3b: %0:v[0][0:24] = v_alignbyte_b32 %0:v[2][0:8], %0:v[0][16:32], 2
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(11u));
bld.pseudo(aco_opcode::p_create_vector, Definition(v0_lo, v3b), Operand(v1_lo, v1b),
Operand(v0_lo, v1b), Operand(v2_lo, v1b));
//~gfx[67]! p_unit_test 12
//~gfx[67]! v1b: %0:v[1][24:32] = v_lshlrev_b32 24, %0:v[1][0:8]
//~gfx[67]! v2b: %0:v[1][0:16] = v_alignbyte_b32 %0:v[0][0:8], %0:v[1][24:32], 3
//~gfx[67]! v2b: %0:v[0][0:16] = v_mov_b32 %0:v[1][0:16]
//~gfx[67]! v2b: %0:v[0][16:32] = v_lshlrev_b32 16, %0:v[0][0:16]
//~gfx[67]! v3b: %0:v[0][0:24] = v_alignbyte_b32 %0:v[2][0:8], %0:v[0][16:32], 2
//~gfx[67]! v3b: %0:v[0][8:32] = v_lshlrev_b32 8, %0:v[0][0:24]
//~gfx[67]! v1: %0:v[0] = v_alignbyte_b32 %0:v[3][0:8], %0:v[0][8:32], 1
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(12u));
bld.pseudo(aco_opcode::p_create_vector, Definition(v0_lo, v1), Operand(v1_lo, v1b),
Operand(v0_lo, v1b), Operand(v2_lo, v1b), Operand(v3_lo, v1b));
//~gfx[67]! p_unit_test 13
//~gfx[67]! v1b: %0:v[0][0:8] = v_and_b32 0xff, %0:v[0][0:8]
//~gfx[67]! v2b: %0:v[0][0:16] = v_mul_u32_u24 0x101, %0:v[0][0:8]
//~gfx[67]! v2b: %0:v[0][0:16] = v_and_b32 0xffff, %0:v[0][0:16]
//~gfx[67]! v3b: %0:v[0][0:24] = v_cvt_pk_u16_u32 %0:v[0][0:16], %0:v[0][0:8]
//~gfx[67]! v3b: %0:v[0][0:24] = v_and_b32 0xffffff, %0:v[0][0:24]
//~gfx[67]! s1: %0:m0 = s_mov_b32 0x1000001
//~gfx[67]! v1: %0:v[0] = v_mul_lo_u32 %0:m0, %0:v[0][0:8]
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(13u));
Instruction* pseudo =
bld.pseudo(aco_opcode::p_create_vector, Definition(v0_lo, v1), Operand(v0_lo, v1b),
Operand(v0_lo, v1b), Operand(v0_lo, v1b), Operand(v0_lo, v1b));
pseudo->pseudo().scratch_sgpr = m0;
//~gfx[67]! p_unit_test 14
//~gfx[67]! v1b: %0:v[1][0:8] = v_mov_b32 %0:v[0][0:8]
//~gfx[67]! v1b: %0:v[0][0:8] = v_lshrrev_b32 8, %0:v[1][8:16]
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(14u));
bld.pseudo(aco_opcode::p_split_vector, Definition(v1_lo, v1b), Definition(v0_lo, v1b),
Operand(v0_lo, v2b));
//~gfx[67]! p_unit_test 15
//~gfx[67]! v1b: %0:v[1][0:8] = v_mov_b32 %0:v[0][0:8]
//~gfx[67]! v1b: %0:v[0][0:8] = v_lshrrev_b32 8, %0:v[1][8:16]
//~gfx[67]! v1b: %0:v[2][0:8] = v_lshrrev_b32 16, %0:v[1][16:24]
//~gfx[67]! v1b: %0:v[3][0:8] = v_lshrrev_b32 24, %0:v[1][24:32]
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(15u));
bld.pseudo(aco_opcode::p_split_vector, Definition(v1_lo, v1b), Definition(v0_lo, v1b),
Definition(v2_lo, v1b), Definition(v3_lo, v1b), Operand(v0_lo, v1));
//~gfx[67]! s_endpgm
finish_to_hw_instr_test();
}
for (amd_gfx_level lvl : {GFX8, GFX9, GFX11}) {
if (!setup_cs(NULL, lvl))
continue;
@ -621,23 +461,23 @@ BEGIN_TEST(to_hw_instr.extract)
//>> p_unit_test 4
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4u));
//~gfx7.*! v2b: %_:v[0][0:16] = @v_bfe %_:v[1][0:16], 0, 8
//~gfx(8|9).*! v2b: %_:v[0][0:16] = v_mov_b32 %_:v[1][0:16] dst_sel:uword0 dst_preserve src0_sel:@byte(0)
//~gfx11_unsigned! v1: %_:v[0] = v_perm_b32 %_:v[0], %_:v[1], 0x7060c00
//~gfx11_signed! v1: %_:v[0] = v_perm_b32 %_:v[0], %_:v[1], 0x7060000
//~gfx11_signed! v1: %_:v[0] = v_perm_b32 %_:v[0], 0, 0x7060a04
EXT(0, 0)
if (lvl != GFX7)
EXT(0, 0)
//~gfx(8|9).*! v2b: %_:v[0][0:16] = v_mov_b32 %_:v[1][16:32] dst_sel:uword0 dst_preserve src0_sel:@byte(2)
//~gfx11_unsigned! v1: %_:v[0] = v_perm_b32 %_:v[0], %_:v[1], 0x7060c02
//~gfx11_signed! v1: %_:v[0] = v_perm_b32 %_:v[0], %_:v[1], 0x7060202
//~gfx11_signed! v1: %_:v[0] = v_perm_b32 %_:v[0], 0, 0x7060a04
if (lvl != GFX7)
EXT(0, 2)
//~gfx7.*! v2b: %_:v[0][0:16] = @v_bfe %_:v[1][0:16], 8, 8
//~gfx(8|9).*! v2b: %_:v[0][0:16] = v_mov_b32 %_:v[1][0:16] dst_sel:uword0 dst_preserve src0_sel:@byte(1)
//~gfx11_unsigned! v1: %_:v[0] = v_perm_b32 %_:v[0], %_:v[1], 0x7060c01
//~gfx11_signed! v1: %_:v[0] = v_perm_b32 %_:v[0], %_:v[1], 0x7060801
EXT(1, 0)
if (lvl != GFX7)
EXT(1, 0)
//~gfx(8|9).*! v2b: %_:v[0][0:16] = v_mov_b32 %_:v[1][16:32] dst_sel:uword0 dst_preserve src0_sel:@byte(3)
//~gfx11_unsigned! v1: %_:v[0] = v_perm_b32 %_:v[0], %_:v[1], 0x7060c03
//~gfx11_signed! v1: %_:v[0] = v_perm_b32 %_:v[0], %_:v[1], 0x7060903
@ -721,10 +561,10 @@ BEGIN_TEST(to_hw_instr.insert)
//>> p_unit_test 2
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2u));
//~gfx7! v2b: %_:v[0][0:16] = v_bfe_u32 %_:v[1][0:16], 0, 8
//~gfx(8|9)! v2b: %0:v[0][0:16] = v_mov_b32 %0:v[1][0:16] dst_sel:uword0 dst_preserve src0_sel:ubyte0
//~gfx11! v1: %0:v[0] = v_perm_b32 %0:v[0], %0:v[1], 0x7060c00
INS(0, 0, 0)
if (lvl != GFX7)
INS(0, 0, 0)
//~gfx(8|9)! v2b: %0:v[0][16:32] = v_mov_b32 %0:v[1][0:16] dst_sel:uword1 dst_preserve src0_sel:ubyte0
//~gfx11! v1: %0:v[0] = v_perm_b32 %0:v[0], %0:v[1], 0xc000504
if (lvl != GFX7)
@ -737,12 +577,12 @@ BEGIN_TEST(to_hw_instr.insert)
//~gfx11! v1: %0:v[0] = v_perm_b32 %0:v[0], %0:v[1], 0xc020504
if (lvl != GFX7)
INS(0, 2, 2)
//~gfx7! v2b: %_:v[0][0:16] = v_lshlrev_b32 8, %_:v[1][0:16]
//~gfx8! v1b: %0:v[0][8:16] = v_mov_b32 %0:v[1][0:16] dst_sel:ubyte1 dst_preserve src0_sel:ubyte0
//~gfx8! v2b: %0:v[0][0:16] = v_and_b32 0xffffff00, %0:v[1]
//~gfx9! v2b: %0:v[0][0:16] = v_lshlrev_b32 8, %0:v[1][0:16] dst_sel:uword0 dst_preserve src0_sel:dword src1_sel:ubyte0
//~gfx11! v1: %0:v[0] = v_perm_b32 %0:v[0], %0:v[1], 0x706000c
INS(1, 0, 0)
if (lvl != GFX7)
INS(1, 0, 0)
//~gfx8! v1b: %0:v[0][24:32] = v_mov_b32 %0:v[1][0:16] dst_sel:ubyte3 dst_preserve src0_sel:ubyte0
//~gfx8! v2b: %0:v[0][16:32] = v_and_b32 0xff00ffff, %0:v[1]
//~gfx9! v2b: %0:v[0][16:32] = v_lshlrev_b32 8, %0:v[1][0:16] dst_sel:uword1 dst_preserve src0_sel:dword src1_sel:ubyte0