mirror of https://gitlab.freedesktop.org/mesa/mesa
aco/lower_to_hw: remove gfx6/7 subdword paths
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev> Reviewed-by: Rhys Perry <pendingchaos02@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/28836>
This commit is contained in:
parent
6ecbda83f8
commit
d4084f7f09
|
@ -1371,43 +1371,6 @@ do_copy(lower_context* ctx, Builder& bld, const copy_operation& copy, bool* pres
|
|||
bld.sop1(aco_opcode::s_mov_b32, def, op);
|
||||
} else if (def.regClass() == s2) {
|
||||
bld.sop1(aco_opcode::s_mov_b64, def, op);
|
||||
} else if (def.regClass().is_subdword() && ctx->program->gfx_level < GFX8) {
|
||||
if (op.physReg().byte()) {
|
||||
assert(def.physReg().byte() == 0);
|
||||
bld.vop2(aco_opcode::v_lshrrev_b32, def, Operand::c32(op.physReg().byte() * 8), op);
|
||||
} else if (def.physReg().byte()) {
|
||||
assert(op.physReg().byte() == 0);
|
||||
/* preserve the target's lower half */
|
||||
uint32_t bits = def.physReg().byte() * 8;
|
||||
PhysReg lo_reg = PhysReg(def.physReg().reg());
|
||||
Definition lo_half =
|
||||
Definition(lo_reg, RegClass::get(RegType::vgpr, def.physReg().byte()));
|
||||
Definition dst =
|
||||
Definition(lo_reg, RegClass::get(RegType::vgpr, lo_half.bytes() + op.bytes()));
|
||||
|
||||
if (def.physReg().reg() == op.physReg().reg()) {
|
||||
bld.vop2(aco_opcode::v_and_b32, lo_half, Operand::c32((1 << bits) - 1u),
|
||||
Operand(lo_reg, lo_half.regClass()));
|
||||
if (def.physReg().byte() == 1) {
|
||||
bld.vop2(aco_opcode::v_mul_u32_u24, dst, Operand::c32((1 << bits) + 1u), op);
|
||||
} else if (def.physReg().byte() == 2) {
|
||||
bld.vop3(aco_opcode::v_cvt_pk_u16_u32, dst, Operand(lo_reg, v2b), op);
|
||||
} else if (def.physReg().byte() == 3) {
|
||||
bld.sop1(aco_opcode::s_mov_b32, Definition(scratch_sgpr, s1),
|
||||
Operand::c32((1 << bits) + 1u));
|
||||
bld.vop3(aco_opcode::v_mul_lo_u32, dst, Operand(scratch_sgpr, s1), op);
|
||||
}
|
||||
} else {
|
||||
lo_half.setFixed(lo_half.physReg().advance(4 - def.physReg().byte()));
|
||||
bld.vop2(aco_opcode::v_lshlrev_b32, lo_half, Operand::c32(32 - bits),
|
||||
Operand(lo_reg, lo_half.regClass()));
|
||||
bld.vop3(aco_opcode::v_alignbyte_b32, dst, op,
|
||||
Operand(lo_half.physReg(), lo_half.regClass()),
|
||||
Operand::c32(4 - def.physReg().byte()));
|
||||
}
|
||||
} else {
|
||||
bld.vop1(aco_opcode::v_mov_b32, def, op);
|
||||
}
|
||||
} else if (def.regClass() == v1b && ctx->program->gfx_level >= GFX11) {
|
||||
uint8_t swiz[] = {4, 5, 6, 7};
|
||||
swiz[def.physReg().byte()] = op.physReg().byte();
|
||||
|
@ -1568,6 +1531,8 @@ do_swap(lower_context* ctx, Builder& bld, const copy_operation& copy, bool prese
|
|||
void
|
||||
do_pack_2x16(lower_context* ctx, Builder& bld, Definition def, Operand lo, Operand hi)
|
||||
{
|
||||
assert(ctx->program->gfx_level >= GFX8);
|
||||
|
||||
if (lo.isConstant() && hi.isConstant()) {
|
||||
copy_constant(ctx, bld, def, Operand::c32(lo.constantValue() | (hi.constantValue() << 16)));
|
||||
return;
|
||||
|
@ -1651,35 +1616,12 @@ do_pack_2x16(lower_context* ctx, Builder& bld, Definition def, Operand lo, Opera
|
|||
emit_v_mov_b16(bld, def_hi, hi);
|
||||
else
|
||||
emit_v_mov_b16(bld, def_lo, lo);
|
||||
return;
|
||||
} else if (ctx->program->gfx_level >= GFX8) {
|
||||
} else {
|
||||
if (lo.physReg().reg() == def.physReg().reg())
|
||||
bld.vop1_sdwa(aco_opcode::v_mov_b32, def_hi, hi);
|
||||
else
|
||||
bld.vop1_sdwa(aco_opcode::v_mov_b32, def_lo, lo);
|
||||
return;
|
||||
}
|
||||
|
||||
/* alignbyte needs the operands in the following way:
|
||||
* | xx hi | lo xx | >> 2 byte */
|
||||
if (lo.physReg().byte() != hi.physReg().byte()) {
|
||||
/* | xx lo | hi xx | => | lo hi | lo hi | */
|
||||
assert(lo.physReg().byte() == 0 && hi.physReg().byte() == 2);
|
||||
bld.vop3(aco_opcode::v_alignbyte_b32, def, lo, hi, Operand::c32(2u));
|
||||
lo = Operand(def_hi.physReg(), v2b);
|
||||
hi = Operand(def_lo.physReg(), v2b);
|
||||
} else if (lo.physReg().byte() == 0) {
|
||||
/* | xx hi | xx lo | => | xx hi | lo 00 | */
|
||||
bld.vop2(aco_opcode::v_lshlrev_b32, def_hi, Operand::c32(16u), lo);
|
||||
lo = Operand(def_hi.physReg(), v2b);
|
||||
} else {
|
||||
/* | hi xx | lo xx | => | 00 hi | lo xx | */
|
||||
assert(hi.physReg().byte() == 2);
|
||||
bld.vop2(aco_opcode::v_lshrrev_b32, def_lo, Operand::c32(16u), hi);
|
||||
hi = Operand(def_lo.physReg(), v2b);
|
||||
}
|
||||
/* perform the alignbyte */
|
||||
bld.vop3(aco_opcode::v_alignbyte_b32, def, hi, lo, Operand::c32(2u));
|
||||
}
|
||||
|
||||
void
|
||||
|
@ -1833,53 +1775,6 @@ handle_operands(std::map<PhysReg, copy_operation>& copy_map, lower_context* ctx,
|
|||
}
|
||||
}
|
||||
|
||||
/* on GFX6/7, we need some small workarounds as there is no
|
||||
* SDWA instruction to do partial register writes */
|
||||
if (ctx->program->gfx_level < GFX8 && it->second.bytes < 4) {
|
||||
if (it->first.byte() == 0 && it->second.op.physReg().byte() == 0 && !it->second.is_used &&
|
||||
pi->opcode == aco_opcode::p_split_vector) {
|
||||
/* Other operations might overwrite the high bits, so change all users
|
||||
* of the high bits to the new target where they are still available.
|
||||
* This mechanism depends on also emitting dead definitions. */
|
||||
PhysReg reg_hi = it->second.op.physReg().advance(it->second.bytes);
|
||||
while (reg_hi != PhysReg(it->second.op.physReg().reg() + 1)) {
|
||||
std::map<PhysReg, copy_operation>::iterator other = copy_map.begin();
|
||||
for (other = copy_map.begin(); other != copy_map.end(); other++) {
|
||||
/* on GFX6/7, if the high bits are used as operand, they cannot be a target */
|
||||
if (other->second.op.physReg() == reg_hi) {
|
||||
other->second.op.setFixed(it->first.advance(reg_hi.byte()));
|
||||
break; /* break because an operand can only be used once */
|
||||
}
|
||||
}
|
||||
reg_hi = reg_hi.advance(it->second.bytes);
|
||||
}
|
||||
} else if (it->first.byte()) {
|
||||
assert(pi->opcode == aco_opcode::p_create_vector);
|
||||
/* on GFX6/7, if we target an upper half where the lower half hasn't yet been handled,
|
||||
* move to the target operand's high bits. This is save to do as it cannot be an operand
|
||||
*/
|
||||
PhysReg lo = PhysReg(it->first.reg());
|
||||
std::map<PhysReg, copy_operation>::iterator other = copy_map.find(lo);
|
||||
if (other != copy_map.end()) {
|
||||
assert(other->second.bytes == it->first.byte());
|
||||
PhysReg new_reg_hi = other->second.op.physReg().advance(it->first.byte());
|
||||
it->second.def = Definition(new_reg_hi, it->second.def.regClass());
|
||||
it->second.is_used = 0;
|
||||
other->second.bytes += it->second.bytes;
|
||||
other->second.def.setTemp(Temp(other->second.def.tempId(),
|
||||
RegClass::get(RegType::vgpr, other->second.bytes)));
|
||||
other->second.op.setTemp(Temp(other->second.op.tempId(),
|
||||
RegClass::get(RegType::vgpr, other->second.bytes)));
|
||||
/* if the new target's high bits are also a target, change uses */
|
||||
std::map<PhysReg, copy_operation>::iterator target = copy_map.find(new_reg_hi);
|
||||
if (target != copy_map.end()) {
|
||||
for (unsigned i = 0; i < it->second.bytes; i++)
|
||||
target->second.uses[i]++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* find portions where the target reg is not used as operand for any other copy */
|
||||
if (it->second.is_used) {
|
||||
if (it->second.op.isConstant() || skip_partial_copies) {
|
||||
|
@ -1899,8 +1794,7 @@ handle_operands(std::map<PhysReg, copy_operation>& copy_map, lower_context* ctx,
|
|||
* a partial copy allows further copies, it should be done instead. */
|
||||
bool partial_copy = (has_zero_use_bytes == 0xf) || (has_zero_use_bytes == 0xf0);
|
||||
for (std::pair<const PhysReg, copy_operation>& copy : copy_map) {
|
||||
/* on GFX6/7, we can only do copies with full registers */
|
||||
if (partial_copy || ctx->program->gfx_level <= GFX7)
|
||||
if (partial_copy)
|
||||
break;
|
||||
for (uint16_t i = 0; i < copy.second.bytes; i++) {
|
||||
/* distance might underflow */
|
||||
|
@ -2026,8 +1920,7 @@ handle_operands(std::map<PhysReg, copy_operation>& copy_map, lower_context* ctx,
|
|||
}
|
||||
|
||||
/* GFX6-7 can only swap full registers */
|
||||
if (ctx->program->gfx_level <= GFX7)
|
||||
swap.bytes = align(swap.bytes, 4);
|
||||
assert (ctx->program->gfx_level > GFX7 || (swap.bytes % 4) == 0);
|
||||
|
||||
do_swap(ctx, bld, swap, preserve_scc, pi);
|
||||
|
||||
|
@ -2588,8 +2481,7 @@ lower_to_hw_instr(Program* program)
|
|||
bld.sop2(signext ? aco_opcode::s_bfe_i32 : aco_opcode::s_bfe_u32, dst,
|
||||
bld.def(s1, scc), op, Operand::c32((bits << 16) | offset));
|
||||
}
|
||||
} else if ((dst.regClass() == v1 && op.physReg().byte() == 0) ||
|
||||
ctx.program->gfx_level <= GFX7) {
|
||||
} else if (dst.regClass() == v1 && op.physReg().byte() == 0) {
|
||||
assert(op.physReg().byte() == 0 && dst.physReg().byte() == 0);
|
||||
if (offset == (32 - bits) && op.regClass() != s1) {
|
||||
bld.vop2(signext ? aco_opcode::v_ashrrev_i32 : aco_opcode::v_lshrrev_b32, dst,
|
||||
|
@ -2667,10 +2559,9 @@ lower_to_hw_instr(Program* program)
|
|||
Operand(dst.physReg(), s1), Operand::c32(offset));
|
||||
}
|
||||
} else if (dst.regClass() == v1 || !has_sdwa) {
|
||||
if (offset == (dst.bytes() * 8u - bits) &&
|
||||
(dst.regClass() == v1 || program->gfx_level <= GFX7)) {
|
||||
if (offset == (dst.bytes() * 8u - bits) && dst.regClass() == v1) {
|
||||
bld.vop2(aco_opcode::v_lshlrev_b32, dst, Operand::c32(offset), op);
|
||||
} else if (offset == 0 && (dst.regClass() == v1 || program->gfx_level <= GFX7)) {
|
||||
} else if (offset == 0 && dst.regClass() == v1) {
|
||||
bld.vop3(aco_opcode::v_bfe_u32, dst, op, Operand::zero(), Operand::c32(bits));
|
||||
} else if (has_sdwa && (op.regClass() != s1 || program->gfx_level >= GFX9)) {
|
||||
bld.vop1_sdwa(aco_opcode::v_mov_b32, dst, op)->sdwa().dst_sel =
|
||||
|
|
|
@ -16,8 +16,6 @@ BEGIN_TEST(to_hw_instr.swap_subdword)
|
|||
PhysReg v1_hi{257};
|
||||
PhysReg v1_b1{257};
|
||||
PhysReg v1_b3{257};
|
||||
PhysReg v2_lo{258};
|
||||
PhysReg v3_lo{259};
|
||||
v0_hi.reg_b += 2;
|
||||
v1_hi.reg_b += 2;
|
||||
v0_b1.reg_b += 1;
|
||||
|
@ -25,164 +23,6 @@ BEGIN_TEST(to_hw_instr.swap_subdword)
|
|||
v0_b3.reg_b += 3;
|
||||
v1_b3.reg_b += 3;
|
||||
|
||||
for (unsigned i = GFX6; i <= GFX7; i++) {
|
||||
if (!setup_cs(NULL, (amd_gfx_level)i))
|
||||
continue;
|
||||
|
||||
//~gfx[67]>> p_unit_test 0
|
||||
//~gfx[67]! v1: %0:v[1] = v_xor_b32 %0:v[1], %0:v[0]
|
||||
//~gfx[67]! v1: %0:v[0] = v_xor_b32 %0:v[1], %0:v[0]
|
||||
//~gfx[67]! v1: %0:v[1] = v_xor_b32 %0:v[1], %0:v[0]
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::zero());
|
||||
bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v2b), Definition(v1_lo, v2b),
|
||||
Operand(v1_lo, v2b), Operand(v0_lo, v2b));
|
||||
|
||||
//~gfx[67]! p_unit_test 1
|
||||
//~gfx[67]! v2b: %0:v[0][16:32] = v_lshlrev_b32 16, %0:v[0][0:16]
|
||||
//~gfx[67]! v1: %0:v[0] = v_alignbyte_b32 %0:v[1][0:16], %0:v[0][16:32], 2
|
||||
//~gfx[67]! v1: %0:v[0] = v_alignbyte_b32 %0:v[0][0:16], %0:v[0][16:32], 2
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1u));
|
||||
bld.pseudo(aco_opcode::p_create_vector, Definition(v0_lo, v1), Operand(v1_lo, v2b),
|
||||
Operand(v0_lo, v2b));
|
||||
|
||||
//~gfx[67]! p_unit_test 2
|
||||
//~gfx[67]! v2b: %0:v[0][16:32] = v_lshlrev_b32 16, %0:v[0][0:16]
|
||||
//~gfx[67]! v1: %0:v[0] = v_alignbyte_b32 %0:v[1][0:16], %0:v[0][16:32], 2
|
||||
//~gfx[67]! v1: %0:v[0] = v_alignbyte_b32 %0:v[0][0:16], %0:v[0][16:32], 2
|
||||
//~gfx[67]! v2b: %0:v[1][0:16] = v_mov_b32 %0:v[2][0:16]
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2u));
|
||||
bld.pseudo(aco_opcode::p_create_vector, Definition(v0_lo, v6b), Operand(v1_lo, v2b),
|
||||
Operand(v0_lo, v2b), Operand(v2_lo, v2b));
|
||||
|
||||
//~gfx[67]! p_unit_test 3
|
||||
//~gfx[67]! v2b: %0:v[0][16:32] = v_lshlrev_b32 16, %0:v[0][0:16]
|
||||
//~gfx[67]! v1: %0:v[0] = v_alignbyte_b32 %0:v[1][0:16], %0:v[0][16:32], 2
|
||||
//~gfx[67]! v1: %0:v[0] = v_alignbyte_b32 %0:v[0][0:16], %0:v[0][16:32], 2
|
||||
//~gfx[67]! v2b: %0:v[1][16:32] = v_lshlrev_b32 16, %0:v[2][0:16]
|
||||
//~gfx[67]! v1: %0:v[1] = v_alignbyte_b32 %0:v[3][0:16], %0:v[1][16:32], 2
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3u));
|
||||
bld.pseudo(aco_opcode::p_create_vector, Definition(v0_lo, v2), Operand(v1_lo, v2b),
|
||||
Operand(v0_lo, v2b), Operand(v2_lo, v2b), Operand(v3_lo, v2b));
|
||||
|
||||
//~gfx[67]! p_unit_test 4
|
||||
//~gfx[67]! v2b: %0:v[1][16:32] = v_lshlrev_b32 16, %0:v[1][0:16]
|
||||
//~gfx[67]! v1: %0:v[1] = v_alignbyte_b32 %0:v[2][0:16], %0:v[1][16:32], 2
|
||||
//~gfx[67]! v2b: %0:v[0][16:32] = v_lshlrev_b32 16, %0:v[0][0:16]
|
||||
//~gfx[67]! v1: %0:v[0] = v_alignbyte_b32 %0:v[3][0:16], %0:v[0][16:32], 2
|
||||
//~gfx[67]! v1: %0:v[1] = v_xor_b32 %0:v[1], %0:v[0]
|
||||
//~gfx[67]! v1: %0:v[0] = v_xor_b32 %0:v[1], %0:v[0]
|
||||
//~gfx[67]! v1: %0:v[1] = v_xor_b32 %0:v[1], %0:v[0]
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4u));
|
||||
bld.pseudo(aco_opcode::p_create_vector, Definition(v0_lo, v2), Operand(v1_lo, v2b),
|
||||
Operand(v2_lo, v2b), Operand(v0_lo, v2b), Operand(v3_lo, v2b));
|
||||
|
||||
//~gfx[67]! p_unit_test 5
|
||||
//~gfx[67]! v2b: %0:v[1][0:16] = v_mov_b32 %0:v[0][0:16]
|
||||
//~gfx[67]! v2b: %0:v[0][0:16] = v_lshrrev_b32 16, %0:v[1][16:32]
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5u));
|
||||
bld.pseudo(aco_opcode::p_split_vector, Definition(v1_lo, v2b), Definition(v0_lo, v2b),
|
||||
Operand(v0_lo, v1));
|
||||
|
||||
//~gfx[67]! p_unit_test 6
|
||||
//~gfx[67]! v2b: %0:v[2][0:16] = v_mov_b32 %0:v[1][0:16]
|
||||
//~gfx[67]! v2b: %0:v[1][0:16] = v_mov_b32 %0:v[0][0:16]
|
||||
//~gfx[67]! v2b: %0:v[0][0:16] = v_lshrrev_b32 16, %0:v[1][16:32]
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(6u));
|
||||
bld.pseudo(aco_opcode::p_split_vector, Definition(v1_lo, v2b), Definition(v0_lo, v2b),
|
||||
Definition(v2_lo, v2b), Operand(v0_lo, v6b));
|
||||
|
||||
//~gfx[67]! p_unit_test 7
|
||||
//~gfx[67]! v2b: %0:v[2][0:16] = v_mov_b32 %0:v[1][0:16]
|
||||
//~gfx[67]! v2b: %0:v[1][0:16] = v_mov_b32 %0:v[0][0:16]
|
||||
//~gfx[67]! v2b: %0:v[0][0:16] = v_lshrrev_b32 16, %0:v[1][16:32]
|
||||
//~gfx[67]! v2b: %0:v[3][0:16] = v_lshrrev_b32 16, %0:v[2][16:32]
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(7u));
|
||||
bld.pseudo(aco_opcode::p_split_vector, Definition(v1_lo, v2b), Definition(v0_lo, v2b),
|
||||
Definition(v2_lo, v2b), Definition(v3_lo, v2b), Operand(v0_lo, v2));
|
||||
|
||||
//~gfx[67]! p_unit_test 8
|
||||
//~gfx[67]! v2b: %0:v[2][0:16] = v_lshrrev_b32 16, %0:v[0][16:32]
|
||||
//~gfx[67]! v2b: %0:v[3][0:16] = v_lshrrev_b32 16, %0:v[1][16:32]
|
||||
//~gfx[67]! v1: %0:v[1] = v_xor_b32 %0:v[1], %0:v[0]
|
||||
//~gfx[67]! v1: %0:v[0] = v_xor_b32 %0:v[1], %0:v[0]
|
||||
//~gfx[67]! v1: %0:v[1] = v_xor_b32 %0:v[1], %0:v[0]
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(8u));
|
||||
bld.pseudo(aco_opcode::p_split_vector, Definition(v1_lo, v2b), Definition(v2_lo, v2b),
|
||||
Definition(v0_lo, v2b), Definition(v3_lo, v2b), Operand(v0_lo, v2));
|
||||
|
||||
//~gfx[67]! p_unit_test 9
|
||||
//~gfx[67]! v1: %0:v[1] = v_xor_b32 %0:v[1], %0:v[0]
|
||||
//~gfx[67]! v1: %0:v[0] = v_xor_b32 %0:v[1], %0:v[0]
|
||||
//~gfx[67]! v1: %0:v[1] = v_xor_b32 %0:v[1], %0:v[0]
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(9u));
|
||||
bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v1b), Definition(v1_lo, v1b),
|
||||
Operand(v1_lo, v1b), Operand(v0_lo, v1b));
|
||||
|
||||
//~gfx[67]! p_unit_test 10
|
||||
//~gfx[67]! v1b: %0:v[1][24:32] = v_lshlrev_b32 24, %0:v[1][0:8]
|
||||
//~gfx[67]! v2b: %0:v[1][0:16] = v_alignbyte_b32 %0:v[0][0:8], %0:v[1][24:32], 3
|
||||
//~gfx[67]! v2b: %0:v[0][0:16] = v_mov_b32 %0:v[1][0:16]
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(10u));
|
||||
bld.pseudo(aco_opcode::p_create_vector, Definition(v0_lo, v2b), Operand(v1_lo, v1b),
|
||||
Operand(v0_lo, v1b));
|
||||
|
||||
//~gfx[67]! p_unit_test 11
|
||||
//~gfx[67]! v1b: %0:v[1][24:32] = v_lshlrev_b32 24, %0:v[1][0:8]
|
||||
//~gfx[67]! v2b: %0:v[1][0:16] = v_alignbyte_b32 %0:v[0][0:8], %0:v[1][24:32], 3
|
||||
//~gfx[67]! v2b: %0:v[0][0:16] = v_mov_b32 %0:v[1][0:16]
|
||||
//~gfx[67]! v2b: %0:v[0][16:32] = v_lshlrev_b32 16, %0:v[0][0:16]
|
||||
//~gfx[67]! v3b: %0:v[0][0:24] = v_alignbyte_b32 %0:v[2][0:8], %0:v[0][16:32], 2
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(11u));
|
||||
bld.pseudo(aco_opcode::p_create_vector, Definition(v0_lo, v3b), Operand(v1_lo, v1b),
|
||||
Operand(v0_lo, v1b), Operand(v2_lo, v1b));
|
||||
|
||||
//~gfx[67]! p_unit_test 12
|
||||
//~gfx[67]! v1b: %0:v[1][24:32] = v_lshlrev_b32 24, %0:v[1][0:8]
|
||||
//~gfx[67]! v2b: %0:v[1][0:16] = v_alignbyte_b32 %0:v[0][0:8], %0:v[1][24:32], 3
|
||||
//~gfx[67]! v2b: %0:v[0][0:16] = v_mov_b32 %0:v[1][0:16]
|
||||
//~gfx[67]! v2b: %0:v[0][16:32] = v_lshlrev_b32 16, %0:v[0][0:16]
|
||||
//~gfx[67]! v3b: %0:v[0][0:24] = v_alignbyte_b32 %0:v[2][0:8], %0:v[0][16:32], 2
|
||||
//~gfx[67]! v3b: %0:v[0][8:32] = v_lshlrev_b32 8, %0:v[0][0:24]
|
||||
//~gfx[67]! v1: %0:v[0] = v_alignbyte_b32 %0:v[3][0:8], %0:v[0][8:32], 1
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(12u));
|
||||
bld.pseudo(aco_opcode::p_create_vector, Definition(v0_lo, v1), Operand(v1_lo, v1b),
|
||||
Operand(v0_lo, v1b), Operand(v2_lo, v1b), Operand(v3_lo, v1b));
|
||||
|
||||
//~gfx[67]! p_unit_test 13
|
||||
//~gfx[67]! v1b: %0:v[0][0:8] = v_and_b32 0xff, %0:v[0][0:8]
|
||||
//~gfx[67]! v2b: %0:v[0][0:16] = v_mul_u32_u24 0x101, %0:v[0][0:8]
|
||||
//~gfx[67]! v2b: %0:v[0][0:16] = v_and_b32 0xffff, %0:v[0][0:16]
|
||||
//~gfx[67]! v3b: %0:v[0][0:24] = v_cvt_pk_u16_u32 %0:v[0][0:16], %0:v[0][0:8]
|
||||
//~gfx[67]! v3b: %0:v[0][0:24] = v_and_b32 0xffffff, %0:v[0][0:24]
|
||||
//~gfx[67]! s1: %0:m0 = s_mov_b32 0x1000001
|
||||
//~gfx[67]! v1: %0:v[0] = v_mul_lo_u32 %0:m0, %0:v[0][0:8]
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(13u));
|
||||
Instruction* pseudo =
|
||||
bld.pseudo(aco_opcode::p_create_vector, Definition(v0_lo, v1), Operand(v0_lo, v1b),
|
||||
Operand(v0_lo, v1b), Operand(v0_lo, v1b), Operand(v0_lo, v1b));
|
||||
pseudo->pseudo().scratch_sgpr = m0;
|
||||
|
||||
//~gfx[67]! p_unit_test 14
|
||||
//~gfx[67]! v1b: %0:v[1][0:8] = v_mov_b32 %0:v[0][0:8]
|
||||
//~gfx[67]! v1b: %0:v[0][0:8] = v_lshrrev_b32 8, %0:v[1][8:16]
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(14u));
|
||||
bld.pseudo(aco_opcode::p_split_vector, Definition(v1_lo, v1b), Definition(v0_lo, v1b),
|
||||
Operand(v0_lo, v2b));
|
||||
|
||||
//~gfx[67]! p_unit_test 15
|
||||
//~gfx[67]! v1b: %0:v[1][0:8] = v_mov_b32 %0:v[0][0:8]
|
||||
//~gfx[67]! v1b: %0:v[0][0:8] = v_lshrrev_b32 8, %0:v[1][8:16]
|
||||
//~gfx[67]! v1b: %0:v[2][0:8] = v_lshrrev_b32 16, %0:v[1][16:24]
|
||||
//~gfx[67]! v1b: %0:v[3][0:8] = v_lshrrev_b32 24, %0:v[1][24:32]
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(15u));
|
||||
bld.pseudo(aco_opcode::p_split_vector, Definition(v1_lo, v1b), Definition(v0_lo, v1b),
|
||||
Definition(v2_lo, v1b), Definition(v3_lo, v1b), Operand(v0_lo, v1));
|
||||
|
||||
//~gfx[67]! s_endpgm
|
||||
|
||||
finish_to_hw_instr_test();
|
||||
}
|
||||
|
||||
for (amd_gfx_level lvl : {GFX8, GFX9, GFX11}) {
|
||||
if (!setup_cs(NULL, lvl))
|
||||
continue;
|
||||
|
@ -621,23 +461,23 @@ BEGIN_TEST(to_hw_instr.extract)
|
|||
|
||||
//>> p_unit_test 4
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4u));
|
||||
//~gfx7.*! v2b: %_:v[0][0:16] = @v_bfe %_:v[1][0:16], 0, 8
|
||||
//~gfx(8|9).*! v2b: %_:v[0][0:16] = v_mov_b32 %_:v[1][0:16] dst_sel:uword0 dst_preserve src0_sel:@byte(0)
|
||||
//~gfx11_unsigned! v1: %_:v[0] = v_perm_b32 %_:v[0], %_:v[1], 0x7060c00
|
||||
//~gfx11_signed! v1: %_:v[0] = v_perm_b32 %_:v[0], %_:v[1], 0x7060000
|
||||
//~gfx11_signed! v1: %_:v[0] = v_perm_b32 %_:v[0], 0, 0x7060a04
|
||||
EXT(0, 0)
|
||||
if (lvl != GFX7)
|
||||
EXT(0, 0)
|
||||
//~gfx(8|9).*! v2b: %_:v[0][0:16] = v_mov_b32 %_:v[1][16:32] dst_sel:uword0 dst_preserve src0_sel:@byte(2)
|
||||
//~gfx11_unsigned! v1: %_:v[0] = v_perm_b32 %_:v[0], %_:v[1], 0x7060c02
|
||||
//~gfx11_signed! v1: %_:v[0] = v_perm_b32 %_:v[0], %_:v[1], 0x7060202
|
||||
//~gfx11_signed! v1: %_:v[0] = v_perm_b32 %_:v[0], 0, 0x7060a04
|
||||
if (lvl != GFX7)
|
||||
EXT(0, 2)
|
||||
//~gfx7.*! v2b: %_:v[0][0:16] = @v_bfe %_:v[1][0:16], 8, 8
|
||||
//~gfx(8|9).*! v2b: %_:v[0][0:16] = v_mov_b32 %_:v[1][0:16] dst_sel:uword0 dst_preserve src0_sel:@byte(1)
|
||||
//~gfx11_unsigned! v1: %_:v[0] = v_perm_b32 %_:v[0], %_:v[1], 0x7060c01
|
||||
//~gfx11_signed! v1: %_:v[0] = v_perm_b32 %_:v[0], %_:v[1], 0x7060801
|
||||
EXT(1, 0)
|
||||
if (lvl != GFX7)
|
||||
EXT(1, 0)
|
||||
//~gfx(8|9).*! v2b: %_:v[0][0:16] = v_mov_b32 %_:v[1][16:32] dst_sel:uword0 dst_preserve src0_sel:@byte(3)
|
||||
//~gfx11_unsigned! v1: %_:v[0] = v_perm_b32 %_:v[0], %_:v[1], 0x7060c03
|
||||
//~gfx11_signed! v1: %_:v[0] = v_perm_b32 %_:v[0], %_:v[1], 0x7060903
|
||||
|
@ -721,10 +561,10 @@ BEGIN_TEST(to_hw_instr.insert)
|
|||
|
||||
//>> p_unit_test 2
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2u));
|
||||
//~gfx7! v2b: %_:v[0][0:16] = v_bfe_u32 %_:v[1][0:16], 0, 8
|
||||
//~gfx(8|9)! v2b: %0:v[0][0:16] = v_mov_b32 %0:v[1][0:16] dst_sel:uword0 dst_preserve src0_sel:ubyte0
|
||||
//~gfx11! v1: %0:v[0] = v_perm_b32 %0:v[0], %0:v[1], 0x7060c00
|
||||
INS(0, 0, 0)
|
||||
if (lvl != GFX7)
|
||||
INS(0, 0, 0)
|
||||
//~gfx(8|9)! v2b: %0:v[0][16:32] = v_mov_b32 %0:v[1][0:16] dst_sel:uword1 dst_preserve src0_sel:ubyte0
|
||||
//~gfx11! v1: %0:v[0] = v_perm_b32 %0:v[0], %0:v[1], 0xc000504
|
||||
if (lvl != GFX7)
|
||||
|
@ -737,12 +577,12 @@ BEGIN_TEST(to_hw_instr.insert)
|
|||
//~gfx11! v1: %0:v[0] = v_perm_b32 %0:v[0], %0:v[1], 0xc020504
|
||||
if (lvl != GFX7)
|
||||
INS(0, 2, 2)
|
||||
//~gfx7! v2b: %_:v[0][0:16] = v_lshlrev_b32 8, %_:v[1][0:16]
|
||||
//~gfx8! v1b: %0:v[0][8:16] = v_mov_b32 %0:v[1][0:16] dst_sel:ubyte1 dst_preserve src0_sel:ubyte0
|
||||
//~gfx8! v2b: %0:v[0][0:16] = v_and_b32 0xffffff00, %0:v[1]
|
||||
//~gfx9! v2b: %0:v[0][0:16] = v_lshlrev_b32 8, %0:v[1][0:16] dst_sel:uword0 dst_preserve src0_sel:dword src1_sel:ubyte0
|
||||
//~gfx11! v1: %0:v[0] = v_perm_b32 %0:v[0], %0:v[1], 0x706000c
|
||||
INS(1, 0, 0)
|
||||
if (lvl != GFX7)
|
||||
INS(1, 0, 0)
|
||||
//~gfx8! v1b: %0:v[0][24:32] = v_mov_b32 %0:v[1][0:16] dst_sel:ubyte3 dst_preserve src0_sel:ubyte0
|
||||
//~gfx8! v2b: %0:v[0][16:32] = v_and_b32 0xff00ffff, %0:v[1]
|
||||
//~gfx9! v2b: %0:v[0][16:32] = v_lshlrev_b32 8, %0:v[1][0:16] dst_sel:uword1 dst_preserve src0_sel:dword src1_sel:ubyte0
|
||||
|
|
Loading…
Reference in New Issue