aco: refactor GFX6_7 subdword copy lowering

The new code uses alignbyte which leads
to shorter code and preserves the operand's
registers.

Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7189>
This commit is contained in:
Daniel Schürmann 2020-10-16 15:12:28 +02:00 committed by Marge Bot
parent 06b41ca589
commit 40bfb08828
2 changed files with 43 additions and 60 deletions

View File

@ -1022,30 +1022,28 @@ bool do_copy(lower_context* ctx, Builder& bld, const copy_operation& copy, bool
if (op.physReg().byte()) {
assert(def.physReg().byte() == 0);
bld.vop2(aco_opcode::v_lshrrev_b32, def, Operand(op.physReg().byte() * 8), op);
} else if (def.physReg().byte() == 2) {
} else if (def.physReg().byte()) {
assert(op.physReg().byte() == 0);
/* preserve the target's lower half */
def = Definition(def.physReg().advance(-2), v1);
bld.vop2(aco_opcode::v_and_b32, Definition(op.physReg(), v1), Operand(0xFFFFu), op);
if (def.physReg().reg() != op.physReg().reg())
bld.vop2(aco_opcode::v_and_b32, def, Operand(0xFFFFu), Operand(def.physReg(), v2b));
bld.vop2(aco_opcode::v_cvt_pk_u16_u32, def, Operand(def.physReg(), v2b), op);
} else if (def.physReg().byte()) {
unsigned bits = def.physReg().byte() * 8;
assert(op.physReg().byte() == 0);
def = Definition(def.physReg().advance(-def.physReg().byte()), v1);
bld.vop2(aco_opcode::v_and_b32, def, Operand((1 << bits) - 1u), Operand(def.physReg(), op.regClass()));
uint32_t bits = def.physReg().byte() * 8;
PhysReg lo_reg = PhysReg(def.physReg().reg());
Definition lo_half = Definition(lo_reg, RegClass::get(RegType::vgpr, def.physReg().byte()));
Definition dst = Definition(lo_reg, RegClass::get(RegType::vgpr, lo_half.bytes() + op.bytes()));
if (def.physReg().reg() == op.physReg().reg()) {
if (bits < 24) {
bld.vop2(aco_opcode::v_mul_u32_u24, def, Operand((1 << bits) + 1u), op);
} else {
bld.vop2(aco_opcode::v_and_b32, lo_half, Operand((1 << bits) - 1u), Operand(lo_reg, lo_half.regClass()));
if (def.physReg().byte() == 1) {
bld.vop2(aco_opcode::v_mul_u32_u24, dst, Operand((1 << bits) + 1u), op);
} else if (def.physReg().byte() == 2) {
bld.vop2(aco_opcode::v_cvt_pk_u16_u32, dst, Operand(lo_reg, v2b), op);
} else if (def.physReg().byte() == 3) {
bld.sop1(aco_opcode::s_mov_b32, Definition(scratch_sgpr, s1), Operand((1 << bits) + 1u));
bld.vop3(aco_opcode::v_mul_lo_u32, def, Operand(scratch_sgpr, s1), op);
bld.vop3(aco_opcode::v_mul_lo_u32, dst, Operand(scratch_sgpr, s1), op);
}
} else {
bld.vop2(aco_opcode::v_lshlrev_b32, Definition(op.physReg(), def.regClass()), Operand(bits), op);
bld.vop2(aco_opcode::v_or_b32, def, Operand(def.physReg(), op.regClass()), op);
bld.vop2(aco_opcode::v_lshrrev_b32, Definition(op.physReg(), def.regClass()), Operand(bits), op);
lo_half.setFixed(lo_half.physReg().advance(4 - def.physReg().byte()));
bld.vop2(aco_opcode::v_lshlrev_b32, lo_half, Operand(32 - bits), Operand(lo_reg, lo_half.regClass()));
bld.vop3(aco_opcode::v_alignbyte_b32, dst, op, Operand(lo_half.physReg(), lo_half.regClass()), Operand(4 - def.physReg().byte()));
}
} else {
bld.vop1(aco_opcode::v_mov_b32, def, op);

View File

@ -57,9 +57,8 @@ BEGIN_TEST(to_hw_instr.swap_subdword)
Operand(v1_lo, v2b), Operand(v0_lo, v2b));
//~gfx[67]! p_unit_test 1
//~gfx[67]! v1: %0:v[0] = v_and_b32 0xffff, %0:v[0][0:16]
//~gfx[67]! v1: %0:v[1] = v_and_b32 0xffff, %0:v[1][0:16]
//~gfx[67]! v1: %0:v[1] = v_cvt_pk_u16_u32 %0:v[1][0:16], %0:v[0][0:16]
//~gfx[67]! v2b: %0:v[1][16:32] = v_lshlrev_b32 16, %0:v[1][0:16]
//~gfx[67]! v1: %0:v[1] = v_alignbyte_b32 %0:v[0][0:16], %0:v[1][16:32], 2
//~gfx[67]! v1: %0:v[0] = v_mov_b32 %0:v[1]
bld.pseudo(aco_opcode::p_unit_test, Operand(1u));
bld.pseudo(aco_opcode::p_create_vector,
@ -67,9 +66,8 @@ BEGIN_TEST(to_hw_instr.swap_subdword)
Operand(v1_lo, v2b), Operand(v0_lo, v2b));
//~gfx[67]! p_unit_test 2
//~gfx[67]! v1: %0:v[0] = v_and_b32 0xffff, %0:v[0][0:16]
//~gfx[67]! v1: %0:v[1] = v_and_b32 0xffff, %0:v[1][0:16]
//~gfx[67]! v1: %0:v[1] = v_cvt_pk_u16_u32 %0:v[1][0:16], %0:v[0][0:16]
//~gfx[67]! v2b: %0:v[1][16:32] = v_lshlrev_b32 16, %0:v[1][0:16]
//~gfx[67]! v1: %0:v[1] = v_alignbyte_b32 %0:v[0][0:16], %0:v[1][16:32], 2
//~gfx[67]! v1: %0:v[0] = v_mov_b32 %0:v[1]
//~gfx[67]! v2b: %0:v[1][0:16] = v_mov_b32 %0:v[2][0:16]
bld.pseudo(aco_opcode::p_unit_test, Operand(2u));
@ -78,9 +76,8 @@ BEGIN_TEST(to_hw_instr.swap_subdword)
Operand(v0_lo, v2b), Operand(v2_lo, v2b));
//~gfx[67]! p_unit_test 3
//~gfx[67]! v1: %0:v[0] = v_and_b32 0xffff, %0:v[0][0:16]
//~gfx[67]! v1: %0:v[1] = v_and_b32 0xffff, %0:v[1][0:16]
//~gfx[67]! v1: %0:v[1] = v_cvt_pk_u16_u32 %0:v[1][0:16], %0:v[0][0:16]
//~gfx[67]! v2b: %0:v[1][16:32] = v_lshlrev_b32 16, %0:v[1][0:16]
//~gfx[67]! v1: %0:v[1] = v_alignbyte_b32 %0:v[0][0:16], %0:v[1][16:32], 2
//~gfx[67]! v1: %0:v[0] = v_mov_b32 %0:v[1]
//~gfx[67]! v1: %0:v[2] = v_and_b32 0xffff, %0:v[2][0:16]
//~gfx[67]! v1: %0:v[3] = v_and_b32 0xffff, %0:v[3][0:16]
@ -92,12 +89,10 @@ BEGIN_TEST(to_hw_instr.swap_subdword)
Operand(v2_lo, v2b), Operand(v3_lo, v2b));
//~gfx[67]! p_unit_test 4
//~gfx[67]! v1: %0:v[2] = v_and_b32 0xffff, %0:v[2][0:16]
//~gfx[67]! v1: %0:v[1] = v_and_b32 0xffff, %0:v[1][0:16]
//~gfx[67]! v1: %0:v[1] = v_cvt_pk_u16_u32 %0:v[1][0:16], %0:v[2][0:16]
//~gfx[67]! v1: %0:v[3] = v_and_b32 0xffff, %0:v[3][0:16]
//~gfx[67]! v1: %0:v[0] = v_and_b32 0xffff, %0:v[0][0:16]
//~gfx[67]! v1: %0:v[0] = v_cvt_pk_u16_u32 %0:v[0][0:16], %0:v[3][0:16]
//~gfx[67]! v2b: %0:v[1][16:32] = v_lshlrev_b32 16, %0:v[1][0:16]
//~gfx[67]! v1: %0:v[1] = v_alignbyte_b32 %0:v[2][0:16], %0:v[1][16:32], 2
//~gfx[67]! v2b: %0:v[0][16:32] = v_lshlrev_b32 16, %0:v[0][0:16]
//~gfx[67]! v1: %0:v[0] = v_alignbyte_b32 %0:v[3][0:16], %0:v[0][16:32], 2
//~gfx[67]! v1: %0:v[1] = v_xor_b32 %0:v[1], %0:v[0]
//~gfx[67]! v1: %0:v[0] = v_xor_b32 %0:v[1], %0:v[0]
//~gfx[67]! v1: %0:v[1] = v_xor_b32 %0:v[1], %0:v[0]
@ -157,10 +152,8 @@ BEGIN_TEST(to_hw_instr.swap_subdword)
Operand(v1_lo, v1b), Operand(v0_lo, v1b));
//~gfx[67]! p_unit_test 10
//~gfx[67]! v1: %0:v[1] = v_and_b32 0xff, %0:v[1][0:8]
//~gfx[67]! v1: %0:v[0] = v_lshlrev_b32 8, %0:v[0][0:8]
//~gfx[67]! v1: %0:v[1] = v_or_b32 %0:v[1][0:8], %0:v[0][0:8]
//~gfx[67]! v1: %0:v[0] = v_lshrrev_b32 8, %0:v[0][0:8]
//~gfx[67]! v1b: %0:v[1][24:32] = v_lshlrev_b32 24, %0:v[1][0:8]
//~gfx[67]! v2b: %0:v[1][0:16] = v_alignbyte_b32 %0:v[0][0:8], %0:v[1][24:32], 3
//~gfx[67]! v2b: %0:v[0][0:16] = v_mov_b32 %0:v[1][0:16]
bld.pseudo(aco_opcode::p_unit_test, Operand(10u));
bld.pseudo(aco_opcode::p_create_vector,
@ -168,32 +161,24 @@ BEGIN_TEST(to_hw_instr.swap_subdword)
Operand(v1_lo, v1b), Operand(v0_lo, v1b));
//~gfx[67]! p_unit_test 11
//~gfx[67]! v1: %0:v[1] = v_and_b32 0xff, %0:v[1][0:8]
//~gfx[67]! v1: %0:v[0] = v_lshlrev_b32 8, %0:v[0][0:8]
//~gfx[67]! v1: %0:v[1] = v_or_b32 %0:v[1][0:8], %0:v[0][0:8]
//~gfx[67]! v1: %0:v[0] = v_lshrrev_b32 8, %0:v[0][0:8]
//~gfx[67]! v1b: %0:v[1][24:32] = v_lshlrev_b32 24, %0:v[1][0:8]
//~gfx[67]! v2b: %0:v[1][0:16] = v_alignbyte_b32 %0:v[0][0:8], %0:v[1][24:32], 3
//~gfx[67]! v2b: %0:v[0][0:16] = v_mov_b32 %0:v[1][0:16]
//~gfx[67]! v1: %0:v[2] = v_and_b32 0xffff, %0:v[2][0:8]
//~gfx[67]! v1: %0:v[0] = v_and_b32 0xffff, %0:v[0][0:16]
//~gfx[67]! v1: %0:v[0] = v_cvt_pk_u16_u32 %0:v[0][0:16], %0:v[2][0:8]
//~gfx[67]! v2b: %0:v[0][16:32] = v_lshlrev_b32 16, %0:v[0][0:16]
//~gfx[67]! v3b: %0:v[0][0:24] = v_alignbyte_b32 %0:v[2][0:8], %0:v[0][16:32], 2
bld.pseudo(aco_opcode::p_unit_test, Operand(11u));
bld.pseudo(aco_opcode::p_create_vector,
Definition(v0_lo, v3b), Operand(v1_lo, v1b),
Operand(v0_lo, v1b), Operand(v2_lo, v1b));
//~gfx[67]! p_unit_test 12
//~gfx[67]! v1: %0:v[1] = v_and_b32 0xff, %0:v[1][0:8]
//~gfx[67]! v1: %0:v[0] = v_lshlrev_b32 8, %0:v[0][0:8]
//~gfx[67]! v1: %0:v[1] = v_or_b32 %0:v[1][0:8], %0:v[0][0:8]
//~gfx[67]! v1: %0:v[0] = v_lshrrev_b32 8, %0:v[0][0:8]
//~gfx[67]! v1b: %0:v[1][24:32] = v_lshlrev_b32 24, %0:v[1][0:8]
//~gfx[67]! v2b: %0:v[1][0:16] = v_alignbyte_b32 %0:v[0][0:8], %0:v[1][24:32], 3
//~gfx[67]! v2b: %0:v[0][0:16] = v_mov_b32 %0:v[1][0:16]
//~gfx[67]! v1: %0:v[2] = v_and_b32 0xffff, %0:v[2][0:8]
//~gfx[67]! v1: %0:v[0] = v_and_b32 0xffff, %0:v[0][0:16]
//~gfx[67]! v1: %0:v[0] = v_cvt_pk_u16_u32 %0:v[0][0:16], %0:v[2][0:8]
//~gfx[67]! v1: %0:v[0] = v_and_b32 0xffffff, %0:v[0][0:8]
//~gfx[67]! v1: %0:v[3] = v_lshlrev_b32 24, %0:v[3][0:8]
//~gfx[67]! v1: %0:v[0] = v_or_b32 %0:v[0][0:8], %0:v[3][0:8]
//~gfx[67]! v1: %0:v[3] = v_lshrrev_b32 24, %0:v[3][0:8]
//~gfx[67]! v2b: %0:v[0][16:32] = v_lshlrev_b32 16, %0:v[0][0:16]
//~gfx[67]! v3b: %0:v[0][0:24] = v_alignbyte_b32 %0:v[2][0:8], %0:v[0][16:32], 2
//~gfx[67]! v3b: %0:v[0][8:32] = v_lshlrev_b32 8, %0:v[0][0:24]
//~gfx[67]! v1: %0:v[0] = v_alignbyte_b32 %0:v[3][0:8], %0:v[0][8:32], 1
bld.pseudo(aco_opcode::p_unit_test, Operand(12u));
bld.pseudo(aco_opcode::p_create_vector,
Definition(v0_lo, v1),
@ -201,11 +186,11 @@ BEGIN_TEST(to_hw_instr.swap_subdword)
Operand(v2_lo, v1b), Operand(v3_lo, v1b));
//~gfx[67]! p_unit_test 13
//~gfx[67]! v1: %0:v[0] = v_and_b32 0xff, %0:v[0][0:8]
//~gfx[67]! v1: %0:v[0] = v_mul_u32_u24 0x101, %0:v[0][0:8]
//~gfx[67]! v1: %0:v[0] = v_and_b32 0xffff, %0:v[0][0:8]
//~gfx[67]! v1: %0:v[0] = v_cvt_pk_u16_u32 %0:v[0][0:16], %0:v[0][0:8]
//~gfx[67]! v1: %0:v[0] = v_and_b32 0xffffff, %0:v[0][0:8]
//~gfx[67]! v1b: %0:v[0][0:8] = v_and_b32 0xff, %0:v[0][0:8]
//~gfx[67]! v2b: %0:v[0][0:16] = v_mul_u32_u24 0x101, %0:v[0][0:8]
//~gfx[67]! v2b: %0:v[0][0:16] = v_and_b32 0xffff, %0:v[0][0:16]
//~gfx[67]! v3b: %0:v[0][0:24] = v_cvt_pk_u16_u32 %0:v[0][0:16], %0:v[0][0:8]
//~gfx[67]! v3b: %0:v[0][0:24] = v_and_b32 0xffffff, %0:v[0][0:24]
//~gfx[67]! s1: %0:m0 = s_mov_b32 0x1000001
//~gfx[67]! v1: %0:v[0] = v_mul_lo_u32 %0:m0, %0:v[0][0:8]
bld.pseudo(aco_opcode::p_unit_test, Operand(13u));