aco: refactor GFX6_7 subdword copy lowering
The new code uses alignbyte which leads to shorter code and preserves the operand's registers. Reviewed-by: Rhys Perry <pendingchaos02@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7189>
This commit is contained in:
parent
06b41ca589
commit
40bfb08828
|
@ -1022,30 +1022,28 @@ bool do_copy(lower_context* ctx, Builder& bld, const copy_operation& copy, bool
|
||||||
if (op.physReg().byte()) {
|
if (op.physReg().byte()) {
|
||||||
assert(def.physReg().byte() == 0);
|
assert(def.physReg().byte() == 0);
|
||||||
bld.vop2(aco_opcode::v_lshrrev_b32, def, Operand(op.physReg().byte() * 8), op);
|
bld.vop2(aco_opcode::v_lshrrev_b32, def, Operand(op.physReg().byte() * 8), op);
|
||||||
} else if (def.physReg().byte() == 2) {
|
} else if (def.physReg().byte()) {
|
||||||
assert(op.physReg().byte() == 0);
|
assert(op.physReg().byte() == 0);
|
||||||
/* preserve the target's lower half */
|
/* preserve the target's lower half */
|
||||||
def = Definition(def.physReg().advance(-2), v1);
|
uint32_t bits = def.physReg().byte() * 8;
|
||||||
bld.vop2(aco_opcode::v_and_b32, Definition(op.physReg(), v1), Operand(0xFFFFu), op);
|
PhysReg lo_reg = PhysReg(def.physReg().reg());
|
||||||
if (def.physReg().reg() != op.physReg().reg())
|
Definition lo_half = Definition(lo_reg, RegClass::get(RegType::vgpr, def.physReg().byte()));
|
||||||
bld.vop2(aco_opcode::v_and_b32, def, Operand(0xFFFFu), Operand(def.physReg(), v2b));
|
Definition dst = Definition(lo_reg, RegClass::get(RegType::vgpr, lo_half.bytes() + op.bytes()));
|
||||||
bld.vop2(aco_opcode::v_cvt_pk_u16_u32, def, Operand(def.physReg(), v2b), op);
|
|
||||||
} else if (def.physReg().byte()) {
|
|
||||||
unsigned bits = def.physReg().byte() * 8;
|
|
||||||
assert(op.physReg().byte() == 0);
|
|
||||||
def = Definition(def.physReg().advance(-def.physReg().byte()), v1);
|
|
||||||
bld.vop2(aco_opcode::v_and_b32, def, Operand((1 << bits) - 1u), Operand(def.physReg(), op.regClass()));
|
|
||||||
if (def.physReg().reg() == op.physReg().reg()) {
|
if (def.physReg().reg() == op.physReg().reg()) {
|
||||||
if (bits < 24) {
|
bld.vop2(aco_opcode::v_and_b32, lo_half, Operand((1 << bits) - 1u), Operand(lo_reg, lo_half.regClass()));
|
||||||
bld.vop2(aco_opcode::v_mul_u32_u24, def, Operand((1 << bits) + 1u), op);
|
if (def.physReg().byte() == 1) {
|
||||||
} else {
|
bld.vop2(aco_opcode::v_mul_u32_u24, dst, Operand((1 << bits) + 1u), op);
|
||||||
|
} else if (def.physReg().byte() == 2) {
|
||||||
|
bld.vop2(aco_opcode::v_cvt_pk_u16_u32, dst, Operand(lo_reg, v2b), op);
|
||||||
|
} else if (def.physReg().byte() == 3) {
|
||||||
bld.sop1(aco_opcode::s_mov_b32, Definition(scratch_sgpr, s1), Operand((1 << bits) + 1u));
|
bld.sop1(aco_opcode::s_mov_b32, Definition(scratch_sgpr, s1), Operand((1 << bits) + 1u));
|
||||||
bld.vop3(aco_opcode::v_mul_lo_u32, def, Operand(scratch_sgpr, s1), op);
|
bld.vop3(aco_opcode::v_mul_lo_u32, dst, Operand(scratch_sgpr, s1), op);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
bld.vop2(aco_opcode::v_lshlrev_b32, Definition(op.physReg(), def.regClass()), Operand(bits), op);
|
lo_half.setFixed(lo_half.physReg().advance(4 - def.physReg().byte()));
|
||||||
bld.vop2(aco_opcode::v_or_b32, def, Operand(def.physReg(), op.regClass()), op);
|
bld.vop2(aco_opcode::v_lshlrev_b32, lo_half, Operand(32 - bits), Operand(lo_reg, lo_half.regClass()));
|
||||||
bld.vop2(aco_opcode::v_lshrrev_b32, Definition(op.physReg(), def.regClass()), Operand(bits), op);
|
bld.vop3(aco_opcode::v_alignbyte_b32, dst, op, Operand(lo_half.physReg(), lo_half.regClass()), Operand(4 - def.physReg().byte()));
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
bld.vop1(aco_opcode::v_mov_b32, def, op);
|
bld.vop1(aco_opcode::v_mov_b32, def, op);
|
||||||
|
|
|
@ -57,9 +57,8 @@ BEGIN_TEST(to_hw_instr.swap_subdword)
|
||||||
Operand(v1_lo, v2b), Operand(v0_lo, v2b));
|
Operand(v1_lo, v2b), Operand(v0_lo, v2b));
|
||||||
|
|
||||||
//~gfx[67]! p_unit_test 1
|
//~gfx[67]! p_unit_test 1
|
||||||
//~gfx[67]! v1: %0:v[0] = v_and_b32 0xffff, %0:v[0][0:16]
|
//~gfx[67]! v2b: %0:v[1][16:32] = v_lshlrev_b32 16, %0:v[1][0:16]
|
||||||
//~gfx[67]! v1: %0:v[1] = v_and_b32 0xffff, %0:v[1][0:16]
|
//~gfx[67]! v1: %0:v[1] = v_alignbyte_b32 %0:v[0][0:16], %0:v[1][16:32], 2
|
||||||
//~gfx[67]! v1: %0:v[1] = v_cvt_pk_u16_u32 %0:v[1][0:16], %0:v[0][0:16]
|
|
||||||
//~gfx[67]! v1: %0:v[0] = v_mov_b32 %0:v[1]
|
//~gfx[67]! v1: %0:v[0] = v_mov_b32 %0:v[1]
|
||||||
bld.pseudo(aco_opcode::p_unit_test, Operand(1u));
|
bld.pseudo(aco_opcode::p_unit_test, Operand(1u));
|
||||||
bld.pseudo(aco_opcode::p_create_vector,
|
bld.pseudo(aco_opcode::p_create_vector,
|
||||||
|
@ -67,9 +66,8 @@ BEGIN_TEST(to_hw_instr.swap_subdword)
|
||||||
Operand(v1_lo, v2b), Operand(v0_lo, v2b));
|
Operand(v1_lo, v2b), Operand(v0_lo, v2b));
|
||||||
|
|
||||||
//~gfx[67]! p_unit_test 2
|
//~gfx[67]! p_unit_test 2
|
||||||
//~gfx[67]! v1: %0:v[0] = v_and_b32 0xffff, %0:v[0][0:16]
|
//~gfx[67]! v2b: %0:v[1][16:32] = v_lshlrev_b32 16, %0:v[1][0:16]
|
||||||
//~gfx[67]! v1: %0:v[1] = v_and_b32 0xffff, %0:v[1][0:16]
|
//~gfx[67]! v1: %0:v[1] = v_alignbyte_b32 %0:v[0][0:16], %0:v[1][16:32], 2
|
||||||
//~gfx[67]! v1: %0:v[1] = v_cvt_pk_u16_u32 %0:v[1][0:16], %0:v[0][0:16]
|
|
||||||
//~gfx[67]! v1: %0:v[0] = v_mov_b32 %0:v[1]
|
//~gfx[67]! v1: %0:v[0] = v_mov_b32 %0:v[1]
|
||||||
//~gfx[67]! v2b: %0:v[1][0:16] = v_mov_b32 %0:v[2][0:16]
|
//~gfx[67]! v2b: %0:v[1][0:16] = v_mov_b32 %0:v[2][0:16]
|
||||||
bld.pseudo(aco_opcode::p_unit_test, Operand(2u));
|
bld.pseudo(aco_opcode::p_unit_test, Operand(2u));
|
||||||
|
@ -78,9 +76,8 @@ BEGIN_TEST(to_hw_instr.swap_subdword)
|
||||||
Operand(v0_lo, v2b), Operand(v2_lo, v2b));
|
Operand(v0_lo, v2b), Operand(v2_lo, v2b));
|
||||||
|
|
||||||
//~gfx[67]! p_unit_test 3
|
//~gfx[67]! p_unit_test 3
|
||||||
//~gfx[67]! v1: %0:v[0] = v_and_b32 0xffff, %0:v[0][0:16]
|
//~gfx[67]! v2b: %0:v[1][16:32] = v_lshlrev_b32 16, %0:v[1][0:16]
|
||||||
//~gfx[67]! v1: %0:v[1] = v_and_b32 0xffff, %0:v[1][0:16]
|
//~gfx[67]! v1: %0:v[1] = v_alignbyte_b32 %0:v[0][0:16], %0:v[1][16:32], 2
|
||||||
//~gfx[67]! v1: %0:v[1] = v_cvt_pk_u16_u32 %0:v[1][0:16], %0:v[0][0:16]
|
|
||||||
//~gfx[67]! v1: %0:v[0] = v_mov_b32 %0:v[1]
|
//~gfx[67]! v1: %0:v[0] = v_mov_b32 %0:v[1]
|
||||||
//~gfx[67]! v1: %0:v[2] = v_and_b32 0xffff, %0:v[2][0:16]
|
//~gfx[67]! v1: %0:v[2] = v_and_b32 0xffff, %0:v[2][0:16]
|
||||||
//~gfx[67]! v1: %0:v[3] = v_and_b32 0xffff, %0:v[3][0:16]
|
//~gfx[67]! v1: %0:v[3] = v_and_b32 0xffff, %0:v[3][0:16]
|
||||||
|
@ -92,12 +89,10 @@ BEGIN_TEST(to_hw_instr.swap_subdword)
|
||||||
Operand(v2_lo, v2b), Operand(v3_lo, v2b));
|
Operand(v2_lo, v2b), Operand(v3_lo, v2b));
|
||||||
|
|
||||||
//~gfx[67]! p_unit_test 4
|
//~gfx[67]! p_unit_test 4
|
||||||
//~gfx[67]! v1: %0:v[2] = v_and_b32 0xffff, %0:v[2][0:16]
|
//~gfx[67]! v2b: %0:v[1][16:32] = v_lshlrev_b32 16, %0:v[1][0:16]
|
||||||
//~gfx[67]! v1: %0:v[1] = v_and_b32 0xffff, %0:v[1][0:16]
|
//~gfx[67]! v1: %0:v[1] = v_alignbyte_b32 %0:v[2][0:16], %0:v[1][16:32], 2
|
||||||
//~gfx[67]! v1: %0:v[1] = v_cvt_pk_u16_u32 %0:v[1][0:16], %0:v[2][0:16]
|
//~gfx[67]! v2b: %0:v[0][16:32] = v_lshlrev_b32 16, %0:v[0][0:16]
|
||||||
//~gfx[67]! v1: %0:v[3] = v_and_b32 0xffff, %0:v[3][0:16]
|
//~gfx[67]! v1: %0:v[0] = v_alignbyte_b32 %0:v[3][0:16], %0:v[0][16:32], 2
|
||||||
//~gfx[67]! v1: %0:v[0] = v_and_b32 0xffff, %0:v[0][0:16]
|
|
||||||
//~gfx[67]! v1: %0:v[0] = v_cvt_pk_u16_u32 %0:v[0][0:16], %0:v[3][0:16]
|
|
||||||
//~gfx[67]! v1: %0:v[1] = v_xor_b32 %0:v[1], %0:v[0]
|
//~gfx[67]! v1: %0:v[1] = v_xor_b32 %0:v[1], %0:v[0]
|
||||||
//~gfx[67]! v1: %0:v[0] = v_xor_b32 %0:v[1], %0:v[0]
|
//~gfx[67]! v1: %0:v[0] = v_xor_b32 %0:v[1], %0:v[0]
|
||||||
//~gfx[67]! v1: %0:v[1] = v_xor_b32 %0:v[1], %0:v[0]
|
//~gfx[67]! v1: %0:v[1] = v_xor_b32 %0:v[1], %0:v[0]
|
||||||
|
@ -157,10 +152,8 @@ BEGIN_TEST(to_hw_instr.swap_subdword)
|
||||||
Operand(v1_lo, v1b), Operand(v0_lo, v1b));
|
Operand(v1_lo, v1b), Operand(v0_lo, v1b));
|
||||||
|
|
||||||
//~gfx[67]! p_unit_test 10
|
//~gfx[67]! p_unit_test 10
|
||||||
//~gfx[67]! v1: %0:v[1] = v_and_b32 0xff, %0:v[1][0:8]
|
//~gfx[67]! v1b: %0:v[1][24:32] = v_lshlrev_b32 24, %0:v[1][0:8]
|
||||||
//~gfx[67]! v1: %0:v[0] = v_lshlrev_b32 8, %0:v[0][0:8]
|
//~gfx[67]! v2b: %0:v[1][0:16] = v_alignbyte_b32 %0:v[0][0:8], %0:v[1][24:32], 3
|
||||||
//~gfx[67]! v1: %0:v[1] = v_or_b32 %0:v[1][0:8], %0:v[0][0:8]
|
|
||||||
//~gfx[67]! v1: %0:v[0] = v_lshrrev_b32 8, %0:v[0][0:8]
|
|
||||||
//~gfx[67]! v2b: %0:v[0][0:16] = v_mov_b32 %0:v[1][0:16]
|
//~gfx[67]! v2b: %0:v[0][0:16] = v_mov_b32 %0:v[1][0:16]
|
||||||
bld.pseudo(aco_opcode::p_unit_test, Operand(10u));
|
bld.pseudo(aco_opcode::p_unit_test, Operand(10u));
|
||||||
bld.pseudo(aco_opcode::p_create_vector,
|
bld.pseudo(aco_opcode::p_create_vector,
|
||||||
|
@ -168,32 +161,24 @@ BEGIN_TEST(to_hw_instr.swap_subdword)
|
||||||
Operand(v1_lo, v1b), Operand(v0_lo, v1b));
|
Operand(v1_lo, v1b), Operand(v0_lo, v1b));
|
||||||
|
|
||||||
//~gfx[67]! p_unit_test 11
|
//~gfx[67]! p_unit_test 11
|
||||||
//~gfx[67]! v1: %0:v[1] = v_and_b32 0xff, %0:v[1][0:8]
|
//~gfx[67]! v1b: %0:v[1][24:32] = v_lshlrev_b32 24, %0:v[1][0:8]
|
||||||
//~gfx[67]! v1: %0:v[0] = v_lshlrev_b32 8, %0:v[0][0:8]
|
//~gfx[67]! v2b: %0:v[1][0:16] = v_alignbyte_b32 %0:v[0][0:8], %0:v[1][24:32], 3
|
||||||
//~gfx[67]! v1: %0:v[1] = v_or_b32 %0:v[1][0:8], %0:v[0][0:8]
|
|
||||||
//~gfx[67]! v1: %0:v[0] = v_lshrrev_b32 8, %0:v[0][0:8]
|
|
||||||
//~gfx[67]! v2b: %0:v[0][0:16] = v_mov_b32 %0:v[1][0:16]
|
//~gfx[67]! v2b: %0:v[0][0:16] = v_mov_b32 %0:v[1][0:16]
|
||||||
//~gfx[67]! v1: %0:v[2] = v_and_b32 0xffff, %0:v[2][0:8]
|
//~gfx[67]! v2b: %0:v[0][16:32] = v_lshlrev_b32 16, %0:v[0][0:16]
|
||||||
//~gfx[67]! v1: %0:v[0] = v_and_b32 0xffff, %0:v[0][0:16]
|
//~gfx[67]! v3b: %0:v[0][0:24] = v_alignbyte_b32 %0:v[2][0:8], %0:v[0][16:32], 2
|
||||||
//~gfx[67]! v1: %0:v[0] = v_cvt_pk_u16_u32 %0:v[0][0:16], %0:v[2][0:8]
|
|
||||||
bld.pseudo(aco_opcode::p_unit_test, Operand(11u));
|
bld.pseudo(aco_opcode::p_unit_test, Operand(11u));
|
||||||
bld.pseudo(aco_opcode::p_create_vector,
|
bld.pseudo(aco_opcode::p_create_vector,
|
||||||
Definition(v0_lo, v3b), Operand(v1_lo, v1b),
|
Definition(v0_lo, v3b), Operand(v1_lo, v1b),
|
||||||
Operand(v0_lo, v1b), Operand(v2_lo, v1b));
|
Operand(v0_lo, v1b), Operand(v2_lo, v1b));
|
||||||
|
|
||||||
//~gfx[67]! p_unit_test 12
|
//~gfx[67]! p_unit_test 12
|
||||||
//~gfx[67]! v1: %0:v[1] = v_and_b32 0xff, %0:v[1][0:8]
|
//~gfx[67]! v1b: %0:v[1][24:32] = v_lshlrev_b32 24, %0:v[1][0:8]
|
||||||
//~gfx[67]! v1: %0:v[0] = v_lshlrev_b32 8, %0:v[0][0:8]
|
//~gfx[67]! v2b: %0:v[1][0:16] = v_alignbyte_b32 %0:v[0][0:8], %0:v[1][24:32], 3
|
||||||
//~gfx[67]! v1: %0:v[1] = v_or_b32 %0:v[1][0:8], %0:v[0][0:8]
|
|
||||||
//~gfx[67]! v1: %0:v[0] = v_lshrrev_b32 8, %0:v[0][0:8]
|
|
||||||
//~gfx[67]! v2b: %0:v[0][0:16] = v_mov_b32 %0:v[1][0:16]
|
//~gfx[67]! v2b: %0:v[0][0:16] = v_mov_b32 %0:v[1][0:16]
|
||||||
//~gfx[67]! v1: %0:v[2] = v_and_b32 0xffff, %0:v[2][0:8]
|
//~gfx[67]! v2b: %0:v[0][16:32] = v_lshlrev_b32 16, %0:v[0][0:16]
|
||||||
//~gfx[67]! v1: %0:v[0] = v_and_b32 0xffff, %0:v[0][0:16]
|
//~gfx[67]! v3b: %0:v[0][0:24] = v_alignbyte_b32 %0:v[2][0:8], %0:v[0][16:32], 2
|
||||||
//~gfx[67]! v1: %0:v[0] = v_cvt_pk_u16_u32 %0:v[0][0:16], %0:v[2][0:8]
|
//~gfx[67]! v3b: %0:v[0][8:32] = v_lshlrev_b32 8, %0:v[0][0:24]
|
||||||
//~gfx[67]! v1: %0:v[0] = v_and_b32 0xffffff, %0:v[0][0:8]
|
//~gfx[67]! v1: %0:v[0] = v_alignbyte_b32 %0:v[3][0:8], %0:v[0][8:32], 1
|
||||||
//~gfx[67]! v1: %0:v[3] = v_lshlrev_b32 24, %0:v[3][0:8]
|
|
||||||
//~gfx[67]! v1: %0:v[0] = v_or_b32 %0:v[0][0:8], %0:v[3][0:8]
|
|
||||||
//~gfx[67]! v1: %0:v[3] = v_lshrrev_b32 24, %0:v[3][0:8]
|
|
||||||
bld.pseudo(aco_opcode::p_unit_test, Operand(12u));
|
bld.pseudo(aco_opcode::p_unit_test, Operand(12u));
|
||||||
bld.pseudo(aco_opcode::p_create_vector,
|
bld.pseudo(aco_opcode::p_create_vector,
|
||||||
Definition(v0_lo, v1),
|
Definition(v0_lo, v1),
|
||||||
|
@ -201,11 +186,11 @@ BEGIN_TEST(to_hw_instr.swap_subdword)
|
||||||
Operand(v2_lo, v1b), Operand(v3_lo, v1b));
|
Operand(v2_lo, v1b), Operand(v3_lo, v1b));
|
||||||
|
|
||||||
//~gfx[67]! p_unit_test 13
|
//~gfx[67]! p_unit_test 13
|
||||||
//~gfx[67]! v1: %0:v[0] = v_and_b32 0xff, %0:v[0][0:8]
|
//~gfx[67]! v1b: %0:v[0][0:8] = v_and_b32 0xff, %0:v[0][0:8]
|
||||||
//~gfx[67]! v1: %0:v[0] = v_mul_u32_u24 0x101, %0:v[0][0:8]
|
//~gfx[67]! v2b: %0:v[0][0:16] = v_mul_u32_u24 0x101, %0:v[0][0:8]
|
||||||
//~gfx[67]! v1: %0:v[0] = v_and_b32 0xffff, %0:v[0][0:8]
|
//~gfx[67]! v2b: %0:v[0][0:16] = v_and_b32 0xffff, %0:v[0][0:16]
|
||||||
//~gfx[67]! v1: %0:v[0] = v_cvt_pk_u16_u32 %0:v[0][0:16], %0:v[0][0:8]
|
//~gfx[67]! v3b: %0:v[0][0:24] = v_cvt_pk_u16_u32 %0:v[0][0:16], %0:v[0][0:8]
|
||||||
//~gfx[67]! v1: %0:v[0] = v_and_b32 0xffffff, %0:v[0][0:8]
|
//~gfx[67]! v3b: %0:v[0][0:24] = v_and_b32 0xffffff, %0:v[0][0:24]
|
||||||
//~gfx[67]! s1: %0:m0 = s_mov_b32 0x1000001
|
//~gfx[67]! s1: %0:m0 = s_mov_b32 0x1000001
|
||||||
//~gfx[67]! v1: %0:v[0] = v_mul_lo_u32 %0:m0, %0:v[0][0:8]
|
//~gfx[67]! v1: %0:v[0] = v_mul_lo_u32 %0:m0, %0:v[0][0:8]
|
||||||
bld.pseudo(aco_opcode::p_unit_test, Operand(13u));
|
bld.pseudo(aco_opcode::p_unit_test, Operand(13u));
|
||||||
|
|
Loading…
Reference in New Issue