From cf083f1d02b8d46a310f4ade96992a9b2f28965f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20Sch=C3=BCrmann?= Date: Fri, 16 Oct 2020 13:29:28 +0200 Subject: [PATCH] aco: use do_pack() for self-intersecting operations. This improves the code for GFX8+, but is slightly worse for GFX6_7. Reviewed-by: Rhys Perry Part-of: --- src/amd/compiler/aco_lower_to_hw_instr.cpp | 9 ++++--- src/amd/compiler/tests/test_to_hw_instr.cpp | 28 ++++++++++----------- 2 files changed, 19 insertions(+), 18 deletions(-) diff --git a/src/amd/compiler/aco_lower_to_hw_instr.cpp b/src/amd/compiler/aco_lower_to_hw_instr.cpp index 1f76362578d..6a60714bf9a 100644 --- a/src/amd/compiler/aco_lower_to_hw_instr.cpp +++ b/src/amd/compiler/aco_lower_to_hw_instr.cpp @@ -1349,9 +1349,12 @@ void handle_operands(std::map& copy_map, lower_context* std::map::iterator other = copy_map.find(reg_hi); if (other != copy_map.end() && other->second.bytes == 2) { /* check if the target register is otherwise unused */ - // TODO: also do this for self-intersecting registers - bool unused_lo = !it->second.is_used; - bool unused_hi = !other->second.is_used; + bool unused_lo = !it->second.is_used || + (it->second.is_used == 0x0101 && + other->second.op.physReg() == it->first); + bool unused_hi = !other->second.is_used || + (other->second.is_used == 0x0101 && + it->second.op.physReg() == reg_hi); if (unused_lo && unused_hi) { Operand lo = it->second.op; Operand hi = other->second.op; diff --git a/src/amd/compiler/tests/test_to_hw_instr.cpp b/src/amd/compiler/tests/test_to_hw_instr.cpp index bd517b6da29..bca1ae22397 100644 --- a/src/amd/compiler/tests/test_to_hw_instr.cpp +++ b/src/amd/compiler/tests/test_to_hw_instr.cpp @@ -57,18 +57,18 @@ BEGIN_TEST(to_hw_instr.swap_subdword) Operand(v1_lo, v2b), Operand(v0_lo, v2b)); //~gfx[67]! p_unit_test 1 - //~gfx[67]! v2b: %0:v[1][16:32] = v_lshlrev_b32 16, %0:v[1][0:16] - //~gfx[67]! v1: %0:v[1] = v_alignbyte_b32 %0:v[0][0:16], %0:v[1][16:32], 2 - //~gfx[67]! v1: %0:v[0] = v_mov_b32 %0:v[1] + //~gfx[67]! v2b: %0:v[0][16:32] = v_lshlrev_b32 16, %0:v[0][0:16] + //~gfx[67]! v1: %0:v[0] = v_alignbyte_b32 %0:v[1][0:16], %0:v[0][16:32], 2 + //~gfx[67]! v1: %0:v[0] = v_alignbyte_b32 %0:v[0][0:16], %0:v[0][16:32], 2 bld.pseudo(aco_opcode::p_unit_test, Operand(1u)); bld.pseudo(aco_opcode::p_create_vector, Definition(v0_lo, v1), Operand(v1_lo, v2b), Operand(v0_lo, v2b)); //~gfx[67]! p_unit_test 2 - //~gfx[67]! v2b: %0:v[1][16:32] = v_lshlrev_b32 16, %0:v[1][0:16] - //~gfx[67]! v1: %0:v[1] = v_alignbyte_b32 %0:v[0][0:16], %0:v[1][16:32], 2 - //~gfx[67]! v1: %0:v[0] = v_mov_b32 %0:v[1] + //~gfx[67]! v2b: %0:v[0][16:32] = v_lshlrev_b32 16, %0:v[0][0:16] + //~gfx[67]! v1: %0:v[0] = v_alignbyte_b32 %0:v[1][0:16], %0:v[0][16:32], 2 + //~gfx[67]! v1: %0:v[0] = v_alignbyte_b32 %0:v[0][0:16], %0:v[0][16:32], 2 //~gfx[67]! v2b: %0:v[1][0:16] = v_mov_b32 %0:v[2][0:16] bld.pseudo(aco_opcode::p_unit_test, Operand(2u)); bld.pseudo(aco_opcode::p_create_vector, @@ -76,9 +76,9 @@ BEGIN_TEST(to_hw_instr.swap_subdword) Operand(v0_lo, v2b), Operand(v2_lo, v2b)); //~gfx[67]! p_unit_test 3 - //~gfx[67]! v2b: %0:v[1][16:32] = v_lshlrev_b32 16, %0:v[1][0:16] - //~gfx[67]! v1: %0:v[1] = v_alignbyte_b32 %0:v[0][0:16], %0:v[1][16:32], 2 - //~gfx[67]! v1: %0:v[0] = v_mov_b32 %0:v[1] + //~gfx[67]! v2b: %0:v[0][16:32] = v_lshlrev_b32 16, %0:v[0][0:16] + //~gfx[67]! v1: %0:v[0] = v_alignbyte_b32 %0:v[1][0:16], %0:v[0][16:32], 2 + //~gfx[67]! v1: %0:v[0] = v_alignbyte_b32 %0:v[0][0:16], %0:v[0][16:32], 2 //~gfx[67]! v2b: %0:v[1][16:32] = v_lshlrev_b32 16, %0:v[2][0:16] //~gfx[67]! v1: %0:v[1] = v_alignbyte_b32 %0:v[3][0:16], %0:v[1][16:32], 2 bld.pseudo(aco_opcode::p_unit_test, Operand(3u)); @@ -228,7 +228,8 @@ BEGIN_TEST(to_hw_instr.swap_subdword) continue; //~gfx[89]>> p_unit_test 0 - //~gfx[89]! v1: %0:v[0] = v_alignbyte_b32 %0:v[0][0:16], %0:v[0][16:32], 2 + //~gfx8! v1: %0:v[0] = v_alignbyte_b32 %0:v[0][0:16], %0:v[0][16:32], 2 + //~gfx9! v1: %0:v[0] = v_pack_b32_f16 hi(%0:v[0][16:32]), %0:v[0][0:16] bld.pseudo(aco_opcode::p_unit_test, Operand(0u)); bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v2b), Definition(v0_hi, v2b), @@ -393,13 +394,10 @@ BEGIN_TEST(to_hw_instr.subdword_constant) Definition(v0_lo, v2b), Definition(v0_hi, v2b), Operand((uint16_t)0x4205), Operand(v1_hi, v2b)); - //TODO: optimize this with GFX10. do_pack_2x16() isn't used in this case //! p_unit_test 2 - //~gfx9! v2b: %_:v[0][16:32] = v_mov_b32 %_:v[0][0:16] dst_preserve - //~gfx9! v1: %_:v[0] = v_and_b32 0xffff0000, %_:v[0] + //~gfx9! v2b: %0:v[0][16:32] = v_lshlrev_b32 16, %0:v[0][0:16] //~gfx9! v1: %_:v[0] = v_or_b32 0x4205, %_:v[0] - //~gfx10! v2b: %_:v[0][16:32] = v_mov_b32 %_:v[0][0:16] dst_preserve - //~gfx10! v2b: %_:v[0][0:16] = v_pack_b32_f16 0x4205, hi(%_:v[0][16:32]) + //~gfx10! v1: %0:v[0] = v_pack_b32_f16 0x4205, %0:v[0][0:16] bld.pseudo(aco_opcode::p_unit_test, Operand(2u)); bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v2b), Definition(v0_hi, v2b),