From e68e6c75ca14c768b50d9ad85c37b2443699dcbb Mon Sep 17 00:00:00 2001 From: Rhys Perry Date: Fri, 13 May 2022 15:11:48 +0100 Subject: [PATCH] aco: use v_perm_b32 to copy 0xff00/0x00ff/0xff/0x00 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Rhys Perry Reviewed-by: Timur Kristóf Part-of: --- src/amd/compiler/aco_lower_to_hw_instr.cpp | 10 +++++ src/amd/compiler/tests/test_to_hw_instr.cpp | 44 +++++++++++++++++++++ 2 files changed, 54 insertions(+) diff --git a/src/amd/compiler/aco_lower_to_hw_instr.cpp b/src/amd/compiler/aco_lower_to_hw_instr.cpp index 2dc32f65ec1..40798da9597 100644 --- a/src/amd/compiler/aco_lower_to_hw_instr.cpp +++ b/src/amd/compiler/aco_lower_to_hw_instr.cpp @@ -1085,6 +1085,10 @@ copy_constant(lower_context* ctx, Builder& bld, Definition dst, Operand op) assert(dst.regClass() == v1b || dst.regClass() == v2b); bool use_sdwa = ctx->program->gfx_level >= GFX9 && ctx->program->gfx_level < GFX11; + /* We need the v_perm_b32 (VOP3) to be able to take literals, and that's a GFX10+ feature. */ + bool can_use_perm = ctx->program->gfx_level >= GFX10 && + (op.constantEquals(0) || op.constantEquals(0xff) || + op.constantEquals(0xffff) || op.constantEquals(0xff00)); if (dst.regClass() == v1b && use_sdwa) { uint8_t val = op.constantValue(); Operand op32 = Operand::c32((uint32_t)val | (val & 0x80u ? 0xffffff00u : 0u)); @@ -1118,6 +1122,12 @@ copy_constant(lower_context* ctx, Builder& bld, Definition dst, Operand op) Instruction* instr = bld.vop3(aco_opcode::v_pack_b32_f16, dst, op, def_hi); instr->vop3().opsel = 2; } + } else if (can_use_perm) { + uint8_t swiz[] = {4, 5, 6, 7}; + swiz[dst.physReg().byte()] = op.constantValue() & 0xff ? bperm_255 : bperm_0; + if (dst.bytes() == 2) + swiz[dst.physReg().byte() + 1] = op.constantValue() >> 8 ? bperm_255 : bperm_0; + create_bperm(bld, swiz, dst, Operand::zero()); } else { uint32_t offset = dst.physReg().byte() * 8u; uint32_t mask = ((1u << (dst.bytes() * 8)) - 1) << offset; diff --git a/src/amd/compiler/tests/test_to_hw_instr.cpp b/src/amd/compiler/tests/test_to_hw_instr.cpp index 22587d4edea..67c6bf704bc 100644 --- a/src/amd/compiler/tests/test_to_hw_instr.cpp +++ b/src/amd/compiler/tests/test_to_hw_instr.cpp @@ -464,6 +464,50 @@ BEGIN_TEST(to_hw_instr.subdword_constant) bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v1), Definition(v1_lo, v1b), Operand::zero(), Operand::zero(1)); + bld.reset(program->create_and_insert_block()); + program->blocks[0].linear_succs.push_back(1); + program->blocks[1].linear_preds.push_back(0); + + /* Prevent usage of v_pack_b32_f16, so we use v_perm_b32 instead. */ + program->blocks[1].fp_mode.denorm16_64 = fp_denorm_flush; + + //>> p_unit_test 13 + //~gfx9! v1: %_:v[0] = v_and_b32 0xffff0000, %_:v[0] + //~gfx9! v1: %_:v[0] = v_or_b32 0xff, %_:v[0] + //~gfx(10|11)! v1: %_:v[0] = v_perm_b32 0, %_:v[0], 0x7060c0d + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(13u)); + bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v2b), + Operand::c16(0x00ff)); + + //! p_unit_test 14 + //~gfx9! v1: %_:v[0] = v_and_b32 0xffff, %_:v[0] + //~gfx9! v1: %_:v[0] = v_or_b32 0xff000000, %_:v[0] + //~gfx(10|11)! v1: %_:v[0] = v_perm_b32 0, %_:v[0], 0xd0c0504 + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(14u)); + bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_hi, v2b), + Operand::c16(0xff00)); + + //! p_unit_test 15 + //~gfx(9|10)! v2b: %_:v[0][0:16] = v_mov_b32 0 dst_sel:uword0 dst_preserve src0_sel:dword + //~gfx11! v1: %_:v[0] = v_perm_b32 0, %_:v[0], 0x7060c0c + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(15u)); + bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v2b), + Operand::zero(2)); + + //! p_unit_test 16 + //~gfx(9|10)! v1b: %_:v[0][0:8] = v_mov_b32 -1 dst_sel:ubyte0 dst_preserve src0_sel:dword + //~gfx11! v1: %_:v[0] = v_perm_b32 0, %_:v[0], 0x706050d + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(16u)); + bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v1b), + Operand::c8(0xff)); + + //! p_unit_test 17 + //~gfx(9|10)! v1b: %_:v[0][0:8] = v_mov_b32 0 dst_sel:ubyte0 dst_preserve src0_sel:dword + //~gfx11! v1: %_:v[0] = v_perm_b32 0, %_:v[0], 0x706050c + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(17u)); + bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v1b), + Operand::zero(1)); + //! s_endpgm finish_to_hw_instr_test();