aco/gfx11: use v_swap_b16

I tested that v_swap_b16 can be encoded as VOP3, because the ISA doc doesn't list
it as a possible VOP3 opcode. VOP3 is nessecary to access v128+.

Foz-DB Navi31:
Totals from 32 (0.04% of 79395) affected shaders:
Instrs: 201865 -> 195168 (-3.32%)
CodeSize: 1082220 -> 1031228 (-4.71%); split: -4.71%, +0.00%
Latency: 2258198 -> 2238586 (-0.87%)
InvThroughput: 796731 -> 788934 (-0.98%)
Copies: 34514 -> 29220 (-15.34%)
VALU: 122457 -> 117163 (-4.32%)

Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29143>
This commit is contained in:
Georg Lehmann 2024-05-10 21:44:08 +02:00 committed by Marge Bot
parent 5803a40e2f
commit 80b8bbf0c5
5 changed files with 23 additions and 48 deletions

View File

@ -880,21 +880,21 @@ emit_vop3_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction*
*/
if (instr->definitions.size() == 2 && instr->isVOPC())
assert(ctx.gfx_level <= GFX9 && instr->definitions[1].physReg() == exec);
else if (instr->definitions.size() == 2)
else if (instr->definitions.size() == 2 && instr->opcode != aco_opcode::v_swap_b16)
encoding |= reg(ctx, instr->definitions[1]) << 8;
encoding |= reg(ctx, instr->definitions[0], 8);
out.push_back(encoding);
encoding = 0;
if (instr->opcode == aco_opcode::v_interp_mov_f32) {
encoding = 0x3 & instr->operands[0].constantValue();
} else if (instr->opcode == aco_opcode::v_writelane_b32_e64) {
encoding |= reg(ctx, instr->operands[0]) << 0;
encoding |= reg(ctx, instr->operands[1]) << 9;
/* Encoding src2 works fine with hardware but breaks some disassemblers. */
} else {
for (unsigned i = 0; i < instr->operands.size(); i++)
encoding |= reg(ctx, instr->operands[i]) << (i * 9);
}
unsigned num_ops = instr->operands.size();
/* Encoding implicit sources works fine with hardware but breaks some disassemblers. */
if (instr->opcode == aco_opcode::v_writelane_b32_e64)
num_ops = 2;
else if (instr->opcode == aco_opcode::v_swap_b16)
num_ops = 1;
for (unsigned i = 0; i < num_ops; i++)
encoding |= reg(ctx, instr->operands[i]) << (i * 9);
encoding |= vop3.omod << 27;
for (unsigned i = 0; i < 3; i++)
encoding |= vop3.neg[i] << (29 + i);

View File

@ -669,6 +669,7 @@ get_gfx11_true16_mask(aco_opcode op)
case aco_opcode::v_sin_f16:
case aco_opcode::v_sqrt_f16:
case aco_opcode::v_trunc_f16:
case aco_opcode::v_swap_b16:
case aco_opcode::v_mov_b16: return 0x1 | 0x8;
case aco_opcode::v_add_f16:
case aco_opcode::v_fmaak_f16:

View File

@ -1317,19 +1317,6 @@ copy_constant(lower_context* ctx, Builder& bld, Definition dst, Operand op)
}
}
void
addsub_subdword_gfx11(Builder& bld, Definition dst, Operand src0, Operand src1, bool sub)
{
Instruction* instr =
bld.vop3(sub ? aco_opcode::v_sub_u16_e64 : aco_opcode::v_add_u16_e64, dst, src0, src1).instr;
if (src0.physReg().byte() == 2)
instr->valu().opsel |= 0x1;
if (src1.physReg().byte() == 2)
instr->valu().opsel |= 0x2;
if (dst.physReg().byte() == 2)
instr->valu().opsel |= 0x8;
}
bool
do_copy(lower_context* ctx, Builder& bld, const copy_operation& copy, bool* preserve_scc,
PhysReg scratch_sgpr)
@ -1390,9 +1377,9 @@ swap_subdword_gfx11(Builder& bld, Definition def, Operand op)
if (def.bytes() == 2) {
Operand def_as_op = Operand(def.physReg(), def.regClass());
Definition op_as_def = Definition(op.physReg(), op.regClass());
addsub_subdword_gfx11(bld, def, def_as_op, op, false);
addsub_subdword_gfx11(bld, op_as_def, def_as_op, op, true);
addsub_subdword_gfx11(bld, def, def_as_op, op, true);
Instruction* instr = bld.vop1(aco_opcode::v_swap_b16, def, op_as_def, op, def_as_op);
instr->valu().opsel[0] = op.physReg().byte();
instr->valu().opsel[3] = def.physReg().byte();
} else {
PhysReg op_half = op.physReg();
op_half.reg_b &= ~1;

View File

@ -974,6 +974,7 @@ VOP1 = {
("v_cvt_i32_i16", False, False, dst(1), src(1), op(gfx11=0x6a)),
("v_cvt_u32_u16", False, False, dst(1), src(1), op(gfx11=0x6b)),
("v_mov_b16", True, False, dst(1), src(1), op(gfx11=0x1c)),
("v_swap_b16", False, False, dst(1, 1), src(1, 1), op(gfx11=0x66)),
}
for (name, in_mod, out_mod, defs, ops, num, cls) in default_class(VOP1, InstrClass.Valu32):
insn(name, num, Format.VOP1, cls, in_mod, out_mod, definitions = defs, operands = ops)

View File

@ -53,9 +53,7 @@ BEGIN_TEST(to_hw_instr.swap_subdword)
//~gfx[89]! v2b: %0:v[1][0:16] = v_xor_b32 %0:v[1][0:16], %0:v[0][0:16] dst_sel:uword0 dst_preserve src0_sel:uword0 src1_sel:uword0
//~gfx11! v2b: %0:v[0][16:32] = v_mov_b16 hi(%0:v[1][16:32]) opsel_hi
//~gfx11! v2b: %0:v[1][16:32] = v_mov_b16 %0:v[0][0:16] opsel_hi
//~gfx11! v2b: %0:v[0][0:16] = v_add_u16_e64 %0:v[0][0:16], %0:v[1][0:16]
//~gfx11! v2b: %0:v[1][0:16] = v_sub_u16_e64 %0:v[0][0:16], %0:v[1][0:16]
//~gfx11! v2b: %0:v[0][0:16] = v_sub_u16_e64 %0:v[0][0:16], %0:v[1][0:16]
//~gfx11! v2b: %0:v[0][0:16], v2b: %0:v[1][0:16] = v_swap_b16 %0:v[1][0:16], %0:v[0][0:16]
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2u));
bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v1), Definition(v1_lo, v2b),
Definition(v1_hi, v2b), Operand(v1_lo, v1), Operand(v0_lo, v2b),
@ -130,13 +128,9 @@ BEGIN_TEST(to_hw_instr.swap_subdword)
//~gfx[89]! v1b: %0:v[1][24:32] = v_xor_b32 %0:v[1][24:32], %0:v[0][24:32] dst_sel:ubyte3 dst_preserve src0_sel:ubyte3 src1_sel:ubyte3
//~gfx[89]! v1b: %0:v[0][24:32] = v_xor_b32 %0:v[1][24:32], %0:v[0][24:32] dst_sel:ubyte3 dst_preserve src0_sel:ubyte3 src1_sel:ubyte3
//~gfx[89]! v1b: %0:v[1][24:32] = v_xor_b32 %0:v[1][24:32], %0:v[0][24:32] dst_sel:ubyte3 dst_preserve src0_sel:ubyte3 src1_sel:ubyte3
//~gfx11! v2b: %0:v[0][0:16] = v_add_u16_e64 %0:v[0][0:16], hi(%0:v[1][16:32])
//~gfx11! v2b: %0:v[1][16:32] = v_sub_u16_e64 %0:v[0][0:16], hi(%0:v[1][16:32]) opsel_hi
//~gfx11! v2b: %0:v[0][0:16] = v_sub_u16_e64 %0:v[0][0:16], hi(%0:v[1][16:32])
//~gfx11! v2b: %0:v[0][0:16], v2b: %0:v[1][16:32] = v_swap_b16 hi(%0:v[1][16:32]), %0:v[0][0:16]
//~gfx11! v1: %0:v[0] = v_perm_b32 %0:v[0], 0, 0x5060704
//~gfx11! v2b: %0:v[0][0:16] = v_add_u16_e64 %0:v[0][0:16], hi(%0:v[1][16:32])
//~gfx11! v2b: %0:v[1][16:32] = v_sub_u16_e64 %0:v[0][0:16], hi(%0:v[1][16:32]) opsel_hi
//~gfx11! v2b: %0:v[0][0:16] = v_sub_u16_e64 %0:v[0][0:16], hi(%0:v[1][16:32])
//~gfx11! v2b: %0:v[0][0:16], v2b: %0:v[1][16:32] = v_swap_b16 hi(%0:v[1][16:32]), %0:v[0][0:16]
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(8u));
bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v3b), Definition(v1_lo, v3b),
Operand(v1_lo, v3b), Operand(v0_lo, v3b));
@ -157,23 +151,15 @@ BEGIN_TEST(to_hw_instr.swap_subdword)
//~gfx[89]! v1b: %0:v[1][8:16] = v_xor_b32 %0:v[1][8:16], %0:v[0][8:16] dst_sel:ubyte1 dst_preserve src0_sel:ubyte1 src1_sel:ubyte1
//~gfx[89]! v1b: %0:v[0][8:16] = v_xor_b32 %0:v[1][8:16], %0:v[0][8:16] dst_sel:ubyte1 dst_preserve src0_sel:ubyte1 src1_sel:ubyte1
//~gfx[89]! v1b: %0:v[1][8:16] = v_xor_b32 %0:v[1][8:16], %0:v[0][8:16] dst_sel:ubyte1 dst_preserve src0_sel:ubyte1 src1_sel:ubyte1
//~gfx11! v2b: %0:v[0][16:32] = v_add_u16_e64 hi(%0:v[0][16:32]), %0:v[1][0:16] opsel_hi
//~gfx11! v2b: %0:v[1][0:16] = v_sub_u16_e64 hi(%0:v[0][16:32]), %0:v[1][0:16]
//~gfx11! v2b: %0:v[0][16:32] = v_sub_u16_e64 hi(%0:v[0][16:32]), %0:v[1][0:16] opsel_hi
//~gfx11! v2b: %0:v[0][16:32], v2b: %0:v[1][0:16] = v_swap_b16 %0:v[1][0:16], %0:v[0][16:32] opsel_hi
//~gfx11! v1: %0:v[0] = v_perm_b32 %0:v[0], 0, 0x5060704
//~gfx11! v2b: %0:v[0][16:32] = v_add_u16_e64 hi(%0:v[0][16:32]), %0:v[1][0:16] opsel_hi
//~gfx11! v2b: %0:v[1][0:16] = v_sub_u16_e64 hi(%0:v[0][16:32]), %0:v[1][0:16]
//~gfx11! v2b: %0:v[0][16:32] = v_sub_u16_e64 hi(%0:v[0][16:32]), %0:v[1][0:16] opsel_hi
//~gfx11! v2b: %0:v[0][16:32], v2b: %0:v[1][0:16] = v_swap_b16 %0:v[1][0:16], %0:v[0][16:32] opsel_hi
//~gfx[89]! v1b: %0:v[1][16:24] = v_xor_b32 %0:v[1][16:24], %0:v[0][16:24] dst_sel:ubyte2 dst_preserve src0_sel:ubyte2 src1_sel:ubyte2
//~gfx[89]! v1b: %0:v[0][16:24] = v_xor_b32 %0:v[1][16:24], %0:v[0][16:24] dst_sel:ubyte2 dst_preserve src0_sel:ubyte2 src1_sel:ubyte2
//~gfx[89]! v1b: %0:v[1][16:24] = v_xor_b32 %0:v[1][16:24], %0:v[0][16:24] dst_sel:ubyte2 dst_preserve src0_sel:ubyte2 src1_sel:ubyte2
//~gfx11! v2b: %0:v[0][0:16] = v_add_u16_e64 %0:v[0][0:16], hi(%0:v[1][16:32])
//~gfx11! v2b: %0:v[1][16:32] = v_sub_u16_e64 %0:v[0][0:16], hi(%0:v[1][16:32]) opsel_hi
//~gfx11! v2b: %0:v[0][0:16] = v_sub_u16_e64 %0:v[0][0:16], hi(%0:v[1][16:32])
//~gfx11! v2b: %0:v[0][0:16], v2b: %0:v[1][16:32] = v_swap_b16 hi(%0:v[1][16:32]), %0:v[0][0:16]
//~gfx11! v1: %0:v[0] = v_perm_b32 %0:v[0], 0, 0x7040506
//~gfx11! v2b: %0:v[0][0:16] = v_add_u16_e64 %0:v[0][0:16], hi(%0:v[1][16:32])
//~gfx11! v2b: %0:v[1][16:32] = v_sub_u16_e64 %0:v[0][0:16], hi(%0:v[1][16:32]) opsel_hi
//~gfx11! v2b: %0:v[0][0:16] = v_sub_u16_e64 %0:v[0][0:16], hi(%0:v[1][16:32])
//~gfx11! v2b: %0:v[0][0:16], v2b: %0:v[1][16:32] = v_swap_b16 hi(%0:v[1][16:32]), %0:v[0][0:16]
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(10u));
bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_b1, v2b), Definition(v1_b1, v2b),
Operand(v1_b1, v2b), Operand(v0_b1, v2b));