mirror of https://gitlab.freedesktop.org/mesa/mesa
aco/gfx11: use v_swap_b16
I tested that v_swap_b16 can be encoded as VOP3, because the ISA doc doesn't list it as a possible VOP3 opcode. VOP3 is nessecary to access v128+. Foz-DB Navi31: Totals from 32 (0.04% of 79395) affected shaders: Instrs: 201865 -> 195168 (-3.32%) CodeSize: 1082220 -> 1031228 (-4.71%); split: -4.71%, +0.00% Latency: 2258198 -> 2238586 (-0.87%) InvThroughput: 796731 -> 788934 (-0.98%) Copies: 34514 -> 29220 (-15.34%) VALU: 122457 -> 117163 (-4.32%) Reviewed-by: Daniel Schürmann <daniel@schuermann.dev> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29143>
This commit is contained in:
parent
5803a40e2f
commit
80b8bbf0c5
|
@ -880,21 +880,21 @@ emit_vop3_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction*
|
|||
*/
|
||||
if (instr->definitions.size() == 2 && instr->isVOPC())
|
||||
assert(ctx.gfx_level <= GFX9 && instr->definitions[1].physReg() == exec);
|
||||
else if (instr->definitions.size() == 2)
|
||||
else if (instr->definitions.size() == 2 && instr->opcode != aco_opcode::v_swap_b16)
|
||||
encoding |= reg(ctx, instr->definitions[1]) << 8;
|
||||
encoding |= reg(ctx, instr->definitions[0], 8);
|
||||
out.push_back(encoding);
|
||||
encoding = 0;
|
||||
if (instr->opcode == aco_opcode::v_interp_mov_f32) {
|
||||
encoding = 0x3 & instr->operands[0].constantValue();
|
||||
} else if (instr->opcode == aco_opcode::v_writelane_b32_e64) {
|
||||
encoding |= reg(ctx, instr->operands[0]) << 0;
|
||||
encoding |= reg(ctx, instr->operands[1]) << 9;
|
||||
/* Encoding src2 works fine with hardware but breaks some disassemblers. */
|
||||
} else {
|
||||
for (unsigned i = 0; i < instr->operands.size(); i++)
|
||||
encoding |= reg(ctx, instr->operands[i]) << (i * 9);
|
||||
}
|
||||
|
||||
unsigned num_ops = instr->operands.size();
|
||||
/* Encoding implicit sources works fine with hardware but breaks some disassemblers. */
|
||||
if (instr->opcode == aco_opcode::v_writelane_b32_e64)
|
||||
num_ops = 2;
|
||||
else if (instr->opcode == aco_opcode::v_swap_b16)
|
||||
num_ops = 1;
|
||||
|
||||
for (unsigned i = 0; i < num_ops; i++)
|
||||
encoding |= reg(ctx, instr->operands[i]) << (i * 9);
|
||||
encoding |= vop3.omod << 27;
|
||||
for (unsigned i = 0; i < 3; i++)
|
||||
encoding |= vop3.neg[i] << (29 + i);
|
||||
|
|
|
@ -669,6 +669,7 @@ get_gfx11_true16_mask(aco_opcode op)
|
|||
case aco_opcode::v_sin_f16:
|
||||
case aco_opcode::v_sqrt_f16:
|
||||
case aco_opcode::v_trunc_f16:
|
||||
case aco_opcode::v_swap_b16:
|
||||
case aco_opcode::v_mov_b16: return 0x1 | 0x8;
|
||||
case aco_opcode::v_add_f16:
|
||||
case aco_opcode::v_fmaak_f16:
|
||||
|
|
|
@ -1317,19 +1317,6 @@ copy_constant(lower_context* ctx, Builder& bld, Definition dst, Operand op)
|
|||
}
|
||||
}
|
||||
|
||||
void
|
||||
addsub_subdword_gfx11(Builder& bld, Definition dst, Operand src0, Operand src1, bool sub)
|
||||
{
|
||||
Instruction* instr =
|
||||
bld.vop3(sub ? aco_opcode::v_sub_u16_e64 : aco_opcode::v_add_u16_e64, dst, src0, src1).instr;
|
||||
if (src0.physReg().byte() == 2)
|
||||
instr->valu().opsel |= 0x1;
|
||||
if (src1.physReg().byte() == 2)
|
||||
instr->valu().opsel |= 0x2;
|
||||
if (dst.physReg().byte() == 2)
|
||||
instr->valu().opsel |= 0x8;
|
||||
}
|
||||
|
||||
bool
|
||||
do_copy(lower_context* ctx, Builder& bld, const copy_operation& copy, bool* preserve_scc,
|
||||
PhysReg scratch_sgpr)
|
||||
|
@ -1390,9 +1377,9 @@ swap_subdword_gfx11(Builder& bld, Definition def, Operand op)
|
|||
if (def.bytes() == 2) {
|
||||
Operand def_as_op = Operand(def.physReg(), def.regClass());
|
||||
Definition op_as_def = Definition(op.physReg(), op.regClass());
|
||||
addsub_subdword_gfx11(bld, def, def_as_op, op, false);
|
||||
addsub_subdword_gfx11(bld, op_as_def, def_as_op, op, true);
|
||||
addsub_subdword_gfx11(bld, def, def_as_op, op, true);
|
||||
Instruction* instr = bld.vop1(aco_opcode::v_swap_b16, def, op_as_def, op, def_as_op);
|
||||
instr->valu().opsel[0] = op.physReg().byte();
|
||||
instr->valu().opsel[3] = def.physReg().byte();
|
||||
} else {
|
||||
PhysReg op_half = op.physReg();
|
||||
op_half.reg_b &= ~1;
|
||||
|
|
|
@ -974,6 +974,7 @@ VOP1 = {
|
|||
("v_cvt_i32_i16", False, False, dst(1), src(1), op(gfx11=0x6a)),
|
||||
("v_cvt_u32_u16", False, False, dst(1), src(1), op(gfx11=0x6b)),
|
||||
("v_mov_b16", True, False, dst(1), src(1), op(gfx11=0x1c)),
|
||||
("v_swap_b16", False, False, dst(1, 1), src(1, 1), op(gfx11=0x66)),
|
||||
}
|
||||
for (name, in_mod, out_mod, defs, ops, num, cls) in default_class(VOP1, InstrClass.Valu32):
|
||||
insn(name, num, Format.VOP1, cls, in_mod, out_mod, definitions = defs, operands = ops)
|
||||
|
|
|
@ -53,9 +53,7 @@ BEGIN_TEST(to_hw_instr.swap_subdword)
|
|||
//~gfx[89]! v2b: %0:v[1][0:16] = v_xor_b32 %0:v[1][0:16], %0:v[0][0:16] dst_sel:uword0 dst_preserve src0_sel:uword0 src1_sel:uword0
|
||||
//~gfx11! v2b: %0:v[0][16:32] = v_mov_b16 hi(%0:v[1][16:32]) opsel_hi
|
||||
//~gfx11! v2b: %0:v[1][16:32] = v_mov_b16 %0:v[0][0:16] opsel_hi
|
||||
//~gfx11! v2b: %0:v[0][0:16] = v_add_u16_e64 %0:v[0][0:16], %0:v[1][0:16]
|
||||
//~gfx11! v2b: %0:v[1][0:16] = v_sub_u16_e64 %0:v[0][0:16], %0:v[1][0:16]
|
||||
//~gfx11! v2b: %0:v[0][0:16] = v_sub_u16_e64 %0:v[0][0:16], %0:v[1][0:16]
|
||||
//~gfx11! v2b: %0:v[0][0:16], v2b: %0:v[1][0:16] = v_swap_b16 %0:v[1][0:16], %0:v[0][0:16]
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2u));
|
||||
bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v1), Definition(v1_lo, v2b),
|
||||
Definition(v1_hi, v2b), Operand(v1_lo, v1), Operand(v0_lo, v2b),
|
||||
|
@ -130,13 +128,9 @@ BEGIN_TEST(to_hw_instr.swap_subdword)
|
|||
//~gfx[89]! v1b: %0:v[1][24:32] = v_xor_b32 %0:v[1][24:32], %0:v[0][24:32] dst_sel:ubyte3 dst_preserve src0_sel:ubyte3 src1_sel:ubyte3
|
||||
//~gfx[89]! v1b: %0:v[0][24:32] = v_xor_b32 %0:v[1][24:32], %0:v[0][24:32] dst_sel:ubyte3 dst_preserve src0_sel:ubyte3 src1_sel:ubyte3
|
||||
//~gfx[89]! v1b: %0:v[1][24:32] = v_xor_b32 %0:v[1][24:32], %0:v[0][24:32] dst_sel:ubyte3 dst_preserve src0_sel:ubyte3 src1_sel:ubyte3
|
||||
//~gfx11! v2b: %0:v[0][0:16] = v_add_u16_e64 %0:v[0][0:16], hi(%0:v[1][16:32])
|
||||
//~gfx11! v2b: %0:v[1][16:32] = v_sub_u16_e64 %0:v[0][0:16], hi(%0:v[1][16:32]) opsel_hi
|
||||
//~gfx11! v2b: %0:v[0][0:16] = v_sub_u16_e64 %0:v[0][0:16], hi(%0:v[1][16:32])
|
||||
//~gfx11! v2b: %0:v[0][0:16], v2b: %0:v[1][16:32] = v_swap_b16 hi(%0:v[1][16:32]), %0:v[0][0:16]
|
||||
//~gfx11! v1: %0:v[0] = v_perm_b32 %0:v[0], 0, 0x5060704
|
||||
//~gfx11! v2b: %0:v[0][0:16] = v_add_u16_e64 %0:v[0][0:16], hi(%0:v[1][16:32])
|
||||
//~gfx11! v2b: %0:v[1][16:32] = v_sub_u16_e64 %0:v[0][0:16], hi(%0:v[1][16:32]) opsel_hi
|
||||
//~gfx11! v2b: %0:v[0][0:16] = v_sub_u16_e64 %0:v[0][0:16], hi(%0:v[1][16:32])
|
||||
//~gfx11! v2b: %0:v[0][0:16], v2b: %0:v[1][16:32] = v_swap_b16 hi(%0:v[1][16:32]), %0:v[0][0:16]
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(8u));
|
||||
bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v3b), Definition(v1_lo, v3b),
|
||||
Operand(v1_lo, v3b), Operand(v0_lo, v3b));
|
||||
|
@ -157,23 +151,15 @@ BEGIN_TEST(to_hw_instr.swap_subdword)
|
|||
//~gfx[89]! v1b: %0:v[1][8:16] = v_xor_b32 %0:v[1][8:16], %0:v[0][8:16] dst_sel:ubyte1 dst_preserve src0_sel:ubyte1 src1_sel:ubyte1
|
||||
//~gfx[89]! v1b: %0:v[0][8:16] = v_xor_b32 %0:v[1][8:16], %0:v[0][8:16] dst_sel:ubyte1 dst_preserve src0_sel:ubyte1 src1_sel:ubyte1
|
||||
//~gfx[89]! v1b: %0:v[1][8:16] = v_xor_b32 %0:v[1][8:16], %0:v[0][8:16] dst_sel:ubyte1 dst_preserve src0_sel:ubyte1 src1_sel:ubyte1
|
||||
//~gfx11! v2b: %0:v[0][16:32] = v_add_u16_e64 hi(%0:v[0][16:32]), %0:v[1][0:16] opsel_hi
|
||||
//~gfx11! v2b: %0:v[1][0:16] = v_sub_u16_e64 hi(%0:v[0][16:32]), %0:v[1][0:16]
|
||||
//~gfx11! v2b: %0:v[0][16:32] = v_sub_u16_e64 hi(%0:v[0][16:32]), %0:v[1][0:16] opsel_hi
|
||||
//~gfx11! v2b: %0:v[0][16:32], v2b: %0:v[1][0:16] = v_swap_b16 %0:v[1][0:16], %0:v[0][16:32] opsel_hi
|
||||
//~gfx11! v1: %0:v[0] = v_perm_b32 %0:v[0], 0, 0x5060704
|
||||
//~gfx11! v2b: %0:v[0][16:32] = v_add_u16_e64 hi(%0:v[0][16:32]), %0:v[1][0:16] opsel_hi
|
||||
//~gfx11! v2b: %0:v[1][0:16] = v_sub_u16_e64 hi(%0:v[0][16:32]), %0:v[1][0:16]
|
||||
//~gfx11! v2b: %0:v[0][16:32] = v_sub_u16_e64 hi(%0:v[0][16:32]), %0:v[1][0:16] opsel_hi
|
||||
//~gfx11! v2b: %0:v[0][16:32], v2b: %0:v[1][0:16] = v_swap_b16 %0:v[1][0:16], %0:v[0][16:32] opsel_hi
|
||||
//~gfx[89]! v1b: %0:v[1][16:24] = v_xor_b32 %0:v[1][16:24], %0:v[0][16:24] dst_sel:ubyte2 dst_preserve src0_sel:ubyte2 src1_sel:ubyte2
|
||||
//~gfx[89]! v1b: %0:v[0][16:24] = v_xor_b32 %0:v[1][16:24], %0:v[0][16:24] dst_sel:ubyte2 dst_preserve src0_sel:ubyte2 src1_sel:ubyte2
|
||||
//~gfx[89]! v1b: %0:v[1][16:24] = v_xor_b32 %0:v[1][16:24], %0:v[0][16:24] dst_sel:ubyte2 dst_preserve src0_sel:ubyte2 src1_sel:ubyte2
|
||||
//~gfx11! v2b: %0:v[0][0:16] = v_add_u16_e64 %0:v[0][0:16], hi(%0:v[1][16:32])
|
||||
//~gfx11! v2b: %0:v[1][16:32] = v_sub_u16_e64 %0:v[0][0:16], hi(%0:v[1][16:32]) opsel_hi
|
||||
//~gfx11! v2b: %0:v[0][0:16] = v_sub_u16_e64 %0:v[0][0:16], hi(%0:v[1][16:32])
|
||||
//~gfx11! v2b: %0:v[0][0:16], v2b: %0:v[1][16:32] = v_swap_b16 hi(%0:v[1][16:32]), %0:v[0][0:16]
|
||||
//~gfx11! v1: %0:v[0] = v_perm_b32 %0:v[0], 0, 0x7040506
|
||||
//~gfx11! v2b: %0:v[0][0:16] = v_add_u16_e64 %0:v[0][0:16], hi(%0:v[1][16:32])
|
||||
//~gfx11! v2b: %0:v[1][16:32] = v_sub_u16_e64 %0:v[0][0:16], hi(%0:v[1][16:32]) opsel_hi
|
||||
//~gfx11! v2b: %0:v[0][0:16] = v_sub_u16_e64 %0:v[0][0:16], hi(%0:v[1][16:32])
|
||||
//~gfx11! v2b: %0:v[0][0:16], v2b: %0:v[1][16:32] = v_swap_b16 hi(%0:v[1][16:32]), %0:v[0][0:16]
|
||||
bld.pseudo(aco_opcode::p_unit_test, Operand::c32(10u));
|
||||
bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_b1, v2b), Definition(v1_b1, v2b),
|
||||
Operand(v1_b1, v2b), Operand(v0_b1, v2b));
|
||||
|
|
Loading…
Reference in New Issue