aco/tests: add v_fma_mix tests

Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/14769>
This commit is contained in:
Rhys Perry 2022-01-27 14:19:21 +00:00 committed by Marge Bot
parent 1092f37805
commit 177b54ebe9
2 changed files with 489 additions and 4 deletions

View File

@ -344,14 +344,14 @@ Temp fsat(Temp src, Builder b)
Temp ext_ushort(Temp src, unsigned idx, Builder b)
{
return b.pseudo(aco_opcode::p_extract, b.def(v1), src, Operand::c32(idx), Operand::c32(16u),
Operand::c32(false));
return b.pseudo(aco_opcode::p_extract, b.def(src.regClass()), src, Operand::c32(idx),
Operand::c32(16u), Operand::c32(false));
}
Temp ext_ubyte(Temp src, unsigned idx, Builder b)
{
return b.pseudo(aco_opcode::p_extract, b.def(v1), src, Operand::c32(idx), Operand::c32(8u),
Operand::c32(false));
return b.pseudo(aco_opcode::p_extract, b.def(src.regClass()), src, Operand::c32(idx),
Operand::c32(8u), Operand::c32(false));
}
VkDevice get_vk_device(enum chip_class chip_class)

View File

@ -1158,3 +1158,488 @@ BEGIN_TEST(optimize.casts)
finish_opt_test();
END_TEST
BEGIN_TEST(optimize.mad_mix.input_conv.basic)
for (unsigned i = GFX9; i <= GFX10; i++) {
//>> v1: %a, v2b: %a16 = p_startpgm
if (!setup_cs("v1 v2b", (chip_class)i))
continue;
Temp a = inputs[0];
Temp a16 = inputs[1];
//! v1: %res0 = v_fma_mix_f32 %a, lo(%a16), -0
//! p_unit_test 0, %res0
writeout(0, fmul(a, f2f32(a16)));
//! v1: %res1 = v_fma_mix_f32 1.0, %a, lo(%a16)
//! p_unit_test 1, %res1
writeout(1, fadd(a, f2f32(a16)));
//! v1: %res2 = v_fma_mix_f32 1.0, lo(%a16), %a
//! p_unit_test 2, %res2
writeout(2, fadd(f2f32(a16), a));
//! v1: %res3 = v_fma_mix_f32 %a, %a, lo(%a16)
//! p_unit_test 3, %res3
writeout(3, fma(a, a, f2f32(a16)));
//! v1: %res4 = v_fma_mix_f32 %a, %a, lo(%a16)
//! p_unit_test 4, %res4
writeout(4, fma(a, a, f2f32(a16)));
finish_opt_test();
}
END_TEST
BEGIN_TEST(optimize.mad_mix.input_conv.precision)
for (unsigned i = GFX9; i <= GFX10; i++) {
//>> v1: %a, v2b: %a16 = p_startpgm
if (!setup_cs("v1 v2b", (chip_class)i))
continue;
Temp a = inputs[0];
Temp a16 = inputs[1];
/* precise arithmetic */
//~gfx9! v1: %res0_cvt = v_cvt_f32_f16 %a16
//~gfx9! v1: (precise)%res0 = v_fma_f32 %a, %a, %res0_cvt
//~gfx10! v1: (precise)%res0 = v_fma_mix_f32 %a, %a, lo(%a16)
//! p_unit_test 0, %res0
writeout(0, fma(a, a, f2f32(a16), bld.precise()));
//! v2b: %res1_cvt = v_cvt_f16_f32 %a
//! v2b: (precise)%res1 = v_mul_f16 %a16, %res1_cvt
//! p_unit_test 1, %res1
writeout(1, fmul(a16, f2f16(a), bld.precise()));
//! v2b: %res2_cvt = v_cvt_f16_f32 %a
//! v2b: (precise)%res2 = v_add_f16 %a16, %res2_cvt
//! p_unit_test 2, %res2
writeout(2, fadd(a16, f2f16(a), bld.precise()));
//! v2b: %res3_cvt = v_cvt_f16_f32 %a
//! v2b: (precise)%res3 = v_fma_f16 %a16, %a16, %res3_cvt
//! p_unit_test 3, %res3
writeout(3, fma(a16, a16, f2f16(a), bld.precise()));
/* precise conversions */
//! v2b: (precise)%res4_cvt = v_cvt_f16_f32 %a
//! v2b: %res4 = v_mul_f16 %a16, %res4_cvt
//! p_unit_test 4, %res4
writeout(4, fmul(a16, f2f16(a, bld.precise())));
//! v2b: (precise)%res5_cvt = v_cvt_f16_f32 %a
//! v2b: %res5 = v_add_f16 %a16, %res5_cvt
//! p_unit_test 5, %res5
writeout(5, fadd(a16, f2f16(a, bld.precise())));
//! v2b: (precise)%res6_cvt = v_cvt_f16_f32 %a
//! v2b: %res6 = v_fma_f16 %a16, %a16, %res6_cvt
//! p_unit_test 6, %res6
writeout(6, fma(a16, a16, f2f16(a, bld.precise())));
finish_opt_test();
}
END_TEST
BEGIN_TEST(optimize.mad_mix.input_conv.modifiers)
for (unsigned i = GFX9; i <= GFX10; i++) {
//>> v1: %a, v2b: %a16 = p_startpgm
if (!setup_cs("v1 v2b", (chip_class)i))
continue;
Temp a = inputs[0];
Temp a16 = inputs[1];
/* check whether modifiers are preserved when converting to VOP3P */
//! v1: %res0 = v_fma_mix_f32 -%a, lo(%a16), -0
//! p_unit_test 0, %res0
writeout(0, fmul(fneg(a), f2f32(a16)));
//! v1: %res1 = v_fma_mix_f32 |%a|, lo(%a16), -0
//! p_unit_test 1, %res1
writeout(1, fmul(fabs(a), f2f32(a16)));
/* fneg modifiers */
//! v1: %res2 = v_fma_mix_f32 %a, -lo(%a16), -0
//! p_unit_test 2, %res2
writeout(2, fmul(a, fneg(f2f32(a16))));
//! v1: %res3 = v_fma_mix_f32 %a, -lo(%a16), -0
//! p_unit_test 3, %res3
writeout(3, fmul(a, f2f32(fneg(a16))));
/* fabs modifiers */
//! v1: %res4 = v_fma_mix_f32 %a, |lo(%a16)|, -0
//! p_unit_test 4, %res4
writeout(4, fmul(a, fabs(f2f32(a16))));
//! v1: %res5 = v_fma_mix_f32 %a, |lo(%a16)|, -0
//! p_unit_test 5, %res5
writeout(5, fmul(a, f2f32(fabs(a16))));
/* both fabs and fneg modifiers */
//! v1: %res6 = v_fma_mix_f32 %a, -|lo(%a16)|, -0
//! p_unit_test 6, %res6
writeout(6, fmul(a, fneg(f2f32(fabs(a16)))));
//! v1: %res7 = v_fma_mix_f32 %a, |lo(%a16)|, -0
//! p_unit_test 7, %res7
writeout(7, fmul(a, fabs(f2f32(fabs(a16)))));
//! v1: %res8 = v_fma_mix_f32 %a, -|lo(%a16)|, -0
//! p_unit_test 8, %res8
writeout(8, fmul(a, fneg(fabs(f2f32(fabs(a16))))));
//! v1: %res9 = v_fma_mix_f32 %a, -|lo(%a16)|, -0
//! p_unit_test 9, %res9
writeout(9, fmul(a, f2f32(fneg(fabs(a16)))));
//! v1: %res10 = v_fma_mix_f32 %a, |lo(%a16)|, -0
//! p_unit_test 10, %res10
writeout(10, fmul(a, fneg(f2f32(fneg(fabs(a16))))));
//! v1: %res11 = v_fma_mix_f32 %a, |lo(%a16)|, -0
//! p_unit_test 11, %res11
writeout(11, fmul(a, fabs(f2f32(fneg(fabs(a16))))));
//! v1: %res12 = v_fma_mix_f32 %a, -|lo(%a16)|, -0
//! p_unit_test 12, %res12
writeout(12, fmul(a, fneg(fabs(f2f32(fneg(fabs(a16)))))));
/* sdwa */
//! v1: %res13 = v_fma_mix_f32 lo(%a), %a, -0
//! p_unit_test 13, %res13
writeout(13, fmul(f2f32(ext_ushort(a, 0)), a));
//! v1: %res14 = v_fma_mix_f32 hi(%a), %a, -0
//! p_unit_test 14, %res14
writeout(14, fmul(f2f32(ext_ushort(a, 1)), a));
//! v1: %res15_cvt = v_cvt_f32_f16 %a dst_sel:uword0 src0_sel:dword
//! v1: %res15 = v_mul_f32 %res15_cvt, %a
//! p_unit_test 15, %res15
writeout(15, fmul(ext_ushort(f2f32(a), 0), a));
//! v1: %res16_cvt = v_cvt_f32_f16 %a
//! v1: %res16 = v_mul_f32 %res16_cvt, %a dst_sel:dword src0_sel:uword1 src1_sel:dword
//! p_unit_test 16, %res16
writeout(16, fmul(ext_ushort(f2f32(a), 1), a));
//! v1: %res17_cvt = v_cvt_f32_f16 %a dst_sel:dword src0_sel:ubyte2
//! v1: %res17 = v_mul_f32 %res17_cvt, %a
//! p_unit_test 17, %res17
writeout(17, fmul(f2f32(ext_ubyte(a, 2)), a));
finish_opt_test();
}
END_TEST
BEGIN_TEST(optimize.mad_mix.output_conv.basic)
for (unsigned i = GFX9; i <= GFX10; i++) {
//>> v1: %a, v1: %b, v1: %c, v2b: %a16, v2b: %b16 = p_startpgm
if (!setup_cs("v1 v1 v1 v2b v2b", (chip_class)i))
continue;
Temp a = inputs[0];
Temp b = inputs[1];
Temp c = inputs[2];
Temp a16 = inputs[3];
Temp b16 = inputs[4];
//! v2b: %res0 = v_fma_mixlo_f16 %a, %b, -0
//! p_unit_test 0, %res0
writeout(0, f2f16(fmul(a, b)));
//! v2b: %res1 = v_fma_mixlo_f16 1.0, %a, %b
//! p_unit_test 1, %res1
writeout(1, f2f16(fadd(a, b)));
//! v2b: %res2 = v_fma_mixlo_f16 %a, %b, %c
//! p_unit_test 2, %res2
writeout(2, f2f16(fma(a, b, c)));
//! v2b: %res3 = v_fma_mixlo_f16 lo(%a16), %b, -0
//! p_unit_test 3, %res3
writeout(3, f2f16(fmul(f2f32(a16), b)));
//! v2b: %res4 = v_fma_mixlo_f16 1.0, %a, lo(%b16)
//! p_unit_test 4, %res4
writeout(4, f2f16(fadd(a, f2f32(b16))));
//! v2b: %res5 = v_fma_mixlo_f16 %a, lo(%b16), %c
//! p_unit_test 5, %res5
writeout(5, f2f16(fma(a, f2f32(b16), c)));
finish_opt_test();
}
END_TEST
BEGIN_TEST(optimize.mad_mix.output_conv.precision)
for (unsigned i = GFX9; i <= GFX10; i++) {
//>> v2b: %a16 = p_startpgm
if (!setup_cs("v2b", (chip_class)i))
continue;
Temp a16 = inputs[0];
//! v2b: %res0_tmp = v_mul_f16 %a16, %a16
//! v1: (precise)%res0 = v_cvt_f32_f16 %res0_tmp
//! p_unit_test 0, %res0
writeout(0, f2f32(fmul(a16, a16), bld.precise()));
//! v2b: (precise)%res1_tmp = v_mul_f16 %a16, %a16
//! v1: %res1 = v_cvt_f32_f16 %res1_tmp
//! p_unit_test 1, %res1
writeout(1, f2f32(fmul(a16, a16, bld.precise())));
finish_opt_test();
}
END_TEST
BEGIN_TEST(optimize.mad_mix.output_conv.modifiers)
for (unsigned i = GFX9; i <= GFX10; i++) {
//>> v1: %a, v1: %b, v2b: %a16, v2b: %b16 = p_startpgm
if (!setup_cs("v1 v1 v2b v2b", (chip_class)i))
continue;
Temp a = inputs[0];
Temp b = inputs[1];
Temp a16 = inputs[2];
Temp b16 = inputs[3];
/* fneg/fabs */
//! v1: %res0_add = v_add_f32 %1, %2
//! v2b: %res0 = v_cvt_f16_f32 |%res0_add|
//! p_unit_test 0, %res0
writeout(0, f2f16(fabs(fadd(a, b))));
//! v1: %res1_add = v_add_f32 %1, %2
//! v2b: %res1 = v_cvt_f16_f32 -%res1_add
//! p_unit_test 1, %res1
writeout(1, f2f16(fneg(fadd(a, b))));
//! v2b: %res2_add = v_add_f16 %3, %4
//! v1: %res2 = v_cvt_f32_f16 |%res2_add|
//! p_unit_test 2, %res2
writeout(2, f2f32(fabs(fadd(a16, b16))));
//! v2b: %res3_add = v_add_f16 %3, %4
//! v1: %res3 = v_cvt_f32_f16 -%res3_add
//! p_unit_test 3, %res3
writeout(3, f2f32(fneg(fadd(a16, b16))));
/* sdwa */
//! v2b: %res4_add = v_fma_mixlo_f16 1.0, %a, %b
//! v2b: %res4 = p_extract %res4_add, 0, 8, 0
//! p_unit_test 4, %res4
writeout(4, ext_ubyte(f2f16(fadd(a, b)), 0));
//! v1: %res5_mul = v_add_f32 %a, %b dst_sel:uword0 src0_sel:dword src1_sel:dword
//! v2b: %res5 = v_cvt_f16_f32 %res5_mul
//! p_unit_test 5, %res5
writeout(5, f2f16(ext_ushort(fadd(a, b), 0)));
finish_opt_test();
}
END_TEST
BEGIN_TEST(optimize.mad_mix.fma.basic)
for (unsigned i = GFX9; i <= GFX10; i++) {
//>> v1: %a, v1: %b, v1: %c, v2b: %a16, v2b: %c16 = p_startpgm
if (!setup_cs("v1 v1 v1 v2b v2b", (chip_class)i))
continue;
Temp a = inputs[0];
Temp b = inputs[1];
Temp c = inputs[2];
Temp a16 = inputs[3];
Temp c16 = inputs[4];
//! v1: %res0 = v_fma_mix_f32 lo(%a16), %b, %c
//! p_unit_test 0, %res0
writeout(0, fadd(fmul(f2f32(a16), b), c));
//! v1: %res1 = v_fma_mix_f32 %a, %b, lo(%c16)
//! p_unit_test 1, %res1
writeout(1, fadd(fmul(a, b), f2f32(c16)));
/* omod/clamp check */
//! v1: %res2_mul = v_fma_mix_f32 lo(%a16), %b, -0
//! v1: %res2 = v_add_f32 %res2_mul, %c *2
//! p_unit_test 2, %res2
writeout(2, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x40000000), fadd(fmul(f2f32(a16), b), c)));
/* neg/abs modifiers */
//! v1: %res3 = v_fma_mix_f32 -lo(%a16), %b, |lo(%c16)|
//! p_unit_test 3, %res3
writeout(3, fadd(fmul(fneg(f2f32(a16)), b), fabs(f2f32(c16))));
//! v1: %res4 = v_fma_mix_f32 |%a|, |%b|, lo(%c16)
//! p_unit_test 4, %res4
writeout(4, fadd(fabs(fmul(fneg(a), fneg(b))), f2f32(c16)));
//! v1: %res5 = v_fma_mix_f32 %a, -%b, lo(%c16)
//! p_unit_test 5, %res5
writeout(5, fadd(fneg(fmul(a, b)), f2f32(c16)));
//! v1: %res6 = v_fma_mix_f32 |%a|, -|%b|, lo(%c16)
//! p_unit_test 6, %res6
writeout(6, fadd(fneg(fabs(fmul(fneg(a), fneg(b)))), f2f32(c16)));
/* output conversions */
//! v2b: %res7 = v_fma_mixlo_f16 %a, %b, %c
//! p_unit_test 7, %res7
writeout(7, f2f16(fadd(fmul(a, b), c)));
finish_opt_test();
}
END_TEST
BEGIN_TEST(optimize.mad_mix.fma.precision)
for (unsigned i = GFX9; i <= GFX10; i++) {
//>> v1: %a, v1: %b, v1: %c, v2b: %a16, v2b: %b16 = p_startpgm
if (!setup_cs("v1 v1 v1 v2b v2b", (chip_class)i))
continue;
Temp a = inputs[0];
Temp b = inputs[1];
Temp c = inputs[2];
Temp a16 = inputs[3];
Temp b16 = inputs[4];
/* the optimization is precise for 32-bit on GFX9 */
//~gfx9! v1: %res0 = v_fma_mix_f32 lo(%a16), %b, %c
//~gfx10! v1: (precise)%res0_tmp = v_fma_mix_f32 lo(%a16), %b, -0
//~gfx10! v1: %res0 = v_add_f32 %res0_tmp, %c
//! p_unit_test 0, %res0
writeout(0, fadd(fmul(f2f32(a16), b, bld.precise()), c));
//~gfx9! v1: (precise)%res1 = v_fma_mix_f32 lo(%a16), %b, %c
//~gfx10! v1: %res1_tmp = v_fma_mix_f32 lo(%a16), %b, -0
//~gfx10! v1: (precise)%res1 = v_add_f32 %res1_tmp, %c
//! p_unit_test 1, %res1
writeout(1, fadd(fmul(f2f32(a16), b), c, bld.precise()));
/* never promote 16-bit arithmetic to 32-bit */
//! v2b: %res2_tmp = v_cvt_f16_f32 %a
//! v2b: %res2 = v_add_f16 %res2_tmp, %b16
//! p_unit_test 2, %res2
writeout(2, fadd(f2f16(a), b16));
//! v2b: %res3_tmp = v_cvt_f16_f32 %a
//! v2b: %res3 = v_mul_f16 %res3_tmp, %b16
//! p_unit_test 3, %res3
writeout(3, fmul(f2f16(a), b16));
//! v2b: %res4_tmp = v_mul_f16 %a16, %b16
//! v1: %res4 = v_cvt_f32_f16 %res4_tmp
//! p_unit_test 4, %res4
writeout(4, f2f32(fmul(a16, b16)));
//! v2b: %res5_tmp = v_add_f16 %a16, %b16
//! v1: %res5 = v_cvt_f32_f16 %res5_tmp
//! p_unit_test 5, %res5
writeout(5, f2f32(fadd(a16, b16)));
//! v2b: %res6_tmp = v_fma_mixlo_f16 %a, %b, -0
//! v2b: %res6 = v_add_f16 %res6_tmp, %a16
//! p_unit_test 6, %res6
writeout(6, fadd(f2f16(fmul(a, b)), a16));
//! v2b: %res7_tmp = v_mul_f16 %a16, %b16
//! v1: %res7 = v_fma_mix_f32 1.0, lo(%res7_tmp), %c
//! p_unit_test 7, %res7
writeout(7, fadd(f2f32(fmul(a16, b16)), c));
finish_opt_test();
}
END_TEST
BEGIN_TEST(optimize.mad_mix.clamp)
for (unsigned i = GFX9; i <= GFX10; i++) {
//>> v1: %a, v2b: %a16 = p_startpgm
if (!setup_cs("v1 v2b", (chip_class)i))
continue;
Temp a = inputs[0];
Temp a16 = inputs[1];
//! v1: %res0 = v_fma_mix_f32 lo(%a16), %a, -0 clamp
//! p_unit_test 0, %res0
writeout(0, fsat(fmul(f2f32(a16), a)));
//! v2b: %res1 = v_fma_mixlo_f16 %a, %a, -0 clamp
//! p_unit_test 1, %res1
writeout(1, f2f16(fsat(fmul(a, a))));
//! v2b: %res2 = v_fma_mixlo_f16 %a, %a, -0 clamp
//! p_unit_test 2, %res2
writeout(2, fsat(f2f16(fmul(a, a))));
finish_opt_test();
}
END_TEST
BEGIN_TEST(optimize.mad_mix.cast)
for (unsigned i = GFX9; i <= GFX10; i++) {
//>> v1: %a, v2b: %a16 = p_startpgm
if (!setup_cs("v1 v2b", (chip_class)i))
continue;
Temp a = inputs[0];
Temp a16 = inputs[1];
/* The optimizer copy-propagates v2b=p_extract_vector(v1, 0) and p_as_uniform, so the
* optimizer has to check compatibility.
*/
//! v1: %res0_cvt = v_cvt_f32_f16 %a16
//! v2b: %res0 = v_mul_f16 %res0_cvt, %a16
//! p_unit_test 0, %res0
writeout(0, fmul(u2u16(f2f32(a16)), a16));
//! v2b: %res1_cvt = v_cvt_f16_f32 %a
//! v1: %res1 = v_mul_f32 %res1_cvt, %a
//! p_unit_test 1, %res1
writeout(1, fmul(bld.as_uniform(f2f16(a)), a));
//! v2b: %res2_mul = v_mul_f16 %a16, %a16
//! v2b: %res2 = v_cvt_f16_f32 %res2_mul
//! p_unit_test 2, %res2
writeout(2, f2f16(bld.as_uniform(fmul(a16, a16))));
//! v1: %res3_mul = v_mul_f32 %a, %a
//! v1: %res3 = v_cvt_f32_f16 %res3_mul
//! p_unit_test 3, %res3
writeout(3, f2f32(u2u16(fmul(a, a))));
//! v1: %res4_mul = v_fma_mix_f32 lo(%a16), %a, -0
//! v2b: %res4 = v_med3_f16 0, 1.0, %res4_mul
//! p_unit_test 4, %res4
writeout(4, fsat(u2u16(fmul(f2f32(a16), a))));
//! v2b: %res5_mul = v_fma_mixlo_f16 %a, %a, -0
//! v1: %res5 = v_med3_f32 0, 1.0, %res5_mul
//! p_unit_test 5, %res5
writeout(5, fsat(bld.as_uniform(f2f16(fmul(a, a)))));
//! v1: %res6_mul = v_mul_f32 %a, %a
//! v1: %res6 = v_fma_mix_f32 1.0, lo(%res6_mul), %a
//! p_unit_test 6, %res6
writeout(6, fadd(f2f32(u2u16(fmul(a, a))), a));
//! v2b: %res7_mul = v_mul_f16 %a16, %a16
//! v1: %res7 = v_fma_mix_f32 1.0, %res7_mul, lo(%a16)
//! p_unit_test 7, %res7
writeout(7, fadd(bld.as_uniform(fmul(a16, a16)), f2f32(a16)));
/* opsel_hi should be obtained from the original opcode, not the operand regclass */
//! v1: %res8 = v_fma_mix_f32 lo(%a16), %a16, -0
//! p_unit_test 8, %res8
writeout(8, fmul(f2f32(a16), a16));
finish_opt_test();
}
END_TEST