swr/rast: simdlib better separation of core vs knights avx512
Reviewed-by: Bruce Cherniak <bruce.cherniak@intel.com>
This commit is contained in:
parent
e1091b0861
commit
07062daae9
|
@ -285,7 +285,7 @@ lib_LTLIBRARIES += libswrKNL.la
|
|||
libswrKNL_la_CXXFLAGS = \
|
||||
$(PTHREAD_CFLAGS) \
|
||||
$(SWR_KNL_CXXFLAGS) \
|
||||
-DKNOB_ARCH=KNOB_ARCH_AVX512 -DAVX512F_STRICT \
|
||||
-DKNOB_ARCH=KNOB_ARCH_AVX512 -DSIMD_ARCH_KNIGHTS \
|
||||
$(COMMON_CXXFLAGS)
|
||||
|
||||
libswrKNL_la_SOURCES = \
|
||||
|
|
|
@ -69,11 +69,19 @@ COMMON_CXX_SOURCES := \
|
|||
rasterizer/common/simdlib_128_avx.inl \
|
||||
rasterizer/common/simdlib_128_avx2.inl \
|
||||
rasterizer/common/simdlib_128_avx512.inl \
|
||||
rasterizer/common/simdlib_128_avx512_core.inl \
|
||||
rasterizer/common/simdlib_128_avx512_knights.inl \
|
||||
rasterizer/common/simdlib_256_avx.inl \
|
||||
rasterizer/common/simdlib_256_avx2.inl \
|
||||
rasterizer/common/simdlib_256_avx512.inl \
|
||||
rasterizer/common/simdlib_256_avx512_core.inl \
|
||||
rasterizer/common/simdlib_256_avx512_knights.inl \
|
||||
rasterizer/common/simdlib_512_avx512.inl \
|
||||
rasterizer/common/simdlib_512_avx512_core.inl \
|
||||
rasterizer/common/simdlib_512_avx512_knights.inl \
|
||||
rasterizer/common/simdlib_512_avx512_masks.inl \
|
||||
rasterizer/common/simdlib_512_avx512_masks_core.inl \
|
||||
rasterizer/common/simdlib_512_avx512_masks_knights.inl \
|
||||
rasterizer/common/simdlib_512_emu.inl \
|
||||
rasterizer/common/simdlib_512_emu_masks.inl \
|
||||
rasterizer/common/simdlib_interface.hpp \
|
||||
|
|
|
@ -55,6 +55,11 @@ namespace SIMDImpl
|
|||
{
|
||||
#define __SIMD_LIB_AVX512_HPP__
|
||||
#include "simdlib_128_avx512.inl"
|
||||
#if defined(SIMD_ARCH_KNIGHTS)
|
||||
#include "simdlib_128_avx512_knights.inl"
|
||||
#else // optimize for core
|
||||
#include "simdlib_128_avx512_core.inl"
|
||||
#endif // defined(SIMD_ARCH_KNIGHTS)
|
||||
#undef __SIMD_LIB_AVX512_HPP__
|
||||
}; // struct AVX2Impl
|
||||
#endif // #if SIMD_ARCH >= SIMD_ARCH_AVX512
|
||||
|
@ -105,6 +110,11 @@ namespace SIMDImpl
|
|||
{
|
||||
#define __SIMD_LIB_AVX512_HPP__
|
||||
#include "simdlib_256_avx512.inl"
|
||||
#if defined(SIMD_ARCH_KNIGHTS)
|
||||
#include "simdlib_256_avx512_knights.inl"
|
||||
#else // optimize for core
|
||||
#include "simdlib_256_avx512_core.inl"
|
||||
#endif // defined(SIMD_ARCH_KNIGHTS)
|
||||
#undef __SIMD_LIB_AVX512_HPP__
|
||||
}; // struct AVX2Impl
|
||||
#endif // #if SIMD_ARCH >= SIMD_ARCH_AVX512
|
||||
|
@ -150,13 +160,20 @@ namespace SIMDImpl
|
|||
|
||||
|
||||
#if SIMD_ARCH >= SIMD_ARCH_AVX512
|
||||
struct AVX512Impl
|
||||
struct AVX512Impl : AVXImplBase<SIMD256Impl::AVX512Impl>
|
||||
{
|
||||
#define __SIMD_LIB_AVX512_HPP__
|
||||
#include "simdlib_512_avx512.inl"
|
||||
#include "simdlib_512_avx512_masks.inl"
|
||||
#if defined(SIMD_ARCH_KNIGHTS)
|
||||
#include "simdlib_512_avx512_knights.inl"
|
||||
#include "simdlib_512_avx512_masks_knights.inl"
|
||||
#else // optimize for core
|
||||
#include "simdlib_512_avx512_core.inl"
|
||||
#include "simdlib_512_avx512_masks_core.inl"
|
||||
#endif // defined(SIMD_ARCH_KNIGHTS)
|
||||
#undef __SIMD_LIB_AVX512_HPP__
|
||||
}; // struct AVX512Impl
|
||||
}; // struct AVX512ImplBase
|
||||
#endif // #if SIMD_ARCH >= SIMD_ARCH_AVX512
|
||||
|
||||
struct Traits : SIMDImpl::Traits
|
||||
|
|
|
@ -78,34 +78,6 @@ public:
|
|||
}
|
||||
#define SIMD_WRAPPER_3(op) SIMD_WRAPPER_3_(op, op, __mmask16(0xf))
|
||||
|
||||
#define SIMD_DWRAPPER_1_(op, intrin, mask) \
|
||||
static SIMDINLINE Double SIMDCALL op(Double a) \
|
||||
{\
|
||||
return __conv(_mm512_maskz_##intrin((mask), __conv(a)));\
|
||||
}
|
||||
#if !defined(AVX512F_STRICT)
|
||||
#define SIMD_DWRAPPER_1(op) SIMD_DWRAPPER_1_(op, op, __mmask8(0x3))
|
||||
#endif
|
||||
|
||||
#define SIMD_DWRAPPER_1I_(op, intrin, mask) \
|
||||
template<int ImmT> \
|
||||
static SIMDINLINE Double SIMDCALL op(Double a) \
|
||||
{\
|
||||
return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT));\
|
||||
}
|
||||
#if !defined(AVX512F_STRICT)
|
||||
#define SIMD_DWRAPPER_1I(op) SIMD_DWRAPPER_1I_(op, op, __mmask8(0x3))
|
||||
#endif
|
||||
|
||||
#define SIMD_DWRAPPER_2_(op, intrin, mask) \
|
||||
static SIMDINLINE Double SIMDCALL op(Double a, Double b) \
|
||||
{\
|
||||
return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b)));\
|
||||
}
|
||||
#if !defined(AVX512F_STRICT)
|
||||
#define SIMD_DWRAPPER_2(op) SIMD_DWRAPPER_2_(op, op, __mmask8(0x3))
|
||||
#endif
|
||||
|
||||
#define SIMD_DWRAPPER_2I(op) \
|
||||
template<int ImmT>\
|
||||
static SIMDINLINE Double SIMDCALL op(Double a, Double b) \
|
||||
|
@ -119,11 +91,6 @@ public:
|
|||
return __conv(_mm512_maskz_##intrin((mask), __conv(a)));\
|
||||
}
|
||||
#define SIMD_IWRAPPER_1_32(op) SIMD_IWRAPPER_1_(op, op, __mmask16(0xf))
|
||||
#if !defined(AVX512F_STRICT)
|
||||
#define SIMD_IWRAPPER_1_8(op) SIMD_IWRAPPER_1_(op, op, __mmask64(0xffffull))
|
||||
#define SIMD_IWRAPPER_1_16(op) SIMD_IWRAPPER_1_(op, op, __mmask32(0xff))
|
||||
#define SIMD_IWRAPPER_1_64(op) SIMD_IWRAPPER_1_(op, op, __mmask8(0x3))
|
||||
#endif
|
||||
|
||||
#define SIMD_IWRAPPER_1I_(op, intrin, mask) \
|
||||
template<int ImmT> \
|
||||
|
@ -132,11 +99,6 @@ public:
|
|||
return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT));\
|
||||
}
|
||||
#define SIMD_IWRAPPER_1I_32(op) SIMD_IWRAPPER_1I_(op, op, __mmask16(0xf))
|
||||
#if !defined(AVX512F_STRICT)
|
||||
#define SIMD_IWRAPPER_1I_8(op) SIMD_IWRAPPER_1I_(op, op, __mmask64(0xffffull))
|
||||
#define SIMD_IWRAPPER_1I_16(op) SIMD_IWRAPPER_1I_(op, op, __mmask32(0xff))
|
||||
#define SIMD_IWRAPPER_1I_64(op) SIMD_IWRAPPER_1I_(op, op, __mmask8(0x3))
|
||||
#endif
|
||||
|
||||
#define SIMD_IWRAPPER_2_(op, intrin, mask) \
|
||||
static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
|
||||
|
@ -144,11 +106,6 @@ public:
|
|||
return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b)));\
|
||||
}
|
||||
#define SIMD_IWRAPPER_2_32(op) SIMD_IWRAPPER_2_(op, op, __mmask16(0xf))
|
||||
#if !defined(AVX512F_STRICT)
|
||||
#define SIMD_IWRAPPER_2_8(op) SIMD_IWRAPPER_2_(op, op, __mmask64(0xffffull))
|
||||
#define SIMD_IWRAPPER_2_16(op) SIMD_IWRAPPER_2_(op, op, __mmask32(0xff))
|
||||
#define SIMD_IWRAPPER_2_64(op) SIMD_IWRAPPER_2_(op, op, __mmask8(0x3))
|
||||
#endif
|
||||
|
||||
#define SIMD_IWRAPPER_2I(op) \
|
||||
template<int ImmT>\
|
||||
|
@ -182,12 +139,8 @@ SIMD_IWRAPPER_2_32(min_epi32); // return (a < b) ? a : b (int32)
|
|||
SIMD_IWRAPPER_2_32(min_epu32); // return (a < b) ? a : b (uint32)
|
||||
SIMD_IWRAPPER_2_32(mul_epi32); // return a * b (int32)
|
||||
|
||||
#if !defined(AVX512F_STRICT)
|
||||
|
||||
SIMD_IWRAPPER_2_8(add_epi8); // return a + b (int8)
|
||||
SIMD_IWRAPPER_2_8(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8)
|
||||
|
||||
#endif
|
||||
// SIMD_IWRAPPER_2_8(add_epi8); // return a + b (int8)
|
||||
// SIMD_IWRAPPER_2_8(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8)
|
||||
|
||||
// return (a * b) & 0xFFFFFFFF
|
||||
//
|
||||
|
@ -196,12 +149,8 @@ SIMD_IWRAPPER_2_8(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uin
|
|||
SIMD_IWRAPPER_2_32(mullo_epi32);
|
||||
SIMD_IWRAPPER_2_32(sub_epi32); // return a - b (int32)
|
||||
|
||||
#if !defined(AVX512F_STRICT)
|
||||
|
||||
SIMD_IWRAPPER_2_64(sub_epi64); // return a - b (int64)
|
||||
SIMD_IWRAPPER_2_8(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8)
|
||||
|
||||
#endif
|
||||
// SIMD_IWRAPPER_2_64(sub_epi64); // return a - b (int64)
|
||||
// SIMD_IWRAPPER_2_8(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8)
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Logical operations
|
||||
|
@ -253,14 +202,10 @@ SIMD_IWRAPPER_2_32(srlv_epi32); // return a >> b (uint32)
|
|||
//-----------------------------------------------------------------------
|
||||
// Blend / shuffle / permute operations
|
||||
//-----------------------------------------------------------------------
|
||||
#if !defined(AVX512F_STRICT)
|
||||
|
||||
SIMD_IWRAPPER_2_8(packs_epi16); // int16 --> int8 See documentation for _mm256_packs_epi16 and _mm512_packs_epi16
|
||||
SIMD_IWRAPPER_2_16(packs_epi32); // int32 --> int16 See documentation for _mm256_packs_epi32 and _mm512_packs_epi32
|
||||
SIMD_IWRAPPER_2_8(packus_epi16); // uint16 --> uint8 See documentation for _mm256_packus_epi16 and _mm512_packus_epi16
|
||||
SIMD_IWRAPPER_2_16(packus_epi32); // uint32 --> uint16 See documentation for _mm256_packus_epi32 and _mm512_packus_epi32
|
||||
|
||||
#endif
|
||||
// SIMD_IWRAPPER_2_8(packs_epi16); // int16 --> int8 See documentation for _mm256_packs_epi16 and _mm512_packs_epi16
|
||||
// SIMD_IWRAPPER_2_16(packs_epi32); // int32 --> int16 See documentation for _mm256_packs_epi32 and _mm512_packs_epi32
|
||||
// SIMD_IWRAPPER_2_8(packus_epi16); // uint16 --> uint8 See documentation for _mm256_packus_epi16 and _mm512_packus_epi16
|
||||
// SIMD_IWRAPPER_2_16(packus_epi32); // uint32 --> uint16 See documentation for _mm256_packus_epi32 and _mm512_packus_epi32
|
||||
// SIMD_IWRAPPER_2_(permute_epi32, permutevar8x32_epi32);
|
||||
|
||||
//static SIMDINLINE Float SIMDCALL permute_ps(Float a, Integer swiz) // return a[swiz[i]] for each 32-bit lane i (float)
|
||||
|
@ -278,16 +223,12 @@ SIMD_IWRAPPER_1I_32(shuffle_epi32);
|
|||
SIMD_IWRAPPER_2_32(unpackhi_epi32);
|
||||
SIMD_IWRAPPER_2_32(unpacklo_epi32);
|
||||
|
||||
#if !defined(AVX512F_STRICT)
|
||||
|
||||
SIMD_IWRAPPER_2_16(unpackhi_epi16);
|
||||
SIMD_IWRAPPER_2_64(unpackhi_epi64);
|
||||
SIMD_IWRAPPER_2_8(unpackhi_epi8);
|
||||
SIMD_IWRAPPER_2_16(unpacklo_epi16);
|
||||
SIMD_IWRAPPER_2_64(unpacklo_epi64);
|
||||
SIMD_IWRAPPER_2_8(unpacklo_epi8);
|
||||
|
||||
#endif
|
||||
// SIMD_IWRAPPER_2_16(unpackhi_epi16);
|
||||
// SIMD_IWRAPPER_2_64(unpackhi_epi64);
|
||||
// SIMD_IWRAPPER_2_8(unpackhi_epi8);
|
||||
// SIMD_IWRAPPER_2_16(unpacklo_epi16);
|
||||
// SIMD_IWRAPPER_2_64(unpacklo_epi64);
|
||||
// SIMD_IWRAPPER_2_8(unpacklo_epi8);
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Load / store operations
|
||||
|
@ -338,16 +279,12 @@ static SIMDINLINE Float SIMDCALL mask_i32gather_ps(Float old, float const* p, In
|
|||
static_cast<int>(ScaleT)));
|
||||
}
|
||||
|
||||
#if !defined(AVX512F_STRICT)
|
||||
|
||||
static SIMDINLINE uint32_t SIMDCALL movemask_epi8(Integer a)
|
||||
{
|
||||
__mmask64 m = 0xffffull;
|
||||
return static_cast<uint32_t>(
|
||||
_mm512_mask_test_epi8_mask(m, __conv(a), _mm512_set1_epi8(0x80)));
|
||||
}
|
||||
|
||||
#endif
|
||||
// static SIMDINLINE uint32_t SIMDCALL movemask_epi8(Integer a)
|
||||
// {
|
||||
// __mmask64 m = 0xffffull;
|
||||
// return static_cast<uint32_t>(
|
||||
// _mm512_mask_test_epi8_mask(m, __conv(a), _mm512_set1_epi8(0x80)));
|
||||
// }
|
||||
|
||||
static SIMDINLINE void SIMDCALL maskstore_ps(float *p, Integer mask, Float src)
|
||||
{
|
||||
|
@ -366,6 +303,11 @@ static SIMDINLINE void SIMDCALL store_si(Integer *p, Integer a) // *p = a
|
|||
_mm512_mask_storeu_epi32(p, __mmask16(0xf), __conv(a));
|
||||
}
|
||||
|
||||
static SIMDINLINE Float SIMDCALL vmask_ps(int32_t mask)
|
||||
{
|
||||
return castsi_ps(__conv(_mm512_maskz_set1_epi32(__mmask16(mask & 0xf), -1)));
|
||||
}
|
||||
|
||||
//=======================================================================
|
||||
// Legacy interface (available only in SIMD256 width)
|
||||
//=======================================================================
|
||||
|
|
|
@ -0,0 +1,193 @@
|
|||
/****************************************************************************
|
||||
* Copyright (C) 2017 Intel Corporation. All Rights Reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
****************************************************************************/
|
||||
#if !defined(__SIMD_LIB_AVX512_HPP__)
|
||||
#error Do not include this file directly, use "simdlib.hpp" instead.
|
||||
#endif
|
||||
|
||||
//============================================================================
|
||||
// SIMD128 AVX (512) implementation
|
||||
//
|
||||
// Since this implementation inherits from the AVX (2) implementation,
|
||||
// the only operations below ones that replace AVX (2) operations.
|
||||
// These use native AVX512 instructions with masking to enable a larger
|
||||
// register set.
|
||||
//============================================================================
|
||||
|
||||
#define SIMD_WRAPPER_1_(op, intrin, mask) \
|
||||
static SIMDINLINE Float SIMDCALL op(Float a) \
|
||||
{\
|
||||
return __conv(_mm512_maskz_##intrin((mask), __conv(a)));\
|
||||
}
|
||||
#define SIMD_WRAPPER_1(op) SIMD_WRAPPER_1_(op, op, __mmask16(0xf))
|
||||
|
||||
#define SIMD_WRAPPER_1I_(op, intrin, mask) \
|
||||
template<int ImmT> \
|
||||
static SIMDINLINE Float SIMDCALL op(Float a) \
|
||||
{\
|
||||
return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT));\
|
||||
}
|
||||
#define SIMD_WRAPPER_1I(op) SIMD_WRAPPER_1I_(op, op, __mmask16(0xf))
|
||||
|
||||
#define SIMD_WRAPPER_2_(op, intrin, mask) \
|
||||
static SIMDINLINE Float SIMDCALL op(Float a, Float b) \
|
||||
{\
|
||||
return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b)));\
|
||||
}
|
||||
#define SIMD_WRAPPER_2(op) SIMD_WRAPPER_2_(op, op, __mmask16(0xf))
|
||||
|
||||
#define SIMD_WRAPPER_2I(op) \
|
||||
template<int ImmT>\
|
||||
static SIMDINLINE Float SIMDCALL op(Float a, Float b) \
|
||||
{\
|
||||
return __conv(_mm512_maskz_##op(0xf, __conv(a), __conv(b), ImmT));\
|
||||
}
|
||||
|
||||
#define SIMD_WRAPPER_3_(op, intrin, mask) \
|
||||
static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c) \
|
||||
{\
|
||||
return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b), __conv(c)));\
|
||||
}
|
||||
#define SIMD_WRAPPER_3(op) SIMD_WRAPPER_3_(op, op, __mmask16(0xf))
|
||||
|
||||
#define SIMD_DWRAPPER_1_(op, intrin, mask) \
|
||||
static SIMDINLINE Double SIMDCALL op(Double a) \
|
||||
{\
|
||||
return __conv(_mm512_maskz_##intrin((mask), __conv(a)));\
|
||||
}
|
||||
#define SIMD_DWRAPPER_1(op) SIMD_DWRAPPER_1_(op, op, __mmask8(0x3))
|
||||
|
||||
#define SIMD_DWRAPPER_1I_(op, intrin, mask) \
|
||||
template<int ImmT> \
|
||||
static SIMDINLINE Double SIMDCALL op(Double a) \
|
||||
{\
|
||||
return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT));\
|
||||
}
|
||||
#define SIMD_DWRAPPER_1I(op) SIMD_DWRAPPER_1I_(op, op, __mmask8(0x3))
|
||||
|
||||
#define SIMD_DWRAPPER_2_(op, intrin, mask) \
|
||||
static SIMDINLINE Double SIMDCALL op(Double a, Double b) \
|
||||
{\
|
||||
return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b)));\
|
||||
}
|
||||
#define SIMD_DWRAPPER_2(op) SIMD_DWRAPPER_2_(op, op, __mmask8(0x3))
|
||||
|
||||
#define SIMD_DWRAPPER_2I(op) \
|
||||
template<int ImmT>\
|
||||
static SIMDINLINE Double SIMDCALL op(Double a, Double b) \
|
||||
{\
|
||||
return __conv(_mm512_maskz_##op(0x3, __conv(a), __conv(b), ImmT));\
|
||||
}
|
||||
|
||||
#define SIMD_IWRAPPER_1_(op, intrin, mask) \
|
||||
static SIMDINLINE Integer SIMDCALL op(Integer a) \
|
||||
{\
|
||||
return __conv(_mm512_maskz_##intrin((mask), __conv(a)));\
|
||||
}
|
||||
#define SIMD_IWRAPPER_1_8(op) SIMD_IWRAPPER_1_(op, op, __mmask64(0xffffull))
|
||||
#define SIMD_IWRAPPER_1_16(op) SIMD_IWRAPPER_1_(op, op, __mmask32(0xff))
|
||||
#define SIMD_IWRAPPER_1_64(op) SIMD_IWRAPPER_1_(op, op, __mmask8(0x3))
|
||||
|
||||
#define SIMD_IWRAPPER_1I_(op, intrin, mask) \
|
||||
template<int ImmT> \
|
||||
static SIMDINLINE Integer SIMDCALL op(Integer a) \
|
||||
{\
|
||||
return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT));\
|
||||
}
|
||||
#define SIMD_IWRAPPER_1I_8(op) SIMD_IWRAPPER_1I_(op, op, __mmask64(0xffffull))
|
||||
#define SIMD_IWRAPPER_1I_16(op) SIMD_IWRAPPER_1I_(op, op, __mmask32(0xff))
|
||||
#define SIMD_IWRAPPER_1I_64(op) SIMD_IWRAPPER_1I_(op, op, __mmask8(0x3))
|
||||
|
||||
#define SIMD_IWRAPPER_2_(op, intrin, mask) \
|
||||
static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
|
||||
{\
|
||||
return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b)));\
|
||||
}
|
||||
#define SIMD_IWRAPPER_2_8(op) SIMD_IWRAPPER_2_(op, op, __mmask64(0xffffull))
|
||||
#define SIMD_IWRAPPER_2_16(op) SIMD_IWRAPPER_2_(op, op, __mmask32(0xff))
|
||||
#define SIMD_IWRAPPER_2_64(op) SIMD_IWRAPPER_2_(op, op, __mmask8(0x3))
|
||||
|
||||
#define SIMD_IWRAPPER_2I(op) \
|
||||
template<int ImmT>\
|
||||
static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
|
||||
{\
|
||||
return __conv(_mm512_maskz_##op(0xf, __conv(a), __conv(b), ImmT));\
|
||||
}
|
||||
|
||||
SIMD_IWRAPPER_2_8(add_epi8); // return a + b (int8)
|
||||
SIMD_IWRAPPER_2_8(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8)
|
||||
SIMD_IWRAPPER_2_64(sub_epi64); // return a - b (int64)
|
||||
SIMD_IWRAPPER_2_8(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8)
|
||||
SIMD_IWRAPPER_2_8(packs_epi16); // int16 --> int8 See documentation for _mm256_packs_epi16 and _mm512_packs_epi16
|
||||
SIMD_IWRAPPER_2_16(packs_epi32); // int32 --> int16 See documentation for _mm256_packs_epi32 and _mm512_packs_epi32
|
||||
SIMD_IWRAPPER_2_8(packus_epi16); // uint16 --> uint8 See documentation for _mm256_packus_epi16 and _mm512_packus_epi16
|
||||
SIMD_IWRAPPER_2_16(packus_epi32); // uint32 --> uint16 See documentation for _mm256_packus_epi32 and _mm512_packus_epi32
|
||||
SIMD_IWRAPPER_2_16(unpackhi_epi16);
|
||||
SIMD_IWRAPPER_2_64(unpackhi_epi64);
|
||||
SIMD_IWRAPPER_2_8(unpackhi_epi8);
|
||||
SIMD_IWRAPPER_2_16(unpacklo_epi16);
|
||||
SIMD_IWRAPPER_2_64(unpacklo_epi64);
|
||||
SIMD_IWRAPPER_2_8(unpacklo_epi8);
|
||||
|
||||
static SIMDINLINE uint32_t SIMDCALL movemask_epi8(Integer a)
|
||||
{
|
||||
__mmask64 m = 0xffffull;
|
||||
return static_cast<uint32_t>(
|
||||
_mm512_mask_test_epi8_mask(m, __conv(a), _mm512_set1_epi8(0x80)));
|
||||
}
|
||||
|
||||
#undef SIMD_WRAPPER_1_
|
||||
#undef SIMD_WRAPPER_1
|
||||
#undef SIMD_WRAPPER_1I_
|
||||
#undef SIMD_WRAPPER_1I
|
||||
#undef SIMD_WRAPPER_2_
|
||||
#undef SIMD_WRAPPER_2
|
||||
#undef SIMD_WRAPPER_2I
|
||||
#undef SIMD_WRAPPER_3_
|
||||
#undef SIMD_WRAPPER_3
|
||||
#undef SIMD_DWRAPPER_1_
|
||||
#undef SIMD_DWRAPPER_1
|
||||
#undef SIMD_DWRAPPER_1I_
|
||||
#undef SIMD_DWRAPPER_1I
|
||||
#undef SIMD_DWRAPPER_2_
|
||||
#undef SIMD_DWRAPPER_2
|
||||
#undef SIMD_DWRAPPER_2I
|
||||
#undef SIMD_IWRAPPER_1_
|
||||
#undef SIMD_IWRAPPER_1_8
|
||||
#undef SIMD_IWRAPPER_1_16
|
||||
#undef SIMD_IWRAPPER_1_32
|
||||
#undef SIMD_IWRAPPER_1_64
|
||||
#undef SIMD_IWRAPPER_1I_
|
||||
#undef SIMD_IWRAPPER_1I_8
|
||||
#undef SIMD_IWRAPPER_1I_16
|
||||
#undef SIMD_IWRAPPER_1I_32
|
||||
#undef SIMD_IWRAPPER_1I_64
|
||||
#undef SIMD_IWRAPPER_2_
|
||||
#undef SIMD_IWRAPPER_2_8
|
||||
#undef SIMD_IWRAPPER_2_16
|
||||
#undef SIMD_IWRAPPER_2_32
|
||||
#undef SIMD_IWRAPPER_2_64
|
||||
#undef SIMD_IWRAPPER_2I
|
||||
//#undef SIMD_IWRAPPER_2I_8
|
||||
//#undef SIMD_IWRAPPER_2I_16
|
||||
//#undef SIMD_IWRAPPER_2I_32
|
||||
//#undef SIMD_IWRAPPER_2I_64
|
|
@ -0,0 +1,35 @@
|
|||
/****************************************************************************
|
||||
* Copyright (C) 2017 Intel Corporation. All Rights Reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
****************************************************************************/
|
||||
#if !defined(__SIMD_LIB_AVX512_HPP__)
|
||||
#error Do not include this file directly, use "simdlib.hpp" instead.
|
||||
#endif
|
||||
|
||||
//============================================================================
|
||||
// SIMD128 AVX (512) implementation for Knights Family
|
||||
//
|
||||
// Since this implementation inherits from the AVX512Base implementation,
|
||||
// the only operations below ones that replace AVX512F / AVX512CD operations
|
||||
// These use native AVX512 instructions with masking to enable a larger
|
||||
// register set.
|
||||
//============================================================================
|
||||
|
|
@ -78,34 +78,6 @@ public:
|
|||
}
|
||||
#define SIMD_WRAPPER_3(op) SIMD_WRAPPER_3_(op, op, __mmask16(0xff))
|
||||
|
||||
#define SIMD_DWRAPPER_1_(op, intrin, mask) \
|
||||
static SIMDINLINE Double SIMDCALL op(Double a) \
|
||||
{\
|
||||
return __conv(_mm512_maskz_##intrin((mask), __conv(a)));\
|
||||
}
|
||||
#if !defined(AVX512F_STRICT)
|
||||
#define SIMD_DWRAPPER_1(op) SIMD_DWRAPPER_1_(op, op, __mmask8(0xf))
|
||||
#endif
|
||||
|
||||
#define SIMD_DWRAPPER_1I_(op, intrin, mask) \
|
||||
template<int ImmT> \
|
||||
static SIMDINLINE Double SIMDCALL op(Double a) \
|
||||
{\
|
||||
return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT));\
|
||||
}
|
||||
#if !defined(AVX512F_STRICT)
|
||||
#define SIMD_DWRAPPER_1I(op) SIMD_DWRAPPER_1I_(op, op, __mmask8(0xf))
|
||||
#endif
|
||||
|
||||
#define SIMD_DWRAPPER_2_(op, intrin, mask) \
|
||||
static SIMDINLINE Double SIMDCALL op(Double a, Double b) \
|
||||
{\
|
||||
return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b)));\
|
||||
}
|
||||
#if !defined(AVX512F_STRICT)
|
||||
#define SIMD_DWRAPPER_2(op) SIMD_DWRAPPER_2_(op, op, __mmask8(0xf))
|
||||
#endif
|
||||
|
||||
#define SIMD_DWRAPPER_2I(op) \
|
||||
template<int ImmT>\
|
||||
static SIMDINLINE Double SIMDCALL op(Double a, Double b) \
|
||||
|
@ -119,11 +91,6 @@ public:
|
|||
return __conv(_mm512_maskz_##intrin((mask), __conv(a)));\
|
||||
}
|
||||
#define SIMD_IWRAPPER_1_32(op) SIMD_IWRAPPER_1_(op, op, __mmask16(0xff))
|
||||
#if !defined(AVX512F_STRICT)
|
||||
#define SIMD_IWRAPPER_1_8(op) SIMD_IWRAPPER_1_(op, op, __mmask64(0xffffffffull))
|
||||
#define SIMD_IWRAPPER_1_16(op) SIMD_IWRAPPER_1_(op, op, __mmask32(0xffff))
|
||||
#define SIMD_IWRAPPER_1_64(op) SIMD_IWRAPPER_1_(op, op, __mmask8(0xf))
|
||||
#endif
|
||||
|
||||
#define SIMD_IWRAPPER_1I_(op, intrin, mask) \
|
||||
template<int ImmT> \
|
||||
|
@ -132,11 +99,6 @@ public:
|
|||
return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT));\
|
||||
}
|
||||
#define SIMD_IWRAPPER_1I_32(op) SIMD_IWRAPPER_1I_(op, op, __mmask16(0xff))
|
||||
#if !defined(AVX512F_STRICT)
|
||||
#define SIMD_IWRAPPER_1I_8(op) SIMD_IWRAPPER_1I_(op, op, __mmask64(0xffffffffull))
|
||||
#define SIMD_IWRAPPER_1I_16(op) SIMD_IWRAPPER_1I_(op, op, __mmask32(0xffff))
|
||||
#define SIMD_IWRAPPER_1I_64(op) SIMD_IWRAPPER_1I_(op, op, __mmask8(0xf))
|
||||
#endif
|
||||
|
||||
#define SIMD_IWRAPPER_2_(op, intrin, mask) \
|
||||
static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
|
||||
|
@ -144,11 +106,6 @@ public:
|
|||
return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b)));\
|
||||
}
|
||||
#define SIMD_IWRAPPER_2_32(op) SIMD_IWRAPPER_2_(op, op, __mmask16(0xff))
|
||||
#if !defined(AVX512F_STRICT)
|
||||
#define SIMD_IWRAPPER_2_8(op) SIMD_IWRAPPER_2_(op, op, __mmask64(0xffffffffull))
|
||||
#define SIMD_IWRAPPER_2_16(op) SIMD_IWRAPPER_2_(op, op, __mmask32(0xffff))
|
||||
#define SIMD_IWRAPPER_2_64(op) SIMD_IWRAPPER_2_(op, op, __mmask8(0xf))
|
||||
#endif
|
||||
|
||||
#define SIMD_IWRAPPER_2I(op) \
|
||||
template<int ImmT>\
|
||||
|
@ -182,12 +139,8 @@ SIMD_IWRAPPER_2_32(min_epi32); // return (a < b) ? a : b (int32)
|
|||
SIMD_IWRAPPER_2_32(min_epu32); // return (a < b) ? a : b (uint32)
|
||||
SIMD_IWRAPPER_2_32(mul_epi32); // return a * b (int32)
|
||||
|
||||
#if !defined(AVX512F_STRICT)
|
||||
|
||||
SIMD_IWRAPPER_2_8(add_epi8); // return a + b (int8)
|
||||
SIMD_IWRAPPER_2_8(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8)
|
||||
|
||||
#endif
|
||||
// SIMD_IWRAPPER_2_8(add_epi8); // return a + b (int8)
|
||||
// SIMD_IWRAPPER_2_8(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8)
|
||||
|
||||
// return (a * b) & 0xFFFFFFFF
|
||||
//
|
||||
|
@ -196,12 +149,8 @@ SIMD_IWRAPPER_2_8(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uin
|
|||
SIMD_IWRAPPER_2_32(mullo_epi32);
|
||||
SIMD_IWRAPPER_2_32(sub_epi32); // return a - b (int32)
|
||||
|
||||
#if !defined(AVX512F_STRICT)
|
||||
|
||||
SIMD_IWRAPPER_2_64(sub_epi64); // return a - b (int64)
|
||||
SIMD_IWRAPPER_2_8(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8)
|
||||
|
||||
#endif
|
||||
// SIMD_IWRAPPER_2_64(sub_epi64); // return a - b (int64)
|
||||
// SIMD_IWRAPPER_2_8(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8)
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Logical operations
|
||||
|
@ -253,14 +202,10 @@ SIMD_IWRAPPER_2_32(srlv_epi32); // return a >> b (uint32)
|
|||
//-----------------------------------------------------------------------
|
||||
// Blend / shuffle / permute operations
|
||||
//-----------------------------------------------------------------------
|
||||
#if !defined(AVX512F_STRICT)
|
||||
|
||||
SIMD_IWRAPPER_2_8(packs_epi16); // int16 --> int8 See documentation for _mm256_packs_epi16 and _mm512_packs_epi16
|
||||
SIMD_IWRAPPER_2_16(packs_epi32); // int32 --> int16 See documentation for _mm256_packs_epi32 and _mm512_packs_epi32
|
||||
SIMD_IWRAPPER_2_8(packus_epi16); // uint16 --> uint8 See documentation for _mm256_packus_epi16 and _mm512_packus_epi16
|
||||
SIMD_IWRAPPER_2_16(packus_epi32); // uint32 --> uint16 See documentation for _mm256_packus_epi32 and _mm512_packus_epi32
|
||||
|
||||
#endif
|
||||
// SIMD_IWRAPPER_2_8(packs_epi16); // int16 --> int8 See documentation for _mm256_packs_epi16 and _mm512_packs_epi16
|
||||
// SIMD_IWRAPPER_2_16(packs_epi32); // int32 --> int16 See documentation for _mm256_packs_epi32 and _mm512_packs_epi32
|
||||
// SIMD_IWRAPPER_2_8(packus_epi16); // uint16 --> uint8 See documentation for _mm256_packus_epi16 and _mm512_packus_epi16
|
||||
// SIMD_IWRAPPER_2_16(packus_epi32); // uint32 --> uint16 See documentation for _mm256_packus_epi32 and _mm512_packus_epi32
|
||||
|
||||
// SIMD_IWRAPPER_2_(permute_epi32, permutevar8x32_epi32);
|
||||
|
||||
|
@ -279,16 +224,12 @@ SIMD_IWRAPPER_1I_32(shuffle_epi32);
|
|||
SIMD_IWRAPPER_2_32(unpackhi_epi32);
|
||||
SIMD_IWRAPPER_2_32(unpacklo_epi32);
|
||||
|
||||
#if !defined(AVX512F_STRICT)
|
||||
|
||||
SIMD_IWRAPPER_2_16(unpackhi_epi16);
|
||||
SIMD_IWRAPPER_2_64(unpackhi_epi64);
|
||||
SIMD_IWRAPPER_2_8(unpackhi_epi8);
|
||||
SIMD_IWRAPPER_2_16(unpacklo_epi16);
|
||||
SIMD_IWRAPPER_2_64(unpacklo_epi64);
|
||||
SIMD_IWRAPPER_2_8(unpacklo_epi8);
|
||||
|
||||
#endif
|
||||
// SIMD_IWRAPPER_2_16(unpackhi_epi16);
|
||||
// SIMD_IWRAPPER_2_64(unpackhi_epi64);
|
||||
// SIMD_IWRAPPER_2_8(unpackhi_epi8);
|
||||
// SIMD_IWRAPPER_2_16(unpacklo_epi16);
|
||||
// SIMD_IWRAPPER_2_64(unpacklo_epi64);
|
||||
// SIMD_IWRAPPER_2_8(unpacklo_epi8);
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
// Load / store operations
|
||||
|
@ -339,16 +280,12 @@ static SIMDINLINE Float SIMDCALL mask_i32gather_ps(Float old, float const* p, In
|
|||
static_cast<int>(ScaleT)));
|
||||
}
|
||||
|
||||
#if !defined(AVX512F_STRICT)
|
||||
|
||||
static SIMDINLINE uint32_t SIMDCALL movemask_epi8(Integer a)
|
||||
{
|
||||
__mmask64 m = 0xffffffffull;
|
||||
return static_cast<uint32_t>(
|
||||
_mm512_mask_test_epi8_mask(m, __conv(a), _mm512_set1_epi8(0x80)));
|
||||
}
|
||||
|
||||
#endif
|
||||
// static SIMDINLINE uint32_t SIMDCALL movemask_epi8(Integer a)
|
||||
// {
|
||||
// __mmask64 m = 0xffffffffull;
|
||||
// return static_cast<uint32_t>(
|
||||
// _mm512_mask_test_epi8_mask(m, __conv(a), _mm512_set1_epi8(0x80)));
|
||||
// }
|
||||
|
||||
static SIMDINLINE void SIMDCALL maskstore_ps(float *p, Integer mask, Float src)
|
||||
{
|
||||
|
@ -367,6 +304,11 @@ static SIMDINLINE void SIMDCALL store_si(Integer *p, Integer a) // *p = a
|
|||
_mm512_mask_storeu_epi32(p, __mmask16(0xff), __conv(a));
|
||||
}
|
||||
|
||||
static SIMDINLINE Float SIMDCALL vmask_ps(int32_t mask)
|
||||
{
|
||||
return castsi_ps(__conv(_mm512_maskz_set1_epi32(__mmask16(mask & 0xff), -1)));
|
||||
}
|
||||
|
||||
//=======================================================================
|
||||
// Legacy interface (available only in SIMD256 width)
|
||||
//=======================================================================
|
||||
|
@ -380,30 +322,10 @@ static SIMDINLINE void SIMDCALL store_si(Integer *p, Integer a) // *p = a
|
|||
#undef SIMD_WRAPPER_2I
|
||||
#undef SIMD_WRAPPER_3_
|
||||
#undef SIMD_WRAPPER_3
|
||||
#undef SIMD_DWRAPPER_1_
|
||||
#undef SIMD_DWRAPPER_1
|
||||
#undef SIMD_DWRAPPER_1I_
|
||||
#undef SIMD_DWRAPPER_1I
|
||||
#undef SIMD_DWRAPPER_2_
|
||||
#undef SIMD_DWRAPPER_2
|
||||
#undef SIMD_DWRAPPER_2I
|
||||
#undef SIMD_IWRAPPER_1_
|
||||
#undef SIMD_IWRAPPER_1_8
|
||||
#undef SIMD_IWRAPPER_1_16
|
||||
#undef SIMD_IWRAPPER_1_32
|
||||
#undef SIMD_IWRAPPER_1_64
|
||||
#undef SIMD_IWRAPPER_1I_
|
||||
#undef SIMD_IWRAPPER_1I_8
|
||||
#undef SIMD_IWRAPPER_1I_16
|
||||
#undef SIMD_IWRAPPER_1I_32
|
||||
#undef SIMD_IWRAPPER_1I_64
|
||||
#undef SIMD_IWRAPPER_2_
|
||||
#undef SIMD_IWRAPPER_2_8
|
||||
#undef SIMD_IWRAPPER_2_16
|
||||
#undef SIMD_IWRAPPER_2_32
|
||||
#undef SIMD_IWRAPPER_2_64
|
||||
#undef SIMD_IWRAPPER_2I
|
||||
//#undef SIMD_IWRAPPER_2I_8
|
||||
//#undef SIMD_IWRAPPER_2I_16
|
||||
//#undef SIMD_IWRAPPER_2I_32
|
||||
//#undef SIMD_IWRAPPER_2I_64
|
||||
|
|
|
@ -0,0 +1,127 @@
|
|||
/****************************************************************************
|
||||
* Copyright (C) 2017 Intel Corporation. All Rights Reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
****************************************************************************/
|
||||
#if !defined(__SIMD_LIB_AVX512_HPP__)
|
||||
#error Do not include this file directly, use "simdlib.hpp" instead.
|
||||
#endif
|
||||
|
||||
//============================================================================
|
||||
// SIMD256 AVX (512) implementation for Core processors
|
||||
//
|
||||
// Since this implementation inherits from the AVX (2) implementation,
|
||||
// the only operations below ones that replace AVX (2) operations.
|
||||
// These use native AVX512 instructions with masking to enable a larger
|
||||
// register set.
|
||||
//============================================================================
|
||||
|
||||
#define SIMD_DWRAPPER_1_(op, intrin, mask) \
|
||||
static SIMDINLINE Double SIMDCALL op(Double a) \
|
||||
{\
|
||||
return __conv(_mm512_maskz_##intrin((mask), __conv(a)));\
|
||||
}
|
||||
#define SIMD_DWRAPPER_1(op) SIMD_DWRAPPER_1_(op, op, __mmask8(0xf))
|
||||
|
||||
#define SIMD_DWRAPPER_1I_(op, intrin, mask) \
|
||||
template<int ImmT> \
|
||||
static SIMDINLINE Double SIMDCALL op(Double a) \
|
||||
{\
|
||||
return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT));\
|
||||
}
|
||||
#define SIMD_DWRAPPER_1I(op) SIMD_DWRAPPER_1I_(op, op, __mmask8(0xf))
|
||||
|
||||
#define SIMD_DWRAPPER_2_(op, intrin, mask) \
|
||||
static SIMDINLINE Double SIMDCALL op(Double a, Double b) \
|
||||
{\
|
||||
return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b)));\
|
||||
}
|
||||
#define SIMD_DWRAPPER_2(op) SIMD_DWRAPPER_2_(op, op, __mmask8(0xf))
|
||||
|
||||
#define SIMD_IWRAPPER_1_(op, intrin, mask) \
|
||||
static SIMDINLINE Integer SIMDCALL op(Integer a) \
|
||||
{\
|
||||
return __conv(_mm512_maskz_##intrin((mask), __conv(a)));\
|
||||
}
|
||||
#define SIMD_IWRAPPER_1_8(op) SIMD_IWRAPPER_1_(op, op, __mmask64(0xffffffffull))
|
||||
#define SIMD_IWRAPPER_1_16(op) SIMD_IWRAPPER_1_(op, op, __mmask32(0xffff))
|
||||
#define SIMD_IWRAPPER_1_64(op) SIMD_IWRAPPER_1_(op, op, __mmask8(0xf))
|
||||
|
||||
#define SIMD_IWRAPPER_1I_(op, intrin, mask) \
|
||||
template<int ImmT> \
|
||||
static SIMDINLINE Integer SIMDCALL op(Integer a) \
|
||||
{\
|
||||
return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT));\
|
||||
}
|
||||
#define SIMD_IWRAPPER_1I_8(op) SIMD_IWRAPPER_1I_(op, op, __mmask64(0xffffffffull))
|
||||
#define SIMD_IWRAPPER_1I_16(op) SIMD_IWRAPPER_1I_(op, op, __mmask32(0xffff))
|
||||
#define SIMD_IWRAPPER_1I_64(op) SIMD_IWRAPPER_1I_(op, op, __mmask8(0xf))
|
||||
|
||||
#define SIMD_IWRAPPER_2_(op, intrin, mask) \
|
||||
static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
|
||||
{\
|
||||
return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b)));\
|
||||
}
|
||||
#define SIMD_IWRAPPER_2_8(op) SIMD_IWRAPPER_2_(op, op, __mmask64(0xffffffffull))
|
||||
#define SIMD_IWRAPPER_2_16(op) SIMD_IWRAPPER_2_(op, op, __mmask32(0xffff))
|
||||
#define SIMD_IWRAPPER_2_64(op) SIMD_IWRAPPER_2_(op, op, __mmask8(0xf))
|
||||
|
||||
|
||||
SIMD_IWRAPPER_2_8(add_epi8); // return a + b (int8)
|
||||
SIMD_IWRAPPER_2_8(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8)
|
||||
SIMD_IWRAPPER_2_64(sub_epi64); // return a - b (int64)
|
||||
SIMD_IWRAPPER_2_8(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8)
|
||||
SIMD_IWRAPPER_2_8(packs_epi16); // int16 --> int8 See documentation for _mm256_packs_epi16 and _mm512_packs_epi16
|
||||
SIMD_IWRAPPER_2_16(packs_epi32); // int32 --> int16 See documentation for _mm256_packs_epi32 and _mm512_packs_epi32
|
||||
SIMD_IWRAPPER_2_8(packus_epi16); // uint16 --> uint8 See documentation for _mm256_packus_epi16 and _mm512_packus_epi16
|
||||
SIMD_IWRAPPER_2_16(packus_epi32); // uint32 --> uint16 See documentation for _mm256_packus_epi32 and _mm512_packus_epi32
|
||||
SIMD_IWRAPPER_2_16(unpackhi_epi16);
|
||||
SIMD_IWRAPPER_2_64(unpackhi_epi64);
|
||||
SIMD_IWRAPPER_2_8(unpackhi_epi8);
|
||||
SIMD_IWRAPPER_2_16(unpacklo_epi16);
|
||||
SIMD_IWRAPPER_2_64(unpacklo_epi64);
|
||||
SIMD_IWRAPPER_2_8(unpacklo_epi8);
|
||||
|
||||
static SIMDINLINE uint32_t SIMDCALL movemask_epi8(Integer a)
|
||||
{
|
||||
__mmask64 m = 0xffffffffull;
|
||||
return static_cast<uint32_t>(
|
||||
_mm512_mask_test_epi8_mask(m, __conv(a), _mm512_set1_epi8(0x80)));
|
||||
}
|
||||
|
||||
#undef SIMD_DWRAPPER_1_
|
||||
#undef SIMD_DWRAPPER_1
|
||||
#undef SIMD_DWRAPPER_1I_
|
||||
#undef SIMD_DWRAPPER_1I
|
||||
#undef SIMD_DWRAPPER_2_
|
||||
#undef SIMD_DWRAPPER_2
|
||||
#undef SIMD_DWRAPPER_2I
|
||||
#undef SIMD_IWRAPPER_1_
|
||||
#undef SIMD_IWRAPPER_1_8
|
||||
#undef SIMD_IWRAPPER_1_16
|
||||
#undef SIMD_IWRAPPER_1_64
|
||||
#undef SIMD_IWRAPPER_1I_
|
||||
#undef SIMD_IWRAPPER_1I_8
|
||||
#undef SIMD_IWRAPPER_1I_16
|
||||
#undef SIMD_IWRAPPER_1I_64
|
||||
#undef SIMD_IWRAPPER_2_
|
||||
#undef SIMD_IWRAPPER_2_8
|
||||
#undef SIMD_IWRAPPER_2_16
|
||||
#undef SIMD_IWRAPPER_2_64
|
|
@ -0,0 +1,35 @@
|
|||
/****************************************************************************
|
||||
* Copyright (C) 2017 Intel Corporation. All Rights Reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
****************************************************************************/
|
||||
#if !defined(__SIMD_LIB_AVX512_HPP__)
|
||||
#error Do not include this file directly, use "simdlib.hpp" instead.
|
||||
#endif
|
||||
|
||||
//============================================================================
|
||||
// SIMD256 AVX (512) implementation for Knights Family
|
||||
//
|
||||
// Since this implementation inherits from the AVX (2) implementation,
|
||||
// the only operations below ones that replace AVX (2) operations.
|
||||
// These use native AVX512 instructions with masking to enable a larger
|
||||
// register set.
|
||||
//============================================================================
|
||||
|
|
@ -25,7 +25,7 @@
|
|||
#endif
|
||||
|
||||
#if defined(__GNUC__) && !defined( __clang__) && !defined(__INTEL_COMPILER)
|
||||
// gcc missing these intrinsics
|
||||
// gcc as of 7.1 was missing these intrinsics
|
||||
#ifndef _mm512_cmpneq_ps_mask
|
||||
#define _mm512_cmpneq_ps_mask(a,b) _mm512_cmp_ps_mask((a),(b),_CMP_NEQ_UQ)
|
||||
#endif
|
||||
|
@ -37,14 +37,13 @@
|
|||
#ifndef _mm512_cmplt_pd_mask
|
||||
#define _mm512_cmplt_pd_mask(a,b) _mm512_cmp_pd_mask((a),(b),_CMP_LT_OS)
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
//============================================================================
|
||||
// SIMD16 AVX512 (F) implementation
|
||||
// SIMD16 AVX512 (F) implementation (compatible with Knights and Core
|
||||
// processors)
|
||||
//
|
||||
// TODO: Optimize for KNL / KNH or for SKX??
|
||||
// For now probably optimizing more for KNL as that's where
|
||||
// immediate customers are.
|
||||
//============================================================================
|
||||
|
||||
static const int TARGET_SIMD_WIDTH = 16;
|
||||
|
@ -153,34 +152,11 @@ using SIMD256T = SIMD256Impl::AVX2Impl;
|
|||
}
|
||||
#define SIMD_IWRAPPER_2I(op) SIMD_IWRAPPER_2I_(op, op)
|
||||
|
||||
#define SIMD_EMU_IWRAPPER_2(op) \
|
||||
static SIMDINLINE \
|
||||
Integer SIMDCALL op(Integer a, Integer b)\
|
||||
{\
|
||||
return Integer\
|
||||
{\
|
||||
SIMD256T::op(a.v8[0], b.v8[0]),\
|
||||
SIMD256T::op(a.v8[1], b.v8[1]),\
|
||||
};\
|
||||
}
|
||||
|
||||
private:
|
||||
static SIMDINLINE Integer vmask(__mmask8 m)
|
||||
{
|
||||
return _mm512_maskz_set1_epi64(m, -1LL);
|
||||
}
|
||||
static SIMDINLINE Integer vmask(__mmask16 m)
|
||||
{
|
||||
return _mm512_maskz_set1_epi32(m, -1);
|
||||
}
|
||||
static SIMDINLINE Integer vmask(__mmask32 m)
|
||||
{
|
||||
return _mm512_maskz_set1_epi16(m, -1);
|
||||
}
|
||||
static SIMDINLINE Integer vmask(__mmask64 m)
|
||||
{
|
||||
return _mm512_maskz_set1_epi8(m, -1);
|
||||
}
|
||||
|
||||
public:
|
||||
//-----------------------------------------------------------------------
|
||||
|
@ -236,21 +212,10 @@ SIMD_IWRAPPER_2_(andnot_si, andnot_si512); // return (~a) & b (int)
|
|||
SIMD_IWRAPPER_2_(or_si, or_si512); // return a | b (int)
|
||||
SIMD_IWRAPPER_2_(xor_si, xor_si512); // return a ^ b (int)
|
||||
|
||||
#if defined(AVX512F_STRICT)
|
||||
|
||||
SIMD_WRAPPERI_2_(and_ps, and_epi32); // return a & b (float treated as int)
|
||||
SIMD_WRAPPERI_2_(andnot_ps, andnot_epi32); // return (~a) & b (float treated as int)
|
||||
SIMD_WRAPPERI_2_(or_ps, or_epi32); // return a | b (float treated as int)
|
||||
SIMD_WRAPPERI_2_(xor_ps, xor_epi32); // return a ^ b (float treated as int)
|
||||
|
||||
#else
|
||||
|
||||
SIMD_WRAPPER_2(and_ps); // return a & b (float treated as int)
|
||||
SIMD_WRAPPER_2(andnot_ps); // return (~a) & b (float treated as int)
|
||||
SIMD_WRAPPER_2(or_ps); // return a | b (float treated as int)
|
||||
SIMD_WRAPPER_2(xor_ps); // return a ^ b (float treated as int)
|
||||
|
||||
#endif
|
||||
// SIMD_WRAPPER_2(and_ps); // return a & b (float treated as int)
|
||||
// SIMD_WRAPPER_2(andnot_ps); // return (~a) & b (float treated as int)
|
||||
// SIMD_WRAPPER_2(or_ps); // return a | b (float treated as int)
|
||||
// SIMD_WRAPPER_2(xor_ps); // return a ^ b (float treated as int)
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
|
@ -260,6 +225,17 @@ SIMD_IWRAPPER_1I(slli_epi32); // return a << ImmT
|
|||
SIMD_IWRAPPER_2(sllv_epi32);
|
||||
SIMD_IWRAPPER_1I(srai_epi32); // return a >> ImmT (int32)
|
||||
SIMD_IWRAPPER_1I(srli_epi32); // return a >> ImmT (uint32)
|
||||
|
||||
#if 0
|
||||
SIMD_IWRAPPER_1I_(srli_si, srli_si512); // return a >> (ImmT*8) (uint)
|
||||
|
||||
template<int ImmT> // same as srli_si, but with Float cast to int
|
||||
static SIMDINLINE Float SIMDCALL srlisi_ps(Float a)
|
||||
{
|
||||
return castsi_ps(srli_si<ImmT>(castps_si(a)));
|
||||
}
|
||||
#endif
|
||||
|
||||
SIMD_IWRAPPER_2(srlv_epi32);
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
|
@ -461,17 +437,10 @@ static SIMDINLINE Integer SIMDCALL insert_si(Integer a, SIMD256Impl::Integer b)
|
|||
return _mm512_inserti64x4(a, b, imm);
|
||||
}
|
||||
|
||||
#if !defined(AVX512F_STRICT)
|
||||
SIMD_IWRAPPER_2(packs_epi16); // See documentation for _mm512_packs_epi16 and _mm512_packs_epi16
|
||||
SIMD_IWRAPPER_2(packs_epi32); // See documentation for _mm512_packs_epi32 and _mm512_packs_epi32
|
||||
SIMD_IWRAPPER_2(packus_epi16); // See documentation for _mm512_packus_epi16 and _mm512_packus_epi16
|
||||
SIMD_IWRAPPER_2(packus_epi32); // See documentation for _mm512_packus_epi32 and _mm512_packus_epi32
|
||||
#else
|
||||
SIMD_EMU_IWRAPPER_2(packs_epi16)
|
||||
SIMD_EMU_IWRAPPER_2(packs_epi32)
|
||||
SIMD_EMU_IWRAPPER_2(packus_epi16)
|
||||
SIMD_EMU_IWRAPPER_2(packus_epi32)
|
||||
#endif
|
||||
// SIMD_IWRAPPER_2(packs_epi16); // See documentation for _mm512_packs_epi16 and _mm512_packs_epi16
|
||||
// SIMD_IWRAPPER_2(packs_epi32); // See documentation for _mm512_packs_epi32 and _mm512_packs_epi32
|
||||
// SIMD_IWRAPPER_2(packus_epi16); // See documentation for _mm512_packus_epi16 and _mm512_packus_epi16
|
||||
// SIMD_IWRAPPER_2(packus_epi32); // See documentation for _mm512_packus_epi32 and _mm512_packus_epi32
|
||||
|
||||
static SIMDINLINE Integer SIMDCALL permute_epi32(Integer a, Integer swiz) // return a[swiz[i]] for each 32-bit lane i (float)
|
||||
{
|
||||
|
@ -704,4 +673,4 @@ static SIMDINLINE Float SIMDCALL vmask_ps(int32_t mask)
|
|||
#undef SIMD_IWRAPPER_2
|
||||
#undef SIMD_IWRAPPER_2_
|
||||
#undef SIMD_IWRAPPER_2I
|
||||
#undef SIMD_EMU_IWRAPPER_2
|
||||
|
||||
|
|
|
@ -0,0 +1,181 @@
|
|||
/****************************************************************************
|
||||
* Copyright (C) 2017 Intel Corporation. All Rights Reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
****************************************************************************/
|
||||
#if !defined(__SIMD_LIB_AVX512_HPP__)
|
||||
#error Do not include this file directly, use "simdlib.hpp" instead.
|
||||
#endif
|
||||
|
||||
//============================================================================
|
||||
// SIMD16 AVX512 (F) implementation for Core processors
|
||||
//
|
||||
//============================================================================
|
||||
|
||||
#define SIMD_WRAPPER_1_(op, intrin) \
|
||||
static SIMDINLINE Float SIMDCALL op(Float a) \
|
||||
{\
|
||||
return intrin(a);\
|
||||
}
|
||||
|
||||
#define SIMD_WRAPPER_1(op) \
|
||||
SIMD_WRAPPER_1_(op, _mm512_##op)
|
||||
|
||||
#define SIMD_WRAPPER_2_(op, intrin) \
|
||||
static SIMDINLINE Float SIMDCALL op(Float a, Float b) \
|
||||
{\
|
||||
return _mm512_##intrin(a, b);\
|
||||
}
|
||||
#define SIMD_WRAPPER_2(op) SIMD_WRAPPER_2_(op, op)
|
||||
|
||||
#define SIMD_WRAPPERI_2_(op, intrin) \
|
||||
static SIMDINLINE Float SIMDCALL op(Float a, Float b) \
|
||||
{\
|
||||
return _mm512_castsi512_ps(_mm512_##intrin(\
|
||||
_mm512_castps_si512(a), _mm512_castps_si512(b)));\
|
||||
}
|
||||
|
||||
#define SIMD_DWRAPPER_2(op) \
|
||||
static SIMDINLINE Double SIMDCALL op(Double a, Double b) \
|
||||
{\
|
||||
return _mm512_##op(a, b);\
|
||||
}
|
||||
|
||||
#define SIMD_WRAPPER_2I_(op, intrin) \
|
||||
template<int ImmT>\
|
||||
static SIMDINLINE Float SIMDCALL op(Float a, Float b) \
|
||||
{\
|
||||
return _mm512_##intrin(a, b, ImmT);\
|
||||
}
|
||||
#define SIMD_WRAPPER_2I(op) SIMD_WRAPPER_2I_(op, op)
|
||||
|
||||
#define SIMD_DWRAPPER_2I_(op, intrin) \
|
||||
template<int ImmT>\
|
||||
static SIMDINLINE Double SIMDCALL op(Double a, Double b) \
|
||||
{\
|
||||
return _mm512_##intrin(a, b, ImmT);\
|
||||
}
|
||||
#define SIMD_DWRAPPER_2I(op) SIMD_DWRAPPER_2I_(op, op)
|
||||
|
||||
#define SIMD_WRAPPER_3(op) \
|
||||
static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c) \
|
||||
{\
|
||||
return _mm512_##op(a, b, c);\
|
||||
}
|
||||
|
||||
#define SIMD_IWRAPPER_1(op) \
|
||||
static SIMDINLINE Integer SIMDCALL op(Integer a) \
|
||||
{\
|
||||
return _mm512_##op(a);\
|
||||
}
|
||||
#define SIMD_IWRAPPER_1_8(op) \
|
||||
static SIMDINLINE Integer SIMDCALL op(SIMD256Impl::Integer a) \
|
||||
{\
|
||||
return _mm512_##op(a);\
|
||||
}
|
||||
|
||||
#define SIMD_IWRAPPER_1_4(op) \
|
||||
static SIMDINLINE Integer SIMDCALL op(SIMD128Impl::Integer a) \
|
||||
{\
|
||||
return _mm512_##op(a);\
|
||||
}
|
||||
|
||||
#define SIMD_IWRAPPER_1I_(op, intrin) \
|
||||
template<int ImmT> \
|
||||
static SIMDINLINE Integer SIMDCALL op(Integer a) \
|
||||
{\
|
||||
return intrin(a, ImmT);\
|
||||
}
|
||||
#define SIMD_IWRAPPER_1I(op) SIMD_IWRAPPER_1I_(op, _mm512_##op)
|
||||
|
||||
#define SIMD_IWRAPPER_2_(op, intrin) \
|
||||
static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
|
||||
{\
|
||||
return _mm512_##intrin(a, b);\
|
||||
}
|
||||
#define SIMD_IWRAPPER_2(op) SIMD_IWRAPPER_2_(op, op)
|
||||
|
||||
#define SIMD_IWRAPPER_2_CMP(op, cmp) \
|
||||
static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
|
||||
{\
|
||||
return cmp(a, b);\
|
||||
}
|
||||
|
||||
#define SIMD_IFWRAPPER_2(op, intrin) \
|
||||
static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
|
||||
{\
|
||||
return castps_si(_mm512_##intrin(castsi_ps(a), castsi_ps(b)) );\
|
||||
}
|
||||
|
||||
#define SIMD_IWRAPPER_2I_(op, intrin) \
|
||||
template<int ImmT>\
|
||||
static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
|
||||
{\
|
||||
return _mm512_##intrin(a, b, ImmT);\
|
||||
}
|
||||
#define SIMD_IWRAPPER_2I(op) SIMD_IWRAPPER_2I_(op, op)
|
||||
|
||||
private:
|
||||
static SIMDINLINE Integer vmask(__mmask8 m)
|
||||
{
|
||||
return _mm512_maskz_set1_epi64(m, -1LL);
|
||||
}
|
||||
static SIMDINLINE Integer vmask(__mmask32 m)
|
||||
{
|
||||
return _mm512_maskz_set1_epi16(m, -1);
|
||||
}
|
||||
static SIMDINLINE Integer vmask(__mmask64 m)
|
||||
{
|
||||
return _mm512_maskz_set1_epi8(m, -1);
|
||||
}
|
||||
|
||||
public:
|
||||
SIMD_WRAPPER_2(and_ps); // return a & b (float treated as int)
|
||||
SIMD_WRAPPER_2(andnot_ps); // return (~a) & b (float treated as int)
|
||||
SIMD_WRAPPER_2(or_ps); // return a | b (float treated as int)
|
||||
SIMD_WRAPPER_2(xor_ps); // return a ^ b (float treated as int)
|
||||
|
||||
SIMD_IWRAPPER_2(packs_epi16); // See documentation for _mm512_packs_epi16 and _mm512_packs_epi16
|
||||
SIMD_IWRAPPER_2(packs_epi32); // See documentation for _mm512_packs_epi32 and _mm512_packs_epi32
|
||||
SIMD_IWRAPPER_2(packus_epi16); // See documentation for _mm512_packus_epi16 and _mm512_packus_epi16
|
||||
SIMD_IWRAPPER_2(packus_epi32); // See documentation for _mm512_packus_epi32 and _mm512_packus_epi32
|
||||
|
||||
#undef SIMD_WRAPPER_1_
|
||||
#undef SIMD_WRAPPER_1
|
||||
#undef SIMD_WRAPPER_2
|
||||
#undef SIMD_WRAPPER_2_
|
||||
#undef SIMD_WRAPPERI_2_
|
||||
#undef SIMD_DWRAPPER_2
|
||||
#undef SIMD_DWRAPPER_2I
|
||||
#undef SIMD_WRAPPER_2I_
|
||||
#undef SIMD_WRAPPER_3_
|
||||
#undef SIMD_WRAPPER_2I
|
||||
#undef SIMD_WRAPPER_3
|
||||
#undef SIMD_IWRAPPER_1
|
||||
#undef SIMD_IWRAPPER_2
|
||||
#undef SIMD_IFWRAPPER_2
|
||||
#undef SIMD_IWRAPPER_2I
|
||||
#undef SIMD_IWRAPPER_1
|
||||
#undef SIMD_IWRAPPER_1I
|
||||
#undef SIMD_IWRAPPER_1I_
|
||||
#undef SIMD_IWRAPPER_2
|
||||
#undef SIMD_IWRAPPER_2_
|
||||
#undef SIMD_IWRAPPER_2I
|
||||
|
|
@ -0,0 +1,183 @@
|
|||
/****************************************************************************
|
||||
* Copyright (C) 2017 Intel Corporation. All Rights Reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
****************************************************************************/
|
||||
#if !defined(__SIMD_LIB_AVX512_HPP__)
|
||||
#error Do not include this file directly, use "simdlib.hpp" instead.
|
||||
#endif
|
||||
|
||||
//============================================================================
|
||||
// SIMD16 AVX512 (F) implementation for Knights Family Processors
|
||||
//
|
||||
//============================================================================
|
||||
|
||||
static const int TARGET_SIMD_WIDTH = 16;
|
||||
using SIMD256T = SIMD256Impl::AVX2Impl;
|
||||
|
||||
#define SIMD_WRAPPER_1_(op, intrin) \
|
||||
static SIMDINLINE Float SIMDCALL op(Float a) \
|
||||
{\
|
||||
return intrin(a);\
|
||||
}
|
||||
|
||||
#define SIMD_WRAPPER_1(op) \
|
||||
SIMD_WRAPPER_1_(op, _mm512_##op)
|
||||
|
||||
#define SIMD_WRAPPER_2_(op, intrin) \
|
||||
static SIMDINLINE Float SIMDCALL op(Float a, Float b) \
|
||||
{\
|
||||
return _mm512_##intrin(a, b);\
|
||||
}
|
||||
#define SIMD_WRAPPER_2(op) SIMD_WRAPPER_2_(op, op)
|
||||
|
||||
#define SIMD_WRAPPERI_2_(op, intrin) \
|
||||
static SIMDINLINE Float SIMDCALL op(Float a, Float b) \
|
||||
{\
|
||||
return _mm512_castsi512_ps(_mm512_##intrin(\
|
||||
_mm512_castps_si512(a), _mm512_castps_si512(b)));\
|
||||
}
|
||||
|
||||
#define SIMD_DWRAPPER_2(op) \
|
||||
static SIMDINLINE Double SIMDCALL op(Double a, Double b) \
|
||||
{\
|
||||
return _mm512_##op(a, b);\
|
||||
}
|
||||
|
||||
#define SIMD_WRAPPER_2I_(op, intrin) \
|
||||
template<int ImmT>\
|
||||
static SIMDINLINE Float SIMDCALL op(Float a, Float b) \
|
||||
{\
|
||||
return _mm512_##intrin(a, b, ImmT);\
|
||||
}
|
||||
#define SIMD_WRAPPER_2I(op) SIMD_WRAPPER_2I_(op, op)
|
||||
|
||||
#define SIMD_DWRAPPER_2I_(op, intrin) \
|
||||
template<int ImmT>\
|
||||
static SIMDINLINE Double SIMDCALL op(Double a, Double b) \
|
||||
{\
|
||||
return _mm512_##intrin(a, b, ImmT);\
|
||||
}
|
||||
#define SIMD_DWRAPPER_2I(op) SIMD_DWRAPPER_2I_(op, op)
|
||||
|
||||
#define SIMD_WRAPPER_3(op) \
|
||||
static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c) \
|
||||
{\
|
||||
return _mm512_##op(a, b, c);\
|
||||
}
|
||||
|
||||
#define SIMD_IWRAPPER_1(op) \
|
||||
static SIMDINLINE Integer SIMDCALL op(Integer a) \
|
||||
{\
|
||||
return _mm512_##op(a);\
|
||||
}
|
||||
#define SIMD_IWRAPPER_1_8(op) \
|
||||
static SIMDINLINE Integer SIMDCALL op(SIMD256Impl::Integer a) \
|
||||
{\
|
||||
return _mm512_##op(a);\
|
||||
}
|
||||
|
||||
#define SIMD_IWRAPPER_1_4(op) \
|
||||
static SIMDINLINE Integer SIMDCALL op(SIMD128Impl::Integer a) \
|
||||
{\
|
||||
return _mm512_##op(a);\
|
||||
}
|
||||
|
||||
#define SIMD_IWRAPPER_1I_(op, intrin) \
|
||||
template<int ImmT> \
|
||||
static SIMDINLINE Integer SIMDCALL op(Integer a) \
|
||||
{\
|
||||
return intrin(a, ImmT);\
|
||||
}
|
||||
#define SIMD_IWRAPPER_1I(op) SIMD_IWRAPPER_1I_(op, _mm512_##op)
|
||||
|
||||
#define SIMD_IWRAPPER_2_(op, intrin) \
|
||||
static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
|
||||
{\
|
||||
return _mm512_##intrin(a, b);\
|
||||
}
|
||||
#define SIMD_IWRAPPER_2(op) SIMD_IWRAPPER_2_(op, op)
|
||||
|
||||
#define SIMD_IWRAPPER_2_CMP(op, cmp) \
|
||||
static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
|
||||
{\
|
||||
return cmp(a, b);\
|
||||
}
|
||||
|
||||
#define SIMD_IFWRAPPER_2(op, intrin) \
|
||||
static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
|
||||
{\
|
||||
return castps_si(_mm512_##intrin(castsi_ps(a), castsi_ps(b)) );\
|
||||
}
|
||||
|
||||
#define SIMD_IWRAPPER_2I_(op, intrin) \
|
||||
template<int ImmT>\
|
||||
static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
|
||||
{\
|
||||
return _mm512_##intrin(a, b, ImmT);\
|
||||
}
|
||||
#define SIMD_IWRAPPER_2I(op) SIMD_IWRAPPER_2I_(op, op)
|
||||
|
||||
private:
|
||||
static SIMDINLINE Integer vmask(__mmask8 m)
|
||||
{
|
||||
return _mm512_maskz_set1_epi64(m, -1LL);
|
||||
}
|
||||
static SIMDINLINE Integer vmask(__mmask16 m)
|
||||
{
|
||||
return _mm512_maskz_set1_epi32(m, -1);
|
||||
}
|
||||
static SIMDINLINE Integer vmask(__mmask32 m)
|
||||
{
|
||||
return _mm512_maskz_set1_epi16(m, -1);
|
||||
}
|
||||
static SIMDINLINE Integer vmask(__mmask64 m)
|
||||
{
|
||||
return _mm512_maskz_set1_epi8(m, -1);
|
||||
}
|
||||
|
||||
public:
|
||||
SIMD_WRAPPERI_2_(and_ps, and_epi32); // return a & b (float treated as int)
|
||||
SIMD_WRAPPERI_2_(andnot_ps, andnot_epi32); // return (~a) & b (float treated as int)
|
||||
SIMD_WRAPPERI_2_(or_ps, or_epi32); // return a | b (float treated as int)
|
||||
SIMD_WRAPPERI_2_(xor_ps, xor_epi32); // return a ^ b (float treated as int)
|
||||
|
||||
#undef SIMD_WRAPPER_1_
|
||||
#undef SIMD_WRAPPER_1
|
||||
#undef SIMD_WRAPPER_2
|
||||
#undef SIMD_WRAPPER_2_
|
||||
#undef SIMD_WRAPPERI_2_
|
||||
#undef SIMD_DWRAPPER_2
|
||||
#undef SIMD_DWRAPPER_2I
|
||||
#undef SIMD_WRAPPER_2I_
|
||||
#undef SIMD_WRAPPER_3_
|
||||
#undef SIMD_WRAPPER_2I
|
||||
#undef SIMD_WRAPPER_3
|
||||
#undef SIMD_IWRAPPER_1
|
||||
#undef SIMD_IWRAPPER_2
|
||||
#undef SIMD_IFWRAPPER_2
|
||||
#undef SIMD_IWRAPPER_2I
|
||||
#undef SIMD_IWRAPPER_1
|
||||
#undef SIMD_IWRAPPER_1I
|
||||
#undef SIMD_IWRAPPER_1I_
|
||||
#undef SIMD_IWRAPPER_2
|
||||
#undef SIMD_IWRAPPER_2_
|
||||
#undef SIMD_IWRAPPER_2I
|
||||
|
|
@ -0,0 +1,27 @@
|
|||
/****************************************************************************
|
||||
* Copyright (C) 2017 Intel Corporation. All Rights Reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
****************************************************************************/
|
||||
#if !defined(__SIMD_LIB_AVX512_HPP__)
|
||||
#error Do not include this file directly, use "simdlib.hpp" instead.
|
||||
#endif
|
||||
|
||||
// Implement mask-enabled SIMD functions
|
|
@ -0,0 +1,27 @@
|
|||
/****************************************************************************
|
||||
* Copyright (C) 2017 Intel Corporation. All Rights Reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
****************************************************************************/
|
||||
#if !defined(__SIMD_LIB_AVX512_HPP__)
|
||||
#error Do not include this file directly, use "simdlib.hpp" instead.
|
||||
#endif
|
||||
|
||||
// Implement mask-enabled SIMD functions
|
|
@ -262,7 +262,7 @@ namespace SIMDImpl
|
|||
|
||||
namespace SIMD512Impl
|
||||
{
|
||||
#if !defined(__AVX512F__)
|
||||
#if !(defined(__AVX512F__) || defined(_MM_K0_REG))
|
||||
// Define AVX512 types if not included via immintrin.h.
|
||||
// All data members of these types are ONLY to viewed
|
||||
// in a debugger. Do NOT access them via code!
|
||||
|
|
Loading…
Reference in New Issue