swr/rast: simdlib better separation of core vs knights avx512

Reviewed-by: Bruce Cherniak <bruce.cherniak@intel.com>
This commit is contained in:
Tim Rowley 2017-07-24 16:13:12 -05:00
parent e1091b0861
commit 07062daae9
15 changed files with 911 additions and 245 deletions

View File

@ -285,7 +285,7 @@ lib_LTLIBRARIES += libswrKNL.la
libswrKNL_la_CXXFLAGS = \
$(PTHREAD_CFLAGS) \
$(SWR_KNL_CXXFLAGS) \
-DKNOB_ARCH=KNOB_ARCH_AVX512 -DAVX512F_STRICT \
-DKNOB_ARCH=KNOB_ARCH_AVX512 -DSIMD_ARCH_KNIGHTS \
$(COMMON_CXXFLAGS)
libswrKNL_la_SOURCES = \

View File

@ -69,11 +69,19 @@ COMMON_CXX_SOURCES := \
rasterizer/common/simdlib_128_avx.inl \
rasterizer/common/simdlib_128_avx2.inl \
rasterizer/common/simdlib_128_avx512.inl \
rasterizer/common/simdlib_128_avx512_core.inl \
rasterizer/common/simdlib_128_avx512_knights.inl \
rasterizer/common/simdlib_256_avx.inl \
rasterizer/common/simdlib_256_avx2.inl \
rasterizer/common/simdlib_256_avx512.inl \
rasterizer/common/simdlib_256_avx512_core.inl \
rasterizer/common/simdlib_256_avx512_knights.inl \
rasterizer/common/simdlib_512_avx512.inl \
rasterizer/common/simdlib_512_avx512_core.inl \
rasterizer/common/simdlib_512_avx512_knights.inl \
rasterizer/common/simdlib_512_avx512_masks.inl \
rasterizer/common/simdlib_512_avx512_masks_core.inl \
rasterizer/common/simdlib_512_avx512_masks_knights.inl \
rasterizer/common/simdlib_512_emu.inl \
rasterizer/common/simdlib_512_emu_masks.inl \
rasterizer/common/simdlib_interface.hpp \

View File

@ -55,6 +55,11 @@ namespace SIMDImpl
{
#define __SIMD_LIB_AVX512_HPP__
#include "simdlib_128_avx512.inl"
#if defined(SIMD_ARCH_KNIGHTS)
#include "simdlib_128_avx512_knights.inl"
#else // optimize for core
#include "simdlib_128_avx512_core.inl"
#endif // defined(SIMD_ARCH_KNIGHTS)
#undef __SIMD_LIB_AVX512_HPP__
}; // struct AVX2Impl
#endif // #if SIMD_ARCH >= SIMD_ARCH_AVX512
@ -105,6 +110,11 @@ namespace SIMDImpl
{
#define __SIMD_LIB_AVX512_HPP__
#include "simdlib_256_avx512.inl"
#if defined(SIMD_ARCH_KNIGHTS)
#include "simdlib_256_avx512_knights.inl"
#else // optimize for core
#include "simdlib_256_avx512_core.inl"
#endif // defined(SIMD_ARCH_KNIGHTS)
#undef __SIMD_LIB_AVX512_HPP__
}; // struct AVX2Impl
#endif // #if SIMD_ARCH >= SIMD_ARCH_AVX512
@ -150,13 +160,20 @@ namespace SIMDImpl
#if SIMD_ARCH >= SIMD_ARCH_AVX512
struct AVX512Impl
struct AVX512Impl : AVXImplBase<SIMD256Impl::AVX512Impl>
{
#define __SIMD_LIB_AVX512_HPP__
#include "simdlib_512_avx512.inl"
#include "simdlib_512_avx512_masks.inl"
#if defined(SIMD_ARCH_KNIGHTS)
#include "simdlib_512_avx512_knights.inl"
#include "simdlib_512_avx512_masks_knights.inl"
#else // optimize for core
#include "simdlib_512_avx512_core.inl"
#include "simdlib_512_avx512_masks_core.inl"
#endif // defined(SIMD_ARCH_KNIGHTS)
#undef __SIMD_LIB_AVX512_HPP__
}; // struct AVX512Impl
}; // struct AVX512ImplBase
#endif // #if SIMD_ARCH >= SIMD_ARCH_AVX512
struct Traits : SIMDImpl::Traits

View File

@ -78,34 +78,6 @@ public:
}
#define SIMD_WRAPPER_3(op) SIMD_WRAPPER_3_(op, op, __mmask16(0xf))
#define SIMD_DWRAPPER_1_(op, intrin, mask) \
static SIMDINLINE Double SIMDCALL op(Double a) \
{\
return __conv(_mm512_maskz_##intrin((mask), __conv(a)));\
}
#if !defined(AVX512F_STRICT)
#define SIMD_DWRAPPER_1(op) SIMD_DWRAPPER_1_(op, op, __mmask8(0x3))
#endif
#define SIMD_DWRAPPER_1I_(op, intrin, mask) \
template<int ImmT> \
static SIMDINLINE Double SIMDCALL op(Double a) \
{\
return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT));\
}
#if !defined(AVX512F_STRICT)
#define SIMD_DWRAPPER_1I(op) SIMD_DWRAPPER_1I_(op, op, __mmask8(0x3))
#endif
#define SIMD_DWRAPPER_2_(op, intrin, mask) \
static SIMDINLINE Double SIMDCALL op(Double a, Double b) \
{\
return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b)));\
}
#if !defined(AVX512F_STRICT)
#define SIMD_DWRAPPER_2(op) SIMD_DWRAPPER_2_(op, op, __mmask8(0x3))
#endif
#define SIMD_DWRAPPER_2I(op) \
template<int ImmT>\
static SIMDINLINE Double SIMDCALL op(Double a, Double b) \
@ -119,11 +91,6 @@ public:
return __conv(_mm512_maskz_##intrin((mask), __conv(a)));\
}
#define SIMD_IWRAPPER_1_32(op) SIMD_IWRAPPER_1_(op, op, __mmask16(0xf))
#if !defined(AVX512F_STRICT)
#define SIMD_IWRAPPER_1_8(op) SIMD_IWRAPPER_1_(op, op, __mmask64(0xffffull))
#define SIMD_IWRAPPER_1_16(op) SIMD_IWRAPPER_1_(op, op, __mmask32(0xff))
#define SIMD_IWRAPPER_1_64(op) SIMD_IWRAPPER_1_(op, op, __mmask8(0x3))
#endif
#define SIMD_IWRAPPER_1I_(op, intrin, mask) \
template<int ImmT> \
@ -132,11 +99,6 @@ public:
return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT));\
}
#define SIMD_IWRAPPER_1I_32(op) SIMD_IWRAPPER_1I_(op, op, __mmask16(0xf))
#if !defined(AVX512F_STRICT)
#define SIMD_IWRAPPER_1I_8(op) SIMD_IWRAPPER_1I_(op, op, __mmask64(0xffffull))
#define SIMD_IWRAPPER_1I_16(op) SIMD_IWRAPPER_1I_(op, op, __mmask32(0xff))
#define SIMD_IWRAPPER_1I_64(op) SIMD_IWRAPPER_1I_(op, op, __mmask8(0x3))
#endif
#define SIMD_IWRAPPER_2_(op, intrin, mask) \
static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
@ -144,11 +106,6 @@ public:
return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b)));\
}
#define SIMD_IWRAPPER_2_32(op) SIMD_IWRAPPER_2_(op, op, __mmask16(0xf))
#if !defined(AVX512F_STRICT)
#define SIMD_IWRAPPER_2_8(op) SIMD_IWRAPPER_2_(op, op, __mmask64(0xffffull))
#define SIMD_IWRAPPER_2_16(op) SIMD_IWRAPPER_2_(op, op, __mmask32(0xff))
#define SIMD_IWRAPPER_2_64(op) SIMD_IWRAPPER_2_(op, op, __mmask8(0x3))
#endif
#define SIMD_IWRAPPER_2I(op) \
template<int ImmT>\
@ -182,12 +139,8 @@ SIMD_IWRAPPER_2_32(min_epi32); // return (a < b) ? a : b (int32)
SIMD_IWRAPPER_2_32(min_epu32); // return (a < b) ? a : b (uint32)
SIMD_IWRAPPER_2_32(mul_epi32); // return a * b (int32)
#if !defined(AVX512F_STRICT)
SIMD_IWRAPPER_2_8(add_epi8); // return a + b (int8)
SIMD_IWRAPPER_2_8(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8)
#endif
// SIMD_IWRAPPER_2_8(add_epi8); // return a + b (int8)
// SIMD_IWRAPPER_2_8(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8)
// return (a * b) & 0xFFFFFFFF
//
@ -196,12 +149,8 @@ SIMD_IWRAPPER_2_8(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uin
SIMD_IWRAPPER_2_32(mullo_epi32);
SIMD_IWRAPPER_2_32(sub_epi32); // return a - b (int32)
#if !defined(AVX512F_STRICT)
SIMD_IWRAPPER_2_64(sub_epi64); // return a - b (int64)
SIMD_IWRAPPER_2_8(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8)
#endif
// SIMD_IWRAPPER_2_64(sub_epi64); // return a - b (int64)
// SIMD_IWRAPPER_2_8(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8)
//-----------------------------------------------------------------------
// Logical operations
@ -253,14 +202,10 @@ SIMD_IWRAPPER_2_32(srlv_epi32); // return a >> b (uint32)
//-----------------------------------------------------------------------
// Blend / shuffle / permute operations
//-----------------------------------------------------------------------
#if !defined(AVX512F_STRICT)
SIMD_IWRAPPER_2_8(packs_epi16); // int16 --> int8 See documentation for _mm256_packs_epi16 and _mm512_packs_epi16
SIMD_IWRAPPER_2_16(packs_epi32); // int32 --> int16 See documentation for _mm256_packs_epi32 and _mm512_packs_epi32
SIMD_IWRAPPER_2_8(packus_epi16); // uint16 --> uint8 See documentation for _mm256_packus_epi16 and _mm512_packus_epi16
SIMD_IWRAPPER_2_16(packus_epi32); // uint32 --> uint16 See documentation for _mm256_packus_epi32 and _mm512_packus_epi32
#endif
// SIMD_IWRAPPER_2_8(packs_epi16); // int16 --> int8 See documentation for _mm256_packs_epi16 and _mm512_packs_epi16
// SIMD_IWRAPPER_2_16(packs_epi32); // int32 --> int16 See documentation for _mm256_packs_epi32 and _mm512_packs_epi32
// SIMD_IWRAPPER_2_8(packus_epi16); // uint16 --> uint8 See documentation for _mm256_packus_epi16 and _mm512_packus_epi16
// SIMD_IWRAPPER_2_16(packus_epi32); // uint32 --> uint16 See documentation for _mm256_packus_epi32 and _mm512_packus_epi32
// SIMD_IWRAPPER_2_(permute_epi32, permutevar8x32_epi32);
//static SIMDINLINE Float SIMDCALL permute_ps(Float a, Integer swiz) // return a[swiz[i]] for each 32-bit lane i (float)
@ -278,16 +223,12 @@ SIMD_IWRAPPER_1I_32(shuffle_epi32);
SIMD_IWRAPPER_2_32(unpackhi_epi32);
SIMD_IWRAPPER_2_32(unpacklo_epi32);
#if !defined(AVX512F_STRICT)
SIMD_IWRAPPER_2_16(unpackhi_epi16);
SIMD_IWRAPPER_2_64(unpackhi_epi64);
SIMD_IWRAPPER_2_8(unpackhi_epi8);
SIMD_IWRAPPER_2_16(unpacklo_epi16);
SIMD_IWRAPPER_2_64(unpacklo_epi64);
SIMD_IWRAPPER_2_8(unpacklo_epi8);
#endif
// SIMD_IWRAPPER_2_16(unpackhi_epi16);
// SIMD_IWRAPPER_2_64(unpackhi_epi64);
// SIMD_IWRAPPER_2_8(unpackhi_epi8);
// SIMD_IWRAPPER_2_16(unpacklo_epi16);
// SIMD_IWRAPPER_2_64(unpacklo_epi64);
// SIMD_IWRAPPER_2_8(unpacklo_epi8);
//-----------------------------------------------------------------------
// Load / store operations
@ -338,16 +279,12 @@ static SIMDINLINE Float SIMDCALL mask_i32gather_ps(Float old, float const* p, In
static_cast<int>(ScaleT)));
}
#if !defined(AVX512F_STRICT)
static SIMDINLINE uint32_t SIMDCALL movemask_epi8(Integer a)
{
__mmask64 m = 0xffffull;
return static_cast<uint32_t>(
_mm512_mask_test_epi8_mask(m, __conv(a), _mm512_set1_epi8(0x80)));
}
#endif
// static SIMDINLINE uint32_t SIMDCALL movemask_epi8(Integer a)
// {
// __mmask64 m = 0xffffull;
// return static_cast<uint32_t>(
// _mm512_mask_test_epi8_mask(m, __conv(a), _mm512_set1_epi8(0x80)));
// }
static SIMDINLINE void SIMDCALL maskstore_ps(float *p, Integer mask, Float src)
{
@ -366,6 +303,11 @@ static SIMDINLINE void SIMDCALL store_si(Integer *p, Integer a) // *p = a
_mm512_mask_storeu_epi32(p, __mmask16(0xf), __conv(a));
}
static SIMDINLINE Float SIMDCALL vmask_ps(int32_t mask)
{
return castsi_ps(__conv(_mm512_maskz_set1_epi32(__mmask16(mask & 0xf), -1)));
}
//=======================================================================
// Legacy interface (available only in SIMD256 width)
//=======================================================================

View File

@ -0,0 +1,193 @@
/****************************************************************************
* Copyright (C) 2017 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
****************************************************************************/
#if !defined(__SIMD_LIB_AVX512_HPP__)
#error Do not include this file directly, use "simdlib.hpp" instead.
#endif
//============================================================================
// SIMD128 AVX (512) implementation
//
// Since this implementation inherits from the AVX (2) implementation,
// the only operations below ones that replace AVX (2) operations.
// These use native AVX512 instructions with masking to enable a larger
// register set.
//============================================================================
#define SIMD_WRAPPER_1_(op, intrin, mask) \
static SIMDINLINE Float SIMDCALL op(Float a) \
{\
return __conv(_mm512_maskz_##intrin((mask), __conv(a)));\
}
#define SIMD_WRAPPER_1(op) SIMD_WRAPPER_1_(op, op, __mmask16(0xf))
#define SIMD_WRAPPER_1I_(op, intrin, mask) \
template<int ImmT> \
static SIMDINLINE Float SIMDCALL op(Float a) \
{\
return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT));\
}
#define SIMD_WRAPPER_1I(op) SIMD_WRAPPER_1I_(op, op, __mmask16(0xf))
#define SIMD_WRAPPER_2_(op, intrin, mask) \
static SIMDINLINE Float SIMDCALL op(Float a, Float b) \
{\
return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b)));\
}
#define SIMD_WRAPPER_2(op) SIMD_WRAPPER_2_(op, op, __mmask16(0xf))
#define SIMD_WRAPPER_2I(op) \
template<int ImmT>\
static SIMDINLINE Float SIMDCALL op(Float a, Float b) \
{\
return __conv(_mm512_maskz_##op(0xf, __conv(a), __conv(b), ImmT));\
}
#define SIMD_WRAPPER_3_(op, intrin, mask) \
static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c) \
{\
return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b), __conv(c)));\
}
#define SIMD_WRAPPER_3(op) SIMD_WRAPPER_3_(op, op, __mmask16(0xf))
#define SIMD_DWRAPPER_1_(op, intrin, mask) \
static SIMDINLINE Double SIMDCALL op(Double a) \
{\
return __conv(_mm512_maskz_##intrin((mask), __conv(a)));\
}
#define SIMD_DWRAPPER_1(op) SIMD_DWRAPPER_1_(op, op, __mmask8(0x3))
#define SIMD_DWRAPPER_1I_(op, intrin, mask) \
template<int ImmT> \
static SIMDINLINE Double SIMDCALL op(Double a) \
{\
return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT));\
}
#define SIMD_DWRAPPER_1I(op) SIMD_DWRAPPER_1I_(op, op, __mmask8(0x3))
#define SIMD_DWRAPPER_2_(op, intrin, mask) \
static SIMDINLINE Double SIMDCALL op(Double a, Double b) \
{\
return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b)));\
}
#define SIMD_DWRAPPER_2(op) SIMD_DWRAPPER_2_(op, op, __mmask8(0x3))
#define SIMD_DWRAPPER_2I(op) \
template<int ImmT>\
static SIMDINLINE Double SIMDCALL op(Double a, Double b) \
{\
return __conv(_mm512_maskz_##op(0x3, __conv(a), __conv(b), ImmT));\
}
#define SIMD_IWRAPPER_1_(op, intrin, mask) \
static SIMDINLINE Integer SIMDCALL op(Integer a) \
{\
return __conv(_mm512_maskz_##intrin((mask), __conv(a)));\
}
#define SIMD_IWRAPPER_1_8(op) SIMD_IWRAPPER_1_(op, op, __mmask64(0xffffull))
#define SIMD_IWRAPPER_1_16(op) SIMD_IWRAPPER_1_(op, op, __mmask32(0xff))
#define SIMD_IWRAPPER_1_64(op) SIMD_IWRAPPER_1_(op, op, __mmask8(0x3))
#define SIMD_IWRAPPER_1I_(op, intrin, mask) \
template<int ImmT> \
static SIMDINLINE Integer SIMDCALL op(Integer a) \
{\
return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT));\
}
#define SIMD_IWRAPPER_1I_8(op) SIMD_IWRAPPER_1I_(op, op, __mmask64(0xffffull))
#define SIMD_IWRAPPER_1I_16(op) SIMD_IWRAPPER_1I_(op, op, __mmask32(0xff))
#define SIMD_IWRAPPER_1I_64(op) SIMD_IWRAPPER_1I_(op, op, __mmask8(0x3))
#define SIMD_IWRAPPER_2_(op, intrin, mask) \
static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
{\
return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b)));\
}
#define SIMD_IWRAPPER_2_8(op) SIMD_IWRAPPER_2_(op, op, __mmask64(0xffffull))
#define SIMD_IWRAPPER_2_16(op) SIMD_IWRAPPER_2_(op, op, __mmask32(0xff))
#define SIMD_IWRAPPER_2_64(op) SIMD_IWRAPPER_2_(op, op, __mmask8(0x3))
#define SIMD_IWRAPPER_2I(op) \
template<int ImmT>\
static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
{\
return __conv(_mm512_maskz_##op(0xf, __conv(a), __conv(b), ImmT));\
}
SIMD_IWRAPPER_2_8(add_epi8); // return a + b (int8)
SIMD_IWRAPPER_2_8(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8)
SIMD_IWRAPPER_2_64(sub_epi64); // return a - b (int64)
SIMD_IWRAPPER_2_8(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8)
SIMD_IWRAPPER_2_8(packs_epi16); // int16 --> int8 See documentation for _mm256_packs_epi16 and _mm512_packs_epi16
SIMD_IWRAPPER_2_16(packs_epi32); // int32 --> int16 See documentation for _mm256_packs_epi32 and _mm512_packs_epi32
SIMD_IWRAPPER_2_8(packus_epi16); // uint16 --> uint8 See documentation for _mm256_packus_epi16 and _mm512_packus_epi16
SIMD_IWRAPPER_2_16(packus_epi32); // uint32 --> uint16 See documentation for _mm256_packus_epi32 and _mm512_packus_epi32
SIMD_IWRAPPER_2_16(unpackhi_epi16);
SIMD_IWRAPPER_2_64(unpackhi_epi64);
SIMD_IWRAPPER_2_8(unpackhi_epi8);
SIMD_IWRAPPER_2_16(unpacklo_epi16);
SIMD_IWRAPPER_2_64(unpacklo_epi64);
SIMD_IWRAPPER_2_8(unpacklo_epi8);
static SIMDINLINE uint32_t SIMDCALL movemask_epi8(Integer a)
{
__mmask64 m = 0xffffull;
return static_cast<uint32_t>(
_mm512_mask_test_epi8_mask(m, __conv(a), _mm512_set1_epi8(0x80)));
}
#undef SIMD_WRAPPER_1_
#undef SIMD_WRAPPER_1
#undef SIMD_WRAPPER_1I_
#undef SIMD_WRAPPER_1I
#undef SIMD_WRAPPER_2_
#undef SIMD_WRAPPER_2
#undef SIMD_WRAPPER_2I
#undef SIMD_WRAPPER_3_
#undef SIMD_WRAPPER_3
#undef SIMD_DWRAPPER_1_
#undef SIMD_DWRAPPER_1
#undef SIMD_DWRAPPER_1I_
#undef SIMD_DWRAPPER_1I
#undef SIMD_DWRAPPER_2_
#undef SIMD_DWRAPPER_2
#undef SIMD_DWRAPPER_2I
#undef SIMD_IWRAPPER_1_
#undef SIMD_IWRAPPER_1_8
#undef SIMD_IWRAPPER_1_16
#undef SIMD_IWRAPPER_1_32
#undef SIMD_IWRAPPER_1_64
#undef SIMD_IWRAPPER_1I_
#undef SIMD_IWRAPPER_1I_8
#undef SIMD_IWRAPPER_1I_16
#undef SIMD_IWRAPPER_1I_32
#undef SIMD_IWRAPPER_1I_64
#undef SIMD_IWRAPPER_2_
#undef SIMD_IWRAPPER_2_8
#undef SIMD_IWRAPPER_2_16
#undef SIMD_IWRAPPER_2_32
#undef SIMD_IWRAPPER_2_64
#undef SIMD_IWRAPPER_2I
//#undef SIMD_IWRAPPER_2I_8
//#undef SIMD_IWRAPPER_2I_16
//#undef SIMD_IWRAPPER_2I_32
//#undef SIMD_IWRAPPER_2I_64

View File

@ -0,0 +1,35 @@
/****************************************************************************
* Copyright (C) 2017 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
****************************************************************************/
#if !defined(__SIMD_LIB_AVX512_HPP__)
#error Do not include this file directly, use "simdlib.hpp" instead.
#endif
//============================================================================
// SIMD128 AVX (512) implementation for Knights Family
//
// Since this implementation inherits from the AVX512Base implementation,
// the only operations below ones that replace AVX512F / AVX512CD operations
// These use native AVX512 instructions with masking to enable a larger
// register set.
//============================================================================

View File

@ -78,34 +78,6 @@ public:
}
#define SIMD_WRAPPER_3(op) SIMD_WRAPPER_3_(op, op, __mmask16(0xff))
#define SIMD_DWRAPPER_1_(op, intrin, mask) \
static SIMDINLINE Double SIMDCALL op(Double a) \
{\
return __conv(_mm512_maskz_##intrin((mask), __conv(a)));\
}
#if !defined(AVX512F_STRICT)
#define SIMD_DWRAPPER_1(op) SIMD_DWRAPPER_1_(op, op, __mmask8(0xf))
#endif
#define SIMD_DWRAPPER_1I_(op, intrin, mask) \
template<int ImmT> \
static SIMDINLINE Double SIMDCALL op(Double a) \
{\
return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT));\
}
#if !defined(AVX512F_STRICT)
#define SIMD_DWRAPPER_1I(op) SIMD_DWRAPPER_1I_(op, op, __mmask8(0xf))
#endif
#define SIMD_DWRAPPER_2_(op, intrin, mask) \
static SIMDINLINE Double SIMDCALL op(Double a, Double b) \
{\
return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b)));\
}
#if !defined(AVX512F_STRICT)
#define SIMD_DWRAPPER_2(op) SIMD_DWRAPPER_2_(op, op, __mmask8(0xf))
#endif
#define SIMD_DWRAPPER_2I(op) \
template<int ImmT>\
static SIMDINLINE Double SIMDCALL op(Double a, Double b) \
@ -119,11 +91,6 @@ public:
return __conv(_mm512_maskz_##intrin((mask), __conv(a)));\
}
#define SIMD_IWRAPPER_1_32(op) SIMD_IWRAPPER_1_(op, op, __mmask16(0xff))
#if !defined(AVX512F_STRICT)
#define SIMD_IWRAPPER_1_8(op) SIMD_IWRAPPER_1_(op, op, __mmask64(0xffffffffull))
#define SIMD_IWRAPPER_1_16(op) SIMD_IWRAPPER_1_(op, op, __mmask32(0xffff))
#define SIMD_IWRAPPER_1_64(op) SIMD_IWRAPPER_1_(op, op, __mmask8(0xf))
#endif
#define SIMD_IWRAPPER_1I_(op, intrin, mask) \
template<int ImmT> \
@ -132,11 +99,6 @@ public:
return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT));\
}
#define SIMD_IWRAPPER_1I_32(op) SIMD_IWRAPPER_1I_(op, op, __mmask16(0xff))
#if !defined(AVX512F_STRICT)
#define SIMD_IWRAPPER_1I_8(op) SIMD_IWRAPPER_1I_(op, op, __mmask64(0xffffffffull))
#define SIMD_IWRAPPER_1I_16(op) SIMD_IWRAPPER_1I_(op, op, __mmask32(0xffff))
#define SIMD_IWRAPPER_1I_64(op) SIMD_IWRAPPER_1I_(op, op, __mmask8(0xf))
#endif
#define SIMD_IWRAPPER_2_(op, intrin, mask) \
static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
@ -144,11 +106,6 @@ public:
return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b)));\
}
#define SIMD_IWRAPPER_2_32(op) SIMD_IWRAPPER_2_(op, op, __mmask16(0xff))
#if !defined(AVX512F_STRICT)
#define SIMD_IWRAPPER_2_8(op) SIMD_IWRAPPER_2_(op, op, __mmask64(0xffffffffull))
#define SIMD_IWRAPPER_2_16(op) SIMD_IWRAPPER_2_(op, op, __mmask32(0xffff))
#define SIMD_IWRAPPER_2_64(op) SIMD_IWRAPPER_2_(op, op, __mmask8(0xf))
#endif
#define SIMD_IWRAPPER_2I(op) \
template<int ImmT>\
@ -182,12 +139,8 @@ SIMD_IWRAPPER_2_32(min_epi32); // return (a < b) ? a : b (int32)
SIMD_IWRAPPER_2_32(min_epu32); // return (a < b) ? a : b (uint32)
SIMD_IWRAPPER_2_32(mul_epi32); // return a * b (int32)
#if !defined(AVX512F_STRICT)
SIMD_IWRAPPER_2_8(add_epi8); // return a + b (int8)
SIMD_IWRAPPER_2_8(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8)
#endif
// SIMD_IWRAPPER_2_8(add_epi8); // return a + b (int8)
// SIMD_IWRAPPER_2_8(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8)
// return (a * b) & 0xFFFFFFFF
//
@ -196,12 +149,8 @@ SIMD_IWRAPPER_2_8(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uin
SIMD_IWRAPPER_2_32(mullo_epi32);
SIMD_IWRAPPER_2_32(sub_epi32); // return a - b (int32)
#if !defined(AVX512F_STRICT)
SIMD_IWRAPPER_2_64(sub_epi64); // return a - b (int64)
SIMD_IWRAPPER_2_8(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8)
#endif
// SIMD_IWRAPPER_2_64(sub_epi64); // return a - b (int64)
// SIMD_IWRAPPER_2_8(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8)
//-----------------------------------------------------------------------
// Logical operations
@ -253,14 +202,10 @@ SIMD_IWRAPPER_2_32(srlv_epi32); // return a >> b (uint32)
//-----------------------------------------------------------------------
// Blend / shuffle / permute operations
//-----------------------------------------------------------------------
#if !defined(AVX512F_STRICT)
SIMD_IWRAPPER_2_8(packs_epi16); // int16 --> int8 See documentation for _mm256_packs_epi16 and _mm512_packs_epi16
SIMD_IWRAPPER_2_16(packs_epi32); // int32 --> int16 See documentation for _mm256_packs_epi32 and _mm512_packs_epi32
SIMD_IWRAPPER_2_8(packus_epi16); // uint16 --> uint8 See documentation for _mm256_packus_epi16 and _mm512_packus_epi16
SIMD_IWRAPPER_2_16(packus_epi32); // uint32 --> uint16 See documentation for _mm256_packus_epi32 and _mm512_packus_epi32
#endif
// SIMD_IWRAPPER_2_8(packs_epi16); // int16 --> int8 See documentation for _mm256_packs_epi16 and _mm512_packs_epi16
// SIMD_IWRAPPER_2_16(packs_epi32); // int32 --> int16 See documentation for _mm256_packs_epi32 and _mm512_packs_epi32
// SIMD_IWRAPPER_2_8(packus_epi16); // uint16 --> uint8 See documentation for _mm256_packus_epi16 and _mm512_packus_epi16
// SIMD_IWRAPPER_2_16(packus_epi32); // uint32 --> uint16 See documentation for _mm256_packus_epi32 and _mm512_packus_epi32
// SIMD_IWRAPPER_2_(permute_epi32, permutevar8x32_epi32);
@ -279,16 +224,12 @@ SIMD_IWRAPPER_1I_32(shuffle_epi32);
SIMD_IWRAPPER_2_32(unpackhi_epi32);
SIMD_IWRAPPER_2_32(unpacklo_epi32);
#if !defined(AVX512F_STRICT)
SIMD_IWRAPPER_2_16(unpackhi_epi16);
SIMD_IWRAPPER_2_64(unpackhi_epi64);
SIMD_IWRAPPER_2_8(unpackhi_epi8);
SIMD_IWRAPPER_2_16(unpacklo_epi16);
SIMD_IWRAPPER_2_64(unpacklo_epi64);
SIMD_IWRAPPER_2_8(unpacklo_epi8);
#endif
// SIMD_IWRAPPER_2_16(unpackhi_epi16);
// SIMD_IWRAPPER_2_64(unpackhi_epi64);
// SIMD_IWRAPPER_2_8(unpackhi_epi8);
// SIMD_IWRAPPER_2_16(unpacklo_epi16);
// SIMD_IWRAPPER_2_64(unpacklo_epi64);
// SIMD_IWRAPPER_2_8(unpacklo_epi8);
//-----------------------------------------------------------------------
// Load / store operations
@ -339,16 +280,12 @@ static SIMDINLINE Float SIMDCALL mask_i32gather_ps(Float old, float const* p, In
static_cast<int>(ScaleT)));
}
#if !defined(AVX512F_STRICT)
static SIMDINLINE uint32_t SIMDCALL movemask_epi8(Integer a)
{
__mmask64 m = 0xffffffffull;
return static_cast<uint32_t>(
_mm512_mask_test_epi8_mask(m, __conv(a), _mm512_set1_epi8(0x80)));
}
#endif
// static SIMDINLINE uint32_t SIMDCALL movemask_epi8(Integer a)
// {
// __mmask64 m = 0xffffffffull;
// return static_cast<uint32_t>(
// _mm512_mask_test_epi8_mask(m, __conv(a), _mm512_set1_epi8(0x80)));
// }
static SIMDINLINE void SIMDCALL maskstore_ps(float *p, Integer mask, Float src)
{
@ -367,6 +304,11 @@ static SIMDINLINE void SIMDCALL store_si(Integer *p, Integer a) // *p = a
_mm512_mask_storeu_epi32(p, __mmask16(0xff), __conv(a));
}
static SIMDINLINE Float SIMDCALL vmask_ps(int32_t mask)
{
return castsi_ps(__conv(_mm512_maskz_set1_epi32(__mmask16(mask & 0xff), -1)));
}
//=======================================================================
// Legacy interface (available only in SIMD256 width)
//=======================================================================
@ -380,30 +322,10 @@ static SIMDINLINE void SIMDCALL store_si(Integer *p, Integer a) // *p = a
#undef SIMD_WRAPPER_2I
#undef SIMD_WRAPPER_3_
#undef SIMD_WRAPPER_3
#undef SIMD_DWRAPPER_1_
#undef SIMD_DWRAPPER_1
#undef SIMD_DWRAPPER_1I_
#undef SIMD_DWRAPPER_1I
#undef SIMD_DWRAPPER_2_
#undef SIMD_DWRAPPER_2
#undef SIMD_DWRAPPER_2I
#undef SIMD_IWRAPPER_1_
#undef SIMD_IWRAPPER_1_8
#undef SIMD_IWRAPPER_1_16
#undef SIMD_IWRAPPER_1_32
#undef SIMD_IWRAPPER_1_64
#undef SIMD_IWRAPPER_1I_
#undef SIMD_IWRAPPER_1I_8
#undef SIMD_IWRAPPER_1I_16
#undef SIMD_IWRAPPER_1I_32
#undef SIMD_IWRAPPER_1I_64
#undef SIMD_IWRAPPER_2_
#undef SIMD_IWRAPPER_2_8
#undef SIMD_IWRAPPER_2_16
#undef SIMD_IWRAPPER_2_32
#undef SIMD_IWRAPPER_2_64
#undef SIMD_IWRAPPER_2I
//#undef SIMD_IWRAPPER_2I_8
//#undef SIMD_IWRAPPER_2I_16
//#undef SIMD_IWRAPPER_2I_32
//#undef SIMD_IWRAPPER_2I_64

View File

@ -0,0 +1,127 @@
/****************************************************************************
* Copyright (C) 2017 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
****************************************************************************/
#if !defined(__SIMD_LIB_AVX512_HPP__)
#error Do not include this file directly, use "simdlib.hpp" instead.
#endif
//============================================================================
// SIMD256 AVX (512) implementation for Core processors
//
// Since this implementation inherits from the AVX (2) implementation,
// the only operations below ones that replace AVX (2) operations.
// These use native AVX512 instructions with masking to enable a larger
// register set.
//============================================================================
#define SIMD_DWRAPPER_1_(op, intrin, mask) \
static SIMDINLINE Double SIMDCALL op(Double a) \
{\
return __conv(_mm512_maskz_##intrin((mask), __conv(a)));\
}
#define SIMD_DWRAPPER_1(op) SIMD_DWRAPPER_1_(op, op, __mmask8(0xf))
#define SIMD_DWRAPPER_1I_(op, intrin, mask) \
template<int ImmT> \
static SIMDINLINE Double SIMDCALL op(Double a) \
{\
return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT));\
}
#define SIMD_DWRAPPER_1I(op) SIMD_DWRAPPER_1I_(op, op, __mmask8(0xf))
#define SIMD_DWRAPPER_2_(op, intrin, mask) \
static SIMDINLINE Double SIMDCALL op(Double a, Double b) \
{\
return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b)));\
}
#define SIMD_DWRAPPER_2(op) SIMD_DWRAPPER_2_(op, op, __mmask8(0xf))
#define SIMD_IWRAPPER_1_(op, intrin, mask) \
static SIMDINLINE Integer SIMDCALL op(Integer a) \
{\
return __conv(_mm512_maskz_##intrin((mask), __conv(a)));\
}
#define SIMD_IWRAPPER_1_8(op) SIMD_IWRAPPER_1_(op, op, __mmask64(0xffffffffull))
#define SIMD_IWRAPPER_1_16(op) SIMD_IWRAPPER_1_(op, op, __mmask32(0xffff))
#define SIMD_IWRAPPER_1_64(op) SIMD_IWRAPPER_1_(op, op, __mmask8(0xf))
#define SIMD_IWRAPPER_1I_(op, intrin, mask) \
template<int ImmT> \
static SIMDINLINE Integer SIMDCALL op(Integer a) \
{\
return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT));\
}
#define SIMD_IWRAPPER_1I_8(op) SIMD_IWRAPPER_1I_(op, op, __mmask64(0xffffffffull))
#define SIMD_IWRAPPER_1I_16(op) SIMD_IWRAPPER_1I_(op, op, __mmask32(0xffff))
#define SIMD_IWRAPPER_1I_64(op) SIMD_IWRAPPER_1I_(op, op, __mmask8(0xf))
#define SIMD_IWRAPPER_2_(op, intrin, mask) \
static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
{\
return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b)));\
}
#define SIMD_IWRAPPER_2_8(op) SIMD_IWRAPPER_2_(op, op, __mmask64(0xffffffffull))
#define SIMD_IWRAPPER_2_16(op) SIMD_IWRAPPER_2_(op, op, __mmask32(0xffff))
#define SIMD_IWRAPPER_2_64(op) SIMD_IWRAPPER_2_(op, op, __mmask8(0xf))
SIMD_IWRAPPER_2_8(add_epi8); // return a + b (int8)
SIMD_IWRAPPER_2_8(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8)
SIMD_IWRAPPER_2_64(sub_epi64); // return a - b (int64)
SIMD_IWRAPPER_2_8(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8)
SIMD_IWRAPPER_2_8(packs_epi16); // int16 --> int8 See documentation for _mm256_packs_epi16 and _mm512_packs_epi16
SIMD_IWRAPPER_2_16(packs_epi32); // int32 --> int16 See documentation for _mm256_packs_epi32 and _mm512_packs_epi32
SIMD_IWRAPPER_2_8(packus_epi16); // uint16 --> uint8 See documentation for _mm256_packus_epi16 and _mm512_packus_epi16
SIMD_IWRAPPER_2_16(packus_epi32); // uint32 --> uint16 See documentation for _mm256_packus_epi32 and _mm512_packus_epi32
SIMD_IWRAPPER_2_16(unpackhi_epi16);
SIMD_IWRAPPER_2_64(unpackhi_epi64);
SIMD_IWRAPPER_2_8(unpackhi_epi8);
SIMD_IWRAPPER_2_16(unpacklo_epi16);
SIMD_IWRAPPER_2_64(unpacklo_epi64);
SIMD_IWRAPPER_2_8(unpacklo_epi8);
static SIMDINLINE uint32_t SIMDCALL movemask_epi8(Integer a)
{
__mmask64 m = 0xffffffffull;
return static_cast<uint32_t>(
_mm512_mask_test_epi8_mask(m, __conv(a), _mm512_set1_epi8(0x80)));
}
#undef SIMD_DWRAPPER_1_
#undef SIMD_DWRAPPER_1
#undef SIMD_DWRAPPER_1I_
#undef SIMD_DWRAPPER_1I
#undef SIMD_DWRAPPER_2_
#undef SIMD_DWRAPPER_2
#undef SIMD_DWRAPPER_2I
#undef SIMD_IWRAPPER_1_
#undef SIMD_IWRAPPER_1_8
#undef SIMD_IWRAPPER_1_16
#undef SIMD_IWRAPPER_1_64
#undef SIMD_IWRAPPER_1I_
#undef SIMD_IWRAPPER_1I_8
#undef SIMD_IWRAPPER_1I_16
#undef SIMD_IWRAPPER_1I_64
#undef SIMD_IWRAPPER_2_
#undef SIMD_IWRAPPER_2_8
#undef SIMD_IWRAPPER_2_16
#undef SIMD_IWRAPPER_2_64

View File

@ -0,0 +1,35 @@
/****************************************************************************
* Copyright (C) 2017 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
****************************************************************************/
#if !defined(__SIMD_LIB_AVX512_HPP__)
#error Do not include this file directly, use "simdlib.hpp" instead.
#endif
//============================================================================
// SIMD256 AVX (512) implementation for Knights Family
//
// Since this implementation inherits from the AVX (2) implementation,
// the only operations below ones that replace AVX (2) operations.
// These use native AVX512 instructions with masking to enable a larger
// register set.
//============================================================================

View File

@ -25,7 +25,7 @@
#endif
#if defined(__GNUC__) && !defined( __clang__) && !defined(__INTEL_COMPILER)
// gcc missing these intrinsics
// gcc as of 7.1 was missing these intrinsics
#ifndef _mm512_cmpneq_ps_mask
#define _mm512_cmpneq_ps_mask(a,b) _mm512_cmp_ps_mask((a),(b),_CMP_NEQ_UQ)
#endif
@ -37,14 +37,13 @@
#ifndef _mm512_cmplt_pd_mask
#define _mm512_cmplt_pd_mask(a,b) _mm512_cmp_pd_mask((a),(b),_CMP_LT_OS)
#endif
#endif
//============================================================================
// SIMD16 AVX512 (F) implementation
// SIMD16 AVX512 (F) implementation (compatible with Knights and Core
// processors)
//
// TODO: Optimize for KNL / KNH or for SKX??
// For now probably optimizing more for KNL as that's where
// immediate customers are.
//============================================================================
static const int TARGET_SIMD_WIDTH = 16;
@ -153,34 +152,11 @@ using SIMD256T = SIMD256Impl::AVX2Impl;
}
#define SIMD_IWRAPPER_2I(op) SIMD_IWRAPPER_2I_(op, op)
#define SIMD_EMU_IWRAPPER_2(op) \
static SIMDINLINE \
Integer SIMDCALL op(Integer a, Integer b)\
{\
return Integer\
{\
SIMD256T::op(a.v8[0], b.v8[0]),\
SIMD256T::op(a.v8[1], b.v8[1]),\
};\
}
private:
static SIMDINLINE Integer vmask(__mmask8 m)
{
return _mm512_maskz_set1_epi64(m, -1LL);
}
static SIMDINLINE Integer vmask(__mmask16 m)
{
return _mm512_maskz_set1_epi32(m, -1);
}
static SIMDINLINE Integer vmask(__mmask32 m)
{
return _mm512_maskz_set1_epi16(m, -1);
}
static SIMDINLINE Integer vmask(__mmask64 m)
{
return _mm512_maskz_set1_epi8(m, -1);
}
public:
//-----------------------------------------------------------------------
@ -236,21 +212,10 @@ SIMD_IWRAPPER_2_(andnot_si, andnot_si512); // return (~a) & b (int)
SIMD_IWRAPPER_2_(or_si, or_si512); // return a | b (int)
SIMD_IWRAPPER_2_(xor_si, xor_si512); // return a ^ b (int)
#if defined(AVX512F_STRICT)
SIMD_WRAPPERI_2_(and_ps, and_epi32); // return a & b (float treated as int)
SIMD_WRAPPERI_2_(andnot_ps, andnot_epi32); // return (~a) & b (float treated as int)
SIMD_WRAPPERI_2_(or_ps, or_epi32); // return a | b (float treated as int)
SIMD_WRAPPERI_2_(xor_ps, xor_epi32); // return a ^ b (float treated as int)
#else
SIMD_WRAPPER_2(and_ps); // return a & b (float treated as int)
SIMD_WRAPPER_2(andnot_ps); // return (~a) & b (float treated as int)
SIMD_WRAPPER_2(or_ps); // return a | b (float treated as int)
SIMD_WRAPPER_2(xor_ps); // return a ^ b (float treated as int)
#endif
// SIMD_WRAPPER_2(and_ps); // return a & b (float treated as int)
// SIMD_WRAPPER_2(andnot_ps); // return (~a) & b (float treated as int)
// SIMD_WRAPPER_2(or_ps); // return a | b (float treated as int)
// SIMD_WRAPPER_2(xor_ps); // return a ^ b (float treated as int)
//-----------------------------------------------------------------------
@ -260,6 +225,17 @@ SIMD_IWRAPPER_1I(slli_epi32); // return a << ImmT
SIMD_IWRAPPER_2(sllv_epi32);
SIMD_IWRAPPER_1I(srai_epi32); // return a >> ImmT (int32)
SIMD_IWRAPPER_1I(srli_epi32); // return a >> ImmT (uint32)
#if 0
SIMD_IWRAPPER_1I_(srli_si, srli_si512); // return a >> (ImmT*8) (uint)
template<int ImmT> // same as srli_si, but with Float cast to int
static SIMDINLINE Float SIMDCALL srlisi_ps(Float a)
{
return castsi_ps(srli_si<ImmT>(castps_si(a)));
}
#endif
SIMD_IWRAPPER_2(srlv_epi32);
//-----------------------------------------------------------------------
@ -461,17 +437,10 @@ static SIMDINLINE Integer SIMDCALL insert_si(Integer a, SIMD256Impl::Integer b)
return _mm512_inserti64x4(a, b, imm);
}
#if !defined(AVX512F_STRICT)
SIMD_IWRAPPER_2(packs_epi16); // See documentation for _mm512_packs_epi16 and _mm512_packs_epi16
SIMD_IWRAPPER_2(packs_epi32); // See documentation for _mm512_packs_epi32 and _mm512_packs_epi32
SIMD_IWRAPPER_2(packus_epi16); // See documentation for _mm512_packus_epi16 and _mm512_packus_epi16
SIMD_IWRAPPER_2(packus_epi32); // See documentation for _mm512_packus_epi32 and _mm512_packus_epi32
#else
SIMD_EMU_IWRAPPER_2(packs_epi16)
SIMD_EMU_IWRAPPER_2(packs_epi32)
SIMD_EMU_IWRAPPER_2(packus_epi16)
SIMD_EMU_IWRAPPER_2(packus_epi32)
#endif
// SIMD_IWRAPPER_2(packs_epi16); // See documentation for _mm512_packs_epi16 and _mm512_packs_epi16
// SIMD_IWRAPPER_2(packs_epi32); // See documentation for _mm512_packs_epi32 and _mm512_packs_epi32
// SIMD_IWRAPPER_2(packus_epi16); // See documentation for _mm512_packus_epi16 and _mm512_packus_epi16
// SIMD_IWRAPPER_2(packus_epi32); // See documentation for _mm512_packus_epi32 and _mm512_packus_epi32
static SIMDINLINE Integer SIMDCALL permute_epi32(Integer a, Integer swiz) // return a[swiz[i]] for each 32-bit lane i (float)
{
@ -704,4 +673,4 @@ static SIMDINLINE Float SIMDCALL vmask_ps(int32_t mask)
#undef SIMD_IWRAPPER_2
#undef SIMD_IWRAPPER_2_
#undef SIMD_IWRAPPER_2I
#undef SIMD_EMU_IWRAPPER_2

View File

@ -0,0 +1,181 @@
/****************************************************************************
* Copyright (C) 2017 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
****************************************************************************/
#if !defined(__SIMD_LIB_AVX512_HPP__)
#error Do not include this file directly, use "simdlib.hpp" instead.
#endif
//============================================================================
// SIMD16 AVX512 (F) implementation for Core processors
//
//============================================================================
#define SIMD_WRAPPER_1_(op, intrin) \
static SIMDINLINE Float SIMDCALL op(Float a) \
{\
return intrin(a);\
}
#define SIMD_WRAPPER_1(op) \
SIMD_WRAPPER_1_(op, _mm512_##op)
#define SIMD_WRAPPER_2_(op, intrin) \
static SIMDINLINE Float SIMDCALL op(Float a, Float b) \
{\
return _mm512_##intrin(a, b);\
}
#define SIMD_WRAPPER_2(op) SIMD_WRAPPER_2_(op, op)
#define SIMD_WRAPPERI_2_(op, intrin) \
static SIMDINLINE Float SIMDCALL op(Float a, Float b) \
{\
return _mm512_castsi512_ps(_mm512_##intrin(\
_mm512_castps_si512(a), _mm512_castps_si512(b)));\
}
#define SIMD_DWRAPPER_2(op) \
static SIMDINLINE Double SIMDCALL op(Double a, Double b) \
{\
return _mm512_##op(a, b);\
}
#define SIMD_WRAPPER_2I_(op, intrin) \
template<int ImmT>\
static SIMDINLINE Float SIMDCALL op(Float a, Float b) \
{\
return _mm512_##intrin(a, b, ImmT);\
}
#define SIMD_WRAPPER_2I(op) SIMD_WRAPPER_2I_(op, op)
#define SIMD_DWRAPPER_2I_(op, intrin) \
template<int ImmT>\
static SIMDINLINE Double SIMDCALL op(Double a, Double b) \
{\
return _mm512_##intrin(a, b, ImmT);\
}
#define SIMD_DWRAPPER_2I(op) SIMD_DWRAPPER_2I_(op, op)
#define SIMD_WRAPPER_3(op) \
static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c) \
{\
return _mm512_##op(a, b, c);\
}
#define SIMD_IWRAPPER_1(op) \
static SIMDINLINE Integer SIMDCALL op(Integer a) \
{\
return _mm512_##op(a);\
}
#define SIMD_IWRAPPER_1_8(op) \
static SIMDINLINE Integer SIMDCALL op(SIMD256Impl::Integer a) \
{\
return _mm512_##op(a);\
}
#define SIMD_IWRAPPER_1_4(op) \
static SIMDINLINE Integer SIMDCALL op(SIMD128Impl::Integer a) \
{\
return _mm512_##op(a);\
}
#define SIMD_IWRAPPER_1I_(op, intrin) \
template<int ImmT> \
static SIMDINLINE Integer SIMDCALL op(Integer a) \
{\
return intrin(a, ImmT);\
}
#define SIMD_IWRAPPER_1I(op) SIMD_IWRAPPER_1I_(op, _mm512_##op)
#define SIMD_IWRAPPER_2_(op, intrin) \
static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
{\
return _mm512_##intrin(a, b);\
}
#define SIMD_IWRAPPER_2(op) SIMD_IWRAPPER_2_(op, op)
#define SIMD_IWRAPPER_2_CMP(op, cmp) \
static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
{\
return cmp(a, b);\
}
#define SIMD_IFWRAPPER_2(op, intrin) \
static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
{\
return castps_si(_mm512_##intrin(castsi_ps(a), castsi_ps(b)) );\
}
#define SIMD_IWRAPPER_2I_(op, intrin) \
template<int ImmT>\
static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
{\
return _mm512_##intrin(a, b, ImmT);\
}
#define SIMD_IWRAPPER_2I(op) SIMD_IWRAPPER_2I_(op, op)
private:
static SIMDINLINE Integer vmask(__mmask8 m)
{
return _mm512_maskz_set1_epi64(m, -1LL);
}
static SIMDINLINE Integer vmask(__mmask32 m)
{
return _mm512_maskz_set1_epi16(m, -1);
}
static SIMDINLINE Integer vmask(__mmask64 m)
{
return _mm512_maskz_set1_epi8(m, -1);
}
public:
SIMD_WRAPPER_2(and_ps); // return a & b (float treated as int)
SIMD_WRAPPER_2(andnot_ps); // return (~a) & b (float treated as int)
SIMD_WRAPPER_2(or_ps); // return a | b (float treated as int)
SIMD_WRAPPER_2(xor_ps); // return a ^ b (float treated as int)
SIMD_IWRAPPER_2(packs_epi16); // See documentation for _mm512_packs_epi16 and _mm512_packs_epi16
SIMD_IWRAPPER_2(packs_epi32); // See documentation for _mm512_packs_epi32 and _mm512_packs_epi32
SIMD_IWRAPPER_2(packus_epi16); // See documentation for _mm512_packus_epi16 and _mm512_packus_epi16
SIMD_IWRAPPER_2(packus_epi32); // See documentation for _mm512_packus_epi32 and _mm512_packus_epi32
#undef SIMD_WRAPPER_1_
#undef SIMD_WRAPPER_1
#undef SIMD_WRAPPER_2
#undef SIMD_WRAPPER_2_
#undef SIMD_WRAPPERI_2_
#undef SIMD_DWRAPPER_2
#undef SIMD_DWRAPPER_2I
#undef SIMD_WRAPPER_2I_
#undef SIMD_WRAPPER_3_
#undef SIMD_WRAPPER_2I
#undef SIMD_WRAPPER_3
#undef SIMD_IWRAPPER_1
#undef SIMD_IWRAPPER_2
#undef SIMD_IFWRAPPER_2
#undef SIMD_IWRAPPER_2I
#undef SIMD_IWRAPPER_1
#undef SIMD_IWRAPPER_1I
#undef SIMD_IWRAPPER_1I_
#undef SIMD_IWRAPPER_2
#undef SIMD_IWRAPPER_2_
#undef SIMD_IWRAPPER_2I

View File

@ -0,0 +1,183 @@
/****************************************************************************
* Copyright (C) 2017 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
****************************************************************************/
#if !defined(__SIMD_LIB_AVX512_HPP__)
#error Do not include this file directly, use "simdlib.hpp" instead.
#endif
//============================================================================
// SIMD16 AVX512 (F) implementation for Knights Family Processors
//
//============================================================================
static const int TARGET_SIMD_WIDTH = 16;
using SIMD256T = SIMD256Impl::AVX2Impl;
#define SIMD_WRAPPER_1_(op, intrin) \
static SIMDINLINE Float SIMDCALL op(Float a) \
{\
return intrin(a);\
}
#define SIMD_WRAPPER_1(op) \
SIMD_WRAPPER_1_(op, _mm512_##op)
#define SIMD_WRAPPER_2_(op, intrin) \
static SIMDINLINE Float SIMDCALL op(Float a, Float b) \
{\
return _mm512_##intrin(a, b);\
}
#define SIMD_WRAPPER_2(op) SIMD_WRAPPER_2_(op, op)
#define SIMD_WRAPPERI_2_(op, intrin) \
static SIMDINLINE Float SIMDCALL op(Float a, Float b) \
{\
return _mm512_castsi512_ps(_mm512_##intrin(\
_mm512_castps_si512(a), _mm512_castps_si512(b)));\
}
#define SIMD_DWRAPPER_2(op) \
static SIMDINLINE Double SIMDCALL op(Double a, Double b) \
{\
return _mm512_##op(a, b);\
}
#define SIMD_WRAPPER_2I_(op, intrin) \
template<int ImmT>\
static SIMDINLINE Float SIMDCALL op(Float a, Float b) \
{\
return _mm512_##intrin(a, b, ImmT);\
}
#define SIMD_WRAPPER_2I(op) SIMD_WRAPPER_2I_(op, op)
#define SIMD_DWRAPPER_2I_(op, intrin) \
template<int ImmT>\
static SIMDINLINE Double SIMDCALL op(Double a, Double b) \
{\
return _mm512_##intrin(a, b, ImmT);\
}
#define SIMD_DWRAPPER_2I(op) SIMD_DWRAPPER_2I_(op, op)
#define SIMD_WRAPPER_3(op) \
static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c) \
{\
return _mm512_##op(a, b, c);\
}
#define SIMD_IWRAPPER_1(op) \
static SIMDINLINE Integer SIMDCALL op(Integer a) \
{\
return _mm512_##op(a);\
}
#define SIMD_IWRAPPER_1_8(op) \
static SIMDINLINE Integer SIMDCALL op(SIMD256Impl::Integer a) \
{\
return _mm512_##op(a);\
}
#define SIMD_IWRAPPER_1_4(op) \
static SIMDINLINE Integer SIMDCALL op(SIMD128Impl::Integer a) \
{\
return _mm512_##op(a);\
}
#define SIMD_IWRAPPER_1I_(op, intrin) \
template<int ImmT> \
static SIMDINLINE Integer SIMDCALL op(Integer a) \
{\
return intrin(a, ImmT);\
}
#define SIMD_IWRAPPER_1I(op) SIMD_IWRAPPER_1I_(op, _mm512_##op)
#define SIMD_IWRAPPER_2_(op, intrin) \
static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
{\
return _mm512_##intrin(a, b);\
}
#define SIMD_IWRAPPER_2(op) SIMD_IWRAPPER_2_(op, op)
#define SIMD_IWRAPPER_2_CMP(op, cmp) \
static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
{\
return cmp(a, b);\
}
#define SIMD_IFWRAPPER_2(op, intrin) \
static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
{\
return castps_si(_mm512_##intrin(castsi_ps(a), castsi_ps(b)) );\
}
#define SIMD_IWRAPPER_2I_(op, intrin) \
template<int ImmT>\
static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
{\
return _mm512_##intrin(a, b, ImmT);\
}
#define SIMD_IWRAPPER_2I(op) SIMD_IWRAPPER_2I_(op, op)
private:
static SIMDINLINE Integer vmask(__mmask8 m)
{
return _mm512_maskz_set1_epi64(m, -1LL);
}
static SIMDINLINE Integer vmask(__mmask16 m)
{
return _mm512_maskz_set1_epi32(m, -1);
}
static SIMDINLINE Integer vmask(__mmask32 m)
{
return _mm512_maskz_set1_epi16(m, -1);
}
static SIMDINLINE Integer vmask(__mmask64 m)
{
return _mm512_maskz_set1_epi8(m, -1);
}
public:
SIMD_WRAPPERI_2_(and_ps, and_epi32); // return a & b (float treated as int)
SIMD_WRAPPERI_2_(andnot_ps, andnot_epi32); // return (~a) & b (float treated as int)
SIMD_WRAPPERI_2_(or_ps, or_epi32); // return a | b (float treated as int)
SIMD_WRAPPERI_2_(xor_ps, xor_epi32); // return a ^ b (float treated as int)
#undef SIMD_WRAPPER_1_
#undef SIMD_WRAPPER_1
#undef SIMD_WRAPPER_2
#undef SIMD_WRAPPER_2_
#undef SIMD_WRAPPERI_2_
#undef SIMD_DWRAPPER_2
#undef SIMD_DWRAPPER_2I
#undef SIMD_WRAPPER_2I_
#undef SIMD_WRAPPER_3_
#undef SIMD_WRAPPER_2I
#undef SIMD_WRAPPER_3
#undef SIMD_IWRAPPER_1
#undef SIMD_IWRAPPER_2
#undef SIMD_IFWRAPPER_2
#undef SIMD_IWRAPPER_2I
#undef SIMD_IWRAPPER_1
#undef SIMD_IWRAPPER_1I
#undef SIMD_IWRAPPER_1I_
#undef SIMD_IWRAPPER_2
#undef SIMD_IWRAPPER_2_
#undef SIMD_IWRAPPER_2I

View File

@ -0,0 +1,27 @@
/****************************************************************************
* Copyright (C) 2017 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
****************************************************************************/
#if !defined(__SIMD_LIB_AVX512_HPP__)
#error Do not include this file directly, use "simdlib.hpp" instead.
#endif
// Implement mask-enabled SIMD functions

View File

@ -0,0 +1,27 @@
/****************************************************************************
* Copyright (C) 2017 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
****************************************************************************/
#if !defined(__SIMD_LIB_AVX512_HPP__)
#error Do not include this file directly, use "simdlib.hpp" instead.
#endif
// Implement mask-enabled SIMD functions

View File

@ -262,7 +262,7 @@ namespace SIMDImpl
namespace SIMD512Impl
{
#if !defined(__AVX512F__)
#if !(defined(__AVX512F__) || defined(_MM_K0_REG))
// Define AVX512 types if not included via immintrin.h.
// All data members of these types are ONLY to viewed
// in a debugger. Do NOT access them via code!