Abstracted away from x86 intrinsics

This commit is contained in:
tevador 2019-05-14 09:13:38 +02:00
parent 3dd21ea93d
commit 1aa7865619
10 changed files with 267 additions and 249 deletions

View File

@ -36,21 +36,21 @@ void hashAes1Rx4(const void *input, size_t inputSize, void *hash) {
const uint8_t* inptr = (uint8_t*)input;
const uint8_t* inputEnd = inptr + inputSize;
__m128i state0, state1, state2, state3;
__m128i in0, in1, in2, in3;
rx_vec_i128 state0, state1, state2, state3;
rx_vec_i128 in0, in1, in2, in3;
//intial state
state0 = _mm_set_epi32(0x8d3126fd, 0x1146d167, 0x887af5ab, 0xc4778e00);
state1 = _mm_set_epi32(0x19fe9fa1, 0x58da632b, 0x1b95af89, 0xb834ef4b);
state2 = _mm_set_epi32(0x1bb2cd74, 0xc35ad744, 0xab283a00, 0x7742dd3a);
state3 = _mm_set_epi32(0xbb30a58a, 0x49593c57, 0xdc5d97cc, 0xe18b449a);
state0 = rx_set_int_vec_i128(0x8d3126fd, 0x1146d167, 0x887af5ab, 0xc4778e00);
state1 = rx_set_int_vec_i128(0x19fe9fa1, 0x58da632b, 0x1b95af89, 0xb834ef4b);
state2 = rx_set_int_vec_i128(0x1bb2cd74, 0xc35ad744, 0xab283a00, 0x7742dd3a);
state3 = rx_set_int_vec_i128(0xbb30a58a, 0x49593c57, 0xdc5d97cc, 0xe18b449a);
//process 64 bytes at a time in 4 lanes
while (inptr < inputEnd) {
in0 = _mm_load_si128((__m128i*)inptr + 0);
in1 = _mm_load_si128((__m128i*)inptr + 1);
in2 = _mm_load_si128((__m128i*)inptr + 2);
in3 = _mm_load_si128((__m128i*)inptr + 3);
in0 = rx_load_vec_i128((rx_vec_i128*)inptr + 0);
in1 = rx_load_vec_i128((rx_vec_i128*)inptr + 1);
in2 = rx_load_vec_i128((rx_vec_i128*)inptr + 2);
in3 = rx_load_vec_i128((rx_vec_i128*)inptr + 3);
state0 = aesenc<softAes>(state0, in0);
state1 = aesdec<softAes>(state1, in1);
@ -61,8 +61,8 @@ void hashAes1Rx4(const void *input, size_t inputSize, void *hash) {
}
//two extra rounds to achieve full diffusion
__m128i xkey0 = _mm_set_epi32(0x83951283, 0xe4c5593d, 0x2a5a929c, 0x11cbf247);
__m128i xkey1 = _mm_set_epi32(0xff215bb2, 0xabbc2523, 0x477bef0b, 0xce816c95);
rx_vec_i128 xkey0 = rx_set_int_vec_i128(0x83951283, 0xe4c5593d, 0x2a5a929c, 0x11cbf247);
rx_vec_i128 xkey1 = rx_set_int_vec_i128(0xff215bb2, 0xabbc2523, 0x477bef0b, 0xce816c95);
state0 = aesenc<softAes>(state0, xkey0);
state1 = aesdec<softAes>(state1, xkey0);
@ -75,10 +75,10 @@ void hashAes1Rx4(const void *input, size_t inputSize, void *hash) {
state3 = aesdec<softAes>(state3, xkey1);
//output hash
_mm_store_si128((__m128i*)hash + 0, state0);
_mm_store_si128((__m128i*)hash + 1, state1);
_mm_store_si128((__m128i*)hash + 2, state2);
_mm_store_si128((__m128i*)hash + 3, state3);
rx_store_vec_i128((rx_vec_i128*)hash + 0, state0);
rx_store_vec_i128((rx_vec_i128*)hash + 1, state1);
rx_store_vec_i128((rx_vec_i128*)hash + 2, state2);
rx_store_vec_i128((rx_vec_i128*)hash + 3, state3);
}
template void hashAes1Rx4<false>(const void *input, size_t inputSize, void *hash);
@ -99,18 +99,18 @@ void fillAes1Rx4(void *state, size_t outputSize, void *buffer) {
const uint8_t* outptr = (uint8_t*)buffer;
const uint8_t* outputEnd = outptr + outputSize;
__m128i state0, state1, state2, state3;
__m128i key0, key1, key2, key3;
rx_vec_i128 state0, state1, state2, state3;
rx_vec_i128 key0, key1, key2, key3;
key0 = _mm_set_epi32(0xdf20a2e3, 0xca329132, 0x454ff6d5, 0x84eeec2d);
key1 = _mm_set_epi32(0x1deb5971, 0xfed0387f, 0xf10fc578, 0x017b63d0);
key2 = _mm_set_epi32(0xdfc926b3, 0xa517ceb4, 0x2f2c70a1, 0x327d7a52);
key3 = _mm_set_epi32(0x341cf31c, 0xa0ece0a9, 0x3d17da5e, 0x5c8d77d3);
key0 = rx_set_int_vec_i128(0xdf20a2e3, 0xca329132, 0x454ff6d5, 0x84eeec2d);
key1 = rx_set_int_vec_i128(0x1deb5971, 0xfed0387f, 0xf10fc578, 0x017b63d0);
key2 = rx_set_int_vec_i128(0xdfc926b3, 0xa517ceb4, 0x2f2c70a1, 0x327d7a52);
key3 = rx_set_int_vec_i128(0x341cf31c, 0xa0ece0a9, 0x3d17da5e, 0x5c8d77d3);
state0 = _mm_load_si128((__m128i*)state + 0);
state1 = _mm_load_si128((__m128i*)state + 1);
state2 = _mm_load_si128((__m128i*)state + 2);
state3 = _mm_load_si128((__m128i*)state + 3);
state0 = rx_load_vec_i128((rx_vec_i128*)state + 0);
state1 = rx_load_vec_i128((rx_vec_i128*)state + 1);
state2 = rx_load_vec_i128((rx_vec_i128*)state + 2);
state3 = rx_load_vec_i128((rx_vec_i128*)state + 3);
while (outptr < outputEnd) {
state0 = aesdec<softAes>(state0, key0);
@ -118,18 +118,18 @@ void fillAes1Rx4(void *state, size_t outputSize, void *buffer) {
state2 = aesdec<softAes>(state2, key2);
state3 = aesenc<softAes>(state3, key3);
_mm_store_si128((__m128i*)outptr + 0, state0);
_mm_store_si128((__m128i*)outptr + 1, state1);
_mm_store_si128((__m128i*)outptr + 2, state2);
_mm_store_si128((__m128i*)outptr + 3, state3);
rx_store_vec_i128((rx_vec_i128*)outptr + 0, state0);
rx_store_vec_i128((rx_vec_i128*)outptr + 1, state1);
rx_store_vec_i128((rx_vec_i128*)outptr + 2, state2);
rx_store_vec_i128((rx_vec_i128*)outptr + 3, state3);
outptr += 64;
}
_mm_store_si128((__m128i*)state + 0, state0);
_mm_store_si128((__m128i*)state + 1, state1);
_mm_store_si128((__m128i*)state + 2, state2);
_mm_store_si128((__m128i*)state + 3, state3);
rx_store_vec_i128((rx_vec_i128*)state + 0, state0);
rx_store_vec_i128((rx_vec_i128*)state + 1, state1);
rx_store_vec_i128((rx_vec_i128*)state + 2, state2);
rx_store_vec_i128((rx_vec_i128*)state + 3, state3);
}
template void fillAes1Rx4<true>(void *state, size_t outputSize, void *buffer);

View File

@ -27,7 +27,7 @@ namespace randomx {
template<size_t alignment>
void* AlignedAllocator<alignment>::allocMemory(size_t count) {
void *mem = _mm_malloc(count, alignment);
void *mem = rx_aligned_alloc(count, alignment);
if (mem == nullptr)
throw std::bad_alloc();
return mem;
@ -35,11 +35,10 @@ namespace randomx {
template<size_t alignment>
void AlignedAllocator<alignment>::freeMemory(void* ptr, size_t count) {
_mm_free(ptr);
rx_aligned_free(ptr);
}
template class AlignedAllocator<CacheLineSize>;
template class AlignedAllocator<sizeof(__m128i)>;;
void* LargePageAllocator::allocMemory(size_t count) {
return allocLargePagesMemory(count);

View File

@ -148,7 +148,7 @@ namespace randomx {
rl[7] = rl[0] ^ superscalarAdd7;
for (unsigned i = 0; i < RANDOMX_CACHE_ACCESSES; ++i) {
mixBlock = getMixBlock(registerValue, cache->memory);
PREFETCHNTA(mixBlock);
rx_prefetch_nta(mixBlock);
SuperscalarProgram& prog = cache->programs[i];
executeSuperscalar(rl, prog, &cache->reciprocalCache);

View File

@ -123,32 +123,32 @@ along with RandomX. If not, see<http://www.gnu.org/licenses/>.
#define HAVE_SMULH
#endif
void setRoundMode(uint32_t rcflag) {
switch (rcflag & 3) {
case RoundDown:
setRoundMode_(FE_DOWNWARD);
break;
case RoundUp:
setRoundMode_(FE_UPWARD);
break;
case RoundToZero:
setRoundMode_(FE_TOWARDZERO);
break;
case RoundToNearest:
setRoundMode_(FE_TONEAREST);
break;
default:
UNREACHABLE;
#ifdef RANDOMX_DEFAULT_FENV
void rx_reset_float_state() {
setRoundMode_(FE_TONEAREST);
}
void rx_set_rounding_mode(uint32_t mode) {
switch (mode & 3) {
case RoundDown:
setRoundMode_(FE_DOWNWARD);
break;
case RoundUp:
setRoundMode_(FE_UPWARD);
break;
case RoundToZero:
setRoundMode_(FE_TOWARDZERO);
break;
case RoundToNearest:
setRoundMode_(FE_TONEAREST);
break;
default:
UNREACHABLE;
}
}
void initFpu() {
#ifdef __SSE2__
_mm_setcsr(0x9FC0); //Flush to zero, denormals are zero, default rounding mode, all exceptions disabled
#else
setRoundMode(FE_TONEAREST);
#endif
}
union double_ser_t {
double f;

View File

@ -20,6 +20,7 @@ along with RandomX. If not, see<http://www.gnu.org/licenses/>.
#pragma once
#include <cstdint>
#include "blake2/endian.h"
constexpr int32_t unsigned32ToSigned2sCompl(uint32_t x) {
return (-1 == ~0) ? (int32_t)x : (x > INT32_MAX ? (-(int32_t)(UINT32_MAX - x) - 1) : (int32_t)x);
@ -33,6 +34,11 @@ constexpr uint64_t signExtend2sCompl(uint32_t x) {
return (-1 == ~0) ? (int64_t)(int32_t)(x) : (x > INT32_MAX ? (x | 0xffffffff00000000ULL) : (uint64_t)x);
}
constexpr int RoundToNearest = 0;
constexpr int RoundDown = 1;
constexpr int RoundUp = 2;
constexpr int RoundToZero = 3;
#if defined(_MSC_VER)
#if defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP == 2)
#define __SSE2__ 1
@ -46,185 +52,230 @@ constexpr uint64_t signExtend2sCompl(uint32_t x) {
#include <intrin.h>
#endif
#define PREFETCHNTA(x) _mm_prefetch((const char *)(x), _MM_HINT_NTA)
typedef __m128i rx_vec_i128;
typedef __m128d rx_vec_f128;
#define rx_aligned_alloc(a, b) _mm_malloc(a,b)
#define rx_aligned_free(a) _mm_free(a)
#define rx_prefetch_nta(x) _mm_prefetch((const char *)(x), _MM_HINT_NTA)
#define rx_load_vec_f128 _mm_load_pd
#define rx_store_vec_f128 _mm_store_pd
#define rx_shuffle_vec_f128 _mm_shuffle_pd
#define rx_add_vec_f128 _mm_add_pd
#define rx_sub_vec_f128 _mm_sub_pd
#define rx_mul_vec_f128 _mm_mul_pd
#define rx_div_vec_f128 _mm_div_pd
#define rx_sqrt_vec_f128 _mm_sqrt_pd
#define rx_set1_long_vec_i128 _mm_set1_epi64x
#define rx_vec_i128_vec_f128 _mm_castsi128_pd
FORCE_INLINE rx_vec_f128 rx_set_vec_f128(uint64_t x1, uint64_t x0) {
return _mm_castsi128_pd(_mm_set_epi64x(x1, x0));
}
FORCE_INLINE rx_vec_f128 rx_set1_vec_f128(uint64_t x) {
return _mm_castsi128_pd(_mm_set1_epi64x(x));
}
#define rx_xor_vec_f128 _mm_xor_pd
#define rx_and_vec_f128 _mm_and_pd
#define rx_or_vec_f128 _mm_or_pd
#define rx_aesenc_vec_i128 _mm_aesenc_si128
#define rx_aesdec_vec_i128 _mm_aesdec_si128
FORCE_INLINE int rx_vec_i128_x(rx_vec_i128 a) {
return _mm_cvtsi128_si32(a);
}
FORCE_INLINE int rx_vec_i128_y(rx_vec_i128 a) {
return _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0x55));
}
FORCE_INLINE int rx_vec_i128_z(rx_vec_i128 a) {
return _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0xaa));
}
FORCE_INLINE int rx_vec_i128_w(rx_vec_i128 a) {
return _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0xff));
}
#define rx_set_int_vec_i128 _mm_set_epi32
#define rx_xor_vec_i128 _mm_xor_si128
#define rx_load_vec_i128 _mm_load_si128
#define rx_store_vec_i128 _mm_store_si128
FORCE_INLINE rx_vec_f128 rx_cvt_packed_int_vec_f128(const void* addr) {
__m128i ix = _mm_loadl_epi64((const __m128i*)addr);
return _mm_cvtepi32_pd(ix);
}
constexpr uint32_t rx_mxcsr_default = 0x9FC0; //Flush to zero, denormals are zero, default rounding mode, all exceptions disabled
FORCE_INLINE void rx_reset_float_state() {
_mm_setcsr(rx_mxcsr_default);
}
FORCE_INLINE void rx_set_rounding_mode(uint32_t mode) {
_mm_setcsr(rx_mxcsr_default | (mode << 13));
}
#else
#include <cstdint>
#include <stdexcept>
#include <cstdlib>
#include <cmath>
#include "blake2/endian.h"
#define _mm_malloc(a,b) malloc(a)
#define _mm_free(a) free(a)
#define PREFETCHNTA(x)
typedef union {
uint64_t u64[2];
uint32_t u32[4];
uint16_t u16[8];
uint8_t u8[16];
} __m128i;
} rx_vec_i128;
typedef union {
struct {
double lo;
double hi;
};
__m128i i;
} __m128d;
rx_vec_i128 i;
} rx_vec_f128;
inline __m128d _mm_load_pd(const double* pd) {
__m128d x;
#define rx_aligned_alloc(a, b) malloc(a)
#define rx_aligned_free(a) free(a)
#define rx_prefetch_nta(x)
FORCE_INLINE rx_vec_f128 rx_load_vec_f128(const double* pd) {
rx_vec_f128 x;
x.i.u64[0] = load64(pd + 0);
x.i.u64[1] = load64(pd + 1);
return x;
}
inline void _mm_store_pd(double* mem_addr, __m128d a) {
FORCE_INLINE void rx_store_vec_f128(double* mem_addr, rx_vec_f128 a) {
store64(mem_addr + 0, a.i.u64[0]);
store64(mem_addr + 1, a.i.u64[1]);
}
inline __m128d _mm_shuffle_pd(__m128d a, __m128d b, int imm8) {
__m128d x;
FORCE_INLINE rx_vec_f128 rx_shuffle_vec_f128(rx_vec_f128 a, rx_vec_f128 b, int imm8) {
rx_vec_f128 x;
x.lo = (imm8 & 1) ? a.hi : a.lo;
x.hi = (imm8 & 2) ? b.hi : b.lo;
return x;
}
inline __m128d _mm_add_pd(__m128d a, __m128d b) {
__m128d x;
FORCE_INLINE rx_vec_f128 rx_add_vec_f128(rx_vec_f128 a, rx_vec_f128 b) {
rx_vec_f128 x;
x.lo = a.lo + b.lo;
x.hi = a.hi + b.hi;
return x;
}
inline __m128d _mm_sub_pd(__m128d a, __m128d b) {
__m128d x;
FORCE_INLINE rx_vec_f128 rx_sub_vec_f128(rx_vec_f128 a, rx_vec_f128 b) {
rx_vec_f128 x;
x.lo = a.lo - b.lo;
x.hi = a.hi - b.hi;
return x;
}
inline __m128d _mm_mul_pd(__m128d a, __m128d b) {
__m128d x;
FORCE_INLINE rx_vec_f128 rx_mul_vec_f128(rx_vec_f128 a, rx_vec_f128 b) {
rx_vec_f128 x;
x.lo = a.lo * b.lo;
x.hi = a.hi * b.hi;
return x;
}
inline __m128d _mm_div_pd(__m128d a, __m128d b) {
__m128d x;
FORCE_INLINE rx_vec_f128 rx_div_vec_f128(rx_vec_f128 a, rx_vec_f128 b) {
rx_vec_f128 x;
x.lo = a.lo / b.lo;
x.hi = a.hi / b.hi;
return x;
}
inline __m128d _mm_sqrt_pd(__m128d a) {
__m128d x;
FORCE_INLINE rx_vec_f128 rx_sqrt_vec_f128(rx_vec_f128 a) {
rx_vec_f128 x;
x.lo = sqrt(a.lo);
x.hi = sqrt(a.hi);
return x;
}
inline __m128i _mm_set1_epi64x(uint64_t a) {
__m128i x;
FORCE_INLINE rx_vec_i128 rx_set1_long_vec_i128(uint64_t a) {
rx_vec_i128 x;
x.u64[0] = a;
x.u64[1] = a;
return x;
}
inline __m128d _mm_castsi128_pd(__m128i a) {
__m128d x;
FORCE_INLINE rx_vec_f128 rx_vec_i128_vec_f128(rx_vec_i128 a) {
rx_vec_f128 x;
x.i = a;
return x;
}
inline __m128d _mm_abs(__m128d xd) {
xd.lo = std::fabs(xd.lo);
xd.hi = std::fabs(xd.hi);
return xd;
FORCE_INLINE rx_vec_f128 rx_set_vec_f128(uint64_t x1, uint64_t x0) {
rx_vec_f128 v;
v.i.u64[0] = x0;
v.i.u64[1] = x1;
return v;
}
inline __m128d _mm_xor_pd(__m128d a, __m128d b) {
__m128d x;
FORCE_INLINE rx_vec_f128 rx_set1_vec_f128(uint64_t x) {
rx_vec_f128 v;
v.i.u64[0] = x;
v.i.u64[1] = x;
return v;
}
FORCE_INLINE rx_vec_f128 rx_xor_vec_f128(rx_vec_f128 a, rx_vec_f128 b) {
rx_vec_f128 x;
x.i.u64[0] = a.i.u64[0] ^ b.i.u64[0];
x.i.u64[1] = a.i.u64[1] ^ b.i.u64[1];
return x;
}
inline __m128d _mm_and_pd(__m128d a, __m128d b) {
__m128d x;
FORCE_INLINE rx_vec_f128 rx_and_vec_f128(rx_vec_f128 a, rx_vec_f128 b) {
rx_vec_f128 x;
x.i.u64[0] = a.i.u64[0] & b.i.u64[0];
x.i.u64[1] = a.i.u64[1] & b.i.u64[1];
return x;
}
inline __m128d _mm_or_pd(__m128d a, __m128d b) {
__m128d x;
FORCE_INLINE rx_vec_f128 rx_or_vec_f128(rx_vec_f128 a, rx_vec_f128 b) {
rx_vec_f128 x;
x.i.u64[0] = a.i.u64[0] | b.i.u64[0];
x.i.u64[1] = a.i.u64[1] | b.i.u64[1];
return x;
}
inline __m128d _mm_set_pd(double e1, double e0) {
__m128d x;
x.lo = e0;
x.hi = e1;
return x;
}
inline __m128d _mm_max_pd(__m128d a, __m128d b) {
__m128d x;
x.lo = a.lo > b.lo ? a.lo : b.lo;
x.hi = a.hi > b.hi ? a.hi : b.hi;
return x;
}
inline __m128d _mm_cvtepi32_pd(__m128i a) {
__m128d x;
x.lo = (double)unsigned32ToSigned2sCompl(a.u32[0]);
x.hi = (double)unsigned32ToSigned2sCompl(a.u32[1]);
return x;
}
static const char* platformError = "Platform doesn't support hardware AES";
inline __m128i _mm_aeskeygenassist_si128(__m128i key, uint8_t rcon) {
FORCE_INLINE rx_vec_i128 rx_aesenc_vec_i128(rx_vec_i128 v, rx_vec_i128 rkey) {
throw std::runtime_error(platformError);
}
inline __m128i _mm_aesenc_si128(__m128i v, __m128i rkey) {
FORCE_INLINE rx_vec_i128 rx_aesdec_vec_i128(rx_vec_i128 v, rx_vec_i128 rkey) {
throw std::runtime_error(platformError);
}
inline __m128i _mm_aesdec_si128(__m128i v, __m128i rkey) {
throw std::runtime_error(platformError);
FORCE_INLINE int rx_vec_i128_x(rx_vec_i128 a) {
return a.u32[0];
}
inline int _mm_cvtsi128_si32(__m128i v) {
return v.u32[0];
FORCE_INLINE int rx_vec_i128_y(rx_vec_i128 a) {
return a.u32[1];
}
inline __m128i _mm_cvtsi32_si128(int si32) {
__m128i v;
v.u32[0] = si32;
v.u32[1] = 0;
v.u32[2] = 0;
v.u32[3] = 0;
return v;
FORCE_INLINE int rx_vec_i128_z(rx_vec_i128 a) {
return a.u32[2];
}
inline __m128i _mm_set_epi64x(int64_t _I1, int64_t _I0) {
__m128i v;
v.u64[0] = _I0;
v.u64[1] = _I1;
return v;
FORCE_INLINE int rx_vec_i128_w(rx_vec_i128 a) {
return a.u32[3];
}
inline __m128i _mm_set_epi32(int _I3, int _I2, int _I1, int _I0) {
__m128i v;
FORCE_INLINE rx_vec_i128 rx_set_int_vec_i128(int _I3, int _I2, int _I1, int _I0) {
rx_vec_i128 v;
v.u32[0] = _I0;
v.u32[1] = _I1;
v.u32[2] = _I2;
@ -232,8 +283,8 @@ inline __m128i _mm_set_epi32(int _I3, int _I2, int _I1, int _I0) {
return v;
};
inline __m128i _mm_xor_si128(__m128i _A, __m128i _B) {
__m128i c;
FORCE_INLINE rx_vec_i128 rx_xor_vec_i128(rx_vec_i128 _A, rx_vec_i128 _B) {
rx_vec_i128 c;
c.u32[0] = _A.u32[0] ^ _B.u32[0];
c.u32[1] = _A.u32[1] ^ _B.u32[1];
c.u32[2] = _A.u32[2] ^ _B.u32[2];
@ -241,21 +292,12 @@ inline __m128i _mm_xor_si128(__m128i _A, __m128i _B) {
return c;
}
inline __m128i _mm_shuffle_epi32(__m128i _A, int _Imm) {
__m128i c;
c.u32[0] = _A.u32[_Imm & 3];
c.u32[1] = _A.u32[(_Imm >> 2) & 3];
c.u32[2] = _A.u32[(_Imm >> 4) & 3];
c.u32[3] = _A.u32[(_Imm >> 6) & 3];
return c;
}
inline __m128i _mm_load_si128(__m128i const*_P) {
FORCE_INLINE rx_vec_i128 rx_load_vec_i128(rx_vec_i128 const*_P) {
#if defined(NATIVE_LITTLE_ENDIAN)
return *_P;
#else
uint32_t* ptr = (uint32_t*)_P;
__m128i c;
rx_vec_i128 c;
c.u32[0] = load32(ptr + 0);
c.u32[1] = load32(ptr + 1);
c.u32[2] = load32(ptr + 2);
@ -264,7 +306,7 @@ inline __m128i _mm_load_si128(__m128i const*_P) {
#endif
}
inline void _mm_store_si128(__m128i *_P, __m128i _B) {
FORCE_INLINE void rx_store_vec_i128(rx_vec_i128 *_P, rx_vec_i128 _B) {
#if defined(NATIVE_LITTLE_ENDIAN)
*_P = _B;
#else
@ -276,46 +318,23 @@ inline void _mm_store_si128(__m128i *_P, __m128i _B) {
#endif
}
inline __m128i _mm_slli_si128(__m128i _A, int _Imm) {
_Imm &= 255;
if (_Imm > 15) {
_A.u64[0] = 0;
_A.u64[1] = 0;
}
else {
for (int i = 15; i >= _Imm; --i) {
_A.u8[i] = _A.u8[i - _Imm];
}
for (int i = 0; i < _Imm; ++i) {
_A.u8[i] = 0;
}
}
return _A;
}
inline __m128i _mm_loadl_epi64(__m128i const* mem_addr) {
__m128i x;
x.u32[0] = load32((uint8_t*)mem_addr + 0);
x.u32[1] = load32((uint8_t*)mem_addr + 4);
FORCE_INLINE rx_vec_f128 rx_cvt_packed_int_vec_f128(const void* addr) {
rx_vec_f128 x;
x.lo = (double)unsigned32ToSigned2sCompl(load32((uint8_t*)addr + 0));
x.hi = (double)unsigned32ToSigned2sCompl(load32((uint8_t*)addr + 4));
return x;
}
#define RANDOMX_DEFAULT_FENV
void rx_reset_float_state();
void rx_set_rounding_mode(uint32_t mode);
#endif
constexpr int RoundToNearest = 0;
constexpr int RoundDown = 1;
constexpr int RoundUp = 2;
constexpr int RoundToZero = 3;
inline __m128d load_cvt_i32x2(const void* addr) {
__m128i ix = _mm_loadl_epi64((const __m128i*)addr);
return _mm_cvtepi32_pd(ix);
}
double loadDoublePortable(const void* addr);
uint64_t mulh(uint64_t, uint64_t);
int64_t smulh(int64_t, int64_t);
uint64_t rotl(uint64_t, int);
uint64_t rotr(uint64_t, int);
void initFpu();
void setRoundMode(uint32_t);

View File

@ -318,38 +318,38 @@ alignas(16) const uint32_t lutDec3[256] = {
0x397101a8, 0x08deb30c, 0xd89ce4b4, 0x6490c156, 0x7b6184cb, 0xd570b632, 0x48745c6c, 0xd04257b8,
};
__m128i soft_aesenc(__m128i in, __m128i key) {
rx_vec_i128 soft_aesenc(rx_vec_i128 in, rx_vec_i128 key) {
uint32_t s0, s1, s2, s3;
s0 = _mm_cvtsi128_si32(_mm_shuffle_epi32(in, 0xff));
s1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(in, 0xaa));
s2 = _mm_cvtsi128_si32(_mm_shuffle_epi32(in, 0x55));
s3 = _mm_cvtsi128_si32(in);
s0 = rx_vec_i128_w(in);
s1 = rx_vec_i128_z(in);
s2 = rx_vec_i128_y(in);
s3 = rx_vec_i128_x(in);
__m128i out = _mm_set_epi32(
rx_vec_i128 out = rx_set_int_vec_i128(
(lutEnc0[s0 & 0xff] ^ lutEnc1[(s3 >> 8) & 0xff] ^ lutEnc2[(s2 >> 16) & 0xff] ^ lutEnc3[s1 >> 24]),
(lutEnc0[s1 & 0xff] ^ lutEnc1[(s0 >> 8) & 0xff] ^ lutEnc2[(s3 >> 16) & 0xff] ^ lutEnc3[s2 >> 24]),
(lutEnc0[s2 & 0xff] ^ lutEnc1[(s1 >> 8) & 0xff] ^ lutEnc2[(s0 >> 16) & 0xff] ^ lutEnc3[s3 >> 24]),
(lutEnc0[s3 & 0xff] ^ lutEnc1[(s2 >> 8) & 0xff] ^ lutEnc2[(s1 >> 16) & 0xff] ^ lutEnc3[s0 >> 24])
);
return _mm_xor_si128(out, key);
return rx_xor_vec_i128(out, key);
}
__m128i soft_aesdec(__m128i in, __m128i key) {
rx_vec_i128 soft_aesdec(rx_vec_i128 in, rx_vec_i128 key) {
uint32_t s0, s1, s2, s3;
s0 = _mm_cvtsi128_si32(_mm_shuffle_epi32(in, 0xff));
s1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(in, 0xaa));
s2 = _mm_cvtsi128_si32(_mm_shuffle_epi32(in, 0x55));
s3 = _mm_cvtsi128_si32(in);
s0 = rx_vec_i128_w(in);
s1 = rx_vec_i128_z(in);
s2 = rx_vec_i128_y(in);
s3 = rx_vec_i128_x(in);
__m128i out = _mm_set_epi32(
rx_vec_i128 out = rx_set_int_vec_i128(
(lutDec0[s0 & 0xff] ^ lutDec1[(s1 >> 8) & 0xff] ^ lutDec2[(s2 >> 16) & 0xff] ^ lutDec3[s3 >> 24]),
(lutDec0[s1 & 0xff] ^ lutDec1[(s2 >> 8) & 0xff] ^ lutDec2[(s3 >> 16) & 0xff] ^ lutDec3[s0 >> 24]),
(lutDec0[s2 & 0xff] ^ lutDec1[(s3 >> 8) & 0xff] ^ lutDec2[(s0 >> 16) & 0xff] ^ lutDec3[s1 >> 24]),
(lutDec0[s3 & 0xff] ^ lutDec1[(s0 >> 8) & 0xff] ^ lutDec2[(s1 >> 16) & 0xff] ^ lutDec3[s2 >> 24])
);
return _mm_xor_si128(out, key);
return rx_xor_vec_i128(out, key);
}

View File

@ -22,16 +22,16 @@ along with RandomX. If not, see<http://www.gnu.org/licenses/>.
#include <stdint.h>
#include "intrin_portable.h"
__m128i soft_aesenc(__m128i in, __m128i key);
rx_vec_i128 soft_aesenc(rx_vec_i128 in, rx_vec_i128 key);
__m128i soft_aesdec(__m128i in, __m128i key);
rx_vec_i128 soft_aesdec(rx_vec_i128 in, rx_vec_i128 key);
template<bool soft>
inline __m128i aesenc(__m128i in, __m128i key) {
return soft ? soft_aesenc(in, key) : _mm_aesenc_si128(in, key);
inline rx_vec_i128 aesenc(rx_vec_i128 in, rx_vec_i128 key) {
return soft ? soft_aesenc(in, key) : rx_aesenc_vec_i128(in, key);
}
template<bool soft>
inline __m128i aesdec(__m128i in, __m128i key) {
return soft ? soft_aesdec(in, key) : _mm_aesdec_si128(in, key);
inline rx_vec_i128 aesdec(rx_vec_i128 in, rx_vec_i128 key) {
return soft ? soft_aesdec(in, key) : rx_aesdec_vec_i128(in, key);
}

View File

@ -32,7 +32,7 @@ randomx_vm::~randomx_vm() {
}
void randomx_vm::resetRoundingMode() {
initFpu();
rx_reset_float_state();
}
namespace randomx {
@ -86,7 +86,7 @@ void randomx_vm::initialize() {
namespace randomx {
alignas(16) volatile static __m128i aesDummy;
alignas(16) volatile static rx_vec_i128 aesDummy;
template<class Allocator, bool softAes>
VmBase<Allocator, softAes>::~VmBase() {
@ -98,9 +98,9 @@ namespace randomx {
if (datasetPtr == nullptr)
throw std::invalid_argument("Cache/Dataset not set");
if (!softAes) { //if hardware AES is not supported, it's better to fail now than to return a ticking bomb
__m128i tmp = _mm_load_si128((const __m128i*)&aesDummy);
tmp = _mm_aesenc_si128(tmp, tmp);
_mm_store_si128((__m128i*)&aesDummy, tmp);
rx_vec_i128 tmp = rx_load_vec_i128((const rx_vec_i128*)&aesDummy);
tmp = rx_aesenc_vec_i128(tmp, tmp);
rx_store_vec_i128((rx_vec_i128*)&aesDummy, tmp);
}
scratchpad = (uint8_t*)Allocator::allocMemory(ScratchpadSize);
}

View File

@ -46,7 +46,7 @@ namespace randomx {
}
template<class Allocator, bool softAes>
void InterpretedVm<Allocator, softAes>::executeBytecode(int_reg_t(&r)[RegistersCount], __m128d (&f)[RegisterCountFlt], __m128d (&e)[RegisterCountFlt], __m128d (&a)[RegisterCountFlt]) {
void InterpretedVm<Allocator, softAes>::executeBytecode(int_reg_t(&r)[RegistersCount], rx_vec_f128(&f)[RegisterCountFlt], rx_vec_f128(&e)[RegisterCountFlt], rx_vec_f128(&a)[RegisterCountFlt]) {
for (int pc = 0; pc < RANDOMX_PROGRAM_SIZE; ++pc) {
executeBytecode(pc, r, f, e, a);
}
@ -59,16 +59,16 @@ namespace randomx {
}
template<class Allocator, bool softAes>
FORCE_INLINE __m128d InterpretedVm<Allocator, softAes>::maskRegisterExponentMantissa(__m128d x) {
const __m128d xmantissaMask = _mm_castsi128_pd(_mm_set_epi64x(dynamicMantissaMask, dynamicMantissaMask));
const __m128d xexponentMask = _mm_load_pd((const double*)&config.eMask);
x = _mm_and_pd(x, xmantissaMask);
x = _mm_or_pd(x, xexponentMask);
FORCE_INLINE rx_vec_f128 InterpretedVm<Allocator, softAes>::maskRegisterExponentMantissa(rx_vec_f128 x) {
const rx_vec_f128 xmantissaMask = rx_set_vec_f128(dynamicMantissaMask, dynamicMantissaMask);
const rx_vec_f128 xexponentMask = rx_load_vec_f128((const double*)&config.eMask);
x = rx_and_vec_f128(x, xmantissaMask);
x = rx_or_vec_f128(x, xexponentMask);
return x;
}
template<class Allocator, bool softAes>
void InterpretedVm<Allocator, softAes>::executeBytecode(int& pc, int_reg_t(&r)[RegistersCount], __m128d (&f)[RegisterCountFlt], __m128d (&e)[RegisterCountFlt], __m128d (&a)[RegisterCountFlt]) {
void InterpretedVm<Allocator, softAes>::executeBytecode(int& pc, int_reg_t(&r)[RegistersCount], rx_vec_f128(&f)[RegisterCountFlt], rx_vec_f128(&e)[RegisterCountFlt], rx_vec_f128(&a)[RegisterCountFlt]) {
auto& ibc = byteCode[pc];
switch (ibc.type)
{
@ -139,43 +139,43 @@ namespace randomx {
} break;
case InstructionType::FSWAP_R: {
*ibc.fdst = _mm_shuffle_pd(*ibc.fdst, *ibc.fdst, 1);
*ibc.fdst = rx_shuffle_vec_f128(*ibc.fdst, *ibc.fdst, 1);
} break;
case InstructionType::FADD_R: {
*ibc.fdst = _mm_add_pd(*ibc.fdst, *ibc.fsrc);
*ibc.fdst = rx_add_vec_f128(*ibc.fdst, *ibc.fsrc);
} break;
case InstructionType::FADD_M: {
__m128d fsrc = load_cvt_i32x2(getScratchpadAddress(ibc));
*ibc.fdst = _mm_add_pd(*ibc.fdst, fsrc);
rx_vec_f128 fsrc = rx_cvt_packed_int_vec_f128(getScratchpadAddress(ibc));
*ibc.fdst = rx_add_vec_f128(*ibc.fdst, fsrc);
} break;
case InstructionType::FSUB_R: {
*ibc.fdst = _mm_sub_pd(*ibc.fdst, *ibc.fsrc);
*ibc.fdst = rx_sub_vec_f128(*ibc.fdst, *ibc.fsrc);
} break;
case InstructionType::FSUB_M: {
__m128d fsrc = load_cvt_i32x2(getScratchpadAddress(ibc));
*ibc.fdst = _mm_sub_pd(*ibc.fdst, fsrc);
rx_vec_f128 fsrc = rx_cvt_packed_int_vec_f128(getScratchpadAddress(ibc));
*ibc.fdst = rx_sub_vec_f128(*ibc.fdst, fsrc);
} break;
case InstructionType::FSCAL_R: {
const __m128d mask = _mm_castsi128_pd(_mm_set1_epi64x(0x81F0000000000000));
*ibc.fdst = _mm_xor_pd(*ibc.fdst, mask);
const rx_vec_f128 mask = rx_set1_vec_f128(0x81F0000000000000);
*ibc.fdst = rx_xor_vec_f128(*ibc.fdst, mask);
} break;
case InstructionType::FMUL_R: {
*ibc.fdst = _mm_mul_pd(*ibc.fdst, *ibc.fsrc);
*ibc.fdst = rx_mul_vec_f128(*ibc.fdst, *ibc.fsrc);
} break;
case InstructionType::FDIV_M: {
__m128d fsrc = maskRegisterExponentMantissa(load_cvt_i32x2(getScratchpadAddress(ibc)));
*ibc.fdst = _mm_div_pd(*ibc.fdst, fsrc);
rx_vec_f128 fsrc = maskRegisterExponentMantissa(rx_cvt_packed_int_vec_f128(getScratchpadAddress(ibc)));
*ibc.fdst = rx_div_vec_f128(*ibc.fdst, fsrc);
} break;
case InstructionType::FSQRT_R: {
*ibc.fdst = _mm_sqrt_pd(*ibc.fdst);
*ibc.fdst = rx_sqrt_vec_f128(*ibc.fdst);
} break;
case InstructionType::CBRANCH: {
@ -186,7 +186,7 @@ namespace randomx {
} break;
case InstructionType::CFROUND: {
setRoundMode(rotr(*ibc.isrc, ibc.imm) % 4);
rx_set_rounding_mode(rotr(*ibc.isrc, ibc.imm) % 4);
} break;
case InstructionType::ISTORE: {
@ -205,12 +205,12 @@ namespace randomx {
template<class Allocator, bool softAes>
void InterpretedVm<Allocator, softAes>::execute() {
int_reg_t r[RegistersCount] = { 0 };
__m128d f[RegisterCountFlt];
__m128d e[RegisterCountFlt];
__m128d a[RegisterCountFlt];
rx_vec_f128 f[RegisterCountFlt];
rx_vec_f128 e[RegisterCountFlt];
rx_vec_f128 a[RegisterCountFlt];
for(unsigned i = 0; i < RegisterCountFlt; ++i)
a[i] = _mm_load_pd(&reg.a[i].lo);
a[i] = rx_load_vec_f128(&reg.a[i].lo);
precompileProgram(r, f, e, a);
@ -228,10 +228,10 @@ namespace randomx {
r[i] ^= load64(scratchpad + spAddr0 + 8 * i);
for (unsigned i = 0; i < RegisterCountFlt; ++i)
f[i] = load_cvt_i32x2(scratchpad + spAddr1 + 8 * i);
f[i] = rx_cvt_packed_int_vec_f128(scratchpad + spAddr1 + 8 * i);
for (unsigned i = 0; i < RegisterCountFlt; ++i)
e[i] = maskRegisterExponentMantissa(load_cvt_i32x2(scratchpad + spAddr1 + 8 * (RegisterCountFlt + i)));
e[i] = maskRegisterExponentMantissa(rx_cvt_packed_int_vec_f128(scratchpad + spAddr1 + 8 * (RegisterCountFlt + i)));
executeBytecode(r, f, e, a);
@ -244,10 +244,10 @@ namespace randomx {
store64(scratchpad + spAddr1 + 8 * i, r[i]);
for (unsigned i = 0; i < RegisterCountFlt; ++i)
f[i] = _mm_xor_pd(f[i], e[i]);
f[i] = rx_xor_vec_f128(f[i], e[i]);
for (unsigned i = 0; i < RegisterCountFlt; ++i)
_mm_store_pd((double*)(scratchpad + spAddr0 + 16 * i), f[i]);
rx_store_vec_f128((double*)(scratchpad + spAddr0 + 16 * i), f[i]);
spAddr0 = 0;
spAddr1 = 0;
@ -257,10 +257,10 @@ namespace randomx {
store64(&reg.r[i], r[i]);
for (unsigned i = 0; i < RegisterCountFlt; ++i)
_mm_store_pd(&reg.f[i].lo, f[i]);
rx_store_vec_f128(&reg.f[i].lo, f[i]);
for (unsigned i = 0; i < RegisterCountFlt; ++i)
_mm_store_pd(&reg.e[i].lo, e[i]);
rx_store_vec_f128(&reg.e[i].lo, e[i]);
}
template<class Allocator, bool softAes>
@ -273,7 +273,7 @@ namespace randomx {
#include "instruction_weights.hpp"
template<class Allocator, bool softAes>
void InterpretedVm<Allocator, softAes>::precompileProgram(int_reg_t(&r)[RegistersCount], __m128d (&f)[RegisterCountFlt], __m128d (&e)[RegisterCountFlt], __m128d (&a)[RegisterCountFlt]) {
void InterpretedVm<Allocator, softAes>::precompileProgram(int_reg_t(&r)[RegistersCount], rx_vec_f128(&f)[RegisterCountFlt], rx_vec_f128(&e)[RegisterCountFlt], rx_vec_f128(&a)[RegisterCountFlt]) {
RegisterUsage registerUsage[RegistersCount];
for (unsigned i = 0; i < RegistersCount; ++i) {
registerUsage[i].lastUsed = -1;

View File

@ -31,11 +31,11 @@ namespace randomx {
struct InstructionByteCode {
union {
int_reg_t* idst;
__m128d* fdst;
rx_vec_f128* fdst;
};
union {
int_reg_t* isrc;
__m128d* fsrc;
rx_vec_f128* fsrc;
};
union {
uint64_t imm;
@ -74,11 +74,11 @@ namespace randomx {
virtual void datasetRead(uint32_t blockNumber, int_reg_t(&r)[RegistersCount]);
private:
void execute();
void precompileProgram(int_reg_t(&r)[RegistersCount], __m128d (&f)[RegisterCountFlt], __m128d (&e)[RegisterCountFlt], __m128d (&a)[RegisterCountFlt]);
void executeBytecode(int_reg_t(&r)[RegistersCount], __m128d (&f)[RegisterCountFlt], __m128d (&e)[RegisterCountFlt], __m128d (&a)[RegisterCountFlt]);
void executeBytecode(int& i, int_reg_t(&r)[RegistersCount], __m128d (&f)[RegisterCountFlt], __m128d (&e)[RegisterCountFlt], __m128d (&a)[RegisterCountFlt]);
void precompileProgram(int_reg_t(&r)[RegistersCount], rx_vec_f128(&f)[RegisterCountFlt], rx_vec_f128(&e)[RegisterCountFlt], rx_vec_f128(&a)[RegisterCountFlt]);
void executeBytecode(int_reg_t(&r)[RegistersCount], rx_vec_f128(&f)[RegisterCountFlt], rx_vec_f128(&e)[RegisterCountFlt], rx_vec_f128(&a)[RegisterCountFlt]);
void executeBytecode(int& i, int_reg_t(&r)[RegistersCount], rx_vec_f128(&f)[RegisterCountFlt], rx_vec_f128(&e)[RegisterCountFlt], rx_vec_f128(&a)[RegisterCountFlt]);
void* getScratchpadAddress(InstructionByteCode& ibc);
__m128d maskRegisterExponentMantissa(__m128d);
rx_vec_f128 maskRegisterExponentMantissa(rx_vec_f128);
InstructionByteCode byteCode[RANDOMX_PROGRAM_SIZE];
};