Optimized keccak implementation

All tests were conducted on the same PC (Ryzen 5 5600X running at fixed 4.65 GHz).

Before:
test_cn_fast_hash<32> (100000 calls) - OK: 1 us/call
test_cn_fast_hash<16384> (1000 calls) - OK: 164 us/call

After:
test_cn_fast_hash<32> (100000 calls) - OK: 0 us/call
test_cn_fast_hash<16384> (1000 calls) - OK: 31 us/call

More than 5 times speedup for cn_fast_hash.

Also noticed consistent 1-2% improvement in test_construct_tx results.
This commit is contained in:
SChernykh 2022-04-16 11:48:37 +02:00
parent f49fc9b487
commit 268a0393e9
1 changed files with 61 additions and 32 deletions

View File

@ -31,54 +31,83 @@ const uint64_t keccakf_rndc[24] =
0x8000000000008080, 0x0000000080000001, 0x8000000080008008 0x8000000000008080, 0x0000000080000001, 0x8000000080008008
}; };
const int keccakf_rotc[24] =
{
1, 3, 6, 10, 15, 21, 28, 36, 45, 55, 2, 14,
27, 41, 56, 8, 25, 43, 62, 18, 39, 61, 20, 44
};
const int keccakf_piln[24] =
{
10, 7, 11, 17, 18, 3, 5, 16, 8, 21, 24, 4,
15, 23, 19, 13, 12, 2, 20, 14, 22, 9, 6, 1
};
// update the state with given number of rounds // update the state with given number of rounds
void keccakf(uint64_t st[25], int rounds) void keccakf(uint64_t st[25], int rounds)
{ {
int i, j, round; int round;
uint64_t t, bc[5]; uint64_t t, bc[5];
for (round = 0; round < rounds; round++) { for (round = 0; round < rounds; ++round) {
// Theta // Theta
for (i = 0; i < 5; i++) bc[0] = st[0] ^ st[5] ^ st[10] ^ st[15] ^ st[20];
bc[i] = st[i] ^ st[i + 5] ^ st[i + 10] ^ st[i + 15] ^ st[i + 20]; bc[1] = st[1] ^ st[6] ^ st[11] ^ st[16] ^ st[21];
bc[2] = st[2] ^ st[7] ^ st[12] ^ st[17] ^ st[22];
bc[3] = st[3] ^ st[8] ^ st[13] ^ st[18] ^ st[23];
bc[4] = st[4] ^ st[9] ^ st[14] ^ st[19] ^ st[24];
for (i = 0; i < 5; i++) { #define THETA(i) { \
t = bc[(i + 4) % 5] ^ ROTL64(bc[(i + 1) % 5], 1); t = bc[(i + 4) % 5] ^ ROTL64(bc[(i + 1) % 5], 1); \
for (j = 0; j < 25; j += 5) st[i ] ^= t; \
st[j + i] ^= t; st[i + 5] ^= t; \
st[i + 10] ^= t; \
st[i + 15] ^= t; \
st[i + 20] ^= t; \
} }
THETA(0);
THETA(1);
THETA(2);
THETA(3);
THETA(4);
// Rho Pi // Rho Pi
t = st[1]; t = st[1];
for (i = 0; i < 24; i++) { st[ 1] = ROTL64(st[ 6], 44);
j = keccakf_piln[i]; st[ 6] = ROTL64(st[ 9], 20);
bc[0] = st[j]; st[ 9] = ROTL64(st[22], 61);
st[j] = ROTL64(t, keccakf_rotc[i]); st[22] = ROTL64(st[14], 39);
t = bc[0]; st[14] = ROTL64(st[20], 18);
} st[20] = ROTL64(st[ 2], 62);
st[ 2] = ROTL64(st[12], 43);
st[12] = ROTL64(st[13], 25);
st[13] = ROTL64(st[19], 8);
st[19] = ROTL64(st[23], 56);
st[23] = ROTL64(st[15], 41);
st[15] = ROTL64(st[ 4], 27);
st[ 4] = ROTL64(st[24], 14);
st[24] = ROTL64(st[21], 2);
st[21] = ROTL64(st[ 8], 55);
st[ 8] = ROTL64(st[16], 45);
st[16] = ROTL64(st[ 5], 36);
st[ 5] = ROTL64(st[ 3], 28);
st[ 3] = ROTL64(st[18], 21);
st[18] = ROTL64(st[17], 15);
st[17] = ROTL64(st[11], 10);
st[11] = ROTL64(st[ 7], 6);
st[ 7] = ROTL64(st[10], 3);
st[10] = ROTL64(t, 1);
// Chi // Chi
for (j = 0; j < 25; j += 5) { #define CHI(j) { \
for (i = 0; i < 5; i++) const uint64_t st0 = st[j ]; \
bc[i] = st[j + i]; const uint64_t st1 = st[j + 1]; \
for (i = 0; i < 5; i++) const uint64_t st2 = st[j + 2]; \
st[j + i] ^= (~bc[(i + 1) % 5]) & bc[(i + 2) % 5]; const uint64_t st3 = st[j + 3]; \
const uint64_t st4 = st[j + 4]; \
st[j ] ^= ~st1 & st2; \
st[j + 1] ^= ~st2 & st3; \
st[j + 2] ^= ~st3 & st4; \
st[j + 3] ^= ~st4 & st0; \
st[j + 4] ^= ~st0 & st1; \
} }
CHI( 0);
CHI( 5);
CHI(10);
CHI(15);
CHI(20);
// Iota // Iota
st[0] ^= keccakf_rndc[round]; st[0] ^= keccakf_rndc[round];
} }