From 33a2fd021d96291e8058ad691ee483bb417064ab Mon Sep 17 00:00:00 2001 From: tevador Date: Sun, 14 Apr 2019 17:21:26 +0200 Subject: [PATCH] Unique scratchpad addresses --- src/AssemblyGeneratorX86.cpp | 6 +- src/Instruction.cpp | 13 +- src/JitCompilerX86.cpp | 21 +- src/common.hpp | 3 +- src/program.inc | 526 +++++++++++++++++------------------ src/superscalarGenerator.cpp | 6 +- 6 files changed, 287 insertions(+), 288 deletions(-) diff --git a/src/AssemblyGeneratorX86.cpp b/src/AssemblyGeneratorX86.cpp index b3511c1..a916372 100644 --- a/src/AssemblyGeneratorX86.cpp +++ b/src/AssemblyGeneratorX86.cpp @@ -257,12 +257,12 @@ namespace RandomX { } void AssemblyGeneratorX86::genAddressReg(Instruction& instr, const char* reg = "eax") { - asmCode << "\tmov " << reg << ", " << regR32[instr.src] << std::endl; + asmCode << "\tlea " << reg << ", [" << regR32[instr.src] << std::showpos << (int32_t)instr.getImm32() << std::noshowpos << "]" << std::endl; asmCode << "\tand " << reg << ", " << ((instr.mod % 4) ? ScratchpadL1Mask : ScratchpadL2Mask) << std::endl; } void AssemblyGeneratorX86::genAddressRegDst(Instruction& instr, int maskAlign = 8) { - asmCode << "\tmov eax" << ", " << regR32[instr.dst] << std::endl; + asmCode << "\tlea eax, [" << regR32[instr.dst] << std::showpos << (int32_t)instr.getImm32() << std::noshowpos << "]" << std::endl; asmCode << "\tand eax" << ", " << ((instr.mod % 4) ? (ScratchpadL1Mask & (-maskAlign)) : (ScratchpadL2Mask & (-maskAlign))) << std::endl; } @@ -273,7 +273,7 @@ namespace RandomX { //1 uOP void AssemblyGeneratorX86::h_IADD_RS(Instruction& instr, int i) { registerUsage[instr.dst] = i; - if(instr.dst == 5) + if(instr.dst == RegisterNeedsDisplacement) asmCode << "\tlea " << regR[instr.dst] << ", [" << regR[instr.dst] << "+" << regR[instr.src] << "*" << (1 << (instr.mod % 4)) << std::showpos << (int32_t)instr.getImm32() << std::noshowpos << "]" << std::endl; else asmCode << "\tlea " << regR[instr.dst] << ", [" << regR[instr.dst] << "+" << regR[instr.src] << "*" << (1 << (instr.mod % 4)) << "]" << std::endl; diff --git a/src/Instruction.cpp b/src/Instruction.cpp index e8ddc64..e4aa772 100644 --- a/src/Instruction.cpp +++ b/src/Instruction.cpp @@ -29,11 +29,11 @@ namespace RandomX { } void Instruction::genAddressReg(std::ostream& os) const { - os << ((mod % 4) ? "L1" : "L2") << "[r" << (int)src << "]"; + os << ((mod % 4) ? "L1" : "L2") << "[r" << (int)src << std::showpos << (int32_t)getImm32() << std::noshowpos << "]"; } void Instruction::genAddressRegDst(std::ostream& os) const { - os << ((mod % 4) ? "L1" : "L2") << "[r" << (int)dst << "]"; + os << ((mod % 4) ? "L1" : "L2") << "[r" << (int)dst << std::showpos << (int32_t)getImm32() << std::noshowpos << "]"; } void Instruction::genAddressImm(std::ostream& os) const { @@ -41,12 +41,11 @@ namespace RandomX { } void Instruction::h_IADD_RS(std::ostream& os) const { - if (src != dst) { - os << "r" << (int)dst << ", r" << (int)src << ", LSH " << (int)(mod % 4) << std::endl; - } - else { - os << "r" << (int)dst << ", " << (int32_t)getImm32() << std::endl; + os << "r" << (int)dst << ", r" << (int)src; + if(dst == RegisterNeedsDisplacement) { + os << ", " << (int32_t)getImm32(); } + os << ", LSH " << (int)(mod % 4) << std::endl; } void Instruction::h_IADD_M(std::ostream& os) const { diff --git a/src/JitCompilerX86.cpp b/src/JitCompilerX86.cpp index ad7c85a..7d17ef2 100644 --- a/src/JitCompilerX86.cpp +++ b/src/JitCompilerX86.cpp @@ -201,6 +201,7 @@ namespace RandomX { static const uint8_t REX_TEST[] = { 0x49, 0xF7 }; static const uint8_t JZ[] = { 0x0f, 0x84 }; static const uint8_t RET = 0xc3; + static const uint8_t LEA_32[] = { 0x67, 0x41, 0x8d }; static const uint8_t NOP1[] = { 0x90 }; static const uint8_t NOP2[] = { 0x66, 0x90 }; @@ -434,8 +435,12 @@ namespace RandomX { template void JitCompilerX86::generateCode(Instruction& instr, int i); void JitCompilerX86::genAddressReg(Instruction& instr, bool rax = true) { - emit(REX_MOV_RR); - emitByte((rax ? 0xc0 : 0xc8) + instr.src); + emit(LEA_32); + emitByte(0x80 + instr.src + (rax ? 0 : 8)); + if (instr.src == RegisterNeedsSib) { + emitByte(0x24); + } + emit32(instr.getImm32()); if (rax) emitByte(AND_EAX_I); else @@ -444,8 +449,12 @@ namespace RandomX { } void JitCompilerX86::genAddressRegDst(Instruction& instr, bool align16 = false) { - emit(REX_MOV_RR); - emitByte(0xc0 + instr.dst); + emit(LEA_32); + emitByte(0x80 + instr.dst); + if (instr.dst == RegisterNeedsSib) { + emitByte(0x24); + } + emit32(instr.getImm32()); emitByte(AND_EAX_I); int32_t maskL1 = align16 ? ScratchpadL1Mask16 : ScratchpadL1Mask; int32_t maskL2 = align16 ? ScratchpadL2Mask16 : ScratchpadL2Mask; @@ -472,12 +481,12 @@ namespace RandomX { return; } emit(REX_LEA); - if (instr.dst == 5) //rbp,r13 cannot be the base register without offset + if (instr.dst == RegisterNeedsDisplacement) emitByte(0xac); else emitByte(0x04 + 8 * instr.dst); genSIB(instr.mod % 4, instr.src, instr.dst); - if (instr.dst == 5) + if (instr.dst == RegisterNeedsDisplacement) emit32(instr.getImm32()); } diff --git a/src/common.hpp b/src/common.hpp index 034c10f..ade8abc 100644 --- a/src/common.hpp +++ b/src/common.hpp @@ -95,7 +95,8 @@ namespace RandomX { constexpr int ScratchpadL3Mask = (ScratchpadL3 - 1) * 8; constexpr int ScratchpadL3Mask64 = (ScratchpadL3 / 8 - 1) * 64; constexpr int RegistersCount = 8; - constexpr int LimitedAddressRegister = 5; //x86 r13 register + constexpr int RegisterNeedsDisplacement = 5; //x86 r13 register + constexpr int RegisterNeedsSib = 4; //x86 r12 register struct Cache { uint8_t* memory; diff --git a/src/program.inc b/src/program.inc index 97a8122..f3a36b8 100644 --- a/src/program.inc +++ b/src/program.inc @@ -1,11 +1,9 @@ - mov ebx, 111 ; Start marker bytes - db 064h, 067h, 090h ; Start marker bytes randomx_isn_0: ; IROR_R r3, 30 ror r11, 30 randomx_isn_1: - ; FSUB_M f1, L1[r7] - mov eax, r15d + ; FSUB_M f1, L1[r7+1640164717] + lea eax, [r15d+1640164717] and eax, 16376 cvtdq2pd xmm12, qword ptr [rsi+rax] subpd xmm1, xmm12 @@ -27,8 +25,8 @@ randomx_isn_6: ; FSQRT_R e3 sqrtpd xmm7, xmm7 randomx_isn_7: - ; ISTORE L1[r0], r3 - mov eax, r8d + ; ISTORE L1[r0-784322734], r3 + lea eax, [r8d-784322734] and eax, 16376 mov qword ptr [rsi+rax], r11 randomx_isn_8: @@ -38,13 +36,13 @@ randomx_isn_9: ; FMUL_R e0, a1 mulpd xmm4, xmm9 randomx_isn_10: - ; IMUL_M r2, L1[r1] - mov eax, r9d + ; IMUL_M r2, L1[r1+222715267] + lea eax, [r9d+222715267] and eax, 16376 imul r10, qword ptr [rsi+rax] randomx_isn_11: - ; ISTORE L1[r3], r1 - mov eax, r11d + ; ISTORE L1[r3-2088207007], r1 + lea eax, [r11d-2088207007] and eax, 16376 mov qword ptr [rsi+rax], r9 randomx_isn_12: @@ -57,11 +55,11 @@ randomx_isn_14: ; FSQRT_R e2 sqrtpd xmm6, xmm6 randomx_isn_15: - ; IADD_R r6, r2 - add r14, r10 + ; IADD_RS r6, r2, LSH 1 + lea r14, [r14+r10*2] randomx_isn_16: - ; FSUB_M f2, L1[r1] - mov eax, r9d + ; FSUB_M f2, L1[r1-1890725713] + lea eax, [r9d-1890725713] and eax, 16376 cvtdq2pd xmm12, qword ptr [rsi+rax] subpd xmm2, xmm12 @@ -70,41 +68,41 @@ randomx_isn_17: mov ecx, r11d ror r12, cl randomx_isn_18: - ; ISTORE L1[r4], r4 - mov eax, r12d + ; ISTORE L1[r4+1297827817], r4 + lea eax, [r12d+1297827817] and eax, 16376 mov qword ptr [rsi+rax], r12 randomx_isn_19: ; FMUL_R e1, a2 mulpd xmm5, xmm10 randomx_isn_20: - ; COND_R r6, of(r3, 1593588996), 1 + ; COND_R r6, of(r3, 1593588996), LSH 1 add r8, 2 test r8, 254 jz randomx_isn_0 - xor ecx, ecx + xor rcx, rcx cmp r11d, 1593588996 seto cl add r14, rcx randomx_isn_21: - ; IXOR_M r7, L1[r2] - mov eax, r10d + ; IXOR_M r7, L1[r2+1680388681] + lea eax, [r10d+1680388681] and eax, 16376 xor r15, qword ptr [rsi+rax] randomx_isn_22: - ; IXOR_M r1, L1[r0] - mov eax, r8d + ; IXOR_M r1, L1[r0+1714608757] + lea eax, [r8d+1714608757] and eax, 16376 xor r9, qword ptr [rsi+rax] randomx_isn_23: ; FMUL_R e2, a0 mulpd xmm6, xmm8 randomx_isn_24: - ; COND_R r6, no(r0, 149087159), 6 + ; COND_R r6, no(r0, 149087159), LSH 6 add r8, 64 test r8, 8128 jz randomx_isn_21 - xor ecx, ecx + xor rcx, rcx cmp r8d, 149087159 setno cl add r14, rcx @@ -112,22 +110,22 @@ randomx_isn_25: ; FADD_R f3, a0 addpd xmm3, xmm8 randomx_isn_26: - ; IADD_R r7, r0 - add r15, r8 + ; IADD_RS r7, r0, LSH 3 + lea r15, [r15+r8*8] randomx_isn_27: ; IMUL_R r2, r3 imul r10, r11 randomx_isn_28: - ; IADD_R r5, r7 - add r13, r15 + ; IADD_RS r5, r7, 1345488645, LSH 1 + lea r13, [r13+r15*2+1345488645] randomx_isn_29: - ; ISTORE L2[r6], r2 - mov eax, r14d + ; ISTORE L2[r6-950233266], r2 + lea eax, [r14d-950233266] and eax, 262136 mov qword ptr [rsi+rax], r10 randomx_isn_30: - ; ISTORE L1[r7], r5 - mov eax, r15d + ; ISTORE L1[r7-627845430], r5 + lea eax, [r15d-627845430] and eax, 16376 mov qword ptr [rsi+rax], r13 randomx_isn_31: @@ -146,21 +144,19 @@ randomx_isn_35: ; IMUL_R r6, 835132161 imul r14, 835132161 randomx_isn_36: - ; IADD_M r3, L1[r4] - mov eax, r12d - and eax, 16376 - add r11, qword ptr [rsi+rax] + ; IADD_RS r3, r4, LSH 2 + lea r11, [r11+r12*4] randomx_isn_37: - ; IMUL_9C r6, 1885029796 - lea r14, [r14+r14*8+1885029796] + ; ISUB_M r6, L2[r4+1885029796] + lea eax, [r12d+1885029796] + and eax, 262136 + sub r14, qword ptr [rsi+rax] randomx_isn_38: ; FSCAL_R f2 xorps xmm2, xmm15 randomx_isn_39: - ; ISUB_M r5, L1[r0] - mov eax, r8d - and eax, 16376 - sub r13, qword ptr [rsi+rax] + ; ISUB_R r5, r0 + sub r13, r8 randomx_isn_40: ; IMUL_R r7, r2 imul r15, r10 @@ -177,15 +173,13 @@ randomx_isn_44: ; FADD_R f1, a2 addpd xmm1, xmm10 randomx_isn_45: - ; ISTORE L1[r0], r5 - mov eax, r8d + ; ISTORE L1[r0+1805562386], r5 + lea eax, [r8d+1805562386] and eax, 16376 mov qword ptr [rsi+rax], r13 randomx_isn_46: - ; IADD_M r0, L2[r7] - mov eax, r15d - and eax, 262136 - add r8, qword ptr [rsi+rax] + ; IADD_RS r0, r7, LSH 0 + lea r8, [r8+r15*1] randomx_isn_47: ; IXOR_R r5, r2 xor r13, r10 @@ -199,26 +193,26 @@ randomx_isn_50: ; FSUB_R f3, a0 subpd xmm3, xmm8 randomx_isn_51: - ; COND_R r2, be(r3, -1975981803), 7 + ; COND_R r2, be(r3, -1975981803), LSH 7 add r12, 128 test r12, 16256 jz randomx_isn_25 - xor ecx, ecx + xor rcx, rcx cmp r11d, -1975981803 setbe cl add r10, rcx randomx_isn_52: - ; IADD_RC r1, r1, 878232328 - lea r9, [r9+r9+878232328] + ; IADD_RS r1, r1, LSH 2 + lea r9, [r9+r9*4] randomx_isn_53: ; FSUB_R f2, a0 subpd xmm2, xmm8 randomx_isn_54: - ; COND_R r5, ns(r1, 1917049931), 6 + ; COND_R r5, ns(r1, 1917049931), LSH 6 add r8, 64 test r8, 8128 jz randomx_isn_52 - xor ecx, ecx + xor rcx, rcx cmp r9d, 1917049931 setns cl add r13, rcx @@ -232,8 +226,8 @@ randomx_isn_57: ; IMUL_R r5, r1 imul r13, r9 randomx_isn_58: - ; IADD_R r5, r1 - add r13, r9 + ; IADD_RS r5, r1, -999103579, LSH 0 + lea r13, [r13+r9*1-999103579] randomx_isn_59: ; FMUL_R e2, a2 mulpd xmm6, xmm10 @@ -242,17 +236,21 @@ randomx_isn_60: mov ecx, r14d ror r10, cl randomx_isn_61: - ; IADD_RC r0, r3, 553576025 - lea r8, [r8+r11+553576025] + ; IADD_RS r0, r3, LSH 1 + lea r8, [r8+r11*2] randomx_isn_62: ; FSQRT_R e3 sqrtpd xmm7, xmm7 randomx_isn_63: - ; IMUL_9C r6, -1165860156 - lea r14, [r14+r14*8-1165860156] + ; ISUB_M r6, L1[r5-1165860156] + lea eax, [r13d-1165860156] + and eax, 16376 + sub r14, qword ptr [rsi+rax] randomx_isn_64: - ; IMUL_9C r5, -1323706896 - lea r13, [r13+r13*8-1323706896] + ; ISUB_M r5, L2[r7-1323706896] + lea eax, [r15d-1323706896] + and eax, 262136 + sub r13, qword ptr [rsi+rax] randomx_isn_65: ; IMUL_RCP r5, 2362240456 mov rax, 16769707400664451577 @@ -261,22 +259,20 @@ randomx_isn_66: ; ISUB_R r4, 841292629 sub r12, 841292629 randomx_isn_67: - ; IADD_M r4, L1[r6] - mov eax, r14d - and eax, 16376 - add r12, qword ptr [rsi+rax] + ; IADD_RS r4, r6, LSH 2 + lea r12, [r12+r14*4] randomx_isn_68: - ; FSUB_M f3, L1[r4] - mov eax, r12d + ; FSUB_M f3, L1[r4+613549729] + lea eax, [r12d+613549729] and eax, 16376 cvtdq2pd xmm12, qword ptr [rsi+rax] subpd xmm3, xmm12 randomx_isn_69: - ; IADD_RC r6, r4, -1863144764 - lea r14, [r14+r12-1863144764] + ; IADD_RS r6, r4, LSH 0 + lea r14, [r14+r12*1] randomx_isn_70: - ; FSUB_M f1, L1[r5] - mov eax, r13d + ; FSUB_M f1, L1[r5+629563256] + lea eax, [r13d+629563256] and eax, 16376 cvtdq2pd xmm12, qword ptr [rsi+rax] subpd xmm1, xmm12 @@ -290,11 +286,11 @@ randomx_isn_73: ; FMUL_R e0, a0 mulpd xmm4, xmm8 randomx_isn_74: - ; COND_R r6, ns(r3, -1200328848), 2 + ; COND_R r6, ns(r3, -1200328848), LSH 2 add r9, 4 test r9, 508 jz randomx_isn_55 - xor ecx, ecx + xor rcx, rcx cmp r11d, -1200328848 setns cl add r14, rcx @@ -302,8 +298,8 @@ randomx_isn_75: ; FMUL_R e0, a3 mulpd xmm4, xmm11 randomx_isn_76: - ; FDIV_M e3, L1[r4] - mov eax, r12d + ; FDIV_M e3, L1[r4+1170730568] + lea eax, [r12d+1170730568] and eax, 16376 cvtdq2pd xmm12, qword ptr [rsi+rax] andps xmm12, xmm13 @@ -316,11 +312,11 @@ randomx_isn_78: ; FMUL_R e2, a1 mulpd xmm6, xmm9 randomx_isn_79: - ; IADD_RC r3, r1, -919815727 - lea r11, [r11+r9-919815727] + ; IADD_RS r3, r1, LSH 1 + lea r11, [r11+r9*2] randomx_isn_80: - ; ISTORE L1[r2], r4 - mov eax, r10d + ; ISTORE L1[r2+1885666804], r4 + lea eax, [r10d+1885666804] and eax, 16376 mov qword ptr [rsi+rax], r12 randomx_isn_81: @@ -352,24 +348,24 @@ randomx_isn_88: ; IMUL_R r1, r3 imul r9, r11 randomx_isn_89: - ; COND_M r2, no(L1[r0], -122257389), 6 + ; COND_M r2, no(L1[r0-122257389], -122257389), LSH 6 add r8, 64 test r8, 8128 jz randomx_isn_75 - xor ecx, ecx - mov eax, r8d + xor rcx, rcx + lea eax, [r8d-122257389] and eax, 16376 cmp dword ptr [rsi+rax], -122257389 setno cl add r10, rcx randomx_isn_90: - ; ISTORE L1[r5], r7 - mov eax, r13d + ; ISTORE L1[r5+228116180], r7 + lea eax, [r13d+228116180] and eax, 16376 mov qword ptr [rsi+rax], r15 randomx_isn_91: - ; ISTORE L1[r6], r5 - mov eax, r14d + ; ISTORE L1[r6+650356254], r5 + lea eax, [r14d+650356254] and eax, 16376 mov qword ptr [rsi+rax], r13 randomx_isn_92: @@ -382,27 +378,29 @@ randomx_isn_94: ; IXOR_R r6, r1 xor r14, r9 randomx_isn_95: - ; ISUB_M r0, L3[910032] - sub r8, qword ptr [rsi+910032] + ; ISUB_R r0, 307094227 + sub r8, 307094227 randomx_isn_96: ; FSWAP_R e3 shufpd xmm7, xmm7, 1 randomx_isn_97: - ; IMUL_M r4, L1[r2] - mov eax, r10d + ; IMUL_M r4, L1[r2-30542523] + lea eax, [r10d-30542523] and eax, 16376 imul r12, qword ptr [rsi+rax] randomx_isn_98: - ; IMUL_9C r0, 2144355962 - lea r8, [r8+r8*8+2144355962] + ; ISUB_M r0, L1[r2+2144355962] + lea eax, [r10d+2144355962] + and eax, 16376 + sub r8, qword ptr [rsi+rax] randomx_isn_99: ; IMULH_R r1, r5 mov rax, r9 mul r13 mov r9, rdx randomx_isn_100: - ; ISTORE L1[r7], r3 - mov eax, r15d + ; ISTORE L1[r7+818959056], r3 + lea eax, [r15d+818959056] and eax, 16376 mov qword ptr [rsi+rax], r11 randomx_isn_101: @@ -414,8 +412,8 @@ randomx_isn_103: ; ISUB_R r2, -1777504751 sub r10, -1777504751 randomx_isn_104: - ; ISTORE L2[r6], r7 - mov eax, r14d + ; ISTORE L2[r6-2059767784], r7 + lea eax, [r14d-2059767784] and eax, 262136 mov qword ptr [rsi+rax], r15 randomx_isn_105: @@ -430,10 +428,8 @@ randomx_isn_107: imul r13 mov r14, rdx randomx_isn_108: - ; IADD_M r7, L1[r0] - mov eax, r8d - and eax, 16376 - add r15, qword ptr [rsi+rax] + ; IADD_RS r7, r0, LSH 1 + lea r15, [r15+r8*2] randomx_isn_109: ; IMUL_R r6, r5 imul r14, r13 @@ -441,25 +437,23 @@ randomx_isn_110: ; IMUL_R r5, r1 imul r13, r9 randomx_isn_111: - ; FADD_M f2, L1[r0] - mov eax, r8d + ; FADD_M f2, L1[r0-1280829689] + lea eax, [r8d-1280829689] and eax, 16376 cvtdq2pd xmm12, qword ptr [rsi+rax] addpd xmm2, xmm12 randomx_isn_112: - ; IADD_R r0, r3 - add r8, r11 + ; IADD_RS r0, r3, LSH 0 + lea r8, [r8+r11*1] randomx_isn_113: - ; IADD_RC r3, r4, -1138304368 - lea r11, [r11+r12-1138304368] + ; IADD_RS r3, r4, LSH 1 + lea r11, [r11+r12*2] randomx_isn_114: - ; IADD_M r2, L1[r4] - mov eax, r12d - and eax, 16376 - add r10, qword ptr [rsi+rax] + ; IADD_RS r2, r4, LSH 2 + lea r10, [r10+r12*4] randomx_isn_115: - ; IMUL_M r7, L1[r2] - mov eax, r10d + ; IMUL_M r7, L1[r2-106928748] + lea eax, [r10d-106928748] and eax, 16376 imul r15, qword ptr [rsi+rax] randomx_isn_116: @@ -469,17 +463,17 @@ randomx_isn_117: ; FSUB_R f2, a2 subpd xmm2, xmm10 randomx_isn_118: - ; IADD_R r2, 160326201 - add r10, 160326201 + ; IADD_RS r2, r2, LSH 0 + lea r10, [r10+r10*1] randomx_isn_119: - ; ISUB_M r7, L3[1780152] - sub r15, qword ptr [rsi+1780152] + ; ISUB_R r7, -342152774 + sub r15, -342152774 randomx_isn_120: - ; IADD_R r4, r1 - add r12, r9 + ; IADD_RS r4, r1, LSH 1 + lea r12, [r12+r9*2] randomx_isn_121: - ; IADD_R r4, r7 - add r12, r15 + ; IADD_RS r4, r7, LSH 2 + lea r12, [r12+r15*4] randomx_isn_122: ; FSUB_R f0, a1 subpd xmm0, xmm9 @@ -490,15 +484,15 @@ randomx_isn_124: ; FSUB_R f2, a2 subpd xmm2, xmm10 randomx_isn_125: - ; ISMULH_M r2, L2[r1] - mov ecx, r9d + ; ISMULH_M r2, L2[r1+1421890385] + lea ecx, [r9d+1421890385] and ecx, 262136 mov rax, r10 imul qword ptr [rsi+rcx] mov r10, rdx randomx_isn_126: - ; FSUB_M f2, L2[r2] - mov eax, r10d + ; FSUB_M f2, L2[r2+875507660] + lea eax, [r10d+875507660] and eax, 262136 cvtdq2pd xmm12, qword ptr [rsi+rax] subpd xmm2, xmm12 @@ -509,14 +503,14 @@ randomx_isn_128: ; FSUB_R f3, a1 subpd xmm3, xmm9 randomx_isn_129: - ; IADD_RC r1, r2, 697183462 - lea r9, [r9+r10+697183462] + ; IADD_RS r1, r2, LSH 2 + lea r9, [r9+r10*4] randomx_isn_130: ; FSUB_R f1, a1 subpd xmm1, xmm9 randomx_isn_131: - ; IMUL_M r2, L1[r3] - mov eax, r11d + ; IMUL_M r2, L1[r3+63855818] + lea eax, [r11d+63855818] and eax, 16376 imul r10, qword ptr [rsi+rax] randomx_isn_132: @@ -533,29 +527,25 @@ randomx_isn_135: ; FMUL_R e1, a2 mulpd xmm5, xmm10 randomx_isn_136: - ; ISUB_M r3, L2[r6] - mov eax, r14d - and eax, 262136 - sub r11, qword ptr [rsi+rax] + ; ISUB_R r3, r6 + sub r11, r14 randomx_isn_137: - ; IADD_RC r4, r1, -1660063210 - lea r12, [r12+r9-1660063210] + ; IADD_RS r4, r1, LSH 0 + lea r12, [r12+r9*1] randomx_isn_138: - ; ISTORE L1[r0], r0 - mov eax, r8d + ; ISTORE L1[r0+56684410], r0 + lea eax, [r8d+56684410] and eax, 16376 mov qword ptr [rsi+rax], r8 randomx_isn_139: - ; FADD_M f0, L1[r5] - mov eax, r13d + ; FADD_M f0, L1[r5+195344615] + lea eax, [r13d+195344615] and eax, 16376 cvtdq2pd xmm12, qword ptr [rsi+rax] addpd xmm0, xmm12 randomx_isn_140: - ; ISUB_M r7, L1[r3] - mov eax, r11d - and eax, 16376 - sub r15, qword ptr [rsi+rax] + ; ISUB_R r7, r3 + sub r15, r11 randomx_isn_141: ; IROR_R r3, r2 mov ecx, r10d @@ -564,17 +554,17 @@ randomx_isn_142: ; FADD_R f1, a0 addpd xmm1, xmm8 randomx_isn_143: - ; COND_R r5, ge(r1, 880467599), 2 + ; COND_R r5, ge(r1, 880467599), LSH 2 add r14, 4 test r14, 508 jz randomx_isn_110 - xor ecx, ecx + xor rcx, rcx cmp r9d, 880467599 setge cl add r13, rcx randomx_isn_144: - ; FSUB_M f1, L1[r5] - mov eax, r13d + ; FSUB_M f1, L1[r5+1283529302] + lea eax, [r13d+1283529302] and eax, 16376 cvtdq2pd xmm12, qword ptr [rsi+rax] subpd xmm1, xmm12 @@ -582,26 +572,26 @@ randomx_isn_145: ; ISUB_R r5, r3 sub r13, r11 randomx_isn_146: - ; IADD_RC r0, r3, 1228198394 - lea r8, [r8+r11+1228198394] + ; IADD_RS r0, r3, LSH 1 + lea r8, [r8+r11*2] randomx_isn_147: - ; IADD_RC r1, r3, 1747766580 - lea r9, [r9+r11+1747766580] + ; IADD_RS r1, r3, LSH 1 + lea r9, [r9+r11*2] randomx_isn_148: ; FSQRT_R e1 sqrtpd xmm5, xmm5 randomx_isn_149: - ; IADD_R r4, r3 - add r12, r11 + ; IADD_RS r4, r3, LSH 1 + lea r12, [r12+r11*2] randomx_isn_150: - ; FADD_M f1, L1[r0] - mov eax, r8d + ; FADD_M f1, L1[r0-1977073973] + lea eax, [r8d-1977073973] and eax, 16376 cvtdq2pd xmm12, qword ptr [rsi+rax] addpd xmm1, xmm12 randomx_isn_151: - ; IADD_RC r1, r0, 1063245428 - lea r9, [r9+r8+1063245428] + ; IADD_RS r1, r0, LSH 1 + lea r9, [r9+r8*2] randomx_isn_152: ; FSUB_R f1, a0 subpd xmm1, xmm8 @@ -623,20 +613,18 @@ randomx_isn_156: ; FSUB_R f3, a2 subpd xmm3, xmm10 randomx_isn_157: - ; ISTORE L1[r1], r1 - mov eax, r9d + ; ISTORE L1[r1+883418866], r1 + lea eax, [r9d+883418866] and eax, 16376 mov qword ptr [rsi+rax], r9 randomx_isn_158: - ; ISTORE L1[r6], r4 - mov eax, r14d + ; ISTORE L1[r6-312458782], r4 + lea eax, [r14d-312458782] and eax, 16376 mov qword ptr [rsi+rax], r12 randomx_isn_159: - ; IADD_M r7, L1[r2] - mov eax, r10d - and eax, 16376 - add r15, qword ptr [rsi+rax] + ; IADD_RS r7, r2, LSH 3 + lea r15, [r15+r10*8] randomx_isn_160: ; IMUL_RCP r7, 2040763167 mov rax, 9705702723791900149 @@ -645,8 +633,8 @@ randomx_isn_161: ; FADD_R f3, a3 addpd xmm3, xmm11 randomx_isn_162: - ; IADD_RC r6, r4, -783948693 - lea r14, [r14+r12-783948693] + ; IADD_RS r6, r4, LSH 1 + lea r14, [r14+r12*2] randomx_isn_163: ; ISWAP_R r3, r5 xchg r11, r13 @@ -661,19 +649,23 @@ randomx_isn_166: mov ecx, r11d ror r13, cl randomx_isn_167: - ; IMUL_9C r2, 805006473 - lea r10, [r10+r10*8+805006473] + ; ISUB_M r2, L2[r0+805006473] + lea eax, [r8d+805006473] + and eax, 262136 + sub r10, qword ptr [rsi+rax] randomx_isn_168: - ; FDIV_M e0, L1[r4] - mov eax, r12d + ; FDIV_M e0, L1[r4-2098372994] + lea eax, [r12d-2098372994] and eax, 16376 cvtdq2pd xmm12, qword ptr [rsi+rax] andps xmm12, xmm13 orps xmm12, xmm14 divpd xmm4, xmm12 randomx_isn_169: - ; IMUL_9C r3, 1773188989 - lea r11, [r11+r11*8+1773188989] + ; ISUB_M r3, L2[r4+1773188989] + lea eax, [r12d+1773188989] + and eax, 262136 + sub r11, qword ptr [rsi+rax] randomx_isn_170: ; FADD_R f0, a3 addpd xmm0, xmm11 @@ -681,13 +673,13 @@ randomx_isn_171: ; FADD_R f1, a0 addpd xmm1, xmm8 randomx_isn_172: - ; ISTORE L1[r7], r6 - mov eax, r15d + ; ISTORE L1[r7-933780249], r6 + lea eax, [r15d-933780249] and eax, 16376 mov qword ptr [rsi+rax], r14 randomx_isn_173: - ; FSUB_M f0, L1[r7] - mov eax, r15d + ; FSUB_M f0, L1[r7-1204687701] + lea eax, [r15d-1204687701] and eax, 16376 cvtdq2pd xmm12, qword ptr [rsi+rax] subpd xmm0, xmm12 @@ -706,11 +698,13 @@ randomx_isn_177: ; IMUL_M r3, L3[232968] imul r11, qword ptr [rsi+232968] randomx_isn_178: - ; IADD_RC r5, r3, -2108568616 - lea r13, [r13+r11-2108568616] + ; IADD_RS r5, r3, -2108568616, LSH 1 + lea r13, [r13+r11*2-2108568616] randomx_isn_179: - ; IADD_RC r3, r4, 1322108729 - lea r11, [r11+r12+1322108729] + ; IADD_M r3, L1[r4+1322108729] + lea eax, [r12d+1322108729] + and eax, 16376 + add r11, qword ptr [rsi+rax] randomx_isn_180: ; FADD_R f3, a1 addpd xmm3, xmm9 @@ -721,10 +715,8 @@ randomx_isn_182: ; FMUL_R e2, a2 mulpd xmm6, xmm10 randomx_isn_183: - ; IADD_M r6, L2[r2] - mov eax, r10d - and eax, 262136 - add r14, qword ptr [rsi+rax] + ; IADD_RS r6, r2, LSH 0 + lea r14, [r14+r10*1] randomx_isn_184: ; FADD_R f2, a3 addpd xmm2, xmm11 @@ -735,40 +727,38 @@ randomx_isn_186: ; FSCAL_R f3 xorps xmm3, xmm15 randomx_isn_187: - ; IADD_RC r6, r6, -914790425 - lea r14, [r14+r14-914790425] + ; IADD_RS r6, r6, LSH 3 + lea r14, [r14+r14*8] randomx_isn_188: ; FSCAL_R f2 xorps xmm2, xmm15 randomx_isn_189: - ; IMUL_M r4, L1[r5] - mov eax, r13d + ; IMUL_M r4, L1[r5+1014226422] + lea eax, [r13d+1014226422] and eax, 16376 imul r12, qword ptr [rsi+rax] randomx_isn_190: - ; FSUB_M f2, L1[r3] - mov eax, r11d + ; FSUB_M f2, L1[r3-87032284] + lea eax, [r11d-87032284] and eax, 16376 cvtdq2pd xmm12, qword ptr [rsi+rax] subpd xmm2, xmm12 randomx_isn_191: - ; IMUL_M r4, L2[r3] - mov eax, r11d + ; IMUL_M r4, L2[r3+1279913094] + lea eax, [r11d+1279913094] and eax, 262136 imul r12, qword ptr [rsi+rax] randomx_isn_192: - ; ISUB_M r7, L1[r3] - mov eax, r11d - and eax, 16376 - sub r15, qword ptr [rsi+rax] + ; ISUB_R r7, r3 + sub r15, r11 randomx_isn_193: - ; ISTORE L1[r1], r1 - mov eax, r9d + ; ISTORE L1[r1+901530824], r1 + lea eax, [r9d+901530824] and eax, 16376 mov qword ptr [rsi+rax], r9 randomx_isn_194: - ; ISTORE L1[r3], r4 - mov eax, r11d + ; ISTORE L1[r3+965438117], r4 + lea eax, [r11d+965438117] and eax, 16376 mov qword ptr [rsi+rax], r12 randomx_isn_195: @@ -778,8 +768,8 @@ randomx_isn_196: ; FMUL_R e2, a0 mulpd xmm6, xmm8 randomx_isn_197: - ; FADD_M f2, L2[r0] - mov eax, r8d + ; FADD_M f2, L2[r0-197871122] + lea eax, [r8d-197871122] and eax, 262136 cvtdq2pd xmm12, qword ptr [rsi+rax] addpd xmm2, xmm12 @@ -790,37 +780,41 @@ randomx_isn_199: ; FSUB_R f3, a3 subpd xmm3, xmm11 randomx_isn_200: - ; IADD_RC r2, r5, 248917123 - lea r10, [r10+r13+248917123] + ; IADD_RS r2, r5, LSH 2 + lea r10, [r10+r13*4] randomx_isn_201: - ; IMUL_9C r6, 376384700 - lea r14, [r14+r14*8+376384700] + ; ISUB_M r6, L2[r3+376384700] + lea eax, [r11d+376384700] + and eax, 262136 + sub r14, qword ptr [rsi+rax] randomx_isn_202: ; ISWAP_R r3, r6 xchg r11, r14 randomx_isn_203: - ; ISTORE L1[r1], r3 - mov eax, r9d + ; ISTORE L1[r1+330228321], r3 + lea eax, [r9d+330228321] and eax, 16376 mov qword ptr [rsi+rax], r11 randomx_isn_204: ; IMUL_R r6, r1 imul r14, r9 randomx_isn_205: - ; ISUB_R r7, r5 - sub r15, r13 + ; IADD_M r7, L2[r5-579800039] + lea eax, [r13d-579800039] + and eax, 262136 + add r15, qword ptr [rsi+rax] randomx_isn_206: - ; IADD_R r3, r5 - add r11, r13 + ; IADD_RS r3, r5, LSH 0 + lea r11, [r11+r13*1] randomx_isn_207: ; FSCAL_R f1 xorps xmm1, xmm15 randomx_isn_208: - ; IADD_R r6, r3 - add r14, r11 + ; IADD_RS r6, r3, LSH 1 + lea r14, [r14+r11*2] randomx_isn_209: - ; FSUB_M f0, L1[r4] - mov eax, r12d + ; FSUB_M f0, L1[r4-557177119] + lea eax, [r12d-557177119] and eax, 16376 cvtdq2pd xmm12, qword ptr [rsi+rax] subpd xmm0, xmm12 @@ -831,13 +825,13 @@ randomx_isn_211: ; FMUL_R e2, a3 mulpd xmm6, xmm11 randomx_isn_212: - ; IMUL_M r0, L1[r1] - mov eax, r9d + ; IMUL_M r0, L1[r1-300353849] + lea eax, [r9d-300353849] and eax, 16376 imul r8, qword ptr [rsi+rax] randomx_isn_213: - ; FSUB_M f2, L1[r5] - mov eax, r13d + ; FSUB_M f2, L1[r5-2016234225] + lea eax, [r13d-2016234225] and eax, 16376 cvtdq2pd xmm12, qword ptr [rsi+rax] subpd xmm2, xmm12 @@ -848,21 +842,21 @@ randomx_isn_215: ; FADD_R f3, a1 addpd xmm3, xmm9 randomx_isn_216: - ; IXOR_M r4, L2[r1] - mov eax, r9d + ; IXOR_M r4, L2[r1+926150064] + lea eax, [r9d+926150064] and eax, 262136 xor r12, qword ptr [rsi+rax] randomx_isn_217: - ; IMUL_M r6, L1[r5] - mov eax, r13d + ; IMUL_M r6, L1[r5-1692567271] + lea eax, [r13d-1692567271] and eax, 16376 imul r14, qword ptr [rsi+rax] randomx_isn_218: ; FSCAL_R f2 xorps xmm2, xmm15 randomx_isn_219: - ; FADD_M f3, L1[r7] - mov eax, r15d + ; FADD_M f3, L1[r7+162195095] + lea eax, [r15d+162195095] and eax, 16376 cvtdq2pd xmm12, qword ptr [rsi+rax] addpd xmm3, xmm12 @@ -873,27 +867,25 @@ randomx_isn_221: ; IMUL_R r1, r0 imul r9, r8 randomx_isn_222: - ; IADD_M r1, L1[r0] - mov eax, r8d - and eax, 16376 - add r9, qword ptr [rsi+rax] + ; IADD_RS r1, r0, LSH 2 + lea r9, [r9+r8*4] randomx_isn_223: ; FSCAL_R f2 xorps xmm2, xmm15 randomx_isn_224: - ; IADD_R r5, r4 - add r13, r12 + ; IADD_RS r5, r4, 312567979, LSH 1 + lea r13, [r13+r12*2+312567979] randomx_isn_225: - ; ISTORE L2[r2], r1 - mov eax, r10d + ; ISTORE L2[r2+260885699], r1 + lea eax, [r10d+260885699] and eax, 262136 mov qword ptr [rsi+rax], r9 randomx_isn_226: ; ISUB_R r6, -791575725 sub r14, -791575725 randomx_isn_227: - ; FDIV_M e3, L1[r0] - mov eax, r8d + ; FDIV_M e3, L1[r0-1140408845] + lea eax, [r8d-1140408845] and eax, 16376 cvtdq2pd xmm12, qword ptr [rsi+rax] andps xmm12, xmm13 @@ -906,10 +898,8 @@ randomx_isn_229: ; ISWAP_R r0, r6 xchg r8, r14 randomx_isn_230: - ; IADD_M r2, L1[r7] - mov eax, r15d - and eax, 16376 - add r10, qword ptr [rsi+rax] + ; IADD_RS r2, r7, LSH 2 + lea r10, [r10+r15*4] randomx_isn_231: ; FMUL_R e1, a0 mulpd xmm5, xmm8 @@ -920,8 +910,10 @@ randomx_isn_233: ; FMUL_R e0, a2 mulpd xmm4, xmm10 randomx_isn_234: - ; IADD_RC r2, r7, 1435646464 - lea r10, [r10+r15+1435646464] + ; IADD_M r2, L1[r7+1435646464] + lea eax, [r15d+1435646464] + and eax, 16376 + add r10, qword ptr [rsi+rax] randomx_isn_235: ; ISWAP_R r7, r6 xchg r15, r14 @@ -932,8 +924,8 @@ randomx_isn_237: ; FSUB_R f1, a3 subpd xmm1, xmm11 randomx_isn_238: - ; IADD_R r4, r2 - add r12, r10 + ; IADD_RS r4, r2, LSH 1 + lea r12, [r12+r10*2] randomx_isn_239: ; IMUL_RCP r7, 3065786637 mov rax, 12921343181238534701 @@ -952,8 +944,8 @@ randomx_isn_243: ; FSUB_R f0, a3 subpd xmm0, xmm11 randomx_isn_244: - ; FADD_M f1, L1[r0] - mov eax, r8d + ; FADD_M f1, L1[r0-389606015] + lea eax, [r8d-389606015] and eax, 16376 cvtdq2pd xmm12, qword ptr [rsi+rax] addpd xmm1, xmm12 @@ -961,47 +953,45 @@ randomx_isn_245: ; INEG_R r3 neg r11 randomx_isn_246: - ; IMUL_9C r7, 1938400676 - lea r15, [r15+r15*8+1938400676] + ; ISUB_M r7, L2[r2+1938400676] + lea eax, [r10d+1938400676] + and eax, 262136 + sub r15, qword ptr [rsi+rax] randomx_isn_247: - ; COND_M r2, be(L1[r5], -8545330), 2 + ; COND_M r2, be(L1[r5-8545330], -8545330), LSH 2 add r9, 4 test r9, 508 jz randomx_isn_223 - xor ecx, ecx - mov eax, r13d + xor rcx, rcx + lea eax, [r13d-8545330] and eax, 16376 cmp dword ptr [rsi+rax], -8545330 setbe cl add r10, rcx randomx_isn_248: - ; ISTORE L1[r0], r5 - mov eax, r8d + ; ISTORE L1[r0+1951752498], r5 + lea eax, [r8d+1951752498] and eax, 16376 mov qword ptr [rsi+rax], r13 randomx_isn_249: - ; IADD_RC r6, r5, 2052724836 - lea r14, [r14+r13+2052724836] + ; IADD_RS r6, r5, LSH 2 + lea r14, [r14+r13*4] randomx_isn_250: ; FADD_R f3, a0 addpd xmm3, xmm8 randomx_isn_251: - ; IADD_R r0, -221201557 - add r8, -221201557 + ; IADD_RS r0, r0, LSH 0 + lea r8, [r8+r8*1] randomx_isn_252: - ; ISUB_M r4, L1[r2] - mov eax, r10d - and eax, 16376 - sub r12, qword ptr [rsi+rax] + ; ISUB_R r4, r2 + sub r12, r10 randomx_isn_253: - ; IADD_RC r5, r4, 256175395 - lea r13, [r13+r12+256175395] + ; IADD_RS r5, r4, 256175395, LSH 0 + lea r13, [r13+r12*1+256175395] randomx_isn_254: - ; IADD_RC r6, r7, 1119815512 - lea r14, [r14+r15+1119815512] + ; IADD_RS r6, r7, LSH 2 + lea r14, [r14+r15*4] randomx_isn_255: ; IROR_R r7, r3 mov ecx, r11d ror r15, cl - mov ebx, 222 ; End marker bytes - db 064h, 067h, 090h ; End marker bytes \ No newline at end of file diff --git a/src/superscalarGenerator.cpp b/src/superscalarGenerator.cpp index e74dc50..e6420d1 100644 --- a/src/superscalarGenerator.cpp +++ b/src/superscalarGenerator.cpp @@ -495,7 +495,7 @@ namespace RandomX { // - this avoids optimizable instruction sequences such as "xor r1, r2; xor r1, r2" or "ror r, C1; ror r, C2" or "add r, C1; add r, C2" // * register r5 cannot be the destination of the IADD_RS instruction (limitation of the x86 lea instruction) for (unsigned i = 0; i < 8; ++i) { - if (registers[i].latency <= cycle && (canReuse_ || i != src_) && (allowChainedMul || opGroup_ != SuperscalarInstructionType::IMUL_R || registers[i].lastOpGroup != SuperscalarInstructionType::IMUL_R) && (registers[i].lastOpGroup != opGroup_ || registers[i].lastOpPar != opGroupPar_) && (info_->getType() != SuperscalarInstructionType::IADD_RS || i != LimitedAddressRegister)) + if (registers[i].latency <= cycle && (canReuse_ || i != src_) && (allowChainedMul || opGroup_ != SuperscalarInstructionType::IMUL_R || registers[i].lastOpGroup != SuperscalarInstructionType::IMUL_R) && (registers[i].lastOpGroup != opGroup_ || registers[i].lastOpPar != opGroupPar_) && (info_->getType() != SuperscalarInstructionType::IADD_RS || i != RegisterNeedsDisplacement)) availableRegisters.push_back(i); } return selectRegister(availableRegisters, gen, dst_); @@ -510,8 +510,8 @@ namespace RandomX { } //if there are only 2 available registers for IADD_RS and one of them is r5, select it as the source because it cannot be the destination if (availableRegisters.size() == 2 && info_->getType() == SuperscalarInstructionType::IADD_RS) { - if (availableRegisters[0] == LimitedAddressRegister || availableRegisters[1] == LimitedAddressRegister) { - opGroupPar_ = src_ = LimitedAddressRegister; + if (availableRegisters[0] == RegisterNeedsDisplacement || availableRegisters[1] == RegisterNeedsDisplacement) { + opGroupPar_ = src_ = RegisterNeedsDisplacement; return true; } }