Updated RandomX to v1.0.4

SChernykh · Jun 23, 2019 · 6071f38 · 6071f38
1 parent 2fc19ec
commit 6071f38
Show file tree

Hide file tree

Showing 6 changed files with 83 additions and 50 deletions.
diff --git a/README.md b/README.md
@@ -1,14 +1,14 @@
 # RandomX OpenCL implementation
 
-This repository contains full RandomX implementation for AMD Vega GPUs. The latest version of RandomX (1.0.3 as of June 1st, 2019) is supported.
+This repository contains full RandomX implementation for AMD Vega GPUs. The latest version of RandomX (1.0.4 as of June 23rd, 2019) is supported.
 
 Note: it's only a benchmark/testing tool, not an actual miner. RandomX hashrate is expected to improve somewhat in the future thanks to further optimizations.
 
 GPUs tested so far:
 
 Model|CryptonightR H/S|RandomX H/S|Relative speed
 -----|---------------|-----------|--------------
-AMD Vega 64 (1700/1100 MHz)|2200|1175|53.4%
+AMD Vega 64 (1700/1100 MHz)|2200|1171|53.2%
 
 ## Building on Windows
 

diff --git a/RandomX b/RandomX
diff --git a/RandomX_OpenCL/CL/fillAes1Rx4.cl b/RandomX_OpenCL/CL/fillAes1Rx4.cl
@@ -37,6 +37,25 @@ __kernel void fillAes_name(__global void* state, __global void* out, uint batch_
 
 #if num_rounds != 4
 	const uint k[4] = { AES_KEY_FILL[sub * 4], AES_KEY_FILL[sub * 4 + 1], AES_KEY_FILL[sub * 4 + 2], AES_KEY_FILL[sub * 4 + 3] };
+#else
+	const bool b = (sub < 2);
+	uint k[16];
+	k[ 0] = b ? 0x6421aaddu : 0xb5826f73u;
+	k[ 1] = b ? 0xd1833ddbu : 0xe3d6a7a6u;
+	k[ 2] = b ? 0x2f546d2bu : 0x3d518b6du;
+	k[ 3] = b ? 0x99e5d23fu : 0x229effb4u;
+	k[ 4] = b ? 0xb20e3450u : 0xc7566bf3u;
+	k[ 5] = b ? 0xb6913f55u : 0x9c10b3d9u;
+	k[ 6] = b ? 0x06f79d53u : 0xe9024d4eu;
+	k[ 7] = b ? 0xa5dfcde5u : 0xb272b7d2u;
+	k[ 8] = b ? 0x5c3ed904u : 0xf273c9e7u;
+	k[ 9] = b ? 0x515e7bafu : 0xf765a38bu;
+	k[10] = b ? 0x0aa4679fu : 0x2ba9660au;
+	k[11] = b ? 0x171c02bfu : 0xf63befa7u;
+	k[12] = b ? 0x85623763u : 0x7a7cd609u;
+	k[13] = b ? 0xe78f5d08u : 0x915839deu;
+	k[14] = b ? 0xcd673785u : 0x0c06d1fdu;
+	k[15] = b ? 0xd8ded291u : 0xc0b0762du;
 #endif
 
 	__global uint* s = ((__global uint*) state) + idx * (64 / sizeof(uint)) + sub * (16 / sizeof(uint));
@@ -70,25 +89,25 @@ __kernel void fillAes_name(__global void* state, __global void* out, uint batch_
 		x[2] = y[2];
 		x[3] = y[3];
 #else
-		y[0] = t0[get_byte(x[0], 0)] ^ t1[get_byte(x[1], s1)] ^ t2[get_byte(x[2], 16)] ^ t3[get_byte(x[3], s3)] ^ 0xf890465du;
-		y[1] = t0[get_byte(x[1], 0)] ^ t1[get_byte(x[2], s1)] ^ t2[get_byte(x[3], 16)] ^ t3[get_byte(x[0], s3)] ^ 0x7ffbe4a6u;
-		y[2] = t0[get_byte(x[2], 0)] ^ t1[get_byte(x[3], s1)] ^ t2[get_byte(x[0], 16)] ^ t3[get_byte(x[1], s3)] ^ 0x141f82b7u;
-		y[3] = t0[get_byte(x[3], 0)] ^ t1[get_byte(x[0], s1)] ^ t2[get_byte(x[1], 16)] ^ t3[get_byte(x[2], s3)] ^ 0xcf359e95u;
-
-		x[0] = t0[get_byte(y[0], 0)] ^ t1[get_byte(y[1], s1)] ^ t2[get_byte(y[2], 16)] ^ t3[get_byte(y[3], s3)] ^ 0x6a55c450u;
-		x[1] = t0[get_byte(y[1], 0)] ^ t1[get_byte(y[2], s1)] ^ t2[get_byte(y[3], 16)] ^ t3[get_byte(y[0], s3)] ^ 0xfee8278au;
-		x[2] = t0[get_byte(y[2], 0)] ^ t1[get_byte(y[3], s1)] ^ t2[get_byte(y[0], 16)] ^ t3[get_byte(y[1], s3)] ^ 0xbd5c5ac3u;
-		x[3] = t0[get_byte(y[3], 0)] ^ t1[get_byte(y[0], s1)] ^ t2[get_byte(y[1], 16)] ^ t3[get_byte(y[2], s3)] ^ 0x6741ffdcu;
-
-		y[0] = t0[get_byte(x[0], 0)] ^ t1[get_byte(x[1], s1)] ^ t2[get_byte(x[2], 16)] ^ t3[get_byte(x[3], s3)] ^ 0x114c47a4u;
-		y[1] = t0[get_byte(x[1], 0)] ^ t1[get_byte(x[2], s1)] ^ t2[get_byte(x[3], 16)] ^ t3[get_byte(x[0], s3)] ^ 0xd524fde4u;
-		y[2] = t0[get_byte(x[2], 0)] ^ t1[get_byte(x[3], s1)] ^ t2[get_byte(x[0], 16)] ^ t3[get_byte(x[1], s3)] ^ 0xa7279ad2u;
-		y[3] = t0[get_byte(x[3], 0)] ^ t1[get_byte(x[0], s1)] ^ t2[get_byte(x[1], 16)] ^ t3[get_byte(x[2], s3)] ^ 0x3d324aacu;
-
-		x[0] = t0[get_byte(y[0], 0)] ^ t1[get_byte(y[1], s1)] ^ t2[get_byte(y[2], 16)] ^ t3[get_byte(y[3], s3)] ^ 0x810c3a2au;
-		x[1] = t0[get_byte(y[1], 0)] ^ t1[get_byte(y[2], s1)] ^ t2[get_byte(y[3], 16)] ^ t3[get_byte(y[0], s3)] ^ 0x99a9aeffu;
-		x[2] = t0[get_byte(y[2], 0)] ^ t1[get_byte(y[3], s1)] ^ t2[get_byte(y[0], 16)] ^ t3[get_byte(y[1], s3)] ^ 0x42d3dbd9u;
-		x[3] = t0[get_byte(y[3], 0)] ^ t1[get_byte(y[0], s1)] ^ t2[get_byte(y[1], 16)] ^ t3[get_byte(y[2], s3)] ^ 0x76f6db08u;
+		y[0] = t0[get_byte(x[0], 0)] ^ t1[get_byte(x[1], s1)] ^ t2[get_byte(x[2], 16)] ^ t3[get_byte(x[3], s3)] ^ k[ 0];
+		y[1] = t0[get_byte(x[1], 0)] ^ t1[get_byte(x[2], s1)] ^ t2[get_byte(x[3], 16)] ^ t3[get_byte(x[0], s3)] ^ k[ 1];
+		y[2] = t0[get_byte(x[2], 0)] ^ t1[get_byte(x[3], s1)] ^ t2[get_byte(x[0], 16)] ^ t3[get_byte(x[1], s3)] ^ k[ 2];
+		y[3] = t0[get_byte(x[3], 0)] ^ t1[get_byte(x[0], s1)] ^ t2[get_byte(x[1], 16)] ^ t3[get_byte(x[2], s3)] ^ k[ 3];
+
+		x[0] = t0[get_byte(y[0], 0)] ^ t1[get_byte(y[1], s1)] ^ t2[get_byte(y[2], 16)] ^ t3[get_byte(y[3], s3)] ^ k[ 4];
+		x[1] = t0[get_byte(y[1], 0)] ^ t1[get_byte(y[2], s1)] ^ t2[get_byte(y[3], 16)] ^ t3[get_byte(y[0], s3)] ^ k[ 5];
+		x[2] = t0[get_byte(y[2], 0)] ^ t1[get_byte(y[3], s1)] ^ t2[get_byte(y[0], 16)] ^ t3[get_byte(y[1], s3)] ^ k[ 6];
+		x[3] = t0[get_byte(y[3], 0)] ^ t1[get_byte(y[0], s1)] ^ t2[get_byte(y[1], 16)] ^ t3[get_byte(y[2], s3)] ^ k[ 7];
+
+		y[0] = t0[get_byte(x[0], 0)] ^ t1[get_byte(x[1], s1)] ^ t2[get_byte(x[2], 16)] ^ t3[get_byte(x[3], s3)] ^ k[ 8];
+		y[1] = t0[get_byte(x[1], 0)] ^ t1[get_byte(x[2], s1)] ^ t2[get_byte(x[3], 16)] ^ t3[get_byte(x[0], s3)] ^ k[ 9];
+		y[2] = t0[get_byte(x[2], 0)] ^ t1[get_byte(x[3], s1)] ^ t2[get_byte(x[0], 16)] ^ t3[get_byte(x[1], s3)] ^ k[10];
+		y[3] = t0[get_byte(x[3], 0)] ^ t1[get_byte(x[0], s1)] ^ t2[get_byte(x[1], 16)] ^ t3[get_byte(x[2], s3)] ^ k[11];
+
+		x[0] = t0[get_byte(y[0], 0)] ^ t1[get_byte(y[1], s1)] ^ t2[get_byte(y[2], 16)] ^ t3[get_byte(y[3], s3)] ^ k[12];
+		x[1] = t0[get_byte(y[1], 0)] ^ t1[get_byte(y[2], s1)] ^ t2[get_byte(y[3], 16)] ^ t3[get_byte(y[0], s3)] ^ k[13];
+		x[2] = t0[get_byte(y[2], 0)] ^ t1[get_byte(y[3], s1)] ^ t2[get_byte(y[0], 16)] ^ t3[get_byte(y[1], s3)] ^ k[14];
+		x[3] = t0[get_byte(y[3], 0)] ^ t1[get_byte(y[0], s1)] ^ t2[get_byte(y[1], 16)] ^ t3[get_byte(y[2], s3)] ^ k[15];
 
 		*p = *(uint4*)(x);
 #endif

diff --git a/RandomX_OpenCL/CL/randomx_init.cl b/RandomX_OpenCL/CL/randomx_init.cl
@@ -86,32 +86,35 @@ along with RandomX OpenCL. If not, see <http://www.gnu.org/licenses/>.
 // 43.5*5 = 217.5 bytes on average
 #define RANDOMX_FREQ_IXOR_M         5
 
-// 15.5*10 = 155 bytes on average
-#define RANDOMX_FREQ_IROR_R        10
+// 15.5*8 = 124 bytes on average
+#define RANDOMX_FREQ_IROR_R         8
+
+// 15.5*2 = 31 bytes on average
+#define RANDOMX_FREQ_IROL_R         2
 
 // 10.5*4 = 42 bytes on average
 #define RANDOMX_FREQ_ISWAP_R        4
 
-// 20*8 = 160 bytes
-#define RANDOMX_FREQ_FSWAP_R        8
+// 20*4 = 80 bytes
+#define RANDOMX_FREQ_FSWAP_R        4
 
-// 8*20 = 160 bytes
-#define RANDOMX_FREQ_FADD_R        20
+// 8*16 = 128 bytes
+#define RANDOMX_FREQ_FADD_R        16
 
 // 40*5 = 200 bytes
 #define RANDOMX_FREQ_FADD_M         5
 
-// 8*20 = 160 bytes
-#define RANDOMX_FREQ_FSUB_R        20
+// 8*16 = 128 bytes
+#define RANDOMX_FREQ_FSUB_R        16
 
 // 40*5 = 200 bytes
 #define RANDOMX_FREQ_FSUB_M         5
 
 // 4*6 = 24 bytes
 #define RANDOMX_FREQ_FSCAL_R        6
 
-// 8*20 = 160 bytes
-#define RANDOMX_FREQ_FMUL_R        20
+// 8*32 = 256 bytes
+#define RANDOMX_FREQ_FMUL_R        32
 
 // 36*4 = 144 bytes
 #define RANDOMX_FREQ_FDIV_M         4
@@ -128,8 +131,8 @@ along with RandomX OpenCL. If not, see <http://www.gnu.org/licenses/>.
 // 28*16 = 448 bytes
 #define RANDOMX_FREQ_ISTORE        16
 
-// Total: 4804.25 + 4(s_setpc_b64) = 4808.25 bytes on average
-// Real average program size: 4791 bytes
+// Total: 4756.25 + 4(s_setpc_b64) = 4760.25 bytes on average
+// Real average program size: 4743 bytes
 
 ulong getSmallPositiveFloatBits(const ulong entropy)
 {
@@ -666,22 +669,36 @@ __global uint* jit_emit_instruction(__global uint* p, __global uint* last_branch
 	}
 	opcode -= RANDOMX_FREQ_IXOR_M;
 
-	if (opcode < RANDOMX_FREQ_IROR_R)
+	if (opcode < RANDOMX_FREQ_IROR_R + RANDOMX_FREQ_IROL_R)
 	{
 		if (src != dst) // p = 7/8
 		{
-			// s_lshr_b64 s[32:33], s[16 + dst * 2:17 + dst * 2], s(16 + src * 2)
-			*(p++) = 0x8fa01010u | (dst << 1) | (src << 9);
+			if (opcode < RANDOMX_FREQ_IROR_R)
+			{
+				// s_lshr_b64 s[32:33], s[16 + dst * 2:17 + dst * 2], s(16 + src * 2)
+				*(p++) = 0x8fa01010u | (dst << 1) | (src << 9);
+
+				// s_sub_u32  s15, 64, s(16 + src * 2)
+				*(p++) = 0x808f10c0u | (src << 9);
+
+				// s_lshl_b64 s[34:35], s[16 + dst * 2:17 + dst * 2], s15
+				*(p++) = 0x8ea20f10u | (dst << 1);
+			}
+			else
+			{
+				// s_lshl_b64 s[32:33], s[16 + dst * 2:17 + dst * 2], s(16 + src * 2)
+				*(p++) = 0x8ea01010u | (dst << 1) | (src << 9);
 
-			// s_sub_u32  s15, 64, s(16 + src * 2)
-			*(p++) = 0x808f10c0u | (src << 9);
+				// s_sub_u32  s15, 64, s(16 + src * 2)
+				*(p++) = 0x808f10c0u | (src << 9);
 
-			// s_lshl_b64 s[34:35], s[16 + dst * 2:17 + dst * 2], s15
-			*(p++) = 0x8ea20f10u | (dst << 1);
+				// s_lshr_b64 s[34:35], s[16 + dst * 2:17 + dst * 2], s15
+				*(p++) = 0x8fa20f10u | (dst << 1);
+			}
 		}
 		else // p = 1/8
 		{
-			const uint shift = inst.y & 63;
+			const uint shift = ((opcode < RANDOMX_FREQ_IROR_R) ? inst.y : -inst.y) & 63;
 
 			// s_lshr_b64 s[32:33], s[16 + dst * 2:17 + dst * 2], shift
 			*(p++) = 0x8fa08010u | (dst << 1) | (shift << 8);
@@ -696,7 +713,7 @@ __global uint* jit_emit_instruction(__global uint* p, __global uint* last_branch
 		// 12*7/8 + 8/8 + 4 = 15.5 bytes on average
 		return p;
 	}
-	opcode -= RANDOMX_FREQ_IROR_R;
+	opcode -= RANDOMX_FREQ_IROR_R + RANDOMX_FREQ_IROL_R;
 
 	if (opcode < RANDOMX_FREQ_ISWAP_R)
 	{
@@ -1137,13 +1154,13 @@ __global uint* generate_jit_code(__global uint2* e, __global uint2* p0, __global
 			}
 			opcode -= RANDOMX_FREQ_IXOR_M;
 
-			if (opcode < RANDOMX_FREQ_IROR_R)
+			if (opcode < RANDOMX_FREQ_IROR_R + RANDOMX_FREQ_IROL_R)
 			{
 				registerLastChanged = (registerLastChanged & ~(0xFFul << (dst * 8))) | ((ulong)(i) << (dst * 8));
 				registerWasChanged |= 1u << dst;
 				continue;
 			}
-			opcode -= RANDOMX_FREQ_IROR_R;
+			opcode -= RANDOMX_FREQ_IROR_R + RANDOMX_FREQ_IROL_R;
 
 			if (opcode < RANDOMX_FREQ_ISWAP_R)
 			{

diff --git a/RandomX_OpenCL/RandomX_OpenCL.vcxproj b/RandomX_OpenCL/RandomX_OpenCL.vcxproj
@@ -58,8 +58,6 @@
       <ConformanceMode>true</ConformanceMode>
       <AdditionalIncludeDirectories>$(CUDA_PATH)\include</AdditionalIncludeDirectories>
       <TreatWarningAsError>true</TreatWarningAsError>
-      <BasicRuntimeChecks>Default</BasicRuntimeChecks>
-      <RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary>
     </ClCompile>
     <Link>
       <SubSystem>Console</SubSystem>
@@ -78,7 +76,6 @@
       <ConformanceMode>true</ConformanceMode>
       <AdditionalIncludeDirectories>$(CUDA_PATH)\include</AdditionalIncludeDirectories>
       <TreatWarningAsError>true</TreatWarningAsError>
-      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
     </ClCompile>
     <Link>
       <SubSystem>Console</SubSystem>

diff --git a/RandomX_OpenCL/tests.cpp b/RandomX_OpenCL/tests.cpp
@@ -189,12 +189,12 @@ bool tests(uint32_t platform_id, uint32_t device_id, size_t intensity)
 
 		if (memcmp(entropy.data() + i * ENTROPY_SIZE, entropy.data() + ENTROPY_SIZE * intensity, ENTROPY_SIZE) != 0)
 		{
-			std::cerr << "fillAes1Rx4_entropy test (entropy) failed!" << std::endl;
+			std::cerr << "fillAes4Rx4_entropy test (entropy) failed!" << std::endl;
 			return false;
 		}
 	}
 
-	std::cout << "fillAes1Rx4_entropy test passed" << std::endl;
+	std::cout << "fillAes4Rx4_entropy test passed" << std::endl;
 
 	kernel = ctx.kernels[CL_HASHAES1RX4];
 	if (!clSetKernelArgs(kernel, scratchpads_gpu, registers_gpu, static_cast<uint32_t>(intensity)))
+11 −0		CMakeLists.txt
+13 −13		README.md
+269 −0		doc/configuration.md
+123 −7		doc/design.md
+56 −26		doc/specs.md
+45 −13		makefile
+21 −0		randomx.sln
+21 −13		src/aes_hash.cpp
+6 −7		src/asm/configuration.asm
+12 −2		src/assembly_generator_x86.cpp
+482 −0		src/bytecode_machine.cpp
+322 −0		src/bytecode_machine.hpp
+25 −7		src/common.hpp
+18 −16		src/configuration.h
+1 −1		src/dataset.cpp
+32 −32		src/instruction.hpp
+0 −40		src/instruction_weights.hpp
+0 −1		src/instructions_portable.cpp
+218 −0		src/intrin_portable.h
+1 −1		src/jit_compiler_x86.cpp
+9 −5		src/jit_compiler_x86_static.S
+17 −13		src/randomx.h
+8 −0		src/reciprocal.c
+6 −0		src/reciprocal.h
+31 −31		src/superscalar.cpp
+18 −19		src/superscalar.hpp
+2 −2		src/superscalar_program.hpp
+117 −0		src/tests/affinity.cpp
+39 −0		src/tests/affinity.hpp
+60 −10		src/tests/benchmark.cpp
+93 −0		src/tests/rng-tests.cpp
+1,031 −0		src/tests/tests.cpp
+39 −0		src/tests/utility.hpp
+2 −0		src/virtual_memory.cpp
+20 −560		src/vm_interpreted.cpp
+4 −28		src/vm_interpreted.hpp
+1 −0		src/vm_interpreted_light.hpp
+1 −0		vcxproj/benchmark.vcxproj
+3 −0		vcxproj/benchmark.vcxproj.filters
+211 −0		vcxproj/randomx-dll.vcxproj
+173 −0		vcxproj/randomx-dll.vcxproj.filters
+4 −1		vcxproj/randomx.vcxproj
+6 −0		vcxproj/randomx.vcxproj.filters
+133 −0		vcxproj/tests.vcxproj
+27 −0		vcxproj/tests.vcxproj.filters