Skip to content

Commit

Permalink
Updated RandomX to v1.0.4
Browse files Browse the repository at this point in the history
  • Loading branch information
SChernykh committed Jun 23, 2019
1 parent 2fc19ec commit 6071f38
Show file tree
Hide file tree
Showing 6 changed files with 83 additions and 50 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
# RandomX OpenCL implementation

This repository contains full RandomX implementation for AMD Vega GPUs. The latest version of RandomX (1.0.3 as of June 1st, 2019) is supported.
This repository contains full RandomX implementation for AMD Vega GPUs. The latest version of RandomX (1.0.4 as of June 23rd, 2019) is supported.

Note: it's only a benchmark/testing tool, not an actual miner. RandomX hashrate is expected to improve somewhat in the future thanks to further optimizations.

GPUs tested so far:

Model|CryptonightR H/S|RandomX H/S|Relative speed
-----|---------------|-----------|--------------
AMD Vega 64 (1700/1100 MHz)|2200|1175|53.4%
AMD Vega 64 (1700/1100 MHz)|2200|1171|53.2%

## Building on Windows

Expand Down
2 changes: 1 addition & 1 deletion RandomX
57 changes: 38 additions & 19 deletions RandomX_OpenCL/CL/fillAes1Rx4.cl
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,25 @@ __kernel void fillAes_name(__global void* state, __global void* out, uint batch_

#if num_rounds != 4
const uint k[4] = { AES_KEY_FILL[sub * 4], AES_KEY_FILL[sub * 4 + 1], AES_KEY_FILL[sub * 4 + 2], AES_KEY_FILL[sub * 4 + 3] };
#else
const bool b = (sub < 2);
uint k[16];
k[ 0] = b ? 0x6421aaddu : 0xb5826f73u;
k[ 1] = b ? 0xd1833ddbu : 0xe3d6a7a6u;
k[ 2] = b ? 0x2f546d2bu : 0x3d518b6du;
k[ 3] = b ? 0x99e5d23fu : 0x229effb4u;
k[ 4] = b ? 0xb20e3450u : 0xc7566bf3u;
k[ 5] = b ? 0xb6913f55u : 0x9c10b3d9u;
k[ 6] = b ? 0x06f79d53u : 0xe9024d4eu;
k[ 7] = b ? 0xa5dfcde5u : 0xb272b7d2u;
k[ 8] = b ? 0x5c3ed904u : 0xf273c9e7u;
k[ 9] = b ? 0x515e7bafu : 0xf765a38bu;
k[10] = b ? 0x0aa4679fu : 0x2ba9660au;
k[11] = b ? 0x171c02bfu : 0xf63befa7u;
k[12] = b ? 0x85623763u : 0x7a7cd609u;
k[13] = b ? 0xe78f5d08u : 0x915839deu;
k[14] = b ? 0xcd673785u : 0x0c06d1fdu;
k[15] = b ? 0xd8ded291u : 0xc0b0762du;
#endif

__global uint* s = ((__global uint*) state) + idx * (64 / sizeof(uint)) + sub * (16 / sizeof(uint));
Expand Down Expand Up @@ -70,25 +89,25 @@ __kernel void fillAes_name(__global void* state, __global void* out, uint batch_
x[2] = y[2];
x[3] = y[3];
#else
y[0] = t0[get_byte(x[0], 0)] ^ t1[get_byte(x[1], s1)] ^ t2[get_byte(x[2], 16)] ^ t3[get_byte(x[3], s3)] ^ 0xf890465du;
y[1] = t0[get_byte(x[1], 0)] ^ t1[get_byte(x[2], s1)] ^ t2[get_byte(x[3], 16)] ^ t3[get_byte(x[0], s3)] ^ 0x7ffbe4a6u;
y[2] = t0[get_byte(x[2], 0)] ^ t1[get_byte(x[3], s1)] ^ t2[get_byte(x[0], 16)] ^ t3[get_byte(x[1], s3)] ^ 0x141f82b7u;
y[3] = t0[get_byte(x[3], 0)] ^ t1[get_byte(x[0], s1)] ^ t2[get_byte(x[1], 16)] ^ t3[get_byte(x[2], s3)] ^ 0xcf359e95u;

x[0] = t0[get_byte(y[0], 0)] ^ t1[get_byte(y[1], s1)] ^ t2[get_byte(y[2], 16)] ^ t3[get_byte(y[3], s3)] ^ 0x6a55c450u;
x[1] = t0[get_byte(y[1], 0)] ^ t1[get_byte(y[2], s1)] ^ t2[get_byte(y[3], 16)] ^ t3[get_byte(y[0], s3)] ^ 0xfee8278au;
x[2] = t0[get_byte(y[2], 0)] ^ t1[get_byte(y[3], s1)] ^ t2[get_byte(y[0], 16)] ^ t3[get_byte(y[1], s3)] ^ 0xbd5c5ac3u;
x[3] = t0[get_byte(y[3], 0)] ^ t1[get_byte(y[0], s1)] ^ t2[get_byte(y[1], 16)] ^ t3[get_byte(y[2], s3)] ^ 0x6741ffdcu;

y[0] = t0[get_byte(x[0], 0)] ^ t1[get_byte(x[1], s1)] ^ t2[get_byte(x[2], 16)] ^ t3[get_byte(x[3], s3)] ^ 0x114c47a4u;
y[1] = t0[get_byte(x[1], 0)] ^ t1[get_byte(x[2], s1)] ^ t2[get_byte(x[3], 16)] ^ t3[get_byte(x[0], s3)] ^ 0xd524fde4u;
y[2] = t0[get_byte(x[2], 0)] ^ t1[get_byte(x[3], s1)] ^ t2[get_byte(x[0], 16)] ^ t3[get_byte(x[1], s3)] ^ 0xa7279ad2u;
y[3] = t0[get_byte(x[3], 0)] ^ t1[get_byte(x[0], s1)] ^ t2[get_byte(x[1], 16)] ^ t3[get_byte(x[2], s3)] ^ 0x3d324aacu;

x[0] = t0[get_byte(y[0], 0)] ^ t1[get_byte(y[1], s1)] ^ t2[get_byte(y[2], 16)] ^ t3[get_byte(y[3], s3)] ^ 0x810c3a2au;
x[1] = t0[get_byte(y[1], 0)] ^ t1[get_byte(y[2], s1)] ^ t2[get_byte(y[3], 16)] ^ t3[get_byte(y[0], s3)] ^ 0x99a9aeffu;
x[2] = t0[get_byte(y[2], 0)] ^ t1[get_byte(y[3], s1)] ^ t2[get_byte(y[0], 16)] ^ t3[get_byte(y[1], s3)] ^ 0x42d3dbd9u;
x[3] = t0[get_byte(y[3], 0)] ^ t1[get_byte(y[0], s1)] ^ t2[get_byte(y[1], 16)] ^ t3[get_byte(y[2], s3)] ^ 0x76f6db08u;
y[0] = t0[get_byte(x[0], 0)] ^ t1[get_byte(x[1], s1)] ^ t2[get_byte(x[2], 16)] ^ t3[get_byte(x[3], s3)] ^ k[ 0];
y[1] = t0[get_byte(x[1], 0)] ^ t1[get_byte(x[2], s1)] ^ t2[get_byte(x[3], 16)] ^ t3[get_byte(x[0], s3)] ^ k[ 1];
y[2] = t0[get_byte(x[2], 0)] ^ t1[get_byte(x[3], s1)] ^ t2[get_byte(x[0], 16)] ^ t3[get_byte(x[1], s3)] ^ k[ 2];
y[3] = t0[get_byte(x[3], 0)] ^ t1[get_byte(x[0], s1)] ^ t2[get_byte(x[1], 16)] ^ t3[get_byte(x[2], s3)] ^ k[ 3];

x[0] = t0[get_byte(y[0], 0)] ^ t1[get_byte(y[1], s1)] ^ t2[get_byte(y[2], 16)] ^ t3[get_byte(y[3], s3)] ^ k[ 4];
x[1] = t0[get_byte(y[1], 0)] ^ t1[get_byte(y[2], s1)] ^ t2[get_byte(y[3], 16)] ^ t3[get_byte(y[0], s3)] ^ k[ 5];
x[2] = t0[get_byte(y[2], 0)] ^ t1[get_byte(y[3], s1)] ^ t2[get_byte(y[0], 16)] ^ t3[get_byte(y[1], s3)] ^ k[ 6];
x[3] = t0[get_byte(y[3], 0)] ^ t1[get_byte(y[0], s1)] ^ t2[get_byte(y[1], 16)] ^ t3[get_byte(y[2], s3)] ^ k[ 7];

y[0] = t0[get_byte(x[0], 0)] ^ t1[get_byte(x[1], s1)] ^ t2[get_byte(x[2], 16)] ^ t3[get_byte(x[3], s3)] ^ k[ 8];
y[1] = t0[get_byte(x[1], 0)] ^ t1[get_byte(x[2], s1)] ^ t2[get_byte(x[3], 16)] ^ t3[get_byte(x[0], s3)] ^ k[ 9];
y[2] = t0[get_byte(x[2], 0)] ^ t1[get_byte(x[3], s1)] ^ t2[get_byte(x[0], 16)] ^ t3[get_byte(x[1], s3)] ^ k[10];
y[3] = t0[get_byte(x[3], 0)] ^ t1[get_byte(x[0], s1)] ^ t2[get_byte(x[1], 16)] ^ t3[get_byte(x[2], s3)] ^ k[11];

x[0] = t0[get_byte(y[0], 0)] ^ t1[get_byte(y[1], s1)] ^ t2[get_byte(y[2], 16)] ^ t3[get_byte(y[3], s3)] ^ k[12];
x[1] = t0[get_byte(y[1], 0)] ^ t1[get_byte(y[2], s1)] ^ t2[get_byte(y[3], 16)] ^ t3[get_byte(y[0], s3)] ^ k[13];
x[2] = t0[get_byte(y[2], 0)] ^ t1[get_byte(y[3], s1)] ^ t2[get_byte(y[0], 16)] ^ t3[get_byte(y[1], s3)] ^ k[14];
x[3] = t0[get_byte(y[3], 0)] ^ t1[get_byte(y[0], s1)] ^ t2[get_byte(y[1], 16)] ^ t3[get_byte(y[2], s3)] ^ k[15];

*p = *(uint4*)(x);
#endif
Expand Down
63 changes: 40 additions & 23 deletions RandomX_OpenCL/CL/randomx_init.cl
Original file line number Diff line number Diff line change
Expand Up @@ -86,32 +86,35 @@ along with RandomX OpenCL. If not, see <http://www.gnu.org/licenses/>.
// 43.5*5 = 217.5 bytes on average
#define RANDOMX_FREQ_IXOR_M 5

// 15.5*10 = 155 bytes on average
#define RANDOMX_FREQ_IROR_R 10
// 15.5*8 = 124 bytes on average
#define RANDOMX_FREQ_IROR_R 8

// 15.5*2 = 31 bytes on average
#define RANDOMX_FREQ_IROL_R 2

// 10.5*4 = 42 bytes on average
#define RANDOMX_FREQ_ISWAP_R 4

// 20*8 = 160 bytes
#define RANDOMX_FREQ_FSWAP_R 8
// 20*4 = 80 bytes
#define RANDOMX_FREQ_FSWAP_R 4

// 8*20 = 160 bytes
#define RANDOMX_FREQ_FADD_R 20
// 8*16 = 128 bytes
#define RANDOMX_FREQ_FADD_R 16

// 40*5 = 200 bytes
#define RANDOMX_FREQ_FADD_M 5

// 8*20 = 160 bytes
#define RANDOMX_FREQ_FSUB_R 20
// 8*16 = 128 bytes
#define RANDOMX_FREQ_FSUB_R 16

// 40*5 = 200 bytes
#define RANDOMX_FREQ_FSUB_M 5

// 4*6 = 24 bytes
#define RANDOMX_FREQ_FSCAL_R 6

// 8*20 = 160 bytes
#define RANDOMX_FREQ_FMUL_R 20
// 8*32 = 256 bytes
#define RANDOMX_FREQ_FMUL_R 32

// 36*4 = 144 bytes
#define RANDOMX_FREQ_FDIV_M 4
Expand All @@ -128,8 +131,8 @@ along with RandomX OpenCL. If not, see <http://www.gnu.org/licenses/>.
// 28*16 = 448 bytes
#define RANDOMX_FREQ_ISTORE 16

// Total: 4804.25 + 4(s_setpc_b64) = 4808.25 bytes on average
// Real average program size: 4791 bytes
// Total: 4756.25 + 4(s_setpc_b64) = 4760.25 bytes on average
// Real average program size: 4743 bytes

ulong getSmallPositiveFloatBits(const ulong entropy)
{
Expand Down Expand Up @@ -666,22 +669,36 @@ __global uint* jit_emit_instruction(__global uint* p, __global uint* last_branch
}
opcode -= RANDOMX_FREQ_IXOR_M;

if (opcode < RANDOMX_FREQ_IROR_R)
if (opcode < RANDOMX_FREQ_IROR_R + RANDOMX_FREQ_IROL_R)
{
if (src != dst) // p = 7/8
{
// s_lshr_b64 s[32:33], s[16 + dst * 2:17 + dst * 2], s(16 + src * 2)
*(p++) = 0x8fa01010u | (dst << 1) | (src << 9);
if (opcode < RANDOMX_FREQ_IROR_R)
{
// s_lshr_b64 s[32:33], s[16 + dst * 2:17 + dst * 2], s(16 + src * 2)
*(p++) = 0x8fa01010u | (dst << 1) | (src << 9);

// s_sub_u32 s15, 64, s(16 + src * 2)
*(p++) = 0x808f10c0u | (src << 9);

// s_lshl_b64 s[34:35], s[16 + dst * 2:17 + dst * 2], s15
*(p++) = 0x8ea20f10u | (dst << 1);
}
else
{
// s_lshl_b64 s[32:33], s[16 + dst * 2:17 + dst * 2], s(16 + src * 2)
*(p++) = 0x8ea01010u | (dst << 1) | (src << 9);

// s_sub_u32 s15, 64, s(16 + src * 2)
*(p++) = 0x808f10c0u | (src << 9);
// s_sub_u32 s15, 64, s(16 + src * 2)
*(p++) = 0x808f10c0u | (src << 9);

// s_lshl_b64 s[34:35], s[16 + dst * 2:17 + dst * 2], s15
*(p++) = 0x8ea20f10u | (dst << 1);
// s_lshr_b64 s[34:35], s[16 + dst * 2:17 + dst * 2], s15
*(p++) = 0x8fa20f10u | (dst << 1);
}
}
else // p = 1/8
{
const uint shift = inst.y & 63;
const uint shift = ((opcode < RANDOMX_FREQ_IROR_R) ? inst.y : -inst.y) & 63;

// s_lshr_b64 s[32:33], s[16 + dst * 2:17 + dst * 2], shift
*(p++) = 0x8fa08010u | (dst << 1) | (shift << 8);
Expand All @@ -696,7 +713,7 @@ __global uint* jit_emit_instruction(__global uint* p, __global uint* last_branch
// 12*7/8 + 8/8 + 4 = 15.5 bytes on average
return p;
}
opcode -= RANDOMX_FREQ_IROR_R;
opcode -= RANDOMX_FREQ_IROR_R + RANDOMX_FREQ_IROL_R;

if (opcode < RANDOMX_FREQ_ISWAP_R)
{
Expand Down Expand Up @@ -1137,13 +1154,13 @@ __global uint* generate_jit_code(__global uint2* e, __global uint2* p0, __global
}
opcode -= RANDOMX_FREQ_IXOR_M;

if (opcode < RANDOMX_FREQ_IROR_R)
if (opcode < RANDOMX_FREQ_IROR_R + RANDOMX_FREQ_IROL_R)
{
registerLastChanged = (registerLastChanged & ~(0xFFul << (dst * 8))) | ((ulong)(i) << (dst * 8));
registerWasChanged |= 1u << dst;
continue;
}
opcode -= RANDOMX_FREQ_IROR_R;
opcode -= RANDOMX_FREQ_IROR_R + RANDOMX_FREQ_IROL_R;

if (opcode < RANDOMX_FREQ_ISWAP_R)
{
Expand Down
3 changes: 0 additions & 3 deletions RandomX_OpenCL/RandomX_OpenCL.vcxproj
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,6 @@
<ConformanceMode>true</ConformanceMode>
<AdditionalIncludeDirectories>$(CUDA_PATH)\include</AdditionalIncludeDirectories>
<TreatWarningAsError>true</TreatWarningAsError>
<BasicRuntimeChecks>Default</BasicRuntimeChecks>
<RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
Expand All @@ -78,7 +76,6 @@
<ConformanceMode>true</ConformanceMode>
<AdditionalIncludeDirectories>$(CUDA_PATH)\include</AdditionalIncludeDirectories>
<TreatWarningAsError>true</TreatWarningAsError>
<RuntimeLibrary>MultiThreaded</RuntimeLibrary>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
Expand Down
4 changes: 2 additions & 2 deletions RandomX_OpenCL/tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -189,12 +189,12 @@ bool tests(uint32_t platform_id, uint32_t device_id, size_t intensity)

if (memcmp(entropy.data() + i * ENTROPY_SIZE, entropy.data() + ENTROPY_SIZE * intensity, ENTROPY_SIZE) != 0)
{
std::cerr << "fillAes1Rx4_entropy test (entropy) failed!" << std::endl;
std::cerr << "fillAes4Rx4_entropy test (entropy) failed!" << std::endl;
return false;
}
}

std::cout << "fillAes1Rx4_entropy test passed" << std::endl;
std::cout << "fillAes4Rx4_entropy test passed" << std::endl;

kernel = ctx.kernels[CL_HASHAES1RX4];
if (!clSetKernelArgs(kernel, scratchpads_gpu, registers_gpu, static_cast<uint32_t>(intensity)))
Expand Down

0 comments on commit 6071f38

Please sign in to comment.