diff --git a/src/amd/opencl/cryptonight.cl b/src/amd/opencl/cryptonight.cl index 3114e8bc..c358936e 100644 --- a/src/amd/opencl/cryptonight.cl +++ b/src/amd/opencl/cryptonight.cl @@ -1392,89 +1392,6 @@ __kernel void cn1(__global uint4 *Scratchpad, __global ulong *states, uint varia )===" R"===( -__attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) -__kernel void cn1_xfh(__global uint4 *Scratchpad, __global ulong *states, uint variant, __global ulong *input, uint Threads) -{ - ulong a[2], b[2]; - __local uint AES0[256], AES1[256]; - - const ulong gIdx = getIdx(); - - for (int i = get_local_id(0); i < 256; i += WORKSIZE) { - const uint tmp = AES0_C[i]; - AES0[i] = tmp; - AES1[i] = rotate(tmp, 8U); - } - - barrier(CLK_LOCAL_MEM_FENCE); - - uint4 b_x; -# if (COMP_MODE == 1) - // do not use early return here - if (gIdx < Threads) -# endif - { - states += 25 * gIdx; -# if (STRIDED_INDEX == 0) - Scratchpad += gIdx * (MEMORY >> 4); -# elif (STRIDED_INDEX == 1) - Scratchpad += gIdx; -# elif(STRIDED_INDEX == 2) - Scratchpad += get_group_id(0) * (MEMORY >> 4) * WORKSIZE + MEM_CHUNK * get_local_id(0); -# endif - - a[0] = states[0] ^ states[4]; - b[0] = states[2] ^ states[6]; - a[1] = states[1] ^ states[5]; - b[1] = states[3] ^ states[7]; - - b_x = ((uint4 *)b)[0]; - } - - mem_fence(CLK_LOCAL_MEM_FENCE); - -# if (COMP_MODE == 1) - // do not use early return here - if (gIdx < Threads) -# endif - { - uint idx0 = a[0]; - - #pragma unroll UNROLL_FACTOR - for (int i = 0; i < 0x20000; ++i) { - ulong c[2]; - - ((uint4 *)c)[0] = Scratchpad[IDX((idx0 & MASK) >> 4)]; - ((uint4 *)c)[0] = AES_Round_Two_Tables(AES0, AES1, ((uint4 *)c)[0], ((uint4 *)a)[0]); - - Scratchpad[IDX((idx0 & MASK) >> 4)] = b_x ^ ((uint4 *)c)[0]; - - uint4 tmp; - tmp = Scratchpad[IDX((as_uint2(c[0]).s0 & MASK) >> 4)]; - - a[1] += c[0] * as_ulong2(tmp).s0; - a[0] += mul_hi(c[0], as_ulong2(tmp).s0); - - Scratchpad[IDX((as_uint2(c[0]).s0 & MASK) >> 4)] = ((uint4 *)a)[0]; - - ((uint4 *)a)[0] ^= tmp; - idx0 = a[0]; - - b_x = ((uint4 *)c)[0]; - - const long2 n = *((__global long2*)(Scratchpad + (IDX((idx0 & MASK) >> 4)))); - long q = fast_div_heavy(n.s0, as_int4(n).s2 | 0x5); - *((__global long*)(Scratchpad + (IDX((idx0 & MASK) >> 4)))) = n.s0 ^ q; - - idx0 = (~as_int4(n).s2) ^ q; - } - } - mem_fence(CLK_GLOBAL_MEM_FENCE); -} - -)===" -R"===( - __attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) __kernel void cn1_xao(__global uint4 *Scratchpad, __global ulong *states, uint variant, __global ulong *input, uint Threads) { diff --git a/src/amd/opencl/cryptonight2.cl b/src/amd/opencl/cryptonight2.cl index 8569037b..dff58c2e 100644 --- a/src/amd/opencl/cryptonight2.cl +++ b/src/amd/opencl/cryptonight2.cl @@ -1,4 +1,87 @@ R"===( + +__attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) +__kernel void cn1_xfh(__global uint4 *Scratchpad, __global ulong *states, uint variant, __global ulong *input, uint Threads) +{ + ulong a[2], b[2]; + __local uint AES0[256], AES1[256]; + + const ulong gIdx = getIdx(); + + for (int i = get_local_id(0); i < 256; i += WORKSIZE) { + const uint tmp = AES0_C[i]; + AES0[i] = tmp; + AES1[i] = rotate(tmp, 8U); + } + + barrier(CLK_LOCAL_MEM_FENCE); + + uint4 b_x; +# if (COMP_MODE == 1) + // do not use early return here + if (gIdx < Threads) +# endif + { + states += 25 * gIdx; +# if (STRIDED_INDEX == 0) + Scratchpad += gIdx * (MEMORY >> 4); +# elif (STRIDED_INDEX == 1) + Scratchpad += gIdx; +# elif(STRIDED_INDEX == 2) + Scratchpad += get_group_id(0) * (MEMORY >> 4) * WORKSIZE + MEM_CHUNK * get_local_id(0); +# endif + + a[0] = states[0] ^ states[4]; + b[0] = states[2] ^ states[6]; + a[1] = states[1] ^ states[5]; + b[1] = states[3] ^ states[7]; + + b_x = ((uint4 *)b)[0]; + } + + mem_fence(CLK_LOCAL_MEM_FENCE); + +# if (COMP_MODE == 1) + // do not use early return here + if (gIdx < Threads) +# endif + { + uint idx0 = a[0]; + + #pragma unroll UNROLL_FACTOR + for (int i = 0; i < 0x20000; ++i) { + ulong c[2]; + + ((uint4 *)c)[0] = Scratchpad[IDX((idx0 & MASK) >> 4)]; + ((uint4 *)c)[0] = AES_Round_Two_Tables(AES0, AES1, ((uint4 *)c)[0], ((uint4 *)a)[0]); + + Scratchpad[IDX((idx0 & MASK) >> 4)] = b_x ^ ((uint4 *)c)[0]; + + uint4 tmp; + tmp = Scratchpad[IDX((as_uint2(c[0]).s0 & MASK) >> 4)]; + + a[1] += c[0] * as_ulong2(tmp).s0; + a[0] += mul_hi(c[0], as_ulong2(tmp).s0); + + Scratchpad[IDX((as_uint2(c[0]).s0 & MASK) >> 4)] = ((uint4 *)a)[0]; + + ((uint4 *)a)[0] ^= tmp; + idx0 = a[0]; + + b_x = ((uint4 *)c)[0]; + + const long2 n = *((__global long2*)(Scratchpad + (IDX((idx0 & MASK) >> 4)))); + long q = fast_div_heavy(n.s0, as_int4(n).s2 | 0x5); + *((__global long*)(Scratchpad + (IDX((idx0 & MASK) >> 4)))) = n.s0 ^ q; + + idx0 = (~as_int4(n).s2) ^ q; + } + } + mem_fence(CLK_GLOBAL_MEM_FENCE); +} + +)===" +R"===( __attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) __kernel void cn1_double(__global uint4 *Scratchpad, __global ulong *states, uint variant, __global ulong *input, uint Threads) {