Skip to content
This repository has been archived by the owner on Dec 27, 2021. It is now read-only.

Commit

Permalink
Moved more kernels to 2nd file
Browse files Browse the repository at this point in the history
  • Loading branch information
Bendr0id committed Mar 7, 2019
1 parent 414cb8c commit 05986a5
Show file tree
Hide file tree
Showing 2 changed files with 83 additions and 83 deletions.
83 changes: 0 additions & 83 deletions src/amd/opencl/cryptonight.cl
Original file line number Diff line number Diff line change
Expand Up @@ -1392,89 +1392,6 @@ __kernel void cn1(__global uint4 *Scratchpad, __global ulong *states, uint varia
)==="
R"===(

__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
__kernel void cn1_xfh(__global uint4 *Scratchpad, __global ulong *states, uint variant, __global ulong *input, uint Threads)
{
ulong a[2], b[2];
__local uint AES0[256], AES1[256];

const ulong gIdx = getIdx();

for (int i = get_local_id(0); i < 256; i += WORKSIZE) {
const uint tmp = AES0_C[i];
AES0[i] = tmp;
AES1[i] = rotate(tmp, 8U);
}

barrier(CLK_LOCAL_MEM_FENCE);

uint4 b_x;
# if (COMP_MODE == 1)
// do not use early return here
if (gIdx < Threads)
# endif
{
states += 25 * gIdx;
# if (STRIDED_INDEX == 0)
Scratchpad += gIdx * (MEMORY >> 4);
# elif (STRIDED_INDEX == 1)
Scratchpad += gIdx;
# elif(STRIDED_INDEX == 2)
Scratchpad += get_group_id(0) * (MEMORY >> 4) * WORKSIZE + MEM_CHUNK * get_local_id(0);
# endif

a[0] = states[0] ^ states[4];
b[0] = states[2] ^ states[6];
a[1] = states[1] ^ states[5];
b[1] = states[3] ^ states[7];

b_x = ((uint4 *)b)[0];
}

mem_fence(CLK_LOCAL_MEM_FENCE);

# if (COMP_MODE == 1)
// do not use early return here
if (gIdx < Threads)
# endif
{
uint idx0 = a[0];

#pragma unroll UNROLL_FACTOR
for (int i = 0; i < 0x20000; ++i) {
ulong c[2];

((uint4 *)c)[0] = Scratchpad[IDX((idx0 & MASK) >> 4)];
((uint4 *)c)[0] = AES_Round_Two_Tables(AES0, AES1, ((uint4 *)c)[0], ((uint4 *)a)[0]);

Scratchpad[IDX((idx0 & MASK) >> 4)] = b_x ^ ((uint4 *)c)[0];

uint4 tmp;
tmp = Scratchpad[IDX((as_uint2(c[0]).s0 & MASK) >> 4)];

a[1] += c[0] * as_ulong2(tmp).s0;
a[0] += mul_hi(c[0], as_ulong2(tmp).s0);

Scratchpad[IDX((as_uint2(c[0]).s0 & MASK) >> 4)] = ((uint4 *)a)[0];

((uint4 *)a)[0] ^= tmp;
idx0 = a[0];

b_x = ((uint4 *)c)[0];

const long2 n = *((__global long2*)(Scratchpad + (IDX((idx0 & MASK) >> 4))));
long q = fast_div_heavy(n.s0, as_int4(n).s2 | 0x5);
*((__global long*)(Scratchpad + (IDX((idx0 & MASK) >> 4)))) = n.s0 ^ q;

idx0 = (~as_int4(n).s2) ^ q;
}
}
mem_fence(CLK_GLOBAL_MEM_FENCE);
}

)==="
R"===(

__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
__kernel void cn1_xao(__global uint4 *Scratchpad, __global ulong *states, uint variant, __global ulong *input, uint Threads)
{
Expand Down
83 changes: 83 additions & 0 deletions src/amd/opencl/cryptonight2.cl
Original file line number Diff line number Diff line change
@@ -1,4 +1,87 @@
R"===(

__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
__kernel void cn1_xfh(__global uint4 *Scratchpad, __global ulong *states, uint variant, __global ulong *input, uint Threads)
{
ulong a[2], b[2];
__local uint AES0[256], AES1[256];

const ulong gIdx = getIdx();

for (int i = get_local_id(0); i < 256; i += WORKSIZE) {
const uint tmp = AES0_C[i];
AES0[i] = tmp;
AES1[i] = rotate(tmp, 8U);
}

barrier(CLK_LOCAL_MEM_FENCE);

uint4 b_x;
# if (COMP_MODE == 1)
// do not use early return here
if (gIdx < Threads)
# endif
{
states += 25 * gIdx;
# if (STRIDED_INDEX == 0)
Scratchpad += gIdx * (MEMORY >> 4);
# elif (STRIDED_INDEX == 1)
Scratchpad += gIdx;
# elif(STRIDED_INDEX == 2)
Scratchpad += get_group_id(0) * (MEMORY >> 4) * WORKSIZE + MEM_CHUNK * get_local_id(0);
# endif

a[0] = states[0] ^ states[4];
b[0] = states[2] ^ states[6];
a[1] = states[1] ^ states[5];
b[1] = states[3] ^ states[7];

b_x = ((uint4 *)b)[0];
}

mem_fence(CLK_LOCAL_MEM_FENCE);

# if (COMP_MODE == 1)
// do not use early return here
if (gIdx < Threads)
# endif
{
uint idx0 = a[0];

#pragma unroll UNROLL_FACTOR
for (int i = 0; i < 0x20000; ++i) {
ulong c[2];

((uint4 *)c)[0] = Scratchpad[IDX((idx0 & MASK) >> 4)];
((uint4 *)c)[0] = AES_Round_Two_Tables(AES0, AES1, ((uint4 *)c)[0], ((uint4 *)a)[0]);

Scratchpad[IDX((idx0 & MASK) >> 4)] = b_x ^ ((uint4 *)c)[0];

uint4 tmp;
tmp = Scratchpad[IDX((as_uint2(c[0]).s0 & MASK) >> 4)];

a[1] += c[0] * as_ulong2(tmp).s0;
a[0] += mul_hi(c[0], as_ulong2(tmp).s0);

Scratchpad[IDX((as_uint2(c[0]).s0 & MASK) >> 4)] = ((uint4 *)a)[0];

((uint4 *)a)[0] ^= tmp;
idx0 = a[0];

b_x = ((uint4 *)c)[0];

const long2 n = *((__global long2*)(Scratchpad + (IDX((idx0 & MASK) >> 4))));
long q = fast_div_heavy(n.s0, as_int4(n).s2 | 0x5);
*((__global long*)(Scratchpad + (IDX((idx0 & MASK) >> 4)))) = n.s0 ^ q;

idx0 = (~as_int4(n).s2) ^ q;
}
}
mem_fence(CLK_GLOBAL_MEM_FENCE);
}

)==="
R"===(
__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
__kernel void cn1_double(__global uint4 *Scratchpad, __global ulong *states, uint variant, __global ulong *input, uint Threads)
{
Expand Down

0 comments on commit 05986a5

Please sign in to comment.