diff --git a/src/amd/opencl/cryptonight.cl b/src/amd/opencl/cryptonight.cl
index 3114e8bc..c358936e 100644
--- a/src/amd/opencl/cryptonight.cl
+++ b/src/amd/opencl/cryptonight.cl
@@ -1392,89 +1392,6 @@ __kernel void cn1(__global uint4 *Scratchpad, __global ulong *states, uint varia
 )==="
 R"===(
 
-__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
-__kernel void cn1_xfh(__global uint4 *Scratchpad, __global ulong *states, uint variant, __global ulong *input, uint Threads)
-{
-    ulong a[2], b[2];
-    __local uint AES0[256], AES1[256];
-
-    const ulong gIdx = getIdx();
-
-    for (int i = get_local_id(0); i < 256; i += WORKSIZE) {
-        const uint tmp = AES0_C[i];
-        AES0[i] = tmp;
-        AES1[i] = rotate(tmp, 8U);
-    }
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    uint4 b_x;
-#   if (COMP_MODE == 1)
-    // do not use early return here
-    if (gIdx < Threads)
-#   endif
-    {
-        states += 25 * gIdx;
-#       if (STRIDED_INDEX == 0)
-        Scratchpad += gIdx * (MEMORY >> 4);
-#       elif (STRIDED_INDEX == 1)
-        Scratchpad += gIdx;
-#       elif(STRIDED_INDEX == 2)
-        Scratchpad += get_group_id(0) * (MEMORY >> 4) * WORKSIZE + MEM_CHUNK * get_local_id(0);
-#       endif
-
-        a[0] = states[0] ^ states[4];
-        b[0] = states[2] ^ states[6];
-        a[1] = states[1] ^ states[5];
-        b[1] = states[3] ^ states[7];
-
-        b_x = ((uint4 *)b)[0];
-    }
-
-    mem_fence(CLK_LOCAL_MEM_FENCE);
-
-#   if (COMP_MODE == 1)
-    // do not use early return here
-    if (gIdx < Threads)
-#   endif
-    {
-        uint idx0 = a[0];
-
-        #pragma unroll UNROLL_FACTOR
-        for (int i = 0; i < 0x20000; ++i) {
-            ulong c[2];
-
-            ((uint4 *)c)[0] = Scratchpad[IDX((idx0 & MASK) >> 4)];
-            ((uint4 *)c)[0] = AES_Round_Two_Tables(AES0, AES1, ((uint4 *)c)[0], ((uint4 *)a)[0]);
-
-            Scratchpad[IDX((idx0 & MASK) >> 4)] = b_x ^ ((uint4 *)c)[0];
-
-            uint4 tmp;
-            tmp = Scratchpad[IDX((as_uint2(c[0]).s0 & MASK) >> 4)];
-
-            a[1] += c[0] * as_ulong2(tmp).s0;
-            a[0] += mul_hi(c[0], as_ulong2(tmp).s0);
-
-            Scratchpad[IDX((as_uint2(c[0]).s0 & MASK) >> 4)] = ((uint4 *)a)[0];
-
-            ((uint4 *)a)[0] ^= tmp;
-            idx0 = a[0];
-
-            b_x = ((uint4 *)c)[0];
-
-            const long2 n = *((__global long2*)(Scratchpad + (IDX((idx0 & MASK) >> 4))));
-            long q = fast_div_heavy(n.s0, as_int4(n).s2 | 0x5);
-            *((__global long*)(Scratchpad + (IDX((idx0 & MASK) >> 4)))) = n.s0 ^ q;
-
-            idx0 = (~as_int4(n).s2) ^ q;
-        }
-    }
-    mem_fence(CLK_GLOBAL_MEM_FENCE);
-}
-
-)==="
-R"===(
-
 __attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
 __kernel void cn1_xao(__global uint4 *Scratchpad, __global ulong *states, uint variant, __global ulong *input, uint Threads)
 {
diff --git a/src/amd/opencl/cryptonight2.cl b/src/amd/opencl/cryptonight2.cl
index 8569037b..dff58c2e 100644
--- a/src/amd/opencl/cryptonight2.cl
+++ b/src/amd/opencl/cryptonight2.cl
@@ -1,4 +1,87 @@
 R"===(
+
+__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
+__kernel void cn1_xfh(__global uint4 *Scratchpad, __global ulong *states, uint variant, __global ulong *input, uint Threads)
+{
+    ulong a[2], b[2];
+    __local uint AES0[256], AES1[256];
+
+    const ulong gIdx = getIdx();
+
+    for (int i = get_local_id(0); i < 256; i += WORKSIZE) {
+        const uint tmp = AES0_C[i];
+        AES0[i] = tmp;
+        AES1[i] = rotate(tmp, 8U);
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    uint4 b_x;
+#   if (COMP_MODE == 1)
+    // do not use early return here
+    if (gIdx < Threads)
+#   endif
+    {
+        states += 25 * gIdx;
+#       if (STRIDED_INDEX == 0)
+        Scratchpad += gIdx * (MEMORY >> 4);
+#       elif (STRIDED_INDEX == 1)
+        Scratchpad += gIdx;
+#       elif(STRIDED_INDEX == 2)
+        Scratchpad += get_group_id(0) * (MEMORY >> 4) * WORKSIZE + MEM_CHUNK * get_local_id(0);
+#       endif
+
+        a[0] = states[0] ^ states[4];
+        b[0] = states[2] ^ states[6];
+        a[1] = states[1] ^ states[5];
+        b[1] = states[3] ^ states[7];
+
+        b_x = ((uint4 *)b)[0];
+    }
+
+    mem_fence(CLK_LOCAL_MEM_FENCE);
+
+#   if (COMP_MODE == 1)
+    // do not use early return here
+    if (gIdx < Threads)
+#   endif
+    {
+        uint idx0 = a[0];
+
+        #pragma unroll UNROLL_FACTOR
+        for (int i = 0; i < 0x20000; ++i) {
+            ulong c[2];
+
+            ((uint4 *)c)[0] = Scratchpad[IDX((idx0 & MASK) >> 4)];
+            ((uint4 *)c)[0] = AES_Round_Two_Tables(AES0, AES1, ((uint4 *)c)[0], ((uint4 *)a)[0]);
+
+            Scratchpad[IDX((idx0 & MASK) >> 4)] = b_x ^ ((uint4 *)c)[0];
+
+            uint4 tmp;
+            tmp = Scratchpad[IDX((as_uint2(c[0]).s0 & MASK) >> 4)];
+
+            a[1] += c[0] * as_ulong2(tmp).s0;
+            a[0] += mul_hi(c[0], as_ulong2(tmp).s0);
+
+            Scratchpad[IDX((as_uint2(c[0]).s0 & MASK) >> 4)] = ((uint4 *)a)[0];
+
+            ((uint4 *)a)[0] ^= tmp;
+            idx0 = a[0];
+
+            b_x = ((uint4 *)c)[0];
+
+            const long2 n = *((__global long2*)(Scratchpad + (IDX((idx0 & MASK) >> 4))));
+            long q = fast_div_heavy(n.s0, as_int4(n).s2 | 0x5);
+            *((__global long*)(Scratchpad + (IDX((idx0 & MASK) >> 4)))) = n.s0 ^ q;
+
+            idx0 = (~as_int4(n).s2) ^ q;
+        }
+    }
+    mem_fence(CLK_GLOBAL_MEM_FENCE);
+}
+
+)==="
+R"===(
 __attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
 __kernel void cn1_double(__global uint4 *Scratchpad, __global ulong *states, uint variant, __global ulong *input, uint Threads)
 {