Skip to content

Commit 3b68fbe

Browse files
committed
Blake2 followup changes
- see #398 for original PR
1 parent 7f4ed1c commit 3b68fbe

File tree

8 files changed

+555
-472
lines changed

8 files changed

+555
-472
lines changed

crypto/Contributors.html

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -301,6 +301,9 @@ <h3>Code Contributors:</h3>
301301
<li>
302302
<p>zer0x64 (https://github.com/zer0x64) - Performance optimizations for Argon2.</p>
303303
</li>
304+
<li>
305+
<p>Timothy Makkison (https://github.com/TimothyMakkison) - X86 intrinsics implementation for Blake2.</p>
306+
</li>
304307
</ul>
305308
</body>
306309
</html>

crypto/src/crypto/digests/Blake2bDigest.cs

Lines changed: 189 additions & 191 deletions
Large diffs are not rendered by default.

crypto/src/crypto/digests/Blake2b_X86.cs

Lines changed: 40 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -35,31 +35,34 @@ namespace Org.BouncyCastle.Crypto.Digests
3535

3636
internal static class Blake2b_X86
3737
{
38-
public static bool IsSupported => Avx2.IsSupported && BitConverter.IsLittleEndian;
38+
internal static bool IsSupported =>
39+
Org.BouncyCastle.Runtime.Intrinsics.X86.Avx2.IsEnabled &&
40+
Org.BouncyCastle.Runtime.Intrinsics.Vector.IsPackedLittleEndian;
3941

4042
[MethodImpl(MethodImplOptions.AggressiveInlining)]
41-
public static void Compress(bool isFinal, Span<ulong> hashBuffer, ReadOnlySpan<byte> message, ulong totalSegmentsLow, ulong totalSegmentsHigh, ReadOnlySpan<ulong> blakeIV)
43+
internal static void Compress(Span<ulong> hashBuffer, ReadOnlySpan<ulong> blakeIV, ulong t0, ulong t1, ulong f0,
44+
ReadOnlySpan<byte> message)
4245
{
4346
if (!IsSupported)
4447
throw new PlatformNotSupportedException(nameof(Blake2b_X86));
4548

46-
Debug.Assert(message.Length >= Unsafe.SizeOf<ulong>() * 8);
4749
Debug.Assert(hashBuffer.Length >= 8);
50+
Debug.Assert(blakeIV.Length >= 8);
51+
Debug.Assert(message.Length >= 128);
4852

4953
var hashBytes = MemoryMarshal.AsBytes(hashBuffer);
5054
var ivBytes = MemoryMarshal.AsBytes(blakeIV);
5155

52-
var r_14 = isFinal ? ulong.MaxValue : 0;
53-
var t_0 = Vector256.Create(totalSegmentsLow, totalSegmentsHigh, r_14, 0);
56+
var t_0 = Vector256.Create(t0, t1, f0, 0);
5457

55-
Vector256<ulong> row1 = LoadVector256<ulong>(hashBytes);
56-
Vector256<ulong> row2 = LoadVector256<ulong>(hashBytes[Vector256<byte>.Count..]);
57-
Vector256<ulong> row3 = LoadVector256<ulong>(ivBytes);
58-
Vector256<ulong> row4 = LoadVector256<ulong>(ivBytes[Vector256<byte>.Count..]);
58+
var row1 = MemoryMarshal.Read<Vector256<ulong>>(hashBytes);
59+
var row2 = MemoryMarshal.Read<Vector256<ulong>>(hashBytes[32..]);
60+
var row3 = MemoryMarshal.Read<Vector256<ulong>>(ivBytes);
61+
var row4 = MemoryMarshal.Read<Vector256<ulong>>(ivBytes[32..]);
5962
row4 = Avx2.Xor(row4, t_0);
6063

61-
Vector256<ulong> orig_1 = row1;
62-
Vector256<ulong> orig_2 = row2;
64+
var orig_1 = row1;
65+
var orig_2 = row2;
6366

6467
Perform12Rounds(message, ref row1, ref row2, ref row3, ref row4);
6568

@@ -68,21 +71,19 @@ public static void Compress(bool isFinal, Span<ulong> hashBuffer, ReadOnlySpan<b
6871
row1 = Avx2.Xor(row1, orig_1);
6972
row2 = Avx2.Xor(row2, orig_2);
7073

71-
Store(row1, hashBytes);
72-
Store(row2, hashBytes[Vector256<byte>.Count..]);
74+
MemoryMarshal.Write(hashBytes, ref row1);
75+
MemoryMarshal.Write(hashBytes[32..], ref row2);
7376
}
7477

7578
[MethodImpl(MethodImplOptions.AggressiveInlining)]
7679
private static void Perform12Rounds(ReadOnlySpan<byte> m, ref Vector256<ulong> row1, ref Vector256<ulong> row2, ref Vector256<ulong> row3, ref Vector256<ulong> row4)
7780
{
78-
Debug.Assert(m.Length >= 128);
79-
80-
#region Rounds
81+
#region Rounds
8182
//ROUND 1
82-
var m0 = BroadcastVector128ToVector256<ulong>(m);
83-
var m1 = BroadcastVector128ToVector256<ulong>(m[Unsafe.SizeOf<Vector128<ulong>>()..]);
84-
var m2 = BroadcastVector128ToVector256<ulong>(m[(Unsafe.SizeOf<Vector128<ulong>>() * 2)..]);
85-
var m3 = BroadcastVector128ToVector256<ulong>(m[(Unsafe.SizeOf<Vector128<ulong>>() * 3)..]);
83+
var m0 = Broadcast128ToVector256<ulong>(m);
84+
var m1 = Broadcast128ToVector256<ulong>(m[16..]);
85+
var m2 = Broadcast128ToVector256<ulong>(m[32..]);
86+
var m3 = Broadcast128ToVector256<ulong>(m[48..]);
8687

8788
var t0 = Avx2.UnpackLow(m0, m1);
8889
var t1 = Avx2.UnpackLow(m2, m3);
@@ -92,10 +93,10 @@ private static void Perform12Rounds(ReadOnlySpan<byte> m, ref Vector256<ulong> r
9293
t1 = Avx2.UnpackHigh(m2, m3);
9394
var b2 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();
9495

95-
var m4 = BroadcastVector128ToVector256<ulong>(m[(Unsafe.SizeOf<Vector128<ulong>>() * 4)..]);
96-
var m5 = BroadcastVector128ToVector256<ulong>(m[(Unsafe.SizeOf<Vector128<ulong>>() * 5)..]);
97-
var m6 = BroadcastVector128ToVector256<ulong>(m[(Unsafe.SizeOf<Vector128<ulong>>() * 6)..]);
98-
var m7 = BroadcastVector128ToVector256<ulong>(m[(Unsafe.SizeOf<Vector128<ulong>>() * 7)..]);
96+
var m4 = Broadcast128ToVector256<ulong>(m[64..]);
97+
var m5 = Broadcast128ToVector256<ulong>(m[80..]);
98+
var m6 = Broadcast128ToVector256<ulong>(m[96..]);
99+
var m7 = Broadcast128ToVector256<ulong>(m[112..]);
99100

100101
t0 = Avx2.UnpackLow(m7, m4);
101102
t1 = Avx2.UnpackLow(m5, m6);
@@ -315,14 +316,18 @@ private static void Perform12Rounds(ReadOnlySpan<byte> m, ref Vector256<ulong> r
315316
b4 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();
316317

317318
Round(ref row1, ref row2, ref row3, ref row4, b1, b2, b3, b4);
318-
#endregion
319+
#endregion
319320
}
320321

321322
[MethodImpl(MethodImplOptions.AggressiveInlining)]
322-
private static void Round(ref Vector256<ulong> row1, ref Vector256<ulong> row2, ref Vector256<ulong> row3, ref Vector256<ulong> row4, Vector256<ulong> b1, Vector256<ulong> b2, Vector256<ulong> b3, Vector256<ulong> b4)
323+
private static void Round(ref Vector256<ulong> row1, ref Vector256<ulong> row2, ref Vector256<ulong> row3,
324+
ref Vector256<ulong> row4, Vector256<ulong> b1, Vector256<ulong> b2, Vector256<ulong> b3,
325+
Vector256<ulong> b4)
323326
{
324-
Vector256<byte> r24 = Vector256.Create((byte)3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10, 3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10);
325-
Vector256<byte> r16 = Vector256.Create((byte)2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9, 2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9);
327+
Vector256<byte> r24 = Vector256.Create(
328+
(byte)3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10, 3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10);
329+
Vector256<byte> r16 = Vector256.Create(
330+
(byte)2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9, 2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9);
326331

327332
G1(r24, ref row1, ref row2, ref row3, ref row4, b1);
328333
G2(r16, ref row1, ref row2, ref row3, ref row4, b2);
@@ -352,7 +357,8 @@ private static void Diagonalize(ref Vector256<ulong> row1, ref Vector256<ulong>
352357
}
353358

354359
[MethodImpl(MethodImplOptions.AggressiveInlining)]
355-
private static void G1(Vector256<byte> r24, ref Vector256<ulong> row1, ref Vector256<ulong> row2, ref Vector256<ulong> row3, ref Vector256<ulong> row4, Vector256<ulong> b0)
360+
private static void G1(Vector256<byte> r24, ref Vector256<ulong> row1, ref Vector256<ulong> row2,
361+
ref Vector256<ulong> row3, ref Vector256<ulong> row4, Vector256<ulong> b0)
356362
{
357363
row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
358364
row4 = Avx2.Xor(row4, row1);
@@ -364,7 +370,8 @@ private static void G1(Vector256<byte> r24, ref Vector256<ulong> row1, ref Vecto
364370
}
365371

366372
[MethodImpl(MethodImplOptions.AggressiveInlining)]
367-
private static void G2(Vector256<byte> r16, ref Vector256<ulong> row1, ref Vector256<ulong> row2, ref Vector256<ulong> row3, ref Vector256<ulong> row4, Vector256<ulong> b0)
373+
private static void G2(Vector256<byte> r16, ref Vector256<ulong> row1, ref Vector256<ulong> row2,
374+
ref Vector256<ulong> row3, ref Vector256<ulong> row4, Vector256<ulong> b0)
368375
{
369376
row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
370377
row4 = Avx2.Xor(row4, row1);
@@ -376,7 +383,8 @@ private static void G2(Vector256<byte> r16, ref Vector256<ulong> row1, ref Vecto
376383
}
377384

378385
[MethodImpl(MethodImplOptions.AggressiveInlining)]
379-
private static void Undiagonalize(ref Vector256<ulong> row1, ref Vector256<ulong> row3, ref Vector256<ulong> row4)
386+
private static void Undiagonalize(ref Vector256<ulong> row1, ref Vector256<ulong> row3,
387+
ref Vector256<ulong> row4)
380388
{
381389
// +-------------------+ +-------------------+
382390
// | 3 | 0 | 1 | 2 | | 0 | 1 | 2 | 3 |
@@ -392,28 +400,12 @@ private static void Undiagonalize(ref Vector256<ulong> row1, ref Vector256<ulong
392400
}
393401

394402
[MethodImpl(MethodImplOptions.AggressiveInlining)]
395-
private static Vector256<T> BroadcastVector128ToVector256<T>(ReadOnlySpan<byte> source) where T : struct
403+
private static Vector256<T> Broadcast128ToVector256<T>(ReadOnlySpan<byte> source) where T : struct
396404
{
397-
Debug.Assert(source.Length >= Unsafe.SizeOf<Vector128<byte>>());
398-
399405
var vector = MemoryMarshal.Read<Vector128<T>>(source);
400406
Vector256<T> result = vector.ToVector256Unsafe();
401407
return result.WithUpper(vector);
402408
}
403-
404-
[MethodImpl(MethodImplOptions.AggressiveInlining)]
405-
private static Vector256<T> LoadVector256<T>(ReadOnlySpan<byte> source) where T : struct
406-
{
407-
Debug.Assert(source.Length >= Unsafe.SizeOf<Vector256<byte>>());
408-
return MemoryMarshal.Read<Vector256<T>>(source);
409-
}
410-
411-
[MethodImpl(MethodImplOptions.AggressiveInlining)]
412-
private static void Store<T>(Vector256<T> vector, Span<byte> destination) where T : struct
413-
{
414-
Debug.Assert(destination.Length >= Unsafe.SizeOf<Vector256<byte>>());
415-
MemoryMarshal.Write(destination, ref vector);
416-
}
417409
}
418410
}
419411
#endif

0 commit comments

Comments
 (0)