diff --git a/src/crypto/randomx/aes_hash.cpp b/src/crypto/randomx/aes_hash.cpp index ae7c67eead..89025e355d 100644 --- a/src/crypto/randomx/aes_hash.cpp +++ b/src/crypto/randomx/aes_hash.cpp @@ -239,3 +239,84 @@ void fillAes4Rx4(void *state, size_t outputSize, void *buffer) { template void fillAes4Rx4(void *state, size_t outputSize, void *buffer); template void fillAes4Rx4(void *state, size_t outputSize, void *buffer); + +template +void hashAndFillAes1Rx4(void *scratchpad, size_t scratchpadSize, void *hash, void* fill_state) { + uint8_t* scratchpadPtr = (uint8_t*)scratchpad; + const uint8_t* scratchpadEnd = scratchpadPtr + scratchpadSize; + + // initial state + rx_vec_i128 hash_state0 = rx_set_int_vec_i128(AES_HASH_1R_STATE0); + rx_vec_i128 hash_state1 = rx_set_int_vec_i128(AES_HASH_1R_STATE1); + rx_vec_i128 hash_state2 = rx_set_int_vec_i128(AES_HASH_1R_STATE2); + rx_vec_i128 hash_state3 = rx_set_int_vec_i128(AES_HASH_1R_STATE3); + + const rx_vec_i128 key0 = rx_set_int_vec_i128(AES_GEN_1R_KEY0); + const rx_vec_i128 key1 = rx_set_int_vec_i128(AES_GEN_1R_KEY1); + const rx_vec_i128 key2 = rx_set_int_vec_i128(AES_GEN_1R_KEY2); + const rx_vec_i128 key3 = rx_set_int_vec_i128(AES_GEN_1R_KEY3); + + rx_vec_i128 fill_state0 = rx_load_vec_i128((rx_vec_i128*)fill_state + 0); + rx_vec_i128 fill_state1 = rx_load_vec_i128((rx_vec_i128*)fill_state + 1); + rx_vec_i128 fill_state2 = rx_load_vec_i128((rx_vec_i128*)fill_state + 2); + rx_vec_i128 fill_state3 = rx_load_vec_i128((rx_vec_i128*)fill_state + 3); + + constexpr int PREFETCH_DISTANCE = 4096; + const char* prefetchPtr = ((const char*)scratchpad) + PREFETCH_DISTANCE; + scratchpadEnd -= PREFETCH_DISTANCE; + + for (int i = 0; i < 2; ++i) { + //process 64 bytes at a time in 4 lanes + while (scratchpadPtr < scratchpadEnd) { + hash_state0 = aesenc(hash_state0, rx_load_vec_i128((rx_vec_i128*)scratchpadPtr + 0)); + hash_state1 = aesdec(hash_state1, rx_load_vec_i128((rx_vec_i128*)scratchpadPtr + 1)); + hash_state2 = aesenc(hash_state2, rx_load_vec_i128((rx_vec_i128*)scratchpadPtr + 2)); + hash_state3 = aesdec(hash_state3, rx_load_vec_i128((rx_vec_i128*)scratchpadPtr + 3)); + + fill_state0 = aesdec(fill_state0, key0); + fill_state1 = aesenc(fill_state1, key1); + fill_state2 = aesdec(fill_state2, key2); + fill_state3 = aesenc(fill_state3, key3); + + rx_store_vec_i128((rx_vec_i128*)scratchpadPtr + 0, fill_state0); + rx_store_vec_i128((rx_vec_i128*)scratchpadPtr + 1, fill_state1); + rx_store_vec_i128((rx_vec_i128*)scratchpadPtr + 2, fill_state2); + rx_store_vec_i128((rx_vec_i128*)scratchpadPtr + 3, fill_state3); + + rx_prefetch_t0(prefetchPtr); + + scratchpadPtr += 64; + prefetchPtr += 64; + } + prefetchPtr = (const char*) scratchpad; + scratchpadEnd += PREFETCH_DISTANCE; + } + + rx_store_vec_i128((rx_vec_i128*)fill_state + 0, fill_state0); + rx_store_vec_i128((rx_vec_i128*)fill_state + 1, fill_state1); + rx_store_vec_i128((rx_vec_i128*)fill_state + 2, fill_state2); + rx_store_vec_i128((rx_vec_i128*)fill_state + 3, fill_state3); + + //two extra rounds to achieve full diffusion + rx_vec_i128 xkey0 = rx_set_int_vec_i128(AES_HASH_1R_XKEY0); + rx_vec_i128 xkey1 = rx_set_int_vec_i128(AES_HASH_1R_XKEY1); + + hash_state0 = aesenc(hash_state0, xkey0); + hash_state1 = aesdec(hash_state1, xkey0); + hash_state2 = aesenc(hash_state2, xkey0); + hash_state3 = aesdec(hash_state3, xkey0); + + hash_state0 = aesenc(hash_state0, xkey1); + hash_state1 = aesdec(hash_state1, xkey1); + hash_state2 = aesenc(hash_state2, xkey1); + hash_state3 = aesdec(hash_state3, xkey1); + + //output hash + rx_store_vec_i128((rx_vec_i128*)hash + 0, hash_state0); + rx_store_vec_i128((rx_vec_i128*)hash + 1, hash_state1); + rx_store_vec_i128((rx_vec_i128*)hash + 2, hash_state2); + rx_store_vec_i128((rx_vec_i128*)hash + 3, hash_state3); +} + +template void hashAndFillAes1Rx4(void *scratchpad, size_t scratchpadSize, void *hash, void* fill_state); +template void hashAndFillAes1Rx4(void *scratchpad, size_t scratchpadSize, void *hash, void* fill_state); \ No newline at end of file diff --git a/src/crypto/randomx/aes_hash.hpp b/src/crypto/randomx/aes_hash.hpp index b4d0e94053..0dc87d25ef 100644 --- a/src/crypto/randomx/aes_hash.hpp +++ b/src/crypto/randomx/aes_hash.hpp @@ -38,3 +38,6 @@ void fillAes1Rx4(void *state, size_t outputSize, void *buffer); template void fillAes4Rx4(void *state, size_t outputSize, void *buffer); + +template +void hashAndFillAes1Rx4(void *scratchpad, size_t scratchpadSize, void *hash, void* fill_state); \ No newline at end of file diff --git a/src/crypto/randomx/allocator.cpp b/src/crypto/randomx/allocator.cpp index 2e1f46060b..3d0e03596b 100644 --- a/src/crypto/randomx/allocator.cpp +++ b/src/crypto/randomx/allocator.cpp @@ -47,7 +47,7 @@ namespace randomx { rx_aligned_free(ptr); } - template class AlignedAllocator; + template struct AlignedAllocator; void* LargePageAllocator::allocMemory(size_t count) { return allocLargePagesMemory(count); diff --git a/src/crypto/randomx/argon2_core.c b/src/crypto/randomx/argon2_core.c index c885a0a063..9e50b7979e 100644 --- a/src/crypto/randomx/argon2_core.c +++ b/src/crypto/randomx/argon2_core.c @@ -307,13 +307,13 @@ void rxa2_fill_first_blocks(uint8_t *blockhash, const argon2_instance_t *instanc store32(blockhash + ARGON2_PREHASH_DIGEST_LENGTH, 0); store32(blockhash + ARGON2_PREHASH_DIGEST_LENGTH + 4, l); - rxa2_blake2b_long(blockhash_bytes, ARGON2_BLOCK_SIZE, blockhash, + blake2b_long(blockhash_bytes, ARGON2_BLOCK_SIZE, blockhash, ARGON2_PREHASH_SEED_LENGTH); load_block(&instance->memory[l * instance->lane_length + 0], blockhash_bytes); store32(blockhash + ARGON2_PREHASH_DIGEST_LENGTH, 1); - rxa2_blake2b_long(blockhash_bytes, ARGON2_BLOCK_SIZE, blockhash, + blake2b_long(blockhash_bytes, ARGON2_BLOCK_SIZE, blockhash, ARGON2_PREHASH_SEED_LENGTH); load_block(&instance->memory[l * instance->lane_length + 1], blockhash_bytes); diff --git a/src/crypto/randomx/blake2/blake2.h b/src/crypto/randomx/blake2/blake2.h index 7ac301ce43..2db10519d7 100644 --- a/src/crypto/randomx/blake2/blake2.h +++ b/src/crypto/randomx/blake2/blake2.h @@ -84,6 +84,15 @@ extern "C" { 1 / !!(sizeof(blake2b_param) == sizeof(uint64_t) * CHAR_BIT) }; + //randomx namespace + #define blake2b_init randomx_blake2b_init + #define blake2b_init_key randomx_blake2b_init_key + #define blake2b_init_param randomx_blake2b_init_param + #define blake2b_update randomx_blake2b_update + #define blake2b_final randomx_blake2b_final + #define blake2b randomx_blake2b + #define blake2b_long randomx_blake2b_long + /* Streaming API */ int blake2b_init(blake2b_state *S, size_t outlen); int blake2b_init_key(blake2b_state *S, size_t outlen, const void *key, diff --git a/src/crypto/randomx/blake2/blake2b.c b/src/crypto/randomx/blake2/blake2b.c index 73a741ba51..924d27e51b 100644 --- a/src/crypto/randomx/blake2/blake2b.c +++ b/src/crypto/randomx/blake2/blake2b.c @@ -347,7 +347,7 @@ int blake2b(void *out, size_t outlen, const void *in, size_t inlen, } /* Argon2 Team - Begin Code */ -int rxa2_blake2b_long(void *pout, size_t outlen, const void *in, size_t inlen) { +int blake2b_long(void *pout, size_t outlen, const void *in, size_t inlen) { uint8_t *out = (uint8_t *)pout; blake2b_state blake_state; uint8_t outlen_bytes[sizeof(uint32_t)] = { 0 }; diff --git a/src/crypto/randomx/intrin_portable.h b/src/crypto/randomx/intrin_portable.h index e072fd7b34..ec5dbfcf63 100644 --- a/src/crypto/randomx/intrin_portable.h +++ b/src/crypto/randomx/intrin_portable.h @@ -102,6 +102,7 @@ typedef __m128d rx_vec_f128; #define rx_aligned_alloc(a, b) _mm_malloc(a,b) #define rx_aligned_free(a) _mm_free(a) #define rx_prefetch_nta(x) _mm_prefetch((const char *)(x), _MM_HINT_NTA) +#define rx_prefetch_t0(x) _mm_prefetch((const char *)(x), _MM_HINT_T0) #define rx_load_vec_f128 _mm_load_pd #define rx_store_vec_f128 _mm_store_pd @@ -201,6 +202,7 @@ typedef union{ #define rx_aligned_alloc(a, b) malloc(a) #define rx_aligned_free(a) free(a) #define rx_prefetch_nta(x) +#define rx_prefetch_t0(x) /* Splat 64-bit long long to 2 64-bit long longs */ FORCE_INLINE __m128i vec_splat2sd (int64_t scalar) @@ -399,6 +401,10 @@ inline void rx_prefetch_nta(void* ptr) { asm volatile ("prfm pldl1strm, [%0]\n" : : "r" (ptr)); } +inline void rx_prefetch_t0(const void* ptr) { + asm volatile ("prfm pldl1strm, [%0]\n" : : "r" (ptr)); +} + FORCE_INLINE rx_vec_f128 rx_load_vec_f128(const double* pd) { return vld1q_f64((const float64_t*)pd); } @@ -532,6 +538,7 @@ typedef union { #define rx_aligned_alloc(a, b) malloc(a) #define rx_aligned_free(a) free(a) #define rx_prefetch_nta(x) +#define rx_prefetch_t0(x) FORCE_INLINE rx_vec_f128 rx_load_vec_f128(const double* pd) { rx_vec_f128 x; diff --git a/src/crypto/randomx/jit_compiler_a64.hpp b/src/crypto/randomx/jit_compiler_a64.hpp index d25403835a..8e51feba8c 100644 --- a/src/crypto/randomx/jit_compiler_a64.hpp +++ b/src/crypto/randomx/jit_compiler_a64.hpp @@ -38,7 +38,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. namespace randomx { class Program; - class ProgramConfiguration; + struct ProgramConfiguration; class SuperscalarProgram; class Instruction; diff --git a/src/crypto/randomx/jit_compiler_a64_static.S b/src/crypto/randomx/jit_compiler_a64_static.S index 660721cc32..d16cfda62c 100644 --- a/src/crypto/randomx/jit_compiler_a64_static.S +++ b/src/crypto/randomx/jit_compiler_a64_static.S @@ -100,6 +100,7 @@ # v30 -> E 'or' mask = 0x3*00000000******3*00000000****** # v31 -> scale mask = 0x81f000000000000081f0000000000000 + .balign 4 randomx_program_aarch64: # Save callee-saved registers sub sp, sp, 192 diff --git a/src/crypto/randomx/jit_compiler_fallback.hpp b/src/crypto/randomx/jit_compiler_fallback.hpp index 063d2e3286..9036ca2a44 100644 --- a/src/crypto/randomx/jit_compiler_fallback.hpp +++ b/src/crypto/randomx/jit_compiler_fallback.hpp @@ -36,7 +36,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. namespace randomx { class Program; - class ProgramConfiguration; + struct ProgramConfiguration; class SuperscalarProgram; class JitCompilerFallback { diff --git a/src/crypto/randomx/jit_compiler_x86.cpp b/src/crypto/randomx/jit_compiler_x86.cpp index d8c2326935..44503519c8 100644 --- a/src/crypto/randomx/jit_compiler_x86.cpp +++ b/src/crypto/randomx/jit_compiler_x86.cpp @@ -295,7 +295,7 @@ namespace randomx { void JitCompilerX86::generateProgramPrologue(Program& prog, ProgramConfiguration& pcfg) { instructionOffsets.clear(); - for (unsigned i = 0; i < 8; ++i) { + for (unsigned i = 0; i < RegistersCount; ++i) { registerUsage[i] = -1; } diff --git a/src/crypto/randomx/jit_compiler_x86.hpp b/src/crypto/randomx/jit_compiler_x86.hpp index 0cedd2901e..764fe48dcf 100644 --- a/src/crypto/randomx/jit_compiler_x86.hpp +++ b/src/crypto/randomx/jit_compiler_x86.hpp @@ -36,7 +36,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. namespace randomx { class Program; - class ProgramConfiguration; + struct ProgramConfiguration; class SuperscalarProgram; class JitCompilerX86; class Instruction; diff --git a/src/crypto/randomx/randomx.cpp b/src/crypto/randomx/randomx.cpp index 7ec3d64c3f..bede273671 100644 --- a/src/crypto/randomx/randomx.cpp +++ b/src/crypto/randomx/randomx.cpp @@ -507,4 +507,21 @@ extern "C" { machine->getFinalResult(output, RANDOMX_HASH_SIZE); } + void randomx_calculate_hash_first(randomx_vm* machine, const void* input, size_t inputSize) { + blake2b(machine->tempHash, sizeof(machine->tempHash), input, inputSize, nullptr, 0); + machine->initScratchpad(machine->tempHash); + } + + void randomx_calculate_hash_next(randomx_vm* machine, const void* nextInput, size_t nextInputSize, void* output) { + machine->resetRoundingMode(); + for (uint32_t chain = 0; chain < RANDOMX_PROGRAM_COUNT - 1; ++chain) { + machine->run(machine->tempHash); + blake2b(machine->tempHash, sizeof(machine->tempHash), machine->getRegisterFile(), sizeof(randomx::RegisterFile), nullptr, 0); + } + machine->run(machine->tempHash); + + // Finish current hash and fill the scratchpad for the next hash at the same time + blake2b(machine->tempHash, sizeof(machine->tempHash), nextInput, nextInputSize, nullptr, 0); + machine->hashAndFill(output, RANDOMX_HASH_SIZE, machine->tempHash); + } } diff --git a/src/crypto/randomx/randomx.h b/src/crypto/randomx/randomx.h index 11427c3e5d..ce266ec4dd 100644 --- a/src/crypto/randomx/randomx.h +++ b/src/crypto/randomx/randomx.h @@ -30,7 +30,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define RANDOMX_H #include - +#include #define RANDOMX_HASH_SIZE 32 #define RANDOMX_DATASET_ITEM_SIZE 64 @@ -238,6 +238,23 @@ RANDOMX_EXPORT void randomx_destroy_vm(randomx_vm *machine); */ RANDOMX_EXPORT void randomx_calculate_hash(randomx_vm *machine, const void *input, size_t inputSize, void *output); +/** + * Paired functions used to calculate multiple RandomX hashes more efficiently. + * randomx_calculate_hash_first is called for the first input value. + * randomx_calculate_hash_next will output the hash value of the previous input. + * + * @param machine is a pointer to a randomx_vm structure. Must not be NULL. + * @param tempHash an array of 8 64-bit values used to store intermediate data between calls to randomx_calculate_hash_first and randomx_calculate_hash_next. + * @param input is a pointer to memory to be hashed. Must not be NULL. + * @param inputSize is the number of bytes to be hashed. + * @param nextInput is a pointer to memory to be hashed for the next hash. Must not be NULL. + * @param nextInputSize is the number of bytes to be hashed for the next hash. + * @param output is a pointer to memory where the hash will be stored. Must not + * be NULL and at least RANDOMX_HASH_SIZE bytes must be available for writing. +*/ +RANDOMX_EXPORT void randomx_calculate_hash_first(randomx_vm* machine, const void* input, size_t inputSize); +RANDOMX_EXPORT void randomx_calculate_hash_next(randomx_vm* machine, const void* nextInput, size_t nextInputSize, void* output); + #if defined(__cplusplus) } #endif diff --git a/src/crypto/randomx/virtual_machine.cpp b/src/crypto/randomx/virtual_machine.cpp index a2e156a5f2..bee39e20c7 100644 --- a/src/crypto/randomx/virtual_machine.cpp +++ b/src/crypto/randomx/virtual_machine.cpp @@ -120,7 +120,13 @@ namespace randomx { blake2b(out, outSize, ®, sizeof(RegisterFile), nullptr, 0); } - template + template + void VmBase::hashAndFill(void* out, size_t outSize, uint64_t *fill_state) { + hashAndFillAes1Rx4((void*) getScratchpad(), ScratchpadSize, ®.a, fill_state); + blake2b(out, outSize, ®, sizeof(RegisterFile), nullptr, 0); + } + + template void VmBase::initScratchpad(void* seed) { fillAes1Rx4(seed, ScratchpadSize, scratchpad); } diff --git a/src/crypto/randomx/virtual_machine.hpp b/src/crypto/randomx/virtual_machine.hpp index 9881cd7c7e..986216a56c 100644 --- a/src/crypto/randomx/virtual_machine.hpp +++ b/src/crypto/randomx/virtual_machine.hpp @@ -38,6 +38,7 @@ class randomx_vm { virtual ~randomx_vm() = 0; virtual void allocate() = 0; virtual void getFinalResult(void* out, size_t outSize) = 0; + virtual void hashAndFill(void* out, size_t outSize, uint64_t *fill_state) = 0; virtual void setDataset(randomx_dataset* dataset) { } virtual void setCache(randomx_cache* cache) { } virtual void initScratchpad(void* seed) = 0; @@ -67,6 +68,7 @@ class randomx_vm { uint64_t datasetOffset; public: std::string cacheKey; + alignas(16) uint64_t tempHash[8]; //8 64-bit values used to store intermediate data }; namespace randomx { @@ -78,6 +80,7 @@ namespace randomx { void allocate() override; void initScratchpad(void* seed) override; void getFinalResult(void* out, size_t outSize) override; + void hashAndFill(void* out, size_t outSize, uint64_t *fill_state) override; protected: void generateProgram(void* seed); };