mirror of
https://github.com/monero-project/monero.git
synced 2025-01-01 06:56:17 -05:00
e745c1e38d
The basic approach it to delegate all sensitive data (master key, secret ephemeral key, key derivation, ....) and related operations to the device. As device has low memory, it does not keep itself the values (except for view/spend keys) but once computed there are encrypted (with AES are equivalent) and return back to monero-wallet-cli. When they need to be manipulated by the device, they are decrypted on receive. Moreover, using the client for storing the value in encrypted form limits the modification in the client code. Those values are transfered from one C-structure to another one as previously. The code modification has been done with the wishes to be open to any other hardware wallet. To achieve that a C++ class hw::Device has been introduced. Two initial implementations are provided: the "default", which remaps all calls to initial Monero code, and the "Ledger", which delegates all calls to Ledger device.
1291 lines
39 KiB
C
1291 lines
39 KiB
C
// Copyright (c) 2014-2018, The Monero Project
|
|
//
|
|
// All rights reserved.
|
|
//
|
|
// Redistribution and use in source and binary forms, with or without modification, are
|
|
// permitted provided that the following conditions are met:
|
|
//
|
|
// 1. Redistributions of source code must retain the above copyright notice, this list of
|
|
// conditions and the following disclaimer.
|
|
//
|
|
// 2. Redistributions in binary form must reproduce the above copyright notice, this list
|
|
// of conditions and the following disclaimer in the documentation and/or other
|
|
// materials provided with the distribution.
|
|
//
|
|
// 3. Neither the name of the copyright holder nor the names of its contributors may be
|
|
// used to endorse or promote products derived from this software without specific
|
|
// prior written permission.
|
|
//
|
|
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
|
|
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
|
|
// MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
|
|
// THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
|
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
|
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
|
// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
|
|
// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF
|
|
// THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
//
|
|
// Parts of this file are originally copyright (c) 2012-2013 The Cryptonote developers
|
|
|
|
#include <assert.h>
|
|
#include <stddef.h>
|
|
#include <stdint.h>
|
|
#include <string.h>
|
|
|
|
#include "common/int-util.h"
|
|
#include "hash-ops.h"
|
|
#include "oaes_lib.h"
|
|
|
|
#define MEMORY (1 << 21) // 2MB scratchpad
|
|
#define ITER (1 << 20)
|
|
#define AES_BLOCK_SIZE 16
|
|
#define AES_KEY_SIZE 32
|
|
#define INIT_SIZE_BLK 8
|
|
#define INIT_SIZE_BYTE (INIT_SIZE_BLK * AES_BLOCK_SIZE)
|
|
|
|
extern int aesb_single_round(const uint8_t *in, uint8_t*out, const uint8_t *expandedKey);
|
|
extern int aesb_pseudo_round(const uint8_t *in, uint8_t *out, const uint8_t *expandedKey);
|
|
|
|
#if !defined NO_AES && (defined(__x86_64__) || (defined(_MSC_VER) && defined(_WIN64)))
|
|
// Optimised code below, uses x86-specific intrinsics, SSE2, AES-NI
|
|
// Fall back to more portable code is down at the bottom
|
|
|
|
#include <emmintrin.h>
|
|
|
|
#if defined(_MSC_VER)
|
|
#include <intrin.h>
|
|
#include <windows.h>
|
|
#define STATIC
|
|
#define INLINE __inline
|
|
#if !defined(RDATA_ALIGN16)
|
|
#define RDATA_ALIGN16 __declspec(align(16))
|
|
#endif
|
|
#elif defined(__MINGW32__)
|
|
#include <intrin.h>
|
|
#include <windows.h>
|
|
#define STATIC static
|
|
#define INLINE inline
|
|
#if !defined(RDATA_ALIGN16)
|
|
#define RDATA_ALIGN16 __attribute__ ((aligned(16)))
|
|
#endif
|
|
#else
|
|
#include <wmmintrin.h>
|
|
#include <sys/mman.h>
|
|
#define STATIC static
|
|
#define INLINE inline
|
|
#if !defined(RDATA_ALIGN16)
|
|
#define RDATA_ALIGN16 __attribute__ ((aligned(16)))
|
|
#endif
|
|
#endif
|
|
|
|
#if defined(__INTEL_COMPILER)
|
|
#define ASM __asm__
|
|
#elif !defined(_MSC_VER)
|
|
#define ASM __asm__
|
|
#else
|
|
#define ASM __asm
|
|
#endif
|
|
|
|
#define TOTALBLOCKS (MEMORY / AES_BLOCK_SIZE)
|
|
|
|
#define U64(x) ((uint64_t *) (x))
|
|
#define R128(x) ((__m128i *) (x))
|
|
|
|
#define state_index(x) (((*((uint64_t *)x) >> 4) & (TOTALBLOCKS - 1)) << 4)
|
|
#if defined(_MSC_VER)
|
|
#if !defined(_WIN64)
|
|
#define __mul() lo = mul128(c[0], b[0], &hi);
|
|
#else
|
|
#define __mul() lo = _umul128(c[0], b[0], &hi);
|
|
#endif
|
|
#else
|
|
#if defined(__x86_64__)
|
|
#define __mul() ASM("mulq %3\n\t" : "=d"(hi), "=a"(lo) : "%a" (c[0]), "rm" (b[0]) : "cc");
|
|
#else
|
|
#define __mul() lo = mul128(c[0], b[0], &hi);
|
|
#endif
|
|
#endif
|
|
|
|
#define pre_aes() \
|
|
j = state_index(a); \
|
|
_c = _mm_load_si128(R128(&hp_state[j])); \
|
|
_a = _mm_load_si128(R128(a)); \
|
|
|
|
/*
|
|
* An SSE-optimized implementation of the second half of CryptoNight step 3.
|
|
* After using AES to mix a scratchpad value into _c (done by the caller),
|
|
* this macro xors it with _b and stores the result back to the same index (j) that it
|
|
* loaded the scratchpad value from. It then performs a second random memory
|
|
* read/write from the scratchpad, but this time mixes the values using a 64
|
|
* bit multiply.
|
|
* This code is based upon an optimized implementation by dga.
|
|
*/
|
|
#define post_aes() \
|
|
_mm_store_si128(R128(c), _c); \
|
|
_b = _mm_xor_si128(_b, _c); \
|
|
_mm_store_si128(R128(&hp_state[j]), _b); \
|
|
j = state_index(c); \
|
|
p = U64(&hp_state[j]); \
|
|
b[0] = p[0]; b[1] = p[1]; \
|
|
__mul(); \
|
|
a[0] += hi; a[1] += lo; \
|
|
p = U64(&hp_state[j]); \
|
|
p[0] = a[0]; p[1] = a[1]; \
|
|
a[0] ^= b[0]; a[1] ^= b[1]; \
|
|
_b = _c; \
|
|
|
|
#if defined(_MSC_VER)
|
|
#define THREADV __declspec(thread)
|
|
#else
|
|
#define THREADV __thread
|
|
#endif
|
|
|
|
#pragma pack(push, 1)
|
|
union cn_slow_hash_state
|
|
{
|
|
union hash_state hs;
|
|
struct
|
|
{
|
|
uint8_t k[64];
|
|
uint8_t init[INIT_SIZE_BYTE];
|
|
};
|
|
};
|
|
#pragma pack(pop)
|
|
|
|
THREADV uint8_t *hp_state = NULL;
|
|
THREADV int hp_allocated = 0;
|
|
|
|
#if defined(_MSC_VER)
|
|
#define cpuid(info,x) __cpuidex(info,x,0)
|
|
#else
|
|
void cpuid(int CPUInfo[4], int InfoType)
|
|
{
|
|
ASM __volatile__
|
|
(
|
|
"cpuid":
|
|
"=a" (CPUInfo[0]),
|
|
"=b" (CPUInfo[1]),
|
|
"=c" (CPUInfo[2]),
|
|
"=d" (CPUInfo[3]) :
|
|
"a" (InfoType), "c" (0)
|
|
);
|
|
}
|
|
#endif
|
|
|
|
/**
|
|
* @brief a = (a xor b), where a and b point to 128 bit values
|
|
*/
|
|
|
|
STATIC INLINE void xor_blocks(uint8_t *a, const uint8_t *b)
|
|
{
|
|
U64(a)[0] ^= U64(b)[0];
|
|
U64(a)[1] ^= U64(b)[1];
|
|
}
|
|
|
|
/**
|
|
* @brief uses cpuid to determine if the CPU supports the AES instructions
|
|
* @return true if the CPU supports AES, false otherwise
|
|
*/
|
|
|
|
STATIC INLINE int force_software_aes(void)
|
|
{
|
|
static int use = -1;
|
|
|
|
if (use != -1)
|
|
return use;
|
|
|
|
const char *env = getenv("MONERO_USE_SOFTWARE_AES");
|
|
if (!env) {
|
|
use = 0;
|
|
}
|
|
else if (!strcmp(env, "0") || !strcmp(env, "no")) {
|
|
use = 0;
|
|
}
|
|
else {
|
|
use = 1;
|
|
}
|
|
return use;
|
|
}
|
|
|
|
STATIC INLINE int check_aes_hw(void)
|
|
{
|
|
int cpuid_results[4];
|
|
static int supported = -1;
|
|
|
|
if(supported >= 0)
|
|
return supported;
|
|
|
|
cpuid(cpuid_results,1);
|
|
return supported = cpuid_results[2] & (1 << 25);
|
|
}
|
|
|
|
STATIC INLINE void aes_256_assist1(__m128i* t1, __m128i * t2)
|
|
{
|
|
__m128i t4;
|
|
*t2 = _mm_shuffle_epi32(*t2, 0xff);
|
|
t4 = _mm_slli_si128(*t1, 0x04);
|
|
*t1 = _mm_xor_si128(*t1, t4);
|
|
t4 = _mm_slli_si128(t4, 0x04);
|
|
*t1 = _mm_xor_si128(*t1, t4);
|
|
t4 = _mm_slli_si128(t4, 0x04);
|
|
*t1 = _mm_xor_si128(*t1, t4);
|
|
*t1 = _mm_xor_si128(*t1, *t2);
|
|
}
|
|
|
|
STATIC INLINE void aes_256_assist2(__m128i* t1, __m128i * t3)
|
|
{
|
|
__m128i t2, t4;
|
|
t4 = _mm_aeskeygenassist_si128(*t1, 0x00);
|
|
t2 = _mm_shuffle_epi32(t4, 0xaa);
|
|
t4 = _mm_slli_si128(*t3, 0x04);
|
|
*t3 = _mm_xor_si128(*t3, t4);
|
|
t4 = _mm_slli_si128(t4, 0x04);
|
|
*t3 = _mm_xor_si128(*t3, t4);
|
|
t4 = _mm_slli_si128(t4, 0x04);
|
|
*t3 = _mm_xor_si128(*t3, t4);
|
|
*t3 = _mm_xor_si128(*t3, t2);
|
|
}
|
|
|
|
/**
|
|
* @brief expands 'key' into a form it can be used for AES encryption.
|
|
*
|
|
* This is an SSE-optimized implementation of AES key schedule generation. It
|
|
* expands the key into multiple round keys, each of which is used in one round
|
|
* of the AES encryption used to fill (and later, extract randomness from)
|
|
* the large 2MB buffer. Note that CryptoNight does not use a completely
|
|
* standard AES encryption for its buffer expansion, so do not copy this
|
|
* function outside of Monero without caution! This version uses the hardware
|
|
* AESKEYGENASSIST instruction to speed key generation, and thus requires
|
|
* CPU AES support.
|
|
* For more information about these functions, see page 19 of Intel's AES instructions
|
|
* white paper:
|
|
* http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/aes-instructions-set-white-paper.pdf
|
|
*
|
|
* @param key the input 128 bit key
|
|
* @param expandedKey An output buffer to hold the generated key schedule
|
|
*/
|
|
|
|
STATIC INLINE void aes_expand_key(const uint8_t *key, uint8_t *expandedKey)
|
|
{
|
|
__m128i *ek = R128(expandedKey);
|
|
__m128i t1, t2, t3;
|
|
|
|
t1 = _mm_loadu_si128(R128(key));
|
|
t3 = _mm_loadu_si128(R128(key + 16));
|
|
|
|
ek[0] = t1;
|
|
ek[1] = t3;
|
|
|
|
t2 = _mm_aeskeygenassist_si128(t3, 0x01);
|
|
aes_256_assist1(&t1, &t2);
|
|
ek[2] = t1;
|
|
aes_256_assist2(&t1, &t3);
|
|
ek[3] = t3;
|
|
|
|
t2 = _mm_aeskeygenassist_si128(t3, 0x02);
|
|
aes_256_assist1(&t1, &t2);
|
|
ek[4] = t1;
|
|
aes_256_assist2(&t1, &t3);
|
|
ek[5] = t3;
|
|
|
|
t2 = _mm_aeskeygenassist_si128(t3, 0x04);
|
|
aes_256_assist1(&t1, &t2);
|
|
ek[6] = t1;
|
|
aes_256_assist2(&t1, &t3);
|
|
ek[7] = t3;
|
|
|
|
t2 = _mm_aeskeygenassist_si128(t3, 0x08);
|
|
aes_256_assist1(&t1, &t2);
|
|
ek[8] = t1;
|
|
aes_256_assist2(&t1, &t3);
|
|
ek[9] = t3;
|
|
|
|
t2 = _mm_aeskeygenassist_si128(t3, 0x10);
|
|
aes_256_assist1(&t1, &t2);
|
|
ek[10] = t1;
|
|
}
|
|
|
|
/**
|
|
* @brief a "pseudo" round of AES (similar to but slightly different from normal AES encryption)
|
|
*
|
|
* To fill its 2MB scratch buffer, CryptoNight uses a nonstandard implementation
|
|
* of AES encryption: It applies 10 rounds of the basic AES encryption operation
|
|
* to an input 128 bit chunk of data <in>. Unlike normal AES, however, this is
|
|
* all it does; it does not perform the initial AddRoundKey step (this is done
|
|
* in subsequent steps by aesenc_si128), and it does not use the simpler final round.
|
|
* Hence, this is a "pseudo" round - though the function actually implements 10 rounds together.
|
|
*
|
|
* Note that unlike aesb_pseudo_round, this function works on multiple data chunks.
|
|
*
|
|
* @param in a pointer to nblocks * 128 bits of data to be encrypted
|
|
* @param out a pointer to an nblocks * 128 bit buffer where the output will be stored
|
|
* @param expandedKey the expanded AES key
|
|
* @param nblocks the number of 128 blocks of data to be encrypted
|
|
*/
|
|
|
|
STATIC INLINE void aes_pseudo_round(const uint8_t *in, uint8_t *out,
|
|
const uint8_t *expandedKey, int nblocks)
|
|
{
|
|
__m128i *k = R128(expandedKey);
|
|
__m128i d;
|
|
int i;
|
|
|
|
for(i = 0; i < nblocks; i++)
|
|
{
|
|
d = _mm_loadu_si128(R128(in + i * AES_BLOCK_SIZE));
|
|
d = _mm_aesenc_si128(d, *R128(&k[0]));
|
|
d = _mm_aesenc_si128(d, *R128(&k[1]));
|
|
d = _mm_aesenc_si128(d, *R128(&k[2]));
|
|
d = _mm_aesenc_si128(d, *R128(&k[3]));
|
|
d = _mm_aesenc_si128(d, *R128(&k[4]));
|
|
d = _mm_aesenc_si128(d, *R128(&k[5]));
|
|
d = _mm_aesenc_si128(d, *R128(&k[6]));
|
|
d = _mm_aesenc_si128(d, *R128(&k[7]));
|
|
d = _mm_aesenc_si128(d, *R128(&k[8]));
|
|
d = _mm_aesenc_si128(d, *R128(&k[9]));
|
|
_mm_storeu_si128((R128(out + i * AES_BLOCK_SIZE)), d);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* @brief aes_pseudo_round that loads data from *in and xors it with *xor first
|
|
*
|
|
* This function performs the same operations as aes_pseudo_round, but before
|
|
* performing the encryption of each 128 bit block from <in>, it xors
|
|
* it with the corresponding block from <xor>.
|
|
*
|
|
* @param in a pointer to nblocks * 128 bits of data to be encrypted
|
|
* @param out a pointer to an nblocks * 128 bit buffer where the output will be stored
|
|
* @param expandedKey the expanded AES key
|
|
* @param xor a pointer to an nblocks * 128 bit buffer that is xored into in before encryption (in is left unmodified)
|
|
* @param nblocks the number of 128 blocks of data to be encrypted
|
|
*/
|
|
|
|
STATIC INLINE void aes_pseudo_round_xor(const uint8_t *in, uint8_t *out,
|
|
const uint8_t *expandedKey, const uint8_t *xor, int nblocks)
|
|
{
|
|
__m128i *k = R128(expandedKey);
|
|
__m128i *x = R128(xor);
|
|
__m128i d;
|
|
int i;
|
|
|
|
for(i = 0; i < nblocks; i++)
|
|
{
|
|
d = _mm_loadu_si128(R128(in + i * AES_BLOCK_SIZE));
|
|
d = _mm_xor_si128(d, *R128(x++));
|
|
d = _mm_aesenc_si128(d, *R128(&k[0]));
|
|
d = _mm_aesenc_si128(d, *R128(&k[1]));
|
|
d = _mm_aesenc_si128(d, *R128(&k[2]));
|
|
d = _mm_aesenc_si128(d, *R128(&k[3]));
|
|
d = _mm_aesenc_si128(d, *R128(&k[4]));
|
|
d = _mm_aesenc_si128(d, *R128(&k[5]));
|
|
d = _mm_aesenc_si128(d, *R128(&k[6]));
|
|
d = _mm_aesenc_si128(d, *R128(&k[7]));
|
|
d = _mm_aesenc_si128(d, *R128(&k[8]));
|
|
d = _mm_aesenc_si128(d, *R128(&k[9]));
|
|
_mm_storeu_si128((R128(out + i * AES_BLOCK_SIZE)), d);
|
|
}
|
|
}
|
|
|
|
#if defined(_MSC_VER) || defined(__MINGW32__)
|
|
BOOL SetLockPagesPrivilege(HANDLE hProcess, BOOL bEnable)
|
|
{
|
|
struct
|
|
{
|
|
DWORD count;
|
|
LUID_AND_ATTRIBUTES privilege[1];
|
|
} info;
|
|
|
|
HANDLE token;
|
|
if(!OpenProcessToken(hProcess, TOKEN_ADJUST_PRIVILEGES, &token))
|
|
return FALSE;
|
|
|
|
info.count = 1;
|
|
info.privilege[0].Attributes = bEnable ? SE_PRIVILEGE_ENABLED : 0;
|
|
|
|
if(!LookupPrivilegeValue(NULL, SE_LOCK_MEMORY_NAME, &(info.privilege[0].Luid)))
|
|
return FALSE;
|
|
|
|
if(!AdjustTokenPrivileges(token, FALSE, (PTOKEN_PRIVILEGES) &info, 0, NULL, NULL))
|
|
return FALSE;
|
|
|
|
if (GetLastError() != ERROR_SUCCESS)
|
|
return FALSE;
|
|
|
|
CloseHandle(token);
|
|
|
|
return TRUE;
|
|
|
|
}
|
|
#endif
|
|
|
|
/**
|
|
* @brief allocate the 2MB scratch buffer using OS support for huge pages, if available
|
|
*
|
|
* This function tries to allocate the 2MB scratch buffer using a single
|
|
* 2MB "huge page" (instead of the usual 4KB page sizes) to reduce TLB misses
|
|
* during the random accesses to the scratch buffer. This is one of the
|
|
* important speed optimizations needed to make CryptoNight faster.
|
|
*
|
|
* No parameters. Updates a thread-local pointer, hp_state, to point to
|
|
* the allocated buffer.
|
|
*/
|
|
|
|
void slow_hash_allocate_state(void)
|
|
{
|
|
if(hp_state != NULL)
|
|
return;
|
|
|
|
#if defined(_MSC_VER) || defined(__MINGW32__)
|
|
SetLockPagesPrivilege(GetCurrentProcess(), TRUE);
|
|
hp_state = (uint8_t *) VirtualAlloc(hp_state, MEMORY, MEM_LARGE_PAGES |
|
|
MEM_COMMIT | MEM_RESERVE, PAGE_READWRITE);
|
|
#else
|
|
#if defined(__APPLE__) || defined(__FreeBSD__) || defined(__OpenBSD__) || \
|
|
defined(__DragonFly__)
|
|
hp_state = mmap(0, MEMORY, PROT_READ | PROT_WRITE,
|
|
MAP_PRIVATE | MAP_ANON, 0, 0);
|
|
#else
|
|
hp_state = mmap(0, MEMORY, PROT_READ | PROT_WRITE,
|
|
MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, 0, 0);
|
|
#endif
|
|
if(hp_state == MAP_FAILED)
|
|
hp_state = NULL;
|
|
#endif
|
|
hp_allocated = 1;
|
|
if(hp_state == NULL)
|
|
{
|
|
hp_allocated = 0;
|
|
hp_state = (uint8_t *) malloc(MEMORY);
|
|
}
|
|
}
|
|
|
|
/**
|
|
*@brief frees the state allocated by slow_hash_allocate_state
|
|
*/
|
|
|
|
void slow_hash_free_state(void)
|
|
{
|
|
if(hp_state == NULL)
|
|
return;
|
|
|
|
if(!hp_allocated)
|
|
free(hp_state);
|
|
else
|
|
{
|
|
#if defined(_MSC_VER) || defined(__MINGW32__)
|
|
VirtualFree(hp_state, MEMORY, MEM_RELEASE);
|
|
#else
|
|
munmap(hp_state, MEMORY);
|
|
#endif
|
|
}
|
|
|
|
hp_state = NULL;
|
|
hp_allocated = 0;
|
|
}
|
|
|
|
/**
|
|
* @brief the hash function implementing CryptoNight, used for the Monero proof-of-work
|
|
*
|
|
* Computes the hash of <data> (which consists of <length> bytes), returning the
|
|
* hash in <hash>. The CryptoNight hash operates by first using Keccak 1600,
|
|
* the 1600 bit variant of the Keccak hash used in SHA-3, to create a 200 byte
|
|
* buffer of pseudorandom data by hashing the supplied data. It then uses this
|
|
* random data to fill a large 2MB buffer with pseudorandom data by iteratively
|
|
* encrypting it using 10 rounds of AES per entry. After this initialization,
|
|
* it executes 524,288 rounds of mixing through the random 2MB buffer using
|
|
* AES (typically provided in hardware on modern CPUs) and a 64 bit multiply.
|
|
* Finally, it re-mixes this large buffer back into
|
|
* the 200 byte "text" buffer, and then hashes this buffer using one of four
|
|
* pseudorandomly selected hash functions (Blake, Groestl, JH, or Skein)
|
|
* to populate the output.
|
|
*
|
|
* The 2MB buffer and choice of functions for mixing are designed to make the
|
|
* algorithm "CPU-friendly" (and thus, reduce the advantage of GPU, FPGA,
|
|
* or ASIC-based implementations): the functions used are fast on modern
|
|
* CPUs, and the 2MB size matches the typical amount of L3 cache available per
|
|
* core on 2013-era CPUs. When available, this implementation will use hardware
|
|
* AES support on x86 CPUs.
|
|
*
|
|
* A diagram of the inner loop of this function can be found at
|
|
* http://www.cs.cmu.edu/~dga/crypto/xmr/cryptonight.png
|
|
*
|
|
* @param data the data to hash
|
|
* @param length the length in bytes of the data
|
|
* @param hash a pointer to a buffer in which the final 256 bit hash will be stored
|
|
*/
|
|
void cn_slow_hash(const void *data, size_t length, char *hash) {
|
|
cn_slow_hash_pre(data,length,hash,false);
|
|
}
|
|
|
|
void cn_slow_hash_pre(const void *data, size_t length, char *hash, bool prehashed)
|
|
{
|
|
RDATA_ALIGN16 uint8_t expandedKey[240]; /* These buffers are aligned to use later with SSE functions */
|
|
|
|
uint8_t text[INIT_SIZE_BYTE];
|
|
RDATA_ALIGN16 uint64_t a[2];
|
|
RDATA_ALIGN16 uint64_t b[2];
|
|
RDATA_ALIGN16 uint64_t c[2];
|
|
union cn_slow_hash_state state;
|
|
__m128i _a, _b, _c;
|
|
uint64_t hi, lo;
|
|
|
|
size_t i, j;
|
|
uint64_t *p = NULL;
|
|
oaes_ctx *aes_ctx = NULL;
|
|
int useAes = !force_software_aes() && check_aes_hw();
|
|
|
|
static void (*const extra_hashes[4])(const void *, size_t, char *) =
|
|
{
|
|
hash_extra_blake, hash_extra_groestl, hash_extra_jh, hash_extra_skein
|
|
};
|
|
|
|
// this isn't supposed to happen, but guard against it for now.
|
|
if(hp_state == NULL)
|
|
slow_hash_allocate_state();
|
|
|
|
/* CryptoNight Step 1: Use Keccak1600 to initialize the 'state' (and 'text') buffers from the data. */
|
|
if (prehashed) {
|
|
memcpy(&state.hs, data, length);
|
|
} else {
|
|
hash_process(&state.hs, data, length);
|
|
}
|
|
memcpy(text, state.init, INIT_SIZE_BYTE);
|
|
|
|
/* CryptoNight Step 2: Iteratively encrypt the results from Keccak to fill
|
|
* the 2MB large random access buffer.
|
|
*/
|
|
|
|
if(useAes)
|
|
{
|
|
aes_expand_key(state.hs.b, expandedKey);
|
|
for(i = 0; i < MEMORY / INIT_SIZE_BYTE; i++)
|
|
{
|
|
aes_pseudo_round(text, text, expandedKey, INIT_SIZE_BLK);
|
|
memcpy(&hp_state[i * INIT_SIZE_BYTE], text, INIT_SIZE_BYTE);
|
|
}
|
|
}
|
|
else
|
|
{
|
|
aes_ctx = (oaes_ctx *) oaes_alloc();
|
|
oaes_key_import_data(aes_ctx, state.hs.b, AES_KEY_SIZE);
|
|
for(i = 0; i < MEMORY / INIT_SIZE_BYTE; i++)
|
|
{
|
|
for(j = 0; j < INIT_SIZE_BLK; j++)
|
|
aesb_pseudo_round(&text[AES_BLOCK_SIZE * j], &text[AES_BLOCK_SIZE * j], aes_ctx->key->exp_data);
|
|
|
|
memcpy(&hp_state[i * INIT_SIZE_BYTE], text, INIT_SIZE_BYTE);
|
|
}
|
|
}
|
|
|
|
U64(a)[0] = U64(&state.k[0])[0] ^ U64(&state.k[32])[0];
|
|
U64(a)[1] = U64(&state.k[0])[1] ^ U64(&state.k[32])[1];
|
|
U64(b)[0] = U64(&state.k[16])[0] ^ U64(&state.k[48])[0];
|
|
U64(b)[1] = U64(&state.k[16])[1] ^ U64(&state.k[48])[1];
|
|
|
|
/* CryptoNight Step 3: Bounce randomly 1,048,576 times (1<<20) through the mixing buffer,
|
|
* using 524,288 iterations of the following mixing function. Each execution
|
|
* performs two reads and writes from the mixing buffer.
|
|
*/
|
|
|
|
_b = _mm_load_si128(R128(b));
|
|
// Two independent versions, one with AES, one without, to ensure that
|
|
// the useAes test is only performed once, not every iteration.
|
|
if(useAes)
|
|
{
|
|
for(i = 0; i < ITER / 2; i++)
|
|
{
|
|
pre_aes();
|
|
_c = _mm_aesenc_si128(_c, _a);
|
|
post_aes();
|
|
}
|
|
}
|
|
else
|
|
{
|
|
for(i = 0; i < ITER / 2; i++)
|
|
{
|
|
pre_aes();
|
|
aesb_single_round((uint8_t *) &_c, (uint8_t *) &_c, (uint8_t *) &_a);
|
|
post_aes();
|
|
}
|
|
}
|
|
|
|
/* CryptoNight Step 4: Sequentially pass through the mixing buffer and use 10 rounds
|
|
* of AES encryption to mix the random data back into the 'text' buffer. 'text'
|
|
* was originally created with the output of Keccak1600. */
|
|
|
|
memcpy(text, state.init, INIT_SIZE_BYTE);
|
|
if(useAes)
|
|
{
|
|
aes_expand_key(&state.hs.b[32], expandedKey);
|
|
for(i = 0; i < MEMORY / INIT_SIZE_BYTE; i++)
|
|
{
|
|
// add the xor to the pseudo round
|
|
aes_pseudo_round_xor(text, text, expandedKey, &hp_state[i * INIT_SIZE_BYTE], INIT_SIZE_BLK);
|
|
}
|
|
}
|
|
else
|
|
{
|
|
oaes_key_import_data(aes_ctx, &state.hs.b[32], AES_KEY_SIZE);
|
|
for(i = 0; i < MEMORY / INIT_SIZE_BYTE; i++)
|
|
{
|
|
for(j = 0; j < INIT_SIZE_BLK; j++)
|
|
{
|
|
xor_blocks(&text[j * AES_BLOCK_SIZE], &hp_state[i * INIT_SIZE_BYTE + j * AES_BLOCK_SIZE]);
|
|
aesb_pseudo_round(&text[AES_BLOCK_SIZE * j], &text[AES_BLOCK_SIZE * j], aes_ctx->key->exp_data);
|
|
}
|
|
}
|
|
oaes_free((OAES_CTX **) &aes_ctx);
|
|
}
|
|
|
|
/* CryptoNight Step 5: Apply Keccak to the state again, and then
|
|
* use the resulting data to select which of four finalizer
|
|
* hash functions to apply to the data (Blake, Groestl, JH, or Skein).
|
|
* Use this hash to squeeze the state array down
|
|
* to the final 256 bit hash output.
|
|
*/
|
|
|
|
memcpy(state.init, text, INIT_SIZE_BYTE);
|
|
hash_permutation(&state.hs);
|
|
extra_hashes[state.hs.b[0] & 3](&state, 200, hash);
|
|
}
|
|
|
|
#elif !defined NO_AES && (defined(__arm__) || defined(__aarch64__))
|
|
void slow_hash_allocate_state(void)
|
|
{
|
|
// Do nothing, this is just to maintain compatibility with the upgraded slow-hash.c
|
|
return;
|
|
}
|
|
|
|
void slow_hash_free_state(void)
|
|
{
|
|
// As above
|
|
return;
|
|
}
|
|
|
|
#if defined(__GNUC__)
|
|
#define RDATA_ALIGN16 __attribute__ ((aligned(16)))
|
|
#define STATIC static
|
|
#define INLINE inline
|
|
#else
|
|
#define RDATA_ALIGN16
|
|
#define STATIC static
|
|
#define INLINE
|
|
#endif
|
|
|
|
#define U64(x) ((uint64_t *) (x))
|
|
|
|
#pragma pack(push, 1)
|
|
union cn_slow_hash_state
|
|
{
|
|
union hash_state hs;
|
|
struct
|
|
{
|
|
uint8_t k[64];
|
|
uint8_t init[INIT_SIZE_BYTE];
|
|
};
|
|
};
|
|
#pragma pack(pop)
|
|
|
|
#if defined(__aarch64__) && defined(__ARM_FEATURE_CRYPTO)
|
|
|
|
/* ARMv8-A optimized with NEON and AES instructions.
|
|
* Copied from the x86-64 AES-NI implementation. It has much the same
|
|
* characteristics as x86-64: there's no 64x64=128 multiplier for vectors,
|
|
* and moving between vector and regular registers stalls the pipeline.
|
|
*/
|
|
#include <arm_neon.h>
|
|
|
|
#define TOTALBLOCKS (MEMORY / AES_BLOCK_SIZE)
|
|
|
|
#define state_index(x) (((*((uint64_t *)x) >> 4) & (TOTALBLOCKS - 1)) << 4)
|
|
#define __mul() __asm__("mul %0, %1, %2\n\t" : "=r"(lo) : "r"(c[0]), "r"(b[0]) ); \
|
|
__asm__("umulh %0, %1, %2\n\t" : "=r"(hi) : "r"(c[0]), "r"(b[0]) );
|
|
|
|
#define pre_aes() \
|
|
j = state_index(a); \
|
|
_c = vld1q_u8(&hp_state[j]); \
|
|
_a = vld1q_u8((const uint8_t *)a); \
|
|
|
|
#define post_aes() \
|
|
vst1q_u8((uint8_t *)c, _c); \
|
|
_b = veorq_u8(_b, _c); \
|
|
vst1q_u8(&hp_state[j], _b); \
|
|
j = state_index(c); \
|
|
p = U64(&hp_state[j]); \
|
|
b[0] = p[0]; b[1] = p[1]; \
|
|
__mul(); \
|
|
a[0] += hi; a[1] += lo; \
|
|
p = U64(&hp_state[j]); \
|
|
p[0] = a[0]; p[1] = a[1]; \
|
|
a[0] ^= b[0]; a[1] ^= b[1]; \
|
|
_b = _c; \
|
|
|
|
|
|
/* Note: this was based on a standard 256bit key schedule but
|
|
* it's been shortened since Cryptonight doesn't use the full
|
|
* key schedule. Don't try to use this for vanilla AES.
|
|
*/
|
|
static void aes_expand_key(const uint8_t *key, uint8_t *expandedKey) {
|
|
static const int rcon[] = {
|
|
0x01,0x01,0x01,0x01,
|
|
0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d, // rotate-n-splat
|
|
0x1b,0x1b,0x1b,0x1b };
|
|
__asm__(
|
|
" eor v0.16b,v0.16b,v0.16b\n"
|
|
" ld1 {v3.16b},[%0],#16\n"
|
|
" ld1 {v1.4s,v2.4s},[%2],#32\n"
|
|
" ld1 {v4.16b},[%0]\n"
|
|
" mov w2,#5\n"
|
|
" st1 {v3.4s},[%1],#16\n"
|
|
"\n"
|
|
"1:\n"
|
|
" tbl v6.16b,{v4.16b},v2.16b\n"
|
|
" ext v5.16b,v0.16b,v3.16b,#12\n"
|
|
" st1 {v4.4s},[%1],#16\n"
|
|
" aese v6.16b,v0.16b\n"
|
|
" subs w2,w2,#1\n"
|
|
"\n"
|
|
" eor v3.16b,v3.16b,v5.16b\n"
|
|
" ext v5.16b,v0.16b,v5.16b,#12\n"
|
|
" eor v3.16b,v3.16b,v5.16b\n"
|
|
" ext v5.16b,v0.16b,v5.16b,#12\n"
|
|
" eor v6.16b,v6.16b,v1.16b\n"
|
|
" eor v3.16b,v3.16b,v5.16b\n"
|
|
" shl v1.16b,v1.16b,#1\n"
|
|
" eor v3.16b,v3.16b,v6.16b\n"
|
|
" st1 {v3.4s},[%1],#16\n"
|
|
" b.eq 2f\n"
|
|
"\n"
|
|
" dup v6.4s,v3.s[3] // just splat\n"
|
|
" ext v5.16b,v0.16b,v4.16b,#12\n"
|
|
" aese v6.16b,v0.16b\n"
|
|
"\n"
|
|
" eor v4.16b,v4.16b,v5.16b\n"
|
|
" ext v5.16b,v0.16b,v5.16b,#12\n"
|
|
" eor v4.16b,v4.16b,v5.16b\n"
|
|
" ext v5.16b,v0.16b,v5.16b,#12\n"
|
|
" eor v4.16b,v4.16b,v5.16b\n"
|
|
"\n"
|
|
" eor v4.16b,v4.16b,v6.16b\n"
|
|
" b 1b\n"
|
|
"\n"
|
|
"2:\n" : : "r"(key), "r"(expandedKey), "r"(rcon));
|
|
}
|
|
|
|
/* An ordinary AES round is a sequence of SubBytes, ShiftRows, MixColumns, AddRoundKey. There
|
|
* is also an InitialRound which consists solely of AddRoundKey. The ARM instructions slice
|
|
* this sequence differently; the aese instruction performs AddRoundKey, SubBytes, ShiftRows.
|
|
* The aesmc instruction does the MixColumns. Since the aese instruction moves the AddRoundKey
|
|
* up front, and Cryptonight's hash skips the InitialRound step, we have to kludge it here by
|
|
* feeding in a vector of zeros for our first step. Also we have to do our own Xor explicitly
|
|
* at the last step, to provide the AddRoundKey that the ARM instructions omit.
|
|
*/
|
|
STATIC INLINE void aes_pseudo_round(const uint8_t *in, uint8_t *out, const uint8_t *expandedKey, int nblocks)
|
|
{
|
|
const uint8x16_t *k = (const uint8x16_t *)expandedKey, zero = {0};
|
|
uint8x16_t tmp;
|
|
int i;
|
|
|
|
for (i=0; i<nblocks; i++)
|
|
{
|
|
uint8x16_t tmp = vld1q_u8(in + i * AES_BLOCK_SIZE);
|
|
tmp = vaeseq_u8(tmp, zero);
|
|
tmp = vaesmcq_u8(tmp);
|
|
tmp = vaeseq_u8(tmp, k[0]);
|
|
tmp = vaesmcq_u8(tmp);
|
|
tmp = vaeseq_u8(tmp, k[1]);
|
|
tmp = vaesmcq_u8(tmp);
|
|
tmp = vaeseq_u8(tmp, k[2]);
|
|
tmp = vaesmcq_u8(tmp);
|
|
tmp = vaeseq_u8(tmp, k[3]);
|
|
tmp = vaesmcq_u8(tmp);
|
|
tmp = vaeseq_u8(tmp, k[4]);
|
|
tmp = vaesmcq_u8(tmp);
|
|
tmp = vaeseq_u8(tmp, k[5]);
|
|
tmp = vaesmcq_u8(tmp);
|
|
tmp = vaeseq_u8(tmp, k[6]);
|
|
tmp = vaesmcq_u8(tmp);
|
|
tmp = vaeseq_u8(tmp, k[7]);
|
|
tmp = vaesmcq_u8(tmp);
|
|
tmp = vaeseq_u8(tmp, k[8]);
|
|
tmp = vaesmcq_u8(tmp);
|
|
tmp = veorq_u8(tmp, k[9]);
|
|
vst1q_u8(out + i * AES_BLOCK_SIZE, tmp);
|
|
}
|
|
}
|
|
|
|
STATIC INLINE void aes_pseudo_round_xor(const uint8_t *in, uint8_t *out, const uint8_t *expandedKey, const uint8_t *xor, int nblocks)
|
|
{
|
|
const uint8x16_t *k = (const uint8x16_t *)expandedKey;
|
|
const uint8x16_t *x = (const uint8x16_t *)xor;
|
|
uint8x16_t tmp;
|
|
int i;
|
|
|
|
for (i=0; i<nblocks; i++)
|
|
{
|
|
uint8x16_t tmp = vld1q_u8(in + i * AES_BLOCK_SIZE);
|
|
tmp = vaeseq_u8(tmp, x[i]);
|
|
tmp = vaesmcq_u8(tmp);
|
|
tmp = vaeseq_u8(tmp, k[0]);
|
|
tmp = vaesmcq_u8(tmp);
|
|
tmp = vaeseq_u8(tmp, k[1]);
|
|
tmp = vaesmcq_u8(tmp);
|
|
tmp = vaeseq_u8(tmp, k[2]);
|
|
tmp = vaesmcq_u8(tmp);
|
|
tmp = vaeseq_u8(tmp, k[3]);
|
|
tmp = vaesmcq_u8(tmp);
|
|
tmp = vaeseq_u8(tmp, k[4]);
|
|
tmp = vaesmcq_u8(tmp);
|
|
tmp = vaeseq_u8(tmp, k[5]);
|
|
tmp = vaesmcq_u8(tmp);
|
|
tmp = vaeseq_u8(tmp, k[6]);
|
|
tmp = vaesmcq_u8(tmp);
|
|
tmp = vaeseq_u8(tmp, k[7]);
|
|
tmp = vaesmcq_u8(tmp);
|
|
tmp = vaeseq_u8(tmp, k[8]);
|
|
tmp = vaesmcq_u8(tmp);
|
|
tmp = veorq_u8(tmp, k[9]);
|
|
vst1q_u8(out + i * AES_BLOCK_SIZE, tmp);
|
|
}
|
|
}
|
|
|
|
void cn_slow_hash(const void *data, size_t length, char *hash)
|
|
{
|
|
RDATA_ALIGN16 uint8_t expandedKey[240];
|
|
RDATA_ALIGN16 uint8_t hp_state[MEMORY];
|
|
|
|
uint8_t text[INIT_SIZE_BYTE];
|
|
RDATA_ALIGN16 uint64_t a[2];
|
|
RDATA_ALIGN16 uint64_t b[2];
|
|
RDATA_ALIGN16 uint64_t c[2];
|
|
union cn_slow_hash_state state;
|
|
uint8x16_t _a, _b, _c, zero = {0};
|
|
uint64_t hi, lo;
|
|
|
|
size_t i, j;
|
|
uint64_t *p = NULL;
|
|
|
|
static void (*const extra_hashes[4])(const void *, size_t, char *) =
|
|
{
|
|
hash_extra_blake, hash_extra_groestl, hash_extra_jh, hash_extra_skein
|
|
};
|
|
|
|
/* CryptoNight Step 1: Use Keccak1600 to initialize the 'state' (and 'text') buffers from the data. */
|
|
|
|
hash_process(&state.hs, data, length);
|
|
memcpy(text, state.init, INIT_SIZE_BYTE);
|
|
|
|
/* CryptoNight Step 2: Iteratively encrypt the results from Keccak to fill
|
|
* the 2MB large random access buffer.
|
|
*/
|
|
|
|
aes_expand_key(state.hs.b, expandedKey);
|
|
for(i = 0; i < MEMORY / INIT_SIZE_BYTE; i++)
|
|
{
|
|
aes_pseudo_round(text, text, expandedKey, INIT_SIZE_BLK);
|
|
memcpy(&hp_state[i * INIT_SIZE_BYTE], text, INIT_SIZE_BYTE);
|
|
}
|
|
|
|
U64(a)[0] = U64(&state.k[0])[0] ^ U64(&state.k[32])[0];
|
|
U64(a)[1] = U64(&state.k[0])[1] ^ U64(&state.k[32])[1];
|
|
U64(b)[0] = U64(&state.k[16])[0] ^ U64(&state.k[48])[0];
|
|
U64(b)[1] = U64(&state.k[16])[1] ^ U64(&state.k[48])[1];
|
|
|
|
/* CryptoNight Step 3: Bounce randomly 1,048,576 times (1<<20) through the mixing buffer,
|
|
* using 524,288 iterations of the following mixing function. Each execution
|
|
* performs two reads and writes from the mixing buffer.
|
|
*/
|
|
|
|
_b = vld1q_u8((const uint8_t *)b);
|
|
|
|
|
|
for(i = 0; i < ITER / 2; i++)
|
|
{
|
|
pre_aes();
|
|
_c = vaeseq_u8(_c, zero);
|
|
_c = vaesmcq_u8(_c);
|
|
_c = veorq_u8(_c, _a);
|
|
post_aes();
|
|
}
|
|
|
|
/* CryptoNight Step 4: Sequentially pass through the mixing buffer and use 10 rounds
|
|
* of AES encryption to mix the random data back into the 'text' buffer. 'text'
|
|
* was originally created with the output of Keccak1600. */
|
|
|
|
memcpy(text, state.init, INIT_SIZE_BYTE);
|
|
|
|
aes_expand_key(&state.hs.b[32], expandedKey);
|
|
for(i = 0; i < MEMORY / INIT_SIZE_BYTE; i++)
|
|
{
|
|
// add the xor to the pseudo round
|
|
aes_pseudo_round_xor(text, text, expandedKey, &hp_state[i * INIT_SIZE_BYTE], INIT_SIZE_BLK);
|
|
}
|
|
|
|
/* CryptoNight Step 5: Apply Keccak to the state again, and then
|
|
* use the resulting data to select which of four finalizer
|
|
* hash functions to apply to the data (Blake, Groestl, JH, or Skein).
|
|
* Use this hash to squeeze the state array down
|
|
* to the final 256 bit hash output.
|
|
*/
|
|
|
|
memcpy(state.init, text, INIT_SIZE_BYTE);
|
|
hash_permutation(&state.hs);
|
|
extra_hashes[state.hs.b[0] & 3](&state, 200, hash);
|
|
}
|
|
#else /* aarch64 && crypto */
|
|
|
|
// ND: Some minor optimizations for ARMv7 (raspberrry pi 2), effect seems to be ~40-50% faster.
|
|
// Needs more work.
|
|
|
|
#ifdef NO_OPTIMIZED_MULTIPLY_ON_ARM
|
|
/* The asm corresponds to this C code */
|
|
#define SHORT uint32_t
|
|
#define LONG uint64_t
|
|
|
|
void mul(const uint8_t *ca, const uint8_t *cb, uint8_t *cres) {
|
|
const SHORT *aa = (SHORT *)ca;
|
|
const SHORT *bb = (SHORT *)cb;
|
|
SHORT *res = (SHORT *)cres;
|
|
union {
|
|
SHORT tmp[8];
|
|
LONG ltmp[4];
|
|
} t;
|
|
LONG A = aa[1];
|
|
LONG a = aa[0];
|
|
LONG B = bb[1];
|
|
LONG b = bb[0];
|
|
|
|
// Aa * Bb = ab + aB_ + Ab_ + AB__
|
|
t.ltmp[0] = a * b;
|
|
t.ltmp[1] = a * B;
|
|
t.ltmp[2] = A * b;
|
|
t.ltmp[3] = A * B;
|
|
|
|
res[2] = t.tmp[0];
|
|
t.ltmp[1] += t.tmp[1];
|
|
t.ltmp[1] += t.tmp[4];
|
|
t.ltmp[3] += t.tmp[3];
|
|
t.ltmp[3] += t.tmp[5];
|
|
res[3] = t.tmp[2];
|
|
res[0] = t.tmp[6];
|
|
res[1] = t.tmp[7];
|
|
}
|
|
#else // !NO_OPTIMIZED_MULTIPLY_ON_ARM
|
|
|
|
#ifdef __aarch64__ /* ARM64, no crypto */
|
|
#define mul(a, b, c) cn_mul128((const uint64_t *)a, (const uint64_t *)b, (uint64_t *)c)
|
|
STATIC void cn_mul128(const uint64_t *a, const uint64_t *b, uint64_t *r)
|
|
{
|
|
uint64_t lo, hi;
|
|
__asm__("mul %0, %1, %2\n\t" : "=r"(lo) : "r"(a[0]), "r"(b[0]) );
|
|
__asm__("umulh %0, %1, %2\n\t" : "=r"(hi) : "r"(a[0]), "r"(b[0]) );
|
|
r[0] = hi;
|
|
r[1] = lo;
|
|
}
|
|
#else /* ARM32 */
|
|
#define mul(a, b, c) cn_mul128((const uint32_t *)a, (const uint32_t *)b, (uint32_t *)c)
|
|
STATIC void cn_mul128(const uint32_t *aa, const uint32_t *bb, uint32_t *r)
|
|
{
|
|
uint32_t t0, t1, t2=0, t3=0;
|
|
__asm__ __volatile__(
|
|
"umull %[t0], %[t1], %[a], %[b]\n\t"
|
|
"str %[t0], %[ll]\n\t"
|
|
|
|
// accumulating with 0 can never overflow/carry
|
|
"eor %[t0], %[t0]\n\t"
|
|
"umlal %[t1], %[t0], %[a], %[B]\n\t"
|
|
|
|
"umlal %[t1], %[t2], %[A], %[b]\n\t"
|
|
"str %[t1], %[lh]\n\t"
|
|
|
|
"umlal %[t0], %[t3], %[A], %[B]\n\t"
|
|
|
|
// final add may have a carry
|
|
"adds %[t0], %[t0], %[t2]\n\t"
|
|
"adc %[t1], %[t3], #0\n\t"
|
|
|
|
"str %[t0], %[hl]\n\t"
|
|
"str %[t1], %[hh]\n\t"
|
|
: [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"+r"(t2), [t3]"+r"(t3), [hl]"=m"(r[0]), [hh]"=m"(r[1]), [ll]"=m"(r[2]), [lh]"=m"(r[3])
|
|
: [A]"r"(aa[1]), [a]"r"(aa[0]), [B]"r"(bb[1]), [b]"r"(bb[0])
|
|
: "cc");
|
|
}
|
|
#endif /* !aarch64 */
|
|
#endif // NO_OPTIMIZED_MULTIPLY_ON_ARM
|
|
|
|
STATIC INLINE void sum_half_blocks(uint8_t* a, const uint8_t* b)
|
|
{
|
|
uint64_t a0, a1, b0, b1;
|
|
a0 = U64(a)[0];
|
|
a1 = U64(a)[1];
|
|
b0 = U64(b)[0];
|
|
b1 = U64(b)[1];
|
|
a0 += b0;
|
|
a1 += b1;
|
|
U64(a)[0] = a0;
|
|
U64(a)[1] = a1;
|
|
}
|
|
|
|
STATIC INLINE void swap_blocks(uint8_t *a, uint8_t *b)
|
|
{
|
|
uint64_t t[2];
|
|
U64(t)[0] = U64(a)[0];
|
|
U64(t)[1] = U64(a)[1];
|
|
U64(a)[0] = U64(b)[0];
|
|
U64(a)[1] = U64(b)[1];
|
|
U64(b)[0] = U64(t)[0];
|
|
U64(b)[1] = U64(t)[1];
|
|
}
|
|
|
|
STATIC INLINE void xor_blocks(uint8_t* a, const uint8_t* b)
|
|
{
|
|
U64(a)[0] ^= U64(b)[0];
|
|
U64(a)[1] ^= U64(b)[1];
|
|
}
|
|
|
|
void cn_slow_hash(const void *data, size_t length, char *hash)
|
|
{
|
|
uint8_t text[INIT_SIZE_BYTE];
|
|
uint8_t a[AES_BLOCK_SIZE];
|
|
uint8_t b[AES_BLOCK_SIZE];
|
|
uint8_t d[AES_BLOCK_SIZE];
|
|
uint8_t aes_key[AES_KEY_SIZE];
|
|
RDATA_ALIGN16 uint8_t expandedKey[256];
|
|
|
|
union cn_slow_hash_state state;
|
|
|
|
size_t i, j;
|
|
uint8_t *p = NULL;
|
|
oaes_ctx *aes_ctx;
|
|
static void (*const extra_hashes[4])(const void *, size_t, char *) =
|
|
{
|
|
hash_extra_blake, hash_extra_groestl, hash_extra_jh, hash_extra_skein
|
|
};
|
|
|
|
#ifndef FORCE_USE_HEAP
|
|
uint8_t long_state[MEMORY];
|
|
#else
|
|
uint8_t *long_state = NULL;
|
|
long_state = (uint8_t *)malloc(MEMORY);
|
|
#endif
|
|
|
|
hash_process(&state.hs, data, length);
|
|
memcpy(text, state.init, INIT_SIZE_BYTE);
|
|
|
|
aes_ctx = (oaes_ctx *) oaes_alloc();
|
|
oaes_key_import_data(aes_ctx, state.hs.b, AES_KEY_SIZE);
|
|
|
|
// use aligned data
|
|
memcpy(expandedKey, aes_ctx->key->exp_data, aes_ctx->key->exp_data_len);
|
|
for(i = 0; i < MEMORY / INIT_SIZE_BYTE; i++)
|
|
{
|
|
for(j = 0; j < INIT_SIZE_BLK; j++)
|
|
aesb_pseudo_round(&text[AES_BLOCK_SIZE * j], &text[AES_BLOCK_SIZE * j], expandedKey);
|
|
memcpy(&long_state[i * INIT_SIZE_BYTE], text, INIT_SIZE_BYTE);
|
|
}
|
|
|
|
U64(a)[0] = U64(&state.k[0])[0] ^ U64(&state.k[32])[0];
|
|
U64(a)[1] = U64(&state.k[0])[1] ^ U64(&state.k[32])[1];
|
|
U64(b)[0] = U64(&state.k[16])[0] ^ U64(&state.k[48])[0];
|
|
U64(b)[1] = U64(&state.k[16])[1] ^ U64(&state.k[48])[1];
|
|
|
|
for(i = 0; i < ITER / 2; i++)
|
|
{
|
|
#define MASK ((uint32_t)(((MEMORY / AES_BLOCK_SIZE) - 1) << 4))
|
|
#define state_index(x) ((*(uint32_t *) x) & MASK)
|
|
|
|
// Iteration 1
|
|
p = &long_state[state_index(a)];
|
|
aesb_single_round(p, p, a);
|
|
|
|
xor_blocks(b, p);
|
|
swap_blocks(b, p);
|
|
swap_blocks(a, b);
|
|
|
|
// Iteration 2
|
|
p = &long_state[state_index(a)];
|
|
|
|
mul(a, p, d);
|
|
sum_half_blocks(b, d);
|
|
swap_blocks(b, p);
|
|
xor_blocks(b, p);
|
|
swap_blocks(a, b);
|
|
}
|
|
|
|
memcpy(text, state.init, INIT_SIZE_BYTE);
|
|
oaes_key_import_data(aes_ctx, &state.hs.b[32], AES_KEY_SIZE);
|
|
memcpy(expandedKey, aes_ctx->key->exp_data, aes_ctx->key->exp_data_len);
|
|
for(i = 0; i < MEMORY / INIT_SIZE_BYTE; i++)
|
|
{
|
|
for(j = 0; j < INIT_SIZE_BLK; j++)
|
|
{
|
|
xor_blocks(&text[j * AES_BLOCK_SIZE], &long_state[i * INIT_SIZE_BYTE + j * AES_BLOCK_SIZE]);
|
|
aesb_pseudo_round(&text[AES_BLOCK_SIZE * j], &text[AES_BLOCK_SIZE * j], expandedKey);
|
|
}
|
|
}
|
|
|
|
oaes_free((OAES_CTX **) &aes_ctx);
|
|
memcpy(state.init, text, INIT_SIZE_BYTE);
|
|
hash_permutation(&state.hs);
|
|
extra_hashes[state.hs.b[0] & 3](&state, 200, hash);
|
|
#ifdef FORCE_USE_HEAP
|
|
free(long_state);
|
|
#endif
|
|
}
|
|
#endif /* !aarch64 || !crypto */
|
|
|
|
#else
|
|
// Portable implementation as a fallback
|
|
|
|
void slow_hash_allocate_state(void)
|
|
{
|
|
// Do nothing, this is just to maintain compatibility with the upgraded slow-hash.c
|
|
return;
|
|
}
|
|
|
|
void slow_hash_free_state(void)
|
|
{
|
|
// As above
|
|
return;
|
|
}
|
|
|
|
static void (*const extra_hashes[4])(const void *, size_t, char *) = {
|
|
hash_extra_blake, hash_extra_groestl, hash_extra_jh, hash_extra_skein
|
|
};
|
|
|
|
extern int aesb_single_round(const uint8_t *in, uint8_t*out, const uint8_t *expandedKey);
|
|
extern int aesb_pseudo_round(const uint8_t *in, uint8_t *out, const uint8_t *expandedKey);
|
|
|
|
static size_t e2i(const uint8_t* a, size_t count) { return (*((uint64_t*)a) / AES_BLOCK_SIZE) & (count - 1); }
|
|
|
|
static void mul(const uint8_t* a, const uint8_t* b, uint8_t* res) {
|
|
uint64_t a0, b0;
|
|
uint64_t hi, lo;
|
|
|
|
a0 = SWAP64LE(((uint64_t*)a)[0]);
|
|
b0 = SWAP64LE(((uint64_t*)b)[0]);
|
|
lo = mul128(a0, b0, &hi);
|
|
((uint64_t*)res)[0] = SWAP64LE(hi);
|
|
((uint64_t*)res)[1] = SWAP64LE(lo);
|
|
}
|
|
|
|
static void sum_half_blocks(uint8_t* a, const uint8_t* b) {
|
|
uint64_t a0, a1, b0, b1;
|
|
|
|
a0 = SWAP64LE(((uint64_t*)a)[0]);
|
|
a1 = SWAP64LE(((uint64_t*)a)[1]);
|
|
b0 = SWAP64LE(((uint64_t*)b)[0]);
|
|
b1 = SWAP64LE(((uint64_t*)b)[1]);
|
|
a0 += b0;
|
|
a1 += b1;
|
|
((uint64_t*)a)[0] = SWAP64LE(a0);
|
|
((uint64_t*)a)[1] = SWAP64LE(a1);
|
|
}
|
|
#define U64(x) ((uint64_t *) (x))
|
|
|
|
static void copy_block(uint8_t* dst, const uint8_t* src) {
|
|
memcpy(dst, src, AES_BLOCK_SIZE);
|
|
}
|
|
|
|
static void swap_blocks(uint8_t *a, uint8_t *b){
|
|
uint64_t t[2];
|
|
U64(t)[0] = U64(a)[0];
|
|
U64(t)[1] = U64(a)[1];
|
|
U64(a)[0] = U64(b)[0];
|
|
U64(a)[1] = U64(b)[1];
|
|
U64(b)[0] = U64(t)[0];
|
|
U64(b)[1] = U64(t)[1];
|
|
}
|
|
|
|
static void xor_blocks(uint8_t* a, const uint8_t* b) {
|
|
size_t i;
|
|
for (i = 0; i < AES_BLOCK_SIZE; i++) {
|
|
a[i] ^= b[i];
|
|
}
|
|
}
|
|
|
|
#pragma pack(push, 1)
|
|
union cn_slow_hash_state {
|
|
union hash_state hs;
|
|
struct {
|
|
uint8_t k[64];
|
|
uint8_t init[INIT_SIZE_BYTE];
|
|
};
|
|
};
|
|
#pragma pack(pop)
|
|
|
|
void cn_slow_hash(const void *data, size_t length, char *hash) {
|
|
uint8_t long_state[MEMORY];
|
|
union cn_slow_hash_state state;
|
|
uint8_t text[INIT_SIZE_BYTE];
|
|
uint8_t a[AES_BLOCK_SIZE];
|
|
uint8_t b[AES_BLOCK_SIZE];
|
|
uint8_t c[AES_BLOCK_SIZE];
|
|
uint8_t d[AES_BLOCK_SIZE];
|
|
size_t i, j;
|
|
uint8_t aes_key[AES_KEY_SIZE];
|
|
oaes_ctx *aes_ctx;
|
|
|
|
hash_process(&state.hs, data, length);
|
|
memcpy(text, state.init, INIT_SIZE_BYTE);
|
|
memcpy(aes_key, state.hs.b, AES_KEY_SIZE);
|
|
aes_ctx = (oaes_ctx *) oaes_alloc();
|
|
|
|
oaes_key_import_data(aes_ctx, aes_key, AES_KEY_SIZE);
|
|
for (i = 0; i < MEMORY / INIT_SIZE_BYTE; i++) {
|
|
for (j = 0; j < INIT_SIZE_BLK; j++) {
|
|
aesb_pseudo_round(&text[AES_BLOCK_SIZE * j], &text[AES_BLOCK_SIZE * j], aes_ctx->key->exp_data);
|
|
}
|
|
memcpy(&long_state[i * INIT_SIZE_BYTE], text, INIT_SIZE_BYTE);
|
|
}
|
|
|
|
for (i = 0; i < 16; i++) {
|
|
a[i] = state.k[ i] ^ state.k[32 + i];
|
|
b[i] = state.k[16 + i] ^ state.k[48 + i];
|
|
}
|
|
|
|
for (i = 0; i < ITER / 2; i++) {
|
|
/* Dependency chain: address -> read value ------+
|
|
* written value <-+ hard function (AES or MUL) <+
|
|
* next address <-+
|
|
*/
|
|
/* Iteration 1 */
|
|
j = e2i(a, MEMORY / AES_BLOCK_SIZE);
|
|
copy_block(c, &long_state[j * AES_BLOCK_SIZE]);
|
|
aesb_single_round(c, c, a);
|
|
xor_blocks(b, c);
|
|
swap_blocks(b, c);
|
|
copy_block(&long_state[j * AES_BLOCK_SIZE], c);
|
|
assert(j == e2i(a, MEMORY / AES_BLOCK_SIZE));
|
|
swap_blocks(a, b);
|
|
/* Iteration 2 */
|
|
j = e2i(a, MEMORY / AES_BLOCK_SIZE);
|
|
copy_block(c, &long_state[j * AES_BLOCK_SIZE]);
|
|
mul(a, c, d);
|
|
sum_half_blocks(b, d);
|
|
swap_blocks(b, c);
|
|
xor_blocks(b, c);
|
|
copy_block(&long_state[j * AES_BLOCK_SIZE], c);
|
|
assert(j == e2i(a, MEMORY / AES_BLOCK_SIZE));
|
|
swap_blocks(a, b);
|
|
}
|
|
|
|
memcpy(text, state.init, INIT_SIZE_BYTE);
|
|
oaes_key_import_data(aes_ctx, &state.hs.b[32], AES_KEY_SIZE);
|
|
for (i = 0; i < MEMORY / INIT_SIZE_BYTE; i++) {
|
|
for (j = 0; j < INIT_SIZE_BLK; j++) {
|
|
xor_blocks(&text[j * AES_BLOCK_SIZE], &long_state[i * INIT_SIZE_BYTE + j * AES_BLOCK_SIZE]);
|
|
aesb_pseudo_round(&text[AES_BLOCK_SIZE * j], &text[AES_BLOCK_SIZE * j], aes_ctx->key->exp_data);
|
|
}
|
|
}
|
|
memcpy(state.init, text, INIT_SIZE_BYTE);
|
|
hash_permutation(&state.hs);
|
|
/*memcpy(hash, &state, 32);*/
|
|
extra_hashes[state.hs.b[0] & 3](&state, 200, hash);
|
|
oaes_free((OAES_CTX **) &aes_ctx);
|
|
}
|
|
|
|
#endif
|