From f51397b3064da60891b7e982d040b7715bbbf59a Mon Sep 17 00:00:00 2001 From: SChernykh Date: Mon, 4 Feb 2019 17:49:19 +0100 Subject: [PATCH] Cryptonight variant 4 aka CryptonightR It introduces random integer math into the main loop. --- src/crypto/chacha.h | 8 +- src/crypto/hash-ops.h | 2 +- src/crypto/hash.h | 8 +- src/crypto/slow-hash.c | 64 ++- src/crypto/variant4_random_math.h | 441 ++++++++++++++++++ .../cryptonote_format_utils.cpp | 2 +- tests/hash/CMakeLists.txt | 2 +- tests/hash/main.cpp | 29 +- tests/hash/tests-slow-4.txt | 10 + 9 files changed, 543 insertions(+), 23 deletions(-) create mode 100644 src/crypto/variant4_random_math.h create mode 100644 tests/hash/tests-slow-4.txt diff --git a/src/crypto/chacha.h b/src/crypto/chacha.h index 6e85ad0e9..0610f7051 100644 --- a/src/crypto/chacha.h +++ b/src/crypto/chacha.h @@ -73,18 +73,18 @@ namespace crypto { inline void generate_chacha_key(const void *data, size_t size, chacha_key& key, uint64_t kdf_rounds) { static_assert(sizeof(chacha_key) <= sizeof(hash), "Size of hash must be at least that of chacha_key"); epee::mlocked> pwd_hash; - crypto::cn_slow_hash(data, size, pwd_hash.data(), 0/*variant*/, 0/*prehashed*/); + crypto::cn_slow_hash(data, size, pwd_hash.data(), 0/*variant*/, 0/*prehashed*/, 0/*height*/); for (uint64_t n = 1; n < kdf_rounds; ++n) - crypto::cn_slow_hash(pwd_hash.data(), pwd_hash.size(), pwd_hash.data(), 0/*variant*/, 0/*prehashed*/); + crypto::cn_slow_hash(pwd_hash.data(), pwd_hash.size(), pwd_hash.data(), 0/*variant*/, 0/*prehashed*/, 0/*height*/); memcpy(&unwrap(unwrap(key)), pwd_hash.data(), sizeof(key)); } inline void generate_chacha_key_prehashed(const void *data, size_t size, chacha_key& key, uint64_t kdf_rounds) { static_assert(sizeof(chacha_key) <= sizeof(hash), "Size of hash must be at least that of chacha_key"); epee::mlocked> pwd_hash; - crypto::cn_slow_hash(data, size, pwd_hash.data(), 0/*variant*/, 1/*prehashed*/); + crypto::cn_slow_hash(data, size, pwd_hash.data(), 0/*variant*/, 1/*prehashed*/, 0/*height*/); for (uint64_t n = 1; n < kdf_rounds; ++n) - crypto::cn_slow_hash(pwd_hash.data(), pwd_hash.size(), pwd_hash.data(), 0/*variant*/, 0/*prehashed*/); + crypto::cn_slow_hash(pwd_hash.data(), pwd_hash.size(), pwd_hash.data(), 0/*variant*/, 0/*prehashed*/, 0/*height*/); memcpy(&unwrap(unwrap(key)), pwd_hash.data(), sizeof(key)); } diff --git a/src/crypto/hash-ops.h b/src/crypto/hash-ops.h index 77b52e2d4..ba7ece0f5 100644 --- a/src/crypto/hash-ops.h +++ b/src/crypto/hash-ops.h @@ -79,7 +79,7 @@ enum { }; void cn_fast_hash(const void *data, size_t length, char *hash); -void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int prehashed); +void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int prehashed, uint64_t height); void hash_extra_blake(const void *data, size_t length, char *hash); void hash_extra_groestl(const void *data, size_t length, char *hash); diff --git a/src/crypto/hash.h b/src/crypto/hash.h index 995e2294e..165fe6bb0 100644 --- a/src/crypto/hash.h +++ b/src/crypto/hash.h @@ -71,12 +71,12 @@ namespace crypto { return h; } - inline void cn_slow_hash(const void *data, std::size_t length, hash &hash, int variant = 0) { - cn_slow_hash(data, length, reinterpret_cast(&hash), variant, 0/*prehashed*/); + inline void cn_slow_hash(const void *data, std::size_t length, hash &hash, int variant = 0, uint64_t height = 0) { + cn_slow_hash(data, length, reinterpret_cast(&hash), variant, 0/*prehashed*/, height); } - inline void cn_slow_hash_prehashed(const void *data, std::size_t length, hash &hash, int variant = 0) { - cn_slow_hash(data, length, reinterpret_cast(&hash), variant, 1/*prehashed*/); + inline void cn_slow_hash_prehashed(const void *data, std::size_t length, hash &hash, int variant = 0, uint64_t height = 0) { + cn_slow_hash(data, length, reinterpret_cast(&hash), variant, 1/*prehashed*/, height); } inline void tree_hash(const hash *hashes, std::size_t count, hash &root_hash) { diff --git a/src/crypto/slow-hash.c b/src/crypto/slow-hash.c index ae0bd4e98..96eafde6d 100644 --- a/src/crypto/slow-hash.c +++ b/src/crypto/slow-hash.c @@ -39,6 +39,7 @@ #include "hash-ops.h" #include "oaes_lib.h" #include "variant2_int_sqrt.h" +#include "variant4_random_math.h" #define MEMORY (1 << 21) // 2MB scratchpad #define ITER (1 << 20) @@ -172,7 +173,7 @@ extern void aesb_pseudo_round(const uint8_t *in, uint8_t *out, const uint8_t *ex const uint64_t sqrt_input = SWAP64LE(((uint64_t*)(ptr))[0]) + division_result #define VARIANT2_INTEGER_MATH_SSE2(b, ptr) \ - do if (variant >= 2) \ + do if ((variant == 2) || (variant == 3)) \ { \ VARIANT2_INTEGER_MATH_DIVISION_STEP(b, ptr); \ VARIANT2_INTEGER_MATH_SQRT_STEP_SSE2(); \ @@ -182,7 +183,7 @@ extern void aesb_pseudo_round(const uint8_t *in, uint8_t *out, const uint8_t *ex #if defined DBL_MANT_DIG && (DBL_MANT_DIG >= 50) // double precision floating point type has enough bits of precision on current platform #define VARIANT2_PORTABLE_INTEGER_MATH(b, ptr) \ - do if (variant >= 2) \ + do if ((variant == 2) || (variant == 3)) \ { \ VARIANT2_INTEGER_MATH_DIVISION_STEP(b, ptr); \ VARIANT2_INTEGER_MATH_SQRT_STEP_FP64(); \ @@ -192,7 +193,7 @@ extern void aesb_pseudo_round(const uint8_t *in, uint8_t *out, const uint8_t *ex // double precision floating point type is not good enough on current platform // fall back to the reference code (integer only) #define VARIANT2_PORTABLE_INTEGER_MATH(b, ptr) \ - do if (variant >= 2) \ + do if ((variant == 2) || (variant == 3)) \ { \ VARIANT2_INTEGER_MATH_DIVISION_STEP(b, ptr); \ VARIANT2_INTEGER_MATH_SQRT_STEP_REF(); \ @@ -214,6 +215,47 @@ extern void aesb_pseudo_round(const uint8_t *in, uint8_t *out, const uint8_t *ex lo ^= SWAP64LE(*(U64(hp_state + (j ^ 0x20)) + 1)); \ } while (0) +#define V4_REG_LOAD(dst, src) \ + do { \ + memcpy((dst), (src), sizeof(v4_reg)); \ + if (sizeof(v4_reg) == sizeof(uint32_t)) \ + *(dst) = SWAP32LE(*(dst)); \ + else \ + *(dst) = SWAP64LE(*(dst)); \ + } while (0) + +#define VARIANT4_RANDOM_MATH_INIT() \ + v4_reg r[9]; \ + struct V4_Instruction code[NUM_INSTRUCTIONS_MAX + 1]; \ + do if (variant >= 4) \ + { \ + for (int i = 0; i < 4; ++i) \ + V4_REG_LOAD(r + i, (uint8_t*)(state.hs.w + 12) + sizeof(v4_reg) * i); \ + v4_random_math_init(code, height); \ + } while (0) + +#define VARIANT4_RANDOM_MATH(a, b, r, _b, _b1) \ + do if (variant >= 4) \ + { \ + uint64_t t; \ + memcpy(&t, b, sizeof(uint64_t)); \ + \ + if (sizeof(v4_reg) == sizeof(uint32_t)) \ + t ^= SWAP64LE((r[0] + r[1]) | ((uint64_t)(r[2] + r[3]) << 32)); \ + else \ + t ^= SWAP64LE((r[0] + r[1]) ^ (r[2] + r[3])); \ + \ + memcpy(b, &t, sizeof(uint64_t)); \ + \ + V4_REG_LOAD(r + 4, a); \ + V4_REG_LOAD(r + 5, (uint64_t*)(a) + 1); \ + V4_REG_LOAD(r + 6, _b); \ + V4_REG_LOAD(r + 7, _b1); \ + V4_REG_LOAD(r + 8, (uint64_t*)(_b1) + 1); \ + \ + v4_random_math(code, r); \ + } while (0) + #if !defined NO_AES && (defined(__x86_64__) || (defined(_MSC_VER) && defined(_WIN64))) // Optimised code below, uses x86-specific intrinsics, SSE2, AES-NI @@ -298,6 +340,7 @@ extern void aesb_pseudo_round(const uint8_t *in, uint8_t *out, const uint8_t *ex p = U64(&hp_state[j]); \ b[0] = p[0]; b[1] = p[1]; \ VARIANT2_INTEGER_MATH_SSE2(b, c); \ + VARIANT4_RANDOM_MATH(a, b, r, &_b, &_b1); \ __mul(); \ VARIANT2_2(); \ VARIANT2_SHUFFLE_ADD_SSE2(hp_state, j); \ @@ -694,7 +737,7 @@ void slow_hash_free_state(void) * @param length the length in bytes of the data * @param hash a pointer to a buffer in which the final 256 bit hash will be stored */ -void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int prehashed) +void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int prehashed, uint64_t height) { RDATA_ALIGN16 uint8_t expandedKey[240]; /* These buffers are aligned to use later with SSE functions */ @@ -730,6 +773,7 @@ void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int VARIANT1_INIT64(); VARIANT2_INIT64(); + VARIANT4_RANDOM_MATH_INIT(); /* CryptoNight Step 2: Iteratively encrypt the results from Keccak to fill * the 2MB large random access buffer. @@ -901,6 +945,7 @@ union cn_slow_hash_state p = U64(&hp_state[j]); \ b[0] = p[0]; b[1] = p[1]; \ VARIANT2_PORTABLE_INTEGER_MATH(b, c); \ + VARIANT4_RANDOM_MATH(a, b, r, &_b, &_b1); \ __mul(); \ VARIANT2_2(); \ VARIANT2_SHUFFLE_ADD_NEON(hp_state, j); \ @@ -1063,7 +1108,7 @@ STATIC INLINE void aligned_free(void *ptr) } #endif /* FORCE_USE_HEAP */ -void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int prehashed) +void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int prehashed, uint64_t height) { RDATA_ALIGN16 uint8_t expandedKey[240]; @@ -1100,6 +1145,7 @@ void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int VARIANT1_INIT64(); VARIANT2_INIT64(); + VARIANT4_RANDOM_MATH_INIT(); /* CryptoNight Step 2: Iteratively encrypt the results from Keccak to fill * the 2MB large random access buffer. @@ -1278,7 +1324,7 @@ STATIC INLINE void xor_blocks(uint8_t* a, const uint8_t* b) U64(a)[1] ^= U64(b)[1]; } -void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int prehashed) +void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int prehashed, uint64_t height) { uint8_t text[INIT_SIZE_BYTE]; uint8_t a[AES_BLOCK_SIZE]; @@ -1317,6 +1363,7 @@ void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int VARIANT1_INIT64(); VARIANT2_INIT64(); + VARIANT4_RANDOM_MATH_INIT(); // use aligned data memcpy(expandedKey, aes_ctx->key->exp_data, aes_ctx->key->exp_data_len); @@ -1353,6 +1400,7 @@ void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int copy_block(c, p); VARIANT2_PORTABLE_INTEGER_MATH(c, c1); + VARIANT4_RANDOM_MATH(a, c, r, b, b + AES_BLOCK_SIZE); mul(c1, c, d); VARIANT2_2_PORTABLE(); VARIANT2_PORTABLE_SHUFFLE_ADD(long_state, j); @@ -1476,7 +1524,7 @@ union cn_slow_hash_state { }; #pragma pack(pop) -void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int prehashed) { +void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int prehashed, uint64_t height) { #ifndef FORCE_USE_HEAP uint8_t long_state[MEMORY]; #else @@ -1505,6 +1553,7 @@ void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int VARIANT1_PORTABLE_INIT(); VARIANT2_PORTABLE_INIT(); + VARIANT4_RANDOM_MATH_INIT(); oaes_key_import_data(aes_ctx, aes_key, AES_KEY_SIZE); for (i = 0; i < MEMORY / INIT_SIZE_BYTE; i++) { @@ -1537,6 +1586,7 @@ void cn_slow_hash(const void *data, size_t length, char *hash, int variant, int j = e2i(c1, MEMORY / AES_BLOCK_SIZE) * AES_BLOCK_SIZE; copy_block(c2, &long_state[j]); VARIANT2_PORTABLE_INTEGER_MATH(c2, c1); + VARIANT4_RANDOM_MATH(a, c2, r, b, b + AES_BLOCK_SIZE); mul(c1, c2, d); VARIANT2_2_PORTABLE(); VARIANT2_PORTABLE_SHUFFLE_ADD(long_state, j); diff --git a/src/crypto/variant4_random_math.h b/src/crypto/variant4_random_math.h new file mode 100644 index 000000000..8549498c4 --- /dev/null +++ b/src/crypto/variant4_random_math.h @@ -0,0 +1,441 @@ +#ifndef VARIANT4_RANDOM_MATH_H +#define VARIANT4_RANDOM_MATH_H + +// Register size can be configured to either 32 bit (uint32_t) or 64 bit (uint64_t) +typedef uint32_t v4_reg; + +enum V4_Settings +{ + // Generate code with minimal theoretical latency = 45 cycles, which is equivalent to 15 multiplications + TOTAL_LATENCY = 15 * 3, + + // Always generate at least 60 instructions + NUM_INSTRUCTIONS_MIN = 60, + + // Never generate more than 70 instructions (final RET instruction doesn't count here) + NUM_INSTRUCTIONS_MAX = 70, + + // Available ALUs for MUL + // Modern CPUs typically have only 1 ALU which can do multiplications + ALU_COUNT_MUL = 1, + + // Total available ALUs + // Modern CPUs have 4 ALUs, but we use only 3 because random math executes together with other main loop code + ALU_COUNT = 3, +}; + +enum V4_InstructionList +{ + MUL, // a*b + ADD, // a+b + C, C is an unsigned 32-bit constant + SUB, // a-b + ROR, // rotate right "a" by "b & 31" bits + ROL, // rotate left "a" by "b & 31" bits + XOR, // a^b + RET, // finish execution + V4_INSTRUCTION_COUNT = RET, +}; + +// V4_InstructionDefinition is used to generate code from random data +// Every random sequence of bytes is a valid code +// +// There are 8 registers in total: +// - 4 variable registers +// - 4 constant registers initialized from loop variables +// +// This is why dst_index is 2 bits +enum V4_InstructionDefinition +{ + V4_OPCODE_BITS = 3, + V4_DST_INDEX_BITS = 2, + V4_SRC_INDEX_BITS = 3, +}; + +struct V4_Instruction +{ + uint8_t opcode; + uint8_t dst_index; + uint8_t src_index; + uint32_t C; +}; + +#ifndef FORCEINLINE +#if defined(__GNUC__) +#define FORCEINLINE __attribute__((always_inline)) inline +#elif defined(_MSC_VER) +#define FORCEINLINE __forceinline +#else +#define FORCEINLINE inline +#endif +#endif + +#ifndef UNREACHABLE_CODE +#if defined(__GNUC__) +#define UNREACHABLE_CODE __builtin_unreachable() +#elif defined(_MSC_VER) +#define UNREACHABLE_CODE __assume(false) +#else +#define UNREACHABLE_CODE +#endif +#endif + +// Random math interpreter's loop is fully unrolled and inlined to achieve 100% branch prediction on CPU: +// every switch-case will point to the same destination on every iteration of Cryptonight main loop +// +// This is about as fast as it can get without using low-level machine code generation +static FORCEINLINE void v4_random_math(const struct V4_Instruction* code, v4_reg* r) +{ + enum + { + REG_BITS = sizeof(v4_reg) * 8, + }; + +#define V4_EXEC(i) \ + { \ + const struct V4_Instruction* op = code + i; \ + const v4_reg src = r[op->src_index]; \ + v4_reg* dst = r + op->dst_index; \ + switch (op->opcode) \ + { \ + case MUL: \ + *dst *= src; \ + break; \ + case ADD: \ + *dst += src + op->C; \ + break; \ + case SUB: \ + *dst -= src; \ + break; \ + case ROR: \ + { \ + const uint32_t shift = src % REG_BITS; \ + *dst = (*dst >> shift) | (*dst << ((REG_BITS - shift) % REG_BITS)); \ + } \ + break; \ + case ROL: \ + { \ + const uint32_t shift = src % REG_BITS; \ + *dst = (*dst << shift) | (*dst >> ((REG_BITS - shift) % REG_BITS)); \ + } \ + break; \ + case XOR: \ + *dst ^= src; \ + break; \ + case RET: \ + return; \ + default: \ + UNREACHABLE_CODE; \ + break; \ + } \ + } + +#define V4_EXEC_10(j) \ + V4_EXEC(j + 0) \ + V4_EXEC(j + 1) \ + V4_EXEC(j + 2) \ + V4_EXEC(j + 3) \ + V4_EXEC(j + 4) \ + V4_EXEC(j + 5) \ + V4_EXEC(j + 6) \ + V4_EXEC(j + 7) \ + V4_EXEC(j + 8) \ + V4_EXEC(j + 9) + + // Generated program can have 60 + a few more (usually 2-3) instructions to achieve required latency + // I've checked all block heights < 10,000,000 and here is the distribution of program sizes: + // + // 60 27960 + // 61 105054 + // 62 2452759 + // 63 5115997 + // 64 1022269 + // 65 1109635 + // 66 153145 + // 67 8550 + // 68 4529 + // 69 102 + + // Unroll 70 instructions here + V4_EXEC_10(0); // instructions 0-9 + V4_EXEC_10(10); // instructions 10-19 + V4_EXEC_10(20); // instructions 20-29 + V4_EXEC_10(30); // instructions 30-39 + V4_EXEC_10(40); // instructions 40-49 + V4_EXEC_10(50); // instructions 50-59 + V4_EXEC_10(60); // instructions 60-69 + +#undef V4_EXEC_10 +#undef V4_EXEC +} + +// If we don't have enough data available, generate more +static FORCEINLINE void check_data(size_t* data_index, const size_t bytes_needed, int8_t* data, const size_t data_size) +{ + if (*data_index + bytes_needed > data_size) + { + hash_extra_blake(data, data_size, (char*) data); + *data_index = 0; + } +} + +// Generates as many random math operations as possible with given latency and ALU restrictions +// "code" array must have space for NUM_INSTRUCTIONS_MAX+1 instructions +static inline int v4_random_math_init(struct V4_Instruction* code, const uint64_t height) +{ + // MUL is 3 cycles, 3-way addition and rotations are 2 cycles, SUB/XOR are 1 cycle + // These latencies match real-life instruction latencies for Intel CPUs starting from Sandy Bridge and up to Skylake/Coffee lake + // + // AMD Ryzen has the same latencies except 1-cycle ROR/ROL, so it'll be a bit faster than Intel Sandy Bridge and newer processors + // Surprisingly, Intel Nehalem also has 1-cycle ROR/ROL, so it'll also be faster than Intel Sandy Bridge and newer processors + // AMD Bulldozer has 4 cycles latency for MUL (slower than Intel) and 1 cycle for ROR/ROL (faster than Intel), so average performance will be the same + // Source: https://www.agner.org/optimize/instruction_tables.pdf + const int op_latency[V4_INSTRUCTION_COUNT] = { 3, 2, 1, 2, 2, 1 }; + + // Instruction latencies for theoretical ASIC implementation + const int asic_op_latency[V4_INSTRUCTION_COUNT] = { 3, 1, 1, 1, 1, 1 }; + + // Available ALUs for each instruction + const int op_ALUs[V4_INSTRUCTION_COUNT] = { ALU_COUNT_MUL, ALU_COUNT, ALU_COUNT, ALU_COUNT, ALU_COUNT, ALU_COUNT }; + + int8_t data[32]; + memset(data, 0, sizeof(data)); + uint64_t tmp = SWAP64LE(height); + memcpy(data, &tmp, sizeof(uint64_t)); + + // Set data_index past the last byte in data + // to trigger full data update with blake hash + // before we start using it + size_t data_index = sizeof(data); + + int code_size; + + // There is a small chance (1.8%) that register R8 won't be used in the generated program + // So we keep track of it and try again if it's not used + bool r8_used; + do { + int latency[9]; + int asic_latency[9]; + + // Tracks previous instruction and value of the source operand for registers R0-R3 throughout code execution + // byte 0: current value of the destination register + // byte 1: instruction opcode + // byte 2: current value of the source register + // + // Registers R4-R8 are constant and are treated as having the same value because when we do + // the same operation twice with two constant source registers, it can be optimized into a single operation + uint32_t inst_data[9] = { 0, 1, 2, 3, 0xFFFFFF, 0xFFFFFF, 0xFFFFFF, 0xFFFFFF, 0xFFFFFF }; + + bool alu_busy[TOTAL_LATENCY + 1][ALU_COUNT]; + bool is_rotation[V4_INSTRUCTION_COUNT]; + bool rotated[4]; + int rotate_count = 0; + + memset(latency, 0, sizeof(latency)); + memset(asic_latency, 0, sizeof(asic_latency)); + memset(alu_busy, 0, sizeof(alu_busy)); + memset(is_rotation, 0, sizeof(is_rotation)); + memset(rotated, 0, sizeof(rotated)); + is_rotation[ROR] = true; + is_rotation[ROL] = true; + + int num_retries = 0; + code_size = 0; + + int total_iterations = 0; + r8_used = false; + + // Generate random code to achieve minimal required latency for our abstract CPU + // Try to get this latency for all 4 registers + while (((latency[0] < TOTAL_LATENCY) || (latency[1] < TOTAL_LATENCY) || (latency[2] < TOTAL_LATENCY) || (latency[3] < TOTAL_LATENCY)) && (num_retries < 64)) + { + // Fail-safe to guarantee loop termination + ++total_iterations; + if (total_iterations > 256) + break; + + check_data(&data_index, 1, data, sizeof(data)); + + const uint8_t c = ((uint8_t*)data)[data_index++]; + + // MUL = opcodes 0-2 + // ADD = opcode 3 + // SUB = opcode 4 + // ROR/ROL = opcode 5, shift direction is selected randomly + // XOR = opcodes 6-7 + uint8_t opcode = c & ((1 << V4_OPCODE_BITS) - 1); + if (opcode == 5) + { + check_data(&data_index, 1, data, sizeof(data)); + opcode = (data[data_index++] >= 0) ? ROR : ROL; + } + else if (opcode >= 6) + { + opcode = XOR; + } + else + { + opcode = (opcode <= 2) ? MUL : (opcode - 2); + } + + uint8_t dst_index = (c >> V4_OPCODE_BITS) & ((1 << V4_DST_INDEX_BITS) - 1); + uint8_t src_index = (c >> (V4_OPCODE_BITS + V4_DST_INDEX_BITS)) & ((1 << V4_SRC_INDEX_BITS) - 1); + + const int a = dst_index; + int b = src_index; + + // Don't do ADD/SUB/XOR with the same register + if (((opcode == ADD) || (opcode == SUB) || (opcode == XOR)) && (a == b)) + { + // Use register R8 as source instead + b = 8; + src_index = 8; + } + + // Don't do rotation with the same destination twice because it's equal to a single rotation + if (is_rotation[opcode] && rotated[a]) + { + continue; + } + + // Don't do the same instruction (except MUL) with the same source value twice because all other cases can be optimized: + // 2xADD(a, b, C) = ADD(a, b*2, C1+C2), same for SUB and rotations + // 2xXOR(a, b) = NOP + if ((opcode != MUL) && ((inst_data[a] & 0xFFFF00) == (opcode << 8) + ((inst_data[b] & 255) << 16))) + { + continue; + } + + // Find which ALU is available (and when) for this instruction + int next_latency = (latency[a] > latency[b]) ? latency[a] : latency[b]; + int alu_index = -1; + while (next_latency < TOTAL_LATENCY) + { + for (int i = op_ALUs[opcode] - 1; i >= 0; --i) + { + if (!alu_busy[next_latency][i]) + { + // ADD is implemented as two 1-cycle instructions on a real CPU, so do an additional availability check + if ((opcode == ADD) && alu_busy[next_latency + 1][i]) + { + continue; + } + + // Rotation can only start when previous rotation is finished, so do an additional availability check + if (is_rotation[opcode] && (next_latency < rotate_count * op_latency[opcode])) + { + continue; + } + + alu_index = i; + break; + } + } + if (alu_index >= 0) + { + break; + } + ++next_latency; + } + + // Don't generate instructions that leave some register unchanged for more than 7 cycles + if (next_latency > latency[a] + 7) + { + continue; + } + + next_latency += op_latency[opcode]; + + if (next_latency <= TOTAL_LATENCY) + { + if (is_rotation[opcode]) + { + ++rotate_count; + } + + // Mark ALU as busy only for the first cycle when it starts executing the instruction because ALUs are fully pipelined + alu_busy[next_latency - op_latency[opcode]][alu_index] = true; + latency[a] = next_latency; + + // ASIC is supposed to have enough ALUs to run as many independent instructions per cycle as possible, so latency calculation for ASIC is simple + asic_latency[a] = ((asic_latency[a] > asic_latency[b]) ? asic_latency[a] : asic_latency[b]) + asic_op_latency[opcode]; + + rotated[a] = is_rotation[opcode]; + + inst_data[a] = code_size + (opcode << 8) + ((inst_data[b] & 255) << 16); + + code[code_size].opcode = opcode; + code[code_size].dst_index = dst_index; + code[code_size].src_index = src_index; + code[code_size].C = 0; + + if (src_index == 8) + { + r8_used = true; + } + + if (opcode == ADD) + { + // ADD instruction is implemented as two 1-cycle instructions on a real CPU, so mark ALU as busy for the next cycle too + alu_busy[next_latency - op_latency[opcode] + 1][alu_index] = true; + + // ADD instruction requires 4 more random bytes for 32-bit constant "C" in "a = a + b + C" + check_data(&data_index, sizeof(uint32_t), data, sizeof(data)); + uint32_t t; + memcpy(&t, data + data_index, sizeof(uint32_t)); + code[code_size].C = SWAP32LE(t); + data_index += sizeof(uint32_t); + } + + ++code_size; + if (code_size >= NUM_INSTRUCTIONS_MIN) + { + break; + } + } + else + { + ++num_retries; + } + } + + // ASIC has more execution resources and can extract as much parallelism from the code as possible + // We need to add a few more MUL and ROR instructions to achieve minimal required latency for ASIC + // Get this latency for at least 1 of the 4 registers + const int prev_code_size = code_size; + while ((code_size < NUM_INSTRUCTIONS_MAX) && (asic_latency[0] < TOTAL_LATENCY) && (asic_latency[1] < TOTAL_LATENCY) && (asic_latency[2] < TOTAL_LATENCY) && (asic_latency[3] < TOTAL_LATENCY)) + { + int min_idx = 0; + int max_idx = 0; + for (int i = 1; i < 4; ++i) + { + if (asic_latency[i] < asic_latency[min_idx]) min_idx = i; + if (asic_latency[i] > asic_latency[max_idx]) max_idx = i; + } + + const uint8_t pattern[3] = { ROR, MUL, MUL }; + const uint8_t opcode = pattern[(code_size - prev_code_size) % 3]; + latency[min_idx] = latency[max_idx] + op_latency[opcode]; + asic_latency[min_idx] = asic_latency[max_idx] + asic_op_latency[opcode]; + + code[code_size].opcode = opcode; + code[code_size].dst_index = min_idx; + code[code_size].src_index = max_idx; + code[code_size].C = 0; + ++code_size; + } + + // There is ~98.15% chance that loop condition is false, so this loop will execute only 1 iteration most of the time + // It never does more than 4 iterations for all block heights < 10,000,000 + } while (!r8_used || (code_size < NUM_INSTRUCTIONS_MIN) || (code_size > NUM_INSTRUCTIONS_MAX)); + + // It's guaranteed that NUM_INSTRUCTIONS_MIN <= code_size <= NUM_INSTRUCTIONS_MAX here + // Add final instruction to stop the interpreter + code[code_size].opcode = RET; + code[code_size].dst_index = 0; + code[code_size].src_index = 0; + code[code_size].C = 0; + + return code_size; +} + +#endif diff --git a/src/cryptonote_basic/cryptonote_format_utils.cpp b/src/cryptonote_basic/cryptonote_format_utils.cpp index f6daaab95..10fb5444c 100644 --- a/src/cryptonote_basic/cryptonote_format_utils.cpp +++ b/src/cryptonote_basic/cryptonote_format_utils.cpp @@ -1174,7 +1174,7 @@ namespace cryptonote } blobdata bd = get_block_hashing_blob(b); const int cn_variant = b.major_version >= 7 ? b.major_version - 6 : 0; - crypto::cn_slow_hash(bd.data(), bd.size(), res, cn_variant); + crypto::cn_slow_hash(bd.data(), bd.size(), res, cn_variant, height); return true; } //--------------------------------------------------------------- diff --git a/tests/hash/CMakeLists.txt b/tests/hash/CMakeLists.txt index 433cf94e9..105cf2c13 100644 --- a/tests/hash/CMakeLists.txt +++ b/tests/hash/CMakeLists.txt @@ -43,7 +43,7 @@ set_property(TARGET hash-tests PROPERTY FOLDER "tests") -foreach (hash IN ITEMS fast slow slow-1 slow-2 tree extra-blake extra-groestl extra-jh extra-skein) +foreach (hash IN ITEMS fast slow slow-1 slow-2 slow-4 tree extra-blake extra-groestl extra-jh extra-skein) add_test( NAME "hash-${hash}" COMMAND hash-tests "${hash}" "${CMAKE_CURRENT_SOURCE_DIR}/tests-${hash}.txt") diff --git a/tests/hash/main.cpp b/tests/hash/main.cpp index 7767d0d3b..acfd99e96 100644 --- a/tests/hash/main.cpp +++ b/tests/hash/main.cpp @@ -44,6 +44,13 @@ using namespace std; using namespace crypto; typedef crypto::hash chash; +struct V4_Data +{ + const void* data; + size_t length; + uint64_t height; +}; + PUSH_WARNINGS DISABLE_VS_WARNINGS(4297) extern "C" { @@ -54,13 +61,17 @@ extern "C" { tree_hash((const char (*)[crypto::HASH_SIZE]) data, length >> 5, hash); } static void cn_slow_hash_0(const void *data, size_t length, char *hash) { - return cn_slow_hash(data, length, hash, 0/*variant*/, 0/*prehashed*/); + return cn_slow_hash(data, length, hash, 0/*variant*/, 0/*prehashed*/, 0/*height*/); } static void cn_slow_hash_1(const void *data, size_t length, char *hash) { - return cn_slow_hash(data, length, hash, 1/*variant*/, 0/*prehashed*/); + return cn_slow_hash(data, length, hash, 1/*variant*/, 0/*prehashed*/, 0/*height*/); } static void cn_slow_hash_2(const void *data, size_t length, char *hash) { - return cn_slow_hash(data, length, hash, 2/*variant*/, 0/*prehashed*/); + return cn_slow_hash(data, length, hash, 2/*variant*/, 0/*prehashed*/, 0/*height*/); + } + static void cn_slow_hash_4(const void *data, size_t, char *hash) { + const V4_Data* p = reinterpret_cast(data); + return cn_slow_hash(p->data, p->length, hash, 4/*variant*/, 0/*prehashed*/, p->height); } } POP_WARNINGS @@ -72,7 +83,7 @@ struct hash_func { } hashes[] = {{"fast", cn_fast_hash}, {"slow", cn_slow_hash_0}, {"tree", hash_tree}, {"extra-blake", hash_extra_blake}, {"extra-groestl", hash_extra_groestl}, {"extra-jh", hash_extra_jh}, {"extra-skein", hash_extra_skein}, - {"slow-1", cn_slow_hash_1}, {"slow-2", cn_slow_hash_2}}; + {"slow-1", cn_slow_hash_1}, {"slow-2", cn_slow_hash_2}, {"slow-4", cn_slow_hash_4}}; int test_variant2_int_sqrt(); int test_variant2_int_sqrt_ref(); @@ -140,7 +151,15 @@ int main(int argc, char *argv[]) { input.exceptions(ios_base::badbit | ios_base::failbit | ios_base::eofbit); input.clear(input.rdstate()); get(input, data); - f(data.data(), data.size(), (char *) &actual); + if (f == cn_slow_hash_4) { + V4_Data d; + d.data = data.data(); + d.length = data.size(); + get(input, d.height); + f(&d, 0, (char *) &actual); + } else { + f(data.data(), data.size(), (char *) &actual); + } if (expected != actual) { size_t i; cerr << "Hash mismatch on test " << test << endl << "Input: "; diff --git a/tests/hash/tests-slow-4.txt b/tests/hash/tests-slow-4.txt new file mode 100644 index 000000000..e9eef60e5 --- /dev/null +++ b/tests/hash/tests-slow-4.txt @@ -0,0 +1,10 @@ +47c996e2d6aa453f50b15a6e829a8c6e5070500c08ba2426019510753e31af42 5468697320697320612074657374205468697320697320612074657374205468697320697320612074657374 1806260 +eb17f755e8f394ff911603826b0e2a37c3f40a5990693a1be7e39cd5c178f0b4 4c6f72656d20697073756d20646f6c6f722073697420616d65742c20636f6e73656374657475722061646970697363696e67 1806261 +f4de6adc61efa498fd4929ed00e88ed8e12caa2907f99cb42442567d3da9daec 656c69742c2073656420646f20656975736d6f642074656d706f7220696e6369646964756e74207574206c61626f7265 1806262 +03004aaa1cdbda343fcbc835aaca191b8577c21267dadd0e4e86a57e68614a71 657420646f6c6f7265206d61676e6120616c697175612e20557420656e696d206164206d696e696d2076656e69616d2c 1806263 +2081cb5646549b44356f5c81787c529367751bc1cdd5f1ea8c0a333b5e49e220 71756973206e6f737472756420657865726369746174696f6e20756c6c616d636f206c61626f726973206e697369 1806264 +653c57f666f6fa1121b82f217485f6fda64ce58bf311d664e92da9119c7d5b95 757420616c697175697020657820656120636f6d6d6f646f20636f6e7365717561742e20447569732061757465 1806265 +20c4b9f8ded7fd1e348341ce2b6297e5ac330e588e5f34985446fd20346b69e3 697275726520646f6c6f7220696e20726570726568656e646572697420696e20766f6c7570746174652076656c6974 1806266 +82e35ae2f7258fb5cb6b53b332f898ac19c385b49fc35e32ae0c5ab56025f763 657373652063696c6c756d20646f6c6f726520657520667567696174206e756c6c612070617269617475722e 1806267 +ddcf95b7d0066668a1d36d4115de1a2bc52dd1f95d94366ec34c8c3c7196e5dd 4578636570746575722073696e74206f6363616563617420637570696461746174206e6f6e2070726f6964656e742c 1806268 +882c2bddf05736ab1072c678c3b75661813709a2ac1fd6e861f2dcc65d466b90 73756e7420696e2063756c706120717569206f666669636961206465736572756e74206d6f6c6c697420616e696d20696420657374206c61626f72756d2e 1806269