diff --git a/third_party/libdivide.h b/third_party/libdivide.h index 562e49f..e9a31d1 100644 --- a/third_party/libdivide.h +++ b/third_party/libdivide.h @@ -1,8 +1,8 @@ // libdivide.h - Optimized integer division // https://libdivide.com // -// Copyright (C) 2010 - 2019 ridiculous_fish, -// Copyright (C) 2016 - 2019 Kim Walisch, +// Copyright (C) 2010 - 2021 ridiculous_fish, +// Copyright (C) 2016 - 2021 Kim Walisch, // // libdivide is dual-licensed under the Boost or zlib licenses. // You may use libdivide under the terms of either of these. @@ -11,17 +11,12 @@ #ifndef LIBDIVIDE_H #define LIBDIVIDE_H -#define LIBDIVIDE_VERSION "3.0" -#define LIBDIVIDE_VERSION_MAJOR 3 +#define LIBDIVIDE_VERSION "5.0" +#define LIBDIVIDE_VERSION_MAJOR 5 #define LIBDIVIDE_VERSION_MINOR 0 #include - -#if defined(__cplusplus) -#include -#include -#include -#else +#if !defined(__AVR__) #include #include #endif @@ -38,9 +33,15 @@ #if defined(_MSC_VER) #include +#pragma warning(push) // disable warning C4146: unary minus operator applied // to unsigned type, result still unsigned #pragma warning(disable : 4146) +// disable warning C4204: nonstandard extension used : non-constant aggregate +// initializer +// +// It's valid C99 +#pragma warning(disable : 4204) #define LIBDIVIDE_VC #endif @@ -74,13 +75,28 @@ #define LIBDIVIDE_FUNCTION __func__ #endif +// Set up forced inlining if possible. +// We need both the attribute and keyword to avoid "might not be inlineable" warnings. +#ifdef __has_attribute +#if __has_attribute(always_inline) +#define LIBDIVIDE_INLINE __attribute__((always_inline)) inline +#endif +#endif +#ifndef LIBDIVIDE_INLINE +#define LIBDIVIDE_INLINE inline +#endif + +#if defined(__AVR__) +#define LIBDIVIDE_ERROR(msg) +#else #define LIBDIVIDE_ERROR(msg) \ do { \ fprintf(stderr, "libdivide.h:%d: %s(): Error: %s\n", __LINE__, LIBDIVIDE_FUNCTION, msg); \ abort(); \ } while (0) +#endif -#if defined(LIBDIVIDE_ASSERTIONS_ON) +#if defined(LIBDIVIDE_ASSERTIONS_ON) && !defined(__AVR__) #define LIBDIVIDE_ASSERT(x) \ do { \ if (!(x)) { \ @@ -103,6 +119,16 @@ namespace libdivide { // by up to 10% because of reduced memory bandwidth. #pragma pack(push, 1) +struct libdivide_u16_t { + uint16_t magic; + uint8_t more; +}; + +struct libdivide_s16_t { + int16_t magic; + uint8_t more; +}; + struct libdivide_u32_t { uint32_t magic; uint8_t more; @@ -123,6 +149,16 @@ struct libdivide_s64_t { uint8_t more; }; +struct libdivide_u16_branchfree_t { + uint16_t magic; + uint8_t more; +}; + +struct libdivide_s16_branchfree_t { + int16_t magic; + uint8_t more; +}; + struct libdivide_u32_branchfree_t { uint32_t magic; uint8_t more; @@ -178,66 +214,106 @@ struct libdivide_s64_branchfree_t { // whether the divisor is negated. In branchfree strategy, it is not negated. enum { + LIBDIVIDE_16_SHIFT_MASK = 0x1F, LIBDIVIDE_32_SHIFT_MASK = 0x1F, LIBDIVIDE_64_SHIFT_MASK = 0x3F, LIBDIVIDE_ADD_MARKER = 0x40, LIBDIVIDE_NEGATIVE_DIVISOR = 0x80 }; -static inline struct libdivide_s32_t libdivide_s32_gen(int32_t d); -static inline struct libdivide_u32_t libdivide_u32_gen(uint32_t d); -static inline struct libdivide_s64_t libdivide_s64_gen(int64_t d); -static inline struct libdivide_u64_t libdivide_u64_gen(uint64_t d); +static LIBDIVIDE_INLINE struct libdivide_s16_t libdivide_s16_gen(int16_t d); +static LIBDIVIDE_INLINE struct libdivide_u16_t libdivide_u16_gen(uint16_t d); +static LIBDIVIDE_INLINE struct libdivide_s32_t libdivide_s32_gen(int32_t d); +static LIBDIVIDE_INLINE struct libdivide_u32_t libdivide_u32_gen(uint32_t d); +static LIBDIVIDE_INLINE struct libdivide_s64_t libdivide_s64_gen(int64_t d); +static LIBDIVIDE_INLINE struct libdivide_u64_t libdivide_u64_gen(uint64_t d); -static inline struct libdivide_s32_branchfree_t libdivide_s32_branchfree_gen(int32_t d); -static inline struct libdivide_u32_branchfree_t libdivide_u32_branchfree_gen(uint32_t d); -static inline struct libdivide_s64_branchfree_t libdivide_s64_branchfree_gen(int64_t d); -static inline struct libdivide_u64_branchfree_t libdivide_u64_branchfree_gen(uint64_t d); +static LIBDIVIDE_INLINE struct libdivide_s16_branchfree_t libdivide_s16_branchfree_gen(int16_t d); +static LIBDIVIDE_INLINE struct libdivide_u16_branchfree_t libdivide_u16_branchfree_gen(uint16_t d); +static LIBDIVIDE_INLINE struct libdivide_s32_branchfree_t libdivide_s32_branchfree_gen(int32_t d); +static LIBDIVIDE_INLINE struct libdivide_u32_branchfree_t libdivide_u32_branchfree_gen(uint32_t d); +static LIBDIVIDE_INLINE struct libdivide_s64_branchfree_t libdivide_s64_branchfree_gen(int64_t d); +static LIBDIVIDE_INLINE struct libdivide_u64_branchfree_t libdivide_u64_branchfree_gen(uint64_t d); -static inline int32_t libdivide_s32_do(int32_t numer, const struct libdivide_s32_t *denom); -static inline uint32_t libdivide_u32_do(uint32_t numer, const struct libdivide_u32_t *denom); -static inline int64_t libdivide_s64_do(int64_t numer, const struct libdivide_s64_t *denom); -static inline uint64_t libdivide_u64_do(uint64_t numer, const struct libdivide_u64_t *denom); +static LIBDIVIDE_INLINE int16_t libdivide_s16_do_raw( + int16_t numer, int16_t magic, uint8_t more); +static LIBDIVIDE_INLINE int16_t libdivide_s16_do( + int16_t numer, const struct libdivide_s16_t* denom); +static LIBDIVIDE_INLINE uint16_t libdivide_u16_do_raw( + uint16_t numer, uint16_t magic, uint8_t more); +static LIBDIVIDE_INLINE uint16_t libdivide_u16_do( + uint16_t numer, const struct libdivide_u16_t* denom); +static LIBDIVIDE_INLINE int32_t libdivide_s32_do( + int32_t numer, const struct libdivide_s32_t *denom); +static LIBDIVIDE_INLINE uint32_t libdivide_u32_do( + uint32_t numer, const struct libdivide_u32_t *denom); +static LIBDIVIDE_INLINE int64_t libdivide_s64_do( + int64_t numer, const struct libdivide_s64_t *denom); +static LIBDIVIDE_INLINE uint64_t libdivide_u64_do( + uint64_t numer, const struct libdivide_u64_t *denom); -static inline int32_t libdivide_s32_branchfree_do( +static LIBDIVIDE_INLINE int16_t libdivide_s16_branchfree_do( + int16_t numer, const struct libdivide_s16_branchfree_t* denom); +static LIBDIVIDE_INLINE uint16_t libdivide_u16_branchfree_do( + uint16_t numer, const struct libdivide_u16_branchfree_t* denom); +static LIBDIVIDE_INLINE int32_t libdivide_s32_branchfree_do( int32_t numer, const struct libdivide_s32_branchfree_t *denom); -static inline uint32_t libdivide_u32_branchfree_do( +static LIBDIVIDE_INLINE uint32_t libdivide_u32_branchfree_do( uint32_t numer, const struct libdivide_u32_branchfree_t *denom); -static inline int64_t libdivide_s64_branchfree_do( +static LIBDIVIDE_INLINE int64_t libdivide_s64_branchfree_do( int64_t numer, const struct libdivide_s64_branchfree_t *denom); -static inline uint64_t libdivide_u64_branchfree_do( +static LIBDIVIDE_INLINE uint64_t libdivide_u64_branchfree_do( uint64_t numer, const struct libdivide_u64_branchfree_t *denom); -static inline int32_t libdivide_s32_recover(const struct libdivide_s32_t *denom); -static inline uint32_t libdivide_u32_recover(const struct libdivide_u32_t *denom); -static inline int64_t libdivide_s64_recover(const struct libdivide_s64_t *denom); -static inline uint64_t libdivide_u64_recover(const struct libdivide_u64_t *denom); +static LIBDIVIDE_INLINE int16_t libdivide_s16_recover(const struct libdivide_s16_t* denom); +static LIBDIVIDE_INLINE uint16_t libdivide_u16_recover(const struct libdivide_u16_t* denom); +static LIBDIVIDE_INLINE int32_t libdivide_s32_recover(const struct libdivide_s32_t *denom); +static LIBDIVIDE_INLINE uint32_t libdivide_u32_recover(const struct libdivide_u32_t *denom); +static LIBDIVIDE_INLINE int64_t libdivide_s64_recover(const struct libdivide_s64_t *denom); +static LIBDIVIDE_INLINE uint64_t libdivide_u64_recover(const struct libdivide_u64_t *denom); -static inline int32_t libdivide_s32_branchfree_recover( +static LIBDIVIDE_INLINE int16_t libdivide_s16_branchfree_recover( + const struct libdivide_s16_branchfree_t* denom); +static LIBDIVIDE_INLINE uint16_t libdivide_u16_branchfree_recover( + const struct libdivide_u16_branchfree_t* denom); +static LIBDIVIDE_INLINE int32_t libdivide_s32_branchfree_recover( const struct libdivide_s32_branchfree_t *denom); -static inline uint32_t libdivide_u32_branchfree_recover( +static LIBDIVIDE_INLINE uint32_t libdivide_u32_branchfree_recover( const struct libdivide_u32_branchfree_t *denom); -static inline int64_t libdivide_s64_branchfree_recover( +static LIBDIVIDE_INLINE int64_t libdivide_s64_branchfree_recover( const struct libdivide_s64_branchfree_t *denom); -static inline uint64_t libdivide_u64_branchfree_recover( +static LIBDIVIDE_INLINE uint64_t libdivide_u64_branchfree_recover( const struct libdivide_u64_branchfree_t *denom); //////// Internal Utility Functions -static inline uint32_t libdivide_mullhi_u32(uint32_t x, uint32_t y) { +static LIBDIVIDE_INLINE uint16_t libdivide_mullhi_u16(uint16_t x, uint16_t y) { + uint32_t xl = x, yl = y; + uint32_t rl = xl * yl; + return (uint16_t)(rl >> 16); +} + +static LIBDIVIDE_INLINE int16_t libdivide_mullhi_s16(int16_t x, int16_t y) { + int32_t xl = x, yl = y; + int32_t rl = xl * yl; + // needs to be arithmetic shift + return (int16_t)(rl >> 16); +} + +static LIBDIVIDE_INLINE uint32_t libdivide_mullhi_u32(uint32_t x, uint32_t y) { uint64_t xl = x, yl = y; uint64_t rl = xl * yl; return (uint32_t)(rl >> 32); } -static inline int32_t libdivide_mullhi_s32(int32_t x, int32_t y) { +static LIBDIVIDE_INLINE int32_t libdivide_mullhi_s32(int32_t x, int32_t y) { int64_t xl = x, yl = y; int64_t rl = xl * yl; // needs to be arithmetic shift return (int32_t)(rl >> 32); } -static inline uint64_t libdivide_mullhi_u64(uint64_t x, uint64_t y) { +static LIBDIVIDE_INLINE uint64_t libdivide_mullhi_u64(uint64_t x, uint64_t y) { #if defined(LIBDIVIDE_VC) && defined(LIBDIVIDE_X86_64) return __umulh(x, y); #elif defined(HAS_INT128_T) @@ -263,7 +339,7 @@ static inline uint64_t libdivide_mullhi_u64(uint64_t x, uint64_t y) { #endif } -static inline int64_t libdivide_mullhi_s64(int64_t x, int64_t y) { +static LIBDIVIDE_INLINE int64_t libdivide_mullhi_s64(int64_t x, int64_t y) { #if defined(LIBDIVIDE_VC) && defined(LIBDIVIDE_X86_64) return __mulh(x, y); #elif defined(HAS_INT128_T) @@ -285,8 +361,41 @@ static inline int64_t libdivide_mullhi_s64(int64_t x, int64_t y) { #endif } -static inline int32_t libdivide_count_leading_zeros32(uint32_t val) { -#if defined(__GNUC__) || __has_builtin(__builtin_clz) +static LIBDIVIDE_INLINE int16_t libdivide_count_leading_zeros16(uint16_t val) { +#if defined(__AVR__) + // Fast way to count leading zeros + // On the AVR 8-bit architecture __builtin_clz() works on a int16_t. + return __builtin_clz(val); +#elif defined(__GNUC__) || __has_builtin(__builtin_clz) + // Fast way to count leading zeros + return __builtin_clz(val) - 16; +#elif defined(LIBDIVIDE_VC) + unsigned long result; + if (_BitScanReverse(&result, (unsigned long)val)) { + return (int16_t)(15 - result); + } + return 0; +#else + if (val == 0) return 16; + int16_t result = 4; + uint16_t hi = 0xFU << 12; + while ((val & hi) == 0) { + hi >>= 4; + result += 4; + } + while (val & hi) { + result -= 1; + hi <<= 1; + } + return result; +#endif +} + +static LIBDIVIDE_INLINE int32_t libdivide_count_leading_zeros32(uint32_t val) { +#if defined(__AVR__) + // Fast way to count leading zeros + return __builtin_clzl(val); +#elif defined(__GNUC__) || __has_builtin(__builtin_clz) // Fast way to count leading zeros return __builtin_clz(val); #elif defined(LIBDIVIDE_VC) @@ -311,7 +420,7 @@ static inline int32_t libdivide_count_leading_zeros32(uint32_t val) { #endif } -static inline int32_t libdivide_count_leading_zeros64(uint64_t val) { +static LIBDIVIDE_INLINE int32_t libdivide_count_leading_zeros64(uint64_t val) { #if defined(__GNUC__) || __has_builtin(__builtin_clzll) // Fast way to count leading zeros return __builtin_clzll(val); @@ -329,10 +438,21 @@ static inline int32_t libdivide_count_leading_zeros64(uint64_t val) { #endif } +// libdivide_32_div_16_to_16: divides a 32-bit uint {u1, u0} by a 16-bit +// uint {v}. The result must fit in 16 bits. +// Returns the quotient directly and the remainder in *r +static LIBDIVIDE_INLINE uint16_t libdivide_32_div_16_to_16( + uint16_t u1, uint16_t u0, uint16_t v, uint16_t* r) { + uint32_t n = ((uint32_t)u1 << 16) | u0; + uint16_t result = (uint16_t)(n / v); + *r = (uint16_t)(n - result * (uint32_t)v); + return result; +} + // libdivide_64_div_32_to_32: divides a 64-bit uint {u1, u0} by a 32-bit // uint {v}. The result must fit in 32 bits. // Returns the quotient directly and the remainder in *r -static inline uint32_t libdivide_64_div_32_to_32( +static LIBDIVIDE_INLINE uint32_t libdivide_64_div_32_to_32( uint32_t u1, uint32_t u0, uint32_t v, uint32_t *r) { #if (defined(LIBDIVIDE_i386) || defined(LIBDIVIDE_X86_64)) && defined(LIBDIVIDE_GCC_STYLE_ASM) uint32_t result; @@ -346,93 +466,106 @@ static inline uint32_t libdivide_64_div_32_to_32( #endif } -// libdivide_128_div_64_to_64: divides a 128-bit uint {u1, u0} by a 64-bit -// uint {v}. The result must fit in 64 bits. -// Returns the quotient directly and the remainder in *r -static uint64_t libdivide_128_div_64_to_64(uint64_t u1, uint64_t u0, uint64_t v, uint64_t *r) { +// libdivide_128_div_64_to_64: divides a 128-bit uint {numhi, numlo} by a 64-bit uint {den}. The +// result must fit in 64 bits. Returns the quotient directly and the remainder in *r +static LIBDIVIDE_INLINE uint64_t libdivide_128_div_64_to_64( + uint64_t numhi, uint64_t numlo, uint64_t den, uint64_t *r) { // N.B. resist the temptation to use __uint128_t here. // In LLVM compiler-rt, it performs a 128/128 -> 128 division which is many times slower than // necessary. In gcc it's better but still slower than the divlu implementation, perhaps because - // it's not inlined. + // it's not LIBDIVIDE_INLINEd. #if defined(LIBDIVIDE_X86_64) && defined(LIBDIVIDE_GCC_STYLE_ASM) uint64_t result; - __asm__("divq %[v]" : "=a"(result), "=d"(*r) : [v] "r"(v), "a"(u0), "d"(u1)); + __asm__("divq %[v]" : "=a"(result), "=d"(*r) : [v] "r"(den), "a"(numlo), "d"(numhi)); return result; #else - // Code taken from Hacker's Delight: - // http://www.hackersdelight.org/HDcode/divlu.c. - // License permits inclusion here per: - // http://www.hackersdelight.org/permissions.htm + // We work in base 2**32. + // A uint32 holds a single digit. A uint64 holds two digits. + // Our numerator is conceptually [num3, num2, num1, num0]. + // Our denominator is [den1, den0]. + const uint64_t b = ((uint64_t)1 << 32); - const uint64_t b = (1ULL << 32); // Number base (32 bits) - uint64_t un1, un0; // Norm. dividend LSD's - uint64_t vn1, vn0; // Norm. divisor digits - uint64_t q1, q0; // Quotient digits - uint64_t un64, un21, un10; // Dividend digit pairs - uint64_t rhat; // A remainder - int32_t s; // Shift amount for norm + // The high and low digits of our computed quotient. + uint32_t q1; + uint32_t q0; - // If overflow, set rem. to an impossible value, - // and return the largest possible quotient - if (u1 >= v) { - *r = (uint64_t)-1; - return (uint64_t)-1; + // The normalization shift factor. + int shift; + + // The high and low digits of our denominator (after normalizing). + // Also the low 2 digits of our numerator (after normalizing). + uint32_t den1; + uint32_t den0; + uint32_t num1; + uint32_t num0; + + // A partial remainder. + uint64_t rem; + + // The estimated quotient, and its corresponding remainder (unrelated to true remainder). + uint64_t qhat; + uint64_t rhat; + + // Variables used to correct the estimated quotient. + uint64_t c1; + uint64_t c2; + + // Check for overflow and divide by 0. + if (numhi >= den) { + if (r != NULL) *r = ~0ull; + return ~0ull; } - // count leading zeros - s = libdivide_count_leading_zeros64(v); - if (s > 0) { - // Normalize divisor - v = v << s; - un64 = (u1 << s) | (u0 >> (64 - s)); - un10 = u0 << s; // Shift dividend left - } else { - // Avoid undefined behavior of (u0 >> 64). - // The behavior is undefined if the right operand is - // negative, or greater than or equal to the length - // in bits of the promoted left operand. - un64 = u1; - un10 = u0; - } + // Determine the normalization factor. We multiply den by this, so that its leading digit is at + // least half b. In binary this means just shifting left by the number of leading zeros, so that + // there's a 1 in the MSB. + // We also shift numer by the same amount. This cannot overflow because numhi < den. + // The expression (-shift & 63) is the same as (64 - shift), except it avoids the UB of shifting + // by 64. The funny bitwise 'and' ensures that numlo does not get shifted into numhi if shift is + // 0. clang 11 has an x86 codegen bug here: see LLVM bug 50118. The sequence below avoids it. + shift = libdivide_count_leading_zeros64(den); + den <<= shift; + numhi <<= shift; + numhi |= (numlo >> (-shift & 63)) & (-(int64_t)shift >> 63); + numlo <<= shift; - // Break divisor up into two 32-bit digits - vn1 = v >> 32; - vn0 = v & 0xFFFFFFFF; + // Extract the low digits of the numerator and both digits of the denominator. + num1 = (uint32_t)(numlo >> 32); + num0 = (uint32_t)(numlo & 0xFFFFFFFFu); + den1 = (uint32_t)(den >> 32); + den0 = (uint32_t)(den & 0xFFFFFFFFu); - // Break right half of dividend into two digits - un1 = un10 >> 32; - un0 = un10 & 0xFFFFFFFF; + // We wish to compute q1 = [n3 n2 n1] / [d1 d0]. + // Estimate q1 as [n3 n2] / [d1], and then correct it. + // Note while qhat may be 2 digits, q1 is always 1 digit. + qhat = numhi / den1; + rhat = numhi % den1; + c1 = qhat * den0; + c2 = rhat * b + num1; + if (c1 > c2) qhat -= (c1 - c2 > den) ? 2 : 1; + q1 = (uint32_t)qhat; - // Compute the first quotient digit, q1 - q1 = un64 / vn1; - rhat = un64 - q1 * vn1; + // Compute the true (partial) remainder. + rem = numhi * b + num1 - q1 * den; - while (q1 >= b || q1 * vn0 > b * rhat + un1) { - q1 = q1 - 1; - rhat = rhat + vn1; - if (rhat >= b) break; - } + // We wish to compute q0 = [rem1 rem0 n0] / [d1 d0]. + // Estimate q0 as [rem1 rem0] / [d1] and correct it. + qhat = rem / den1; + rhat = rem % den1; + c1 = qhat * den0; + c2 = rhat * b + num0; + if (c1 > c2) qhat -= (c1 - c2 > den) ? 2 : 1; + q0 = (uint32_t)qhat; - // Multiply and subtract - un21 = un64 * b + un1 - q1 * v; - - // Compute the second quotient digit - q0 = un21 / vn1; - rhat = un21 - q0 * vn1; - - while (q0 >= b || q0 * vn0 > b * rhat + un0) { - q0 = q0 - 1; - rhat = rhat + vn1; - if (rhat >= b) break; - } - - *r = (un21 * b + un0 - q0 * v) >> s; - return q1 * b + q0; + // Return remainder if requested. + if (r != NULL) *r = (rem * b + num0 - q0 * den) >> shift; + return ((uint64_t)q1 << 32) | q0; #endif } // Bitshift a u128 in place, left (signed_shift > 0) or right (signed_shift < 0) -static inline void libdivide_u128_shift(uint64_t *u1, uint64_t *u0, int32_t signed_shift) { +static LIBDIVIDE_INLINE void libdivide_u128_shift( + uint64_t *u1, uint64_t *u0, int32_t signed_shift) { if (signed_shift > 0) { uint32_t shift = signed_shift; *u1 <<= shift; @@ -447,7 +580,7 @@ static inline void libdivide_u128_shift(uint64_t *u1, uint64_t *u0, int32_t sign } // Computes a 128 / 128 -> 64 bit division, with a 128 bit remainder. -static uint64_t libdivide_128_div_128_to_64( +static LIBDIVIDE_INLINE uint64_t libdivide_128_div_128_to_64( uint64_t u_hi, uint64_t u_lo, uint64_t v_hi, uint64_t v_lo, uint64_t *r_hi, uint64_t *r_lo) { #if defined(HAS_INT128_T) && defined(HAS_INT128_DIV) __uint128_t ufull = u_hi; @@ -544,9 +677,178 @@ static uint64_t libdivide_128_div_128_to_64( #endif } +////////// UINT16 + +static LIBDIVIDE_INLINE struct libdivide_u16_t libdivide_internal_u16_gen( + uint16_t d, int branchfree) { + if (d == 0) { + LIBDIVIDE_ERROR("divider must be != 0"); + } + + struct libdivide_u16_t result; + uint8_t floor_log_2_d = (uint8_t)(15 - libdivide_count_leading_zeros16(d)); + + // Power of 2 + if ((d & (d - 1)) == 0) { + // We need to subtract 1 from the shift value in case of an unsigned + // branchfree divider because there is a hardcoded right shift by 1 + // in its division algorithm. Because of this we also need to add back + // 1 in its recovery algorithm. + result.magic = 0; + result.more = (uint8_t)(floor_log_2_d - (branchfree != 0)); + } + else { + uint8_t more; + uint16_t rem, proposed_m; + proposed_m = libdivide_32_div_16_to_16((uint16_t)1 << floor_log_2_d, 0, d, &rem); + + LIBDIVIDE_ASSERT(rem > 0 && rem < d); + const uint16_t e = d - rem; + + // This power works if e < 2**floor_log_2_d. + if (!branchfree && (e < ((uint16_t)1 << floor_log_2_d))) { + // This power works + more = floor_log_2_d; + } + else { + // We have to use the general 17-bit algorithm. We need to compute + // (2**power) / d. However, we already have (2**(power-1))/d and + // its remainder. By doubling both, and then correcting the + // remainder, we can compute the larger division. + // don't care about overflow here - in fact, we expect it + proposed_m += proposed_m; + const uint16_t twice_rem = rem + rem; + if (twice_rem >= d || twice_rem < rem) proposed_m += 1; + more = floor_log_2_d | LIBDIVIDE_ADD_MARKER; + } + result.magic = 1 + proposed_m; + result.more = more; + // result.more's shift should in general be ceil_log_2_d. But if we + // used the smaller power, we subtract one from the shift because we're + // using the smaller power. If we're using the larger power, we + // subtract one from the shift because it's taken care of by the add + // indicator. So floor_log_2_d happens to be correct in both cases. + } + return result; +} + +struct libdivide_u16_t libdivide_u16_gen(uint16_t d) { + return libdivide_internal_u16_gen(d, 0); +} + +struct libdivide_u16_branchfree_t libdivide_u16_branchfree_gen(uint16_t d) { + if (d == 1) { + LIBDIVIDE_ERROR("branchfree divider must be != 1"); + } + struct libdivide_u16_t tmp = libdivide_internal_u16_gen(d, 1); + struct libdivide_u16_branchfree_t ret = { + tmp.magic, (uint8_t)(tmp.more & LIBDIVIDE_16_SHIFT_MASK) }; + return ret; +} + +// The original libdivide_u16_do takes a const pointer. However, this cannot be used +// with a compile time constant libdivide_u16_t: it will generate a warning about +// taking the address of a temporary. Hence this overload. +uint16_t libdivide_u16_do_raw(uint16_t numer, uint16_t magic, uint8_t more) { + if (!magic) { + return numer >> more; + } + else { + uint16_t q = libdivide_mullhi_u16(magic, numer); + if (more & LIBDIVIDE_ADD_MARKER) { + uint16_t t = ((numer - q) >> 1) + q; + return t >> (more & LIBDIVIDE_16_SHIFT_MASK); + } + else { + // All upper bits are 0, + // don't need to mask them off. + return q >> more; + } + } +} + +uint16_t libdivide_u16_do(uint16_t numer, const struct libdivide_u16_t* denom) { + return libdivide_u16_do_raw(numer, denom->magic, denom->more); +} + +uint16_t libdivide_u16_branchfree_do( + uint16_t numer, const struct libdivide_u16_branchfree_t* denom) { + uint16_t q = libdivide_mullhi_u16(denom->magic, numer); + uint16_t t = ((numer - q) >> 1) + q; + return t >> denom->more; +} + +uint16_t libdivide_u16_recover(const struct libdivide_u16_t *denom) { + uint8_t more = denom->more; + uint8_t shift = more & LIBDIVIDE_16_SHIFT_MASK; + + if (!denom->magic) { + return (uint16_t)1 << shift; + } else if (!(more & LIBDIVIDE_ADD_MARKER)) { + // We compute q = n/d = n*m / 2^(16 + shift) + // Therefore we have d = 2^(16 + shift) / m + // We need to ceil it. + // We know d is not a power of 2, so m is not a power of 2, + // so we can just add 1 to the floor + uint16_t hi_dividend = (uint16_t)1 << shift; + uint16_t rem_ignored; + return 1 + libdivide_32_div_16_to_16(hi_dividend, 0, denom->magic, &rem_ignored); + } else { + // Here we wish to compute d = 2^(16+shift+1)/(m+2^16). + // Notice (m + 2^16) is a 17 bit number. Use 32 bit division for now + // Also note that shift may be as high as 15, so shift + 1 will + // overflow. So we have to compute it as 2^(16+shift)/(m+2^16), and + // then double the quotient and remainder. + uint32_t half_n = (uint32_t)1 << (16 + shift); + uint32_t d = ( (uint32_t)1 << 16) | denom->magic; + // Note that the quotient is guaranteed <= 16 bits, but the remainder + // may need 17! + uint16_t half_q = (uint16_t)(half_n / d); + uint32_t rem = half_n % d; + // We computed 2^(16+shift)/(m+2^16) + // Need to double it, and then add 1 to the quotient if doubling th + // remainder would increase the quotient. + // Note that rem<<1 cannot overflow, since rem < d and d is 17 bits + uint16_t full_q = half_q + half_q + ((rem << 1) >= d); + + // We rounded down in gen (hence +1) + return full_q + 1; + } +} + +uint16_t libdivide_u16_branchfree_recover(const struct libdivide_u16_branchfree_t *denom) { + uint8_t more = denom->more; + uint8_t shift = more & LIBDIVIDE_16_SHIFT_MASK; + + if (!denom->magic) { + return (uint16_t)1 << (shift + 1); + } else { + // Here we wish to compute d = 2^(16+shift+1)/(m+2^16). + // Notice (m + 2^16) is a 17 bit number. Use 32 bit division for now + // Also note that shift may be as high as 15, so shift + 1 will + // overflow. So we have to compute it as 2^(16+shift)/(m+2^16), and + // then double the quotient and remainder. + uint32_t half_n = (uint32_t)1 << (16 + shift); + uint32_t d = ((uint32_t)1 << 16) | denom->magic; + // Note that the quotient is guaranteed <= 16 bits, but the remainder + // may need 17! + uint16_t half_q = (uint16_t)(half_n / d); + uint32_t rem = half_n % d; + // We computed 2^(16+shift)/(m+2^16) + // Need to double it, and then add 1 to the quotient if doubling th + // remainder would increase the quotient. + // Note that rem<<1 cannot overflow, since rem < d and d is 33 bits + uint16_t full_q = half_q + half_q + ((rem << 1) >= d); + + // We rounded down in gen (hence +1) + return full_q + 1; + } +} + ////////// UINT32 -static inline struct libdivide_u32_t libdivide_internal_u32_gen(uint32_t d, int branchfree) { +static LIBDIVIDE_INLINE struct libdivide_u32_t libdivide_internal_u32_gen( + uint32_t d, int branchfree) { if (d == 0) { LIBDIVIDE_ERROR("divider must be != 0"); } @@ -565,15 +867,15 @@ static inline struct libdivide_u32_t libdivide_internal_u32_gen(uint32_t d, int } else { uint8_t more; uint32_t rem, proposed_m; - proposed_m = libdivide_64_div_32_to_32(1U << floor_log_2_d, 0, d, &rem); + proposed_m = libdivide_64_div_32_to_32((uint32_t)1 << floor_log_2_d, 0, d, &rem); LIBDIVIDE_ASSERT(rem > 0 && rem < d); const uint32_t e = d - rem; // This power works if e < 2**floor_log_2_d. - if (!branchfree && (e < (1U << floor_log_2_d))) { + if (!branchfree && (e < ((uint32_t)1 << floor_log_2_d))) { // This power works - more = floor_log_2_d; + more = (uint8_t)floor_log_2_d; } else { // We have to use the general 33-bit algorithm. We need to compute // (2**power) / d. However, we already have (2**(power-1))/d and @@ -583,7 +885,7 @@ static inline struct libdivide_u32_t libdivide_internal_u32_gen(uint32_t d, int proposed_m += proposed_m; const uint32_t twice_rem = rem + rem; if (twice_rem >= d || twice_rem < rem) proposed_m += 1; - more = floor_log_2_d | LIBDIVIDE_ADD_MARKER; + more = (uint8_t)(floor_log_2_d | LIBDIVIDE_ADD_MARKER); } result.magic = 1 + proposed_m; result.more = more; @@ -639,14 +941,14 @@ uint32_t libdivide_u32_recover(const struct libdivide_u32_t *denom) { uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK; if (!denom->magic) { - return 1U << shift; + return (uint32_t)1 << shift; } else if (!(more & LIBDIVIDE_ADD_MARKER)) { // We compute q = n/d = n*m / 2^(32 + shift) // Therefore we have d = 2^(32 + shift) / m // We need to ceil it. // We know d is not a power of 2, so m is not a power of 2, // so we can just add 1 to the floor - uint32_t hi_dividend = 1U << shift; + uint32_t hi_dividend = (uint32_t)1 << shift; uint32_t rem_ignored; return 1 + libdivide_64_div_32_to_32(hi_dividend, 0, denom->magic, &rem_ignored); } else { @@ -655,8 +957,8 @@ uint32_t libdivide_u32_recover(const struct libdivide_u32_t *denom) { // Also note that shift may be as high as 31, so shift + 1 will // overflow. So we have to compute it as 2^(32+shift)/(m+2^32), and // then double the quotient and remainder. - uint64_t half_n = 1ULL << (32 + shift); - uint64_t d = (1ULL << 32) | denom->magic; + uint64_t half_n = (uint64_t)1 << (32 + shift); + uint64_t d = ((uint64_t)1 << 32) | denom->magic; // Note that the quotient is guaranteed <= 32 bits, but the remainder // may need 33! uint32_t half_q = (uint32_t)(half_n / d); @@ -677,15 +979,15 @@ uint32_t libdivide_u32_branchfree_recover(const struct libdivide_u32_branchfree_ uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK; if (!denom->magic) { - return 1U << (shift + 1); + return (uint32_t)1 << (shift + 1); } else { // Here we wish to compute d = 2^(32+shift+1)/(m+2^32). // Notice (m + 2^32) is a 33 bit number. Use 64 bit division for now // Also note that shift may be as high as 31, so shift + 1 will // overflow. So we have to compute it as 2^(32+shift)/(m+2^32), and // then double the quotient and remainder. - uint64_t half_n = 1ULL << (32 + shift); - uint64_t d = (1ULL << 32) | denom->magic; + uint64_t half_n = (uint64_t)1 << (32 + shift); + uint64_t d = ((uint64_t)1 << 32) | denom->magic; // Note that the quotient is guaranteed <= 32 bits, but the remainder // may need 33! uint32_t half_q = (uint32_t)(half_n / d); @@ -703,7 +1005,8 @@ uint32_t libdivide_u32_branchfree_recover(const struct libdivide_u32_branchfree_ /////////// UINT64 -static inline struct libdivide_u64_t libdivide_internal_u64_gen(uint64_t d, int branchfree) { +static LIBDIVIDE_INLINE struct libdivide_u64_t libdivide_internal_u64_gen( + uint64_t d, int branchfree) { if (d == 0) { LIBDIVIDE_ERROR("divider must be != 0"); } @@ -723,15 +1026,15 @@ static inline struct libdivide_u64_t libdivide_internal_u64_gen(uint64_t d, int uint64_t proposed_m, rem; uint8_t more; // (1 << (64 + floor_log_2_d)) / d - proposed_m = libdivide_128_div_64_to_64(1ULL << floor_log_2_d, 0, d, &rem); + proposed_m = libdivide_128_div_64_to_64((uint64_t)1 << floor_log_2_d, 0, d, &rem); LIBDIVIDE_ASSERT(rem > 0 && rem < d); const uint64_t e = d - rem; // This power works if e < 2**floor_log_2_d. - if (!branchfree && e < (1ULL << floor_log_2_d)) { + if (!branchfree && e < ((uint64_t)1 << floor_log_2_d)) { // This power works - more = floor_log_2_d; + more = (uint8_t)floor_log_2_d; } else { // We have to use the general 65-bit algorithm. We need to compute // (2**power) / d. However, we already have (2**(power-1))/d and @@ -741,7 +1044,7 @@ static inline struct libdivide_u64_t libdivide_internal_u64_gen(uint64_t d, int proposed_m += proposed_m; const uint64_t twice_rem = rem + rem; if (twice_rem >= d || twice_rem < rem) proposed_m += 1; - more = floor_log_2_d | LIBDIVIDE_ADD_MARKER; + more = (uint8_t)(floor_log_2_d | LIBDIVIDE_ADD_MARKER); } result.magic = 1 + proposed_m; result.more = more; @@ -798,14 +1101,14 @@ uint64_t libdivide_u64_recover(const struct libdivide_u64_t *denom) { uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK; if (!denom->magic) { - return 1ULL << shift; + return (uint64_t)1 << shift; } else if (!(more & LIBDIVIDE_ADD_MARKER)) { // We compute q = n/d = n*m / 2^(64 + shift) // Therefore we have d = 2^(64 + shift) / m // We need to ceil it. // We know d is not a power of 2, so m is not a power of 2, // so we can just add 1 to the floor - uint64_t hi_dividend = 1ULL << shift; + uint64_t hi_dividend = (uint64_t)1 << shift; uint64_t rem_ignored; return 1 + libdivide_128_div_64_to_64(hi_dividend, 0, denom->magic, &rem_ignored); } else { @@ -817,7 +1120,7 @@ uint64_t libdivide_u64_recover(const struct libdivide_u64_t *denom) { // Full n is a (potentially) 129 bit value // half_n is a 128 bit value // Compute the hi half of half_n. Low half is 0. - uint64_t half_n_hi = 1ULL << shift, half_n_lo = 0; + uint64_t half_n_hi = (uint64_t)1 << shift, half_n_lo = 0; // d is a 65 bit value. The high bit is always set to 1. const uint64_t d_hi = 1, d_lo = denom->magic; // Note that the quotient is guaranteed <= 64 bits, @@ -842,7 +1145,7 @@ uint64_t libdivide_u64_branchfree_recover(const struct libdivide_u64_branchfree_ uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK; if (!denom->magic) { - return 1ULL << (shift + 1); + return (uint64_t)1 << (shift + 1); } else { // Here we wish to compute d = 2^(64+shift+1)/(m+2^64). // Notice (m + 2^64) is a 65 bit number. This gets hairy. See @@ -852,7 +1155,7 @@ uint64_t libdivide_u64_branchfree_recover(const struct libdivide_u64_branchfree_ // Full n is a (potentially) 129 bit value // half_n is a 128 bit value // Compute the hi half of half_n. Low half is 0. - uint64_t half_n_hi = 1ULL << shift, half_n_lo = 0; + uint64_t half_n_hi = (uint64_t)1 << shift, half_n_lo = 0; // d is a 65 bit value. The high bit is always set to 1. const uint64_t d_hi = 1, d_lo = denom->magic; // Note that the quotient is guaranteed <= 64 bits, @@ -872,9 +1175,185 @@ uint64_t libdivide_u64_branchfree_recover(const struct libdivide_u64_branchfree_ } } +/////////// SINT16 + +static LIBDIVIDE_INLINE struct libdivide_s16_t libdivide_internal_s16_gen( + int16_t d, int branchfree) { + if (d == 0) { + LIBDIVIDE_ERROR("divider must be != 0"); + } + + struct libdivide_s16_t result; + + // If d is a power of 2, or negative a power of 2, we have to use a shift. + // This is especially important because the magic algorithm fails for -1. + // To check if d is a power of 2 or its inverse, it suffices to check + // whether its absolute value has exactly one bit set. This works even for + // INT_MIN, because abs(INT_MIN) == INT_MIN, and INT_MIN has one bit set + // and is a power of 2. + uint16_t ud = (uint16_t)d; + uint16_t absD = (d < 0) ? -ud : ud; + uint16_t floor_log_2_d = 15 - libdivide_count_leading_zeros16(absD); + // check if exactly one bit is set, + // don't care if absD is 0 since that's divide by zero + if ((absD & (absD - 1)) == 0) { + // Branchfree and normal paths are exactly the same + result.magic = 0; + result.more = (uint8_t)(floor_log_2_d | (d < 0 ? LIBDIVIDE_NEGATIVE_DIVISOR : 0)); + } else { + LIBDIVIDE_ASSERT(floor_log_2_d >= 1); + + uint8_t more; + // the dividend here is 2**(floor_log_2_d + 31), so the low 16 bit word + // is 0 and the high word is floor_log_2_d - 1 + uint16_t rem, proposed_m; + proposed_m = libdivide_32_div_16_to_16((uint16_t)1 << (floor_log_2_d - 1), 0, absD, &rem); + const uint16_t e = absD - rem; + + // We are going to start with a power of floor_log_2_d - 1. + // This works if works if e < 2**floor_log_2_d. + if (!branchfree && e < ((uint16_t)1 << floor_log_2_d)) { + // This power works + more = (uint8_t)(floor_log_2_d - 1); + } else { + // We need to go one higher. This should not make proposed_m + // overflow, but it will make it negative when interpreted as an + // int16_t. + proposed_m += proposed_m; + const uint16_t twice_rem = rem + rem; + if (twice_rem >= absD || twice_rem < rem) proposed_m += 1; + more = (uint8_t)(floor_log_2_d | LIBDIVIDE_ADD_MARKER); + } + + proposed_m += 1; + int16_t magic = (int16_t)proposed_m; + + // Mark if we are negative. Note we only negate the magic number in the + // branchfull case. + if (d < 0) { + more |= LIBDIVIDE_NEGATIVE_DIVISOR; + if (!branchfree) { + magic = -magic; + } + } + + result.more = more; + result.magic = magic; + } + return result; +} + +struct libdivide_s16_t libdivide_s16_gen(int16_t d) { + return libdivide_internal_s16_gen(d, 0); +} + +struct libdivide_s16_branchfree_t libdivide_s16_branchfree_gen(int16_t d) { + struct libdivide_s16_t tmp = libdivide_internal_s16_gen(d, 1); + struct libdivide_s16_branchfree_t result = {tmp.magic, tmp.more}; + return result; +} + +// The original libdivide_s16_do takes a const pointer. However, this cannot be used +// with a compile time constant libdivide_s16_t: it will generate a warning about +// taking the address of a temporary. Hence this overload. +int16_t libdivide_s16_do_raw(int16_t numer, int16_t magic, uint8_t more) { + uint8_t shift = more & LIBDIVIDE_16_SHIFT_MASK; + + if (!magic) { + uint16_t sign = (int8_t)more >> 7; + uint16_t mask = ((uint16_t)1 << shift) - 1; + uint16_t uq = numer + ((numer >> 15) & mask); + int16_t q = (int16_t)uq; + q >>= shift; + q = (q ^ sign) - sign; + return q; + } else { + uint16_t uq = (uint16_t)libdivide_mullhi_s16(magic, numer); + if (more & LIBDIVIDE_ADD_MARKER) { + // must be arithmetic shift and then sign extend + int16_t sign = (int8_t)more >> 7; + // q += (more < 0 ? -numer : numer) + // cast required to avoid UB + uq += ((uint16_t)numer ^ sign) - sign; + } + int16_t q = (int16_t)uq; + q >>= shift; + q += (q < 0); + return q; + } +} + +int16_t libdivide_s16_do(int16_t numer, const struct libdivide_s16_t *denom) { + return libdivide_s16_do_raw(numer, denom->magic, denom->more); +} + +int16_t libdivide_s16_branchfree_do(int16_t numer, const struct libdivide_s16_branchfree_t *denom) { + uint8_t more = denom->more; + uint8_t shift = more & LIBDIVIDE_16_SHIFT_MASK; + // must be arithmetic shift and then sign extend + int16_t sign = (int8_t)more >> 7; + int16_t magic = denom->magic; + int16_t q = libdivide_mullhi_s16(magic, numer); + q += numer; + + // If q is non-negative, we have nothing to do + // If q is negative, we want to add either (2**shift)-1 if d is a power of + // 2, or (2**shift) if it is not a power of 2 + uint16_t is_power_of_2 = (magic == 0); + uint16_t q_sign = (uint16_t)(q >> 15); + q += q_sign & (((uint16_t)1 << shift) - is_power_of_2); + + // Now arithmetic right shift + q >>= shift; + // Negate if needed + q = (q ^ sign) - sign; + + return q; +} + +int16_t libdivide_s16_recover(const struct libdivide_s16_t *denom) { + uint8_t more = denom->more; + uint8_t shift = more & LIBDIVIDE_16_SHIFT_MASK; + if (!denom->magic) { + uint16_t absD = (uint16_t)1 << shift; + if (more & LIBDIVIDE_NEGATIVE_DIVISOR) { + absD = -absD; + } + return (int16_t)absD; + } else { + // Unsigned math is much easier + // We negate the magic number only in the branchfull case, and we don't + // know which case we're in. However we have enough information to + // determine the correct sign of the magic number. The divisor was + // negative if LIBDIVIDE_NEGATIVE_DIVISOR is set. If ADD_MARKER is set, + // the magic number's sign is opposite that of the divisor. + // We want to compute the positive magic number. + int negative_divisor = (more & LIBDIVIDE_NEGATIVE_DIVISOR); + int magic_was_negated = (more & LIBDIVIDE_ADD_MARKER) ? denom->magic > 0 : denom->magic < 0; + + // Handle the power of 2 case (including branchfree) + if (denom->magic == 0) { + int16_t result = (uint16_t)1 << shift; + return negative_divisor ? -result : result; + } + + uint16_t d = (uint16_t)(magic_was_negated ? -denom->magic : denom->magic); + uint32_t n = (uint32_t)1 << (16 + shift); // this shift cannot exceed 30 + uint16_t q = (uint16_t)(n / d); + int16_t result = (int16_t)q; + result += 1; + return negative_divisor ? -result : result; + } +} + +int16_t libdivide_s16_branchfree_recover(const struct libdivide_s16_branchfree_t *denom) { + return libdivide_s16_recover((const struct libdivide_s16_t *)denom); +} + /////////// SINT32 -static inline struct libdivide_s32_t libdivide_internal_s32_gen(int32_t d, int branchfree) { +static LIBDIVIDE_INLINE struct libdivide_s32_t libdivide_internal_s32_gen( + int32_t d, int branchfree) { if (d == 0) { LIBDIVIDE_ERROR("divider must be != 0"); } @@ -895,7 +1374,7 @@ static inline struct libdivide_s32_t libdivide_internal_s32_gen(int32_t d, int b if ((absD & (absD - 1)) == 0) { // Branchfree and normal paths are exactly the same result.magic = 0; - result.more = floor_log_2_d | (d < 0 ? LIBDIVIDE_NEGATIVE_DIVISOR : 0); + result.more = (uint8_t)(floor_log_2_d | (d < 0 ? LIBDIVIDE_NEGATIVE_DIVISOR : 0)); } else { LIBDIVIDE_ASSERT(floor_log_2_d >= 1); @@ -903,14 +1382,14 @@ static inline struct libdivide_s32_t libdivide_internal_s32_gen(int32_t d, int b // the dividend here is 2**(floor_log_2_d + 31), so the low 32 bit word // is 0 and the high word is floor_log_2_d - 1 uint32_t rem, proposed_m; - proposed_m = libdivide_64_div_32_to_32(1U << (floor_log_2_d - 1), 0, absD, &rem); + proposed_m = libdivide_64_div_32_to_32((uint32_t)1 << (floor_log_2_d - 1), 0, absD, &rem); const uint32_t e = absD - rem; // We are going to start with a power of floor_log_2_d - 1. // This works if works if e < 2**floor_log_2_d. - if (!branchfree && e < (1U << floor_log_2_d)) { + if (!branchfree && e < ((uint32_t)1 << floor_log_2_d)) { // This power works - more = floor_log_2_d - 1; + more = (uint8_t)(floor_log_2_d - 1); } else { // We need to go one higher. This should not make proposed_m // overflow, but it will make it negative when interpreted as an @@ -918,7 +1397,7 @@ static inline struct libdivide_s32_t libdivide_internal_s32_gen(int32_t d, int b proposed_m += proposed_m; const uint32_t twice_rem = rem + rem; if (twice_rem >= absD || twice_rem < rem) proposed_m += 1; - more = floor_log_2_d | LIBDIVIDE_ADD_MARKER; + more = (uint8_t)(floor_log_2_d | LIBDIVIDE_ADD_MARKER); } proposed_m += 1; @@ -955,7 +1434,7 @@ int32_t libdivide_s32_do(int32_t numer, const struct libdivide_s32_t *denom) { if (!denom->magic) { uint32_t sign = (int8_t)more >> 7; - uint32_t mask = (1U << shift) - 1; + uint32_t mask = ((uint32_t)1 << shift) - 1; uint32_t uq = numer + ((numer >> 31) & mask); int32_t q = (int32_t)uq; q >>= shift; @@ -991,7 +1470,7 @@ int32_t libdivide_s32_branchfree_do(int32_t numer, const struct libdivide_s32_br // 2, or (2**shift) if it is not a power of 2 uint32_t is_power_of_2 = (magic == 0); uint32_t q_sign = (uint32_t)(q >> 31); - q += q_sign & ((1U << shift) - is_power_of_2); + q += q_sign & (((uint32_t)1 << shift) - is_power_of_2); // Now arithmetic right shift q >>= shift; @@ -1005,7 +1484,7 @@ int32_t libdivide_s32_recover(const struct libdivide_s32_t *denom) { uint8_t more = denom->more; uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK; if (!denom->magic) { - uint32_t absD = 1U << shift; + uint32_t absD = (uint32_t)1 << shift; if (more & LIBDIVIDE_NEGATIVE_DIVISOR) { absD = -absD; } @@ -1023,12 +1502,12 @@ int32_t libdivide_s32_recover(const struct libdivide_s32_t *denom) { // Handle the power of 2 case (including branchfree) if (denom->magic == 0) { - int32_t result = 1U << shift; + int32_t result = (uint32_t)1 << shift; return negative_divisor ? -result : result; } uint32_t d = (uint32_t)(magic_was_negated ? -denom->magic : denom->magic); - uint64_t n = 1ULL << (32 + shift); // this shift cannot exceed 30 + uint64_t n = (uint64_t)1 << (32 + shift); // this shift cannot exceed 30 uint32_t q = (uint32_t)(n / d); int32_t result = (int32_t)q; result += 1; @@ -1042,7 +1521,8 @@ int32_t libdivide_s32_branchfree_recover(const struct libdivide_s32_branchfree_t ///////////// SINT64 -static inline struct libdivide_s64_t libdivide_internal_s64_gen(int64_t d, int branchfree) { +static LIBDIVIDE_INLINE struct libdivide_s64_t libdivide_internal_s64_gen( + int64_t d, int branchfree) { if (d == 0) { LIBDIVIDE_ERROR("divider must be != 0"); } @@ -1063,20 +1543,20 @@ static inline struct libdivide_s64_t libdivide_internal_s64_gen(int64_t d, int b if ((absD & (absD - 1)) == 0) { // Branchfree and non-branchfree cases are the same result.magic = 0; - result.more = floor_log_2_d | (d < 0 ? LIBDIVIDE_NEGATIVE_DIVISOR : 0); + result.more = (uint8_t)(floor_log_2_d | (d < 0 ? LIBDIVIDE_NEGATIVE_DIVISOR : 0)); } else { // the dividend here is 2**(floor_log_2_d + 63), so the low 64 bit word // is 0 and the high word is floor_log_2_d - 1 uint8_t more; uint64_t rem, proposed_m; - proposed_m = libdivide_128_div_64_to_64(1ULL << (floor_log_2_d - 1), 0, absD, &rem); + proposed_m = libdivide_128_div_64_to_64((uint64_t)1 << (floor_log_2_d - 1), 0, absD, &rem); const uint64_t e = absD - rem; // We are going to start with a power of floor_log_2_d - 1. // This works if works if e < 2**floor_log_2_d. - if (!branchfree && e < (1ULL << floor_log_2_d)) { + if (!branchfree && e < ((uint64_t)1 << floor_log_2_d)) { // This power works - more = floor_log_2_d - 1; + more = (uint8_t)(floor_log_2_d - 1); } else { // We need to go one higher. This should not make proposed_m // overflow, but it will make it negative when interpreted as an @@ -1088,7 +1568,7 @@ static inline struct libdivide_s64_t libdivide_internal_s64_gen(int64_t d, int b // also set ADD_MARKER this is an annoying optimization that // enables algorithm #4 to avoid the mask. However we always set it // in the branchfree case - more = floor_log_2_d | LIBDIVIDE_ADD_MARKER; + more = (uint8_t)(floor_log_2_d | LIBDIVIDE_ADD_MARKER); } proposed_m += 1; int64_t magic = (int64_t)proposed_m; @@ -1122,7 +1602,7 @@ int64_t libdivide_s64_do(int64_t numer, const struct libdivide_s64_t *denom) { uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK; if (!denom->magic) { // shift path - uint64_t mask = (1ULL << shift) - 1; + uint64_t mask = ((uint64_t)1 << shift) - 1; uint64_t uq = numer + ((numer >> 63) & mask); int64_t q = (int64_t)uq; q >>= shift; @@ -1160,7 +1640,7 @@ int64_t libdivide_s64_branchfree_do(int64_t numer, const struct libdivide_s64_br // 2, or (2**shift) if it is not a power of 2. uint64_t is_power_of_2 = (magic == 0); uint64_t q_sign = (uint64_t)(q >> 63); - q += q_sign & ((1ULL << shift) - is_power_of_2); + q += q_sign & (((uint64_t)1 << shift) - is_power_of_2); // Arithmetic right shift q >>= shift; @@ -1174,7 +1654,7 @@ int64_t libdivide_s64_recover(const struct libdivide_s64_t *denom) { uint8_t more = denom->more; uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK; if (denom->magic == 0) { // shift path - uint64_t absD = 1ULL << shift; + uint64_t absD = (uint64_t)1 << shift; if (more & LIBDIVIDE_NEGATIVE_DIVISOR) { absD = -absD; } @@ -1185,7 +1665,7 @@ int64_t libdivide_s64_recover(const struct libdivide_s64_t *denom) { int magic_was_negated = (more & LIBDIVIDE_ADD_MARKER) ? denom->magic > 0 : denom->magic < 0; uint64_t d = (uint64_t)(magic_was_negated ? -denom->magic : denom->magic); - uint64_t n_hi = 1ULL << shift, n_lo = 0; + uint64_t n_hi = (uint64_t)1 << shift, n_lo = 0; uint64_t rem_ignored; uint64_t q = libdivide_128_div_64_to_64(n_hi, n_lo, d, &rem_ignored); int64_t result = (int64_t)(q + 1); @@ -1200,67 +1680,87 @@ int64_t libdivide_s64_branchfree_recover(const struct libdivide_s64_branchfree_t return libdivide_s64_recover((const struct libdivide_s64_t *)denom); } +// Simplest possible vector type division: treat the vector type as an array +// of underlying native type. +#define SIMPLE_VECTOR_DIVISION(IntT, VecT, Algo) \ + const size_t count = sizeof(VecT) / sizeof(IntT); \ + VecT result; \ + IntT *pSource = (IntT *)&numers; \ + IntT *pTarget = (IntT *)&result; \ + for (size_t loop=0; loop(amt); +static LIBDIVIDE_INLINE uint32x4_t libdivide_u32_neon_srl(uint32x4_t v, uint8_t amt) { + int32_t wamt = (int32_t)(amt); return vshlq_u32(v, vdupq_n_s32(-wamt)); } -static inline uint64x2_t libdivide_u64_neon_srl(uint64x2_t v, uint8_t amt) { - int64_t wamt = static_cast(amt); +static LIBDIVIDE_INLINE uint64x2_t libdivide_u64_neon_srl(uint64x2_t v, uint8_t amt) { + int64_t wamt = (int64_t)(amt); return vshlq_u64(v, vdupq_n_s64(-wamt)); } // Arithmetic right shift by runtime value. -static inline int32x4_t libdivide_s32_neon_sra(int32x4_t v, uint8_t amt) { - int32_t wamt = static_cast(amt); +static LIBDIVIDE_INLINE int32x4_t libdivide_s32_neon_sra(int32x4_t v, uint8_t amt) { + int32_t wamt = (int32_t)(amt); return vshlq_s32(v, vdupq_n_s32(-wamt)); } -static inline int64x2_t libdivide_s64_neon_sra(int64x2_t v, uint8_t amt) { - int64_t wamt = static_cast(amt); +static LIBDIVIDE_INLINE int64x2_t libdivide_s64_neon_sra(int64x2_t v, uint8_t amt) { + int64_t wamt = (int64_t)(amt); return vshlq_s64(v, vdupq_n_s64(-wamt)); } -static inline int64x2_t libdivide_s64_signbits(int64x2_t v) { return vshrq_n_s64(v, 63); } +static LIBDIVIDE_INLINE int64x2_t libdivide_s64_signbits(int64x2_t v) { return vshrq_n_s64(v, 63); } -static inline uint32x4_t libdivide_mullhi_u32_vec128(uint32x4_t a, uint32_t b) { +static LIBDIVIDE_INLINE uint32x4_t libdivide_mullhi_u32_vec128(uint32x4_t a, uint32_t b) { // Desire is [x0, x1, x2, x3] uint32x4_t w1 = vreinterpretq_u32_u64(vmull_n_u32(vget_low_u32(a), b)); // [_, x0, _, x1] uint32x4_t w2 = vreinterpretq_u32_u64(vmull_high_n_u32(a, b)); //[_, x2, _, x3] return vuzp2q_u32(w1, w2); // [x0, x1, x2, x3] } -static inline int32x4_t libdivide_mullhi_s32_vec128(int32x4_t a, int32_t b) { +static LIBDIVIDE_INLINE int32x4_t libdivide_mullhi_s32_vec128(int32x4_t a, int32_t b) { int32x4_t w1 = vreinterpretq_s32_s64(vmull_n_s32(vget_low_s32(a), b)); // [_, x0, _, x1] int32x4_t w2 = vreinterpretq_s32_s64(vmull_high_n_s32(a, b)); //[_, x2, _, x3] return vuzp2q_s32(w1, w2); // [x0, x1, x2, x3] } -static inline uint64x2_t libdivide_mullhi_u64_vec128(uint64x2_t x, uint64_t sy) { +static LIBDIVIDE_INLINE uint64x2_t libdivide_mullhi_u64_vec128(uint64x2_t x, uint64_t sy) { // full 128 bits product is: // x0*y0 + (x0*y1 << 32) + (x1*y0 << 32) + (x1*y1 << 64) // Note x0,y0,x1,y1 are all conceptually uint32, products are 32x32->64. @@ -1291,9 +1791,9 @@ static inline uint64x2_t libdivide_mullhi_u64_vec128(uint64x2_t x, uint64_t sy) return result; } -static inline int64x2_t libdivide_mullhi_s64_vec128(int64x2_t x, int64_t sy) { +static LIBDIVIDE_INLINE int64x2_t libdivide_mullhi_s64_vec128(int64x2_t x, int64_t sy) { int64x2_t p = vreinterpretq_s64_u64( - libdivide_mullhi_u64_vec128(vreinterpretq_u64_s64(x), static_cast(sy))); + libdivide_mullhi_u64_vec128(vreinterpretq_u64_s64(x), (uint64_t)(sy))); int64x2_t y = vdupq_n_s64(sy); int64x2_t t1 = vandq_s64(libdivide_s64_signbits(x), y); int64x2_t t2 = vandq_s64(libdivide_s64_signbits(y), x); @@ -1302,6 +1802,16 @@ static inline int64x2_t libdivide_mullhi_s64_vec128(int64x2_t x, int64_t sy) { return p; } +////////// UINT16 + +uint16x8_t libdivide_u16_do_vec128(uint16x8_t numers, const struct libdivide_u16_t *denom) { + SIMPLE_VECTOR_DIVISION(uint16_t, uint16x8_t, u16) +} + +uint16x8_t libdivide_u16_branchfree_do_vec128(uint16x8_t numers, const struct libdivide_u16_branchfree_t *denom) { + SIMPLE_VECTOR_DIVISION(uint16_t, uint16x8_t, u16_branchfree) +} + ////////// UINT32 uint32x4_t libdivide_u32_do_vec128(uint32x4_t numers, const struct libdivide_u32_t *denom) { @@ -1358,13 +1868,23 @@ uint64x2_t libdivide_u64_branchfree_do_vec128( return libdivide_u64_neon_srl(t, denom->more); } +////////// SINT16 + +int16x8_t libdivide_s16_do_vec128(int16x8_t numers, const struct libdivide_s16_t *denom) { + SIMPLE_VECTOR_DIVISION(int16_t, int16x8_t, s16) +} + +int16x8_t libdivide_s16_branchfree_do_vec128(int16x8_t numers, const struct libdivide_s16_branchfree_t *denom) { + SIMPLE_VECTOR_DIVISION(int16_t, int16x8_t, s16_branchfree) +} + ////////// SINT32 int32x4_t libdivide_s32_do_vec128(int32x4_t numers, const struct libdivide_s32_t *denom) { uint8_t more = denom->more; if (!denom->magic) { uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK; - uint32_t mask = (1U << shift) - 1; + uint32_t mask = ((uint32_t)1 << shift) - 1; int32x4_t roundToZeroTweak = vdupq_n_s32((int)mask); // q = numer + ((numer >> 31) & roundToZeroTweak); int32x4_t q = vaddq_s32(numers, vandq_s32(vshrq_n_s32(numers, 31), roundToZeroTweak)); @@ -1404,7 +1924,7 @@ int32x4_t libdivide_s32_branchfree_do_vec128( // a power of 2, or (2**shift) if it is not a power of 2 uint32_t is_power_of_2 = (magic == 0); int32x4_t q_sign = vshrq_n_s32(q, 31); // q_sign = q >> 31 - int32x4_t mask = vdupq_n_s32((1U << shift) - is_power_of_2); + int32x4_t mask = vdupq_n_s32(((uint32_t)1 << shift) - is_power_of_2); q = vaddq_s32(q, vandq_s32(q_sign, mask)); // q = q + (q_sign & mask) q = libdivide_s32_neon_sra(q, shift); // q >>= shift q = vsubq_s32(veorq_s32(q, sign), sign); // q = (q ^ sign) - sign @@ -1418,7 +1938,7 @@ int64x2_t libdivide_s64_do_vec128(int64x2_t numers, const struct libdivide_s64_t int64_t magic = denom->magic; if (magic == 0) { // shift path uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK; - uint64_t mask = (1ULL << shift) - 1; + uint64_t mask = ((uint64_t)1 << shift) - 1; int64x2_t roundToZeroTweak = vdupq_n_s64(mask); // TODO: no need to sign extend // q = numer + ((numer >> 63) & roundToZeroTweak); int64x2_t q = @@ -1461,7 +1981,7 @@ int64x2_t libdivide_s64_branchfree_do_vec128( // a power of 2, or (2**shift) if it is not a power of 2. uint32_t is_power_of_2 = (magic == 0); int64x2_t q_sign = libdivide_s64_signbits(q); // q_sign = q >> 63 - int64x2_t mask = vdupq_n_s64((1ULL << shift) - is_power_of_2); + int64x2_t mask = vdupq_n_s64(((uint64_t)1 << shift) - is_power_of_2); q = vaddq_s64(q, vandq_s64(q_sign, mask)); // q = q + (q_sign & mask) q = libdivide_s64_neon_sra(q, shift); // q >>= shift q = vsubq_s64(veorq_s64(q, sign), sign); // q = (q ^ sign) - sign @@ -1472,33 +1992,45 @@ int64x2_t libdivide_s64_branchfree_do_vec128( #if defined(LIBDIVIDE_AVX512) -static inline __m512i libdivide_u32_do_vec512(__m512i numers, const struct libdivide_u32_t *denom); -static inline __m512i libdivide_s32_do_vec512(__m512i numers, const struct libdivide_s32_t *denom); -static inline __m512i libdivide_u64_do_vec512(__m512i numers, const struct libdivide_u64_t *denom); -static inline __m512i libdivide_s64_do_vec512(__m512i numers, const struct libdivide_s64_t *denom); +static LIBDIVIDE_INLINE __m512i libdivide_u16_do_vec512( + __m512i numers, const struct libdivide_u16_t *denom); +static LIBDIVIDE_INLINE __m512i libdivide_s16_do_vec512( + __m512i numers, const struct libdivide_s16_t *denom); +static LIBDIVIDE_INLINE __m512i libdivide_u32_do_vec512( + __m512i numers, const struct libdivide_u32_t *denom); +static LIBDIVIDE_INLINE __m512i libdivide_s32_do_vec512( + __m512i numers, const struct libdivide_s32_t *denom); +static LIBDIVIDE_INLINE __m512i libdivide_u64_do_vec512( + __m512i numers, const struct libdivide_u64_t *denom); +static LIBDIVIDE_INLINE __m512i libdivide_s64_do_vec512( + __m512i numers, const struct libdivide_s64_t *denom); -static inline __m512i libdivide_u32_branchfree_do_vec512( +static LIBDIVIDE_INLINE __m512i libdivide_u16_branchfree_do_vec512( + __m512i numers, const struct libdivide_u16_branchfree_t *denom); +static LIBDIVIDE_INLINE __m512i libdivide_s16_branchfree_do_vec512( + __m512i numers, const struct libdivide_s16_branchfree_t *denom); +static LIBDIVIDE_INLINE __m512i libdivide_u32_branchfree_do_vec512( __m512i numers, const struct libdivide_u32_branchfree_t *denom); -static inline __m512i libdivide_s32_branchfree_do_vec512( +static LIBDIVIDE_INLINE __m512i libdivide_s32_branchfree_do_vec512( __m512i numers, const struct libdivide_s32_branchfree_t *denom); -static inline __m512i libdivide_u64_branchfree_do_vec512( +static LIBDIVIDE_INLINE __m512i libdivide_u64_branchfree_do_vec512( __m512i numers, const struct libdivide_u64_branchfree_t *denom); -static inline __m512i libdivide_s64_branchfree_do_vec512( +static LIBDIVIDE_INLINE __m512i libdivide_s64_branchfree_do_vec512( __m512i numers, const struct libdivide_s64_branchfree_t *denom); //////// Internal Utility Functions -static inline __m512i libdivide_s64_signbits(__m512i v) { +static LIBDIVIDE_INLINE __m512i libdivide_s64_signbits_vec512(__m512i v) { ; return _mm512_srai_epi64(v, 63); } -static inline __m512i libdivide_s64_shift_right_vec512(__m512i v, int amt) { +static LIBDIVIDE_INLINE __m512i libdivide_s64_shift_right_vec512(__m512i v, int amt) { return _mm512_srai_epi64(v, amt); } // Here, b is assumed to contain one 32-bit value repeated. -static inline __m512i libdivide_mullhi_u32_vec512(__m512i a, __m512i b) { +static LIBDIVIDE_INLINE __m512i libdivide_mullhi_u32_vec512(__m512i a, __m512i b) { __m512i hi_product_0Z2Z = _mm512_srli_epi64(_mm512_mul_epu32(a, b), 32); __m512i a1X3X = _mm512_srli_epi64(a, 32); __m512i mask = _mm512_set_epi32(-1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0); @@ -1507,7 +2039,7 @@ static inline __m512i libdivide_mullhi_u32_vec512(__m512i a, __m512i b) { } // b is one 32-bit value repeated. -static inline __m512i libdivide_mullhi_s32_vec512(__m512i a, __m512i b) { +static LIBDIVIDE_INLINE __m512i libdivide_mullhi_s32_vec512(__m512i a, __m512i b) { __m512i hi_product_0Z2Z = _mm512_srli_epi64(_mm512_mul_epi32(a, b), 32); __m512i a1X3X = _mm512_srli_epi64(a, 32); __m512i mask = _mm512_set_epi32(-1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0); @@ -1516,7 +2048,7 @@ static inline __m512i libdivide_mullhi_s32_vec512(__m512i a, __m512i b) { } // Here, y is assumed to contain one 64-bit value repeated. -static inline __m512i libdivide_mullhi_u64_vec512(__m512i x, __m512i y) { +static LIBDIVIDE_INLINE __m512i libdivide_mullhi_u64_vec512(__m512i x, __m512i y) { // see m128i variant for comments. __m512i x0y0 = _mm512_mul_epu32(x, y); __m512i x0y0_hi = _mm512_srli_epi64(x0y0, 32); @@ -1539,15 +2071,25 @@ static inline __m512i libdivide_mullhi_u64_vec512(__m512i x, __m512i y) { } // y is one 64-bit value repeated. -static inline __m512i libdivide_mullhi_s64_vec512(__m512i x, __m512i y) { +static LIBDIVIDE_INLINE __m512i libdivide_mullhi_s64_vec512(__m512i x, __m512i y) { __m512i p = libdivide_mullhi_u64_vec512(x, y); - __m512i t1 = _mm512_and_si512(libdivide_s64_signbits(x), y); - __m512i t2 = _mm512_and_si512(libdivide_s64_signbits(y), x); + __m512i t1 = _mm512_and_si512(libdivide_s64_signbits_vec512(x), y); + __m512i t2 = _mm512_and_si512(libdivide_s64_signbits_vec512(y), x); p = _mm512_sub_epi64(p, t1); p = _mm512_sub_epi64(p, t2); return p; } +////////// UINT16 + +__m512i libdivide_u16_do_vec512(__m512i numers, const struct libdivide_u16_t *denom) { + SIMPLE_VECTOR_DIVISION(uint16_t, __m512i, u16) +} + +__m512i libdivide_u16_branchfree_do_vec512(__m512i numers, const struct libdivide_u16_branchfree_t *denom) { + SIMPLE_VECTOR_DIVISION(uint16_t, __m512i, u16_branchfree) +} + ////////// UINT32 __m512i libdivide_u32_do_vec512(__m512i numers, const struct libdivide_u32_t *denom) { @@ -1602,13 +2144,23 @@ __m512i libdivide_u64_branchfree_do_vec512( return _mm512_srli_epi64(t, denom->more); } +////////// SINT16 + +__m512i libdivide_s16_do_vec512(__m512i numers, const struct libdivide_s16_t *denom) { + SIMPLE_VECTOR_DIVISION(int16_t, __m512i, s16) +} + +__m512i libdivide_s16_branchfree_do_vec512(__m512i numers, const struct libdivide_s16_branchfree_t *denom) { + SIMPLE_VECTOR_DIVISION(int16_t, __m512i, s16_branchfree) +} + ////////// SINT32 __m512i libdivide_s32_do_vec512(__m512i numers, const struct libdivide_s32_t *denom) { uint8_t more = denom->more; if (!denom->magic) { uint32_t shift = more & LIBDIVIDE_32_SHIFT_MASK; - uint32_t mask = (1U << shift) - 1; + uint32_t mask = ((uint32_t)1 << shift) - 1; __m512i roundToZeroTweak = _mm512_set1_epi32(mask); // q = numer + ((numer >> 31) & roundToZeroTweak); __m512i q = _mm512_add_epi32( @@ -1648,7 +2200,7 @@ __m512i libdivide_s32_branchfree_do_vec512( // a power of 2, or (2**shift) if it is not a power of 2 uint32_t is_power_of_2 = (magic == 0); __m512i q_sign = _mm512_srai_epi32(q, 31); // q_sign = q >> 31 - __m512i mask = _mm512_set1_epi32((1U << shift) - is_power_of_2); + __m512i mask = _mm512_set1_epi32(((uint32_t)1 << shift) - is_power_of_2); q = _mm512_add_epi32(q, _mm512_and_si512(q_sign, mask)); // q = q + (q_sign & mask) q = _mm512_srai_epi32(q, shift); // q >>= shift q = _mm512_sub_epi32(_mm512_xor_si512(q, sign), sign); // q = (q ^ sign) - sign @@ -1662,11 +2214,11 @@ __m512i libdivide_s64_do_vec512(__m512i numers, const struct libdivide_s64_t *de int64_t magic = denom->magic; if (magic == 0) { // shift path uint32_t shift = more & LIBDIVIDE_64_SHIFT_MASK; - uint64_t mask = (1ULL << shift) - 1; + uint64_t mask = ((uint64_t)1 << shift) - 1; __m512i roundToZeroTweak = _mm512_set1_epi64(mask); // q = numer + ((numer >> 63) & roundToZeroTweak); __m512i q = _mm512_add_epi64( - numers, _mm512_and_si512(libdivide_s64_signbits(numers), roundToZeroTweak)); + numers, _mm512_and_si512(libdivide_s64_signbits_vec512(numers), roundToZeroTweak)); q = libdivide_s64_shift_right_vec512(q, shift); __m512i sign = _mm512_set1_epi32((int8_t)more >> 7); // q = (q ^ sign) - sign; @@ -1703,8 +2255,8 @@ __m512i libdivide_s64_branchfree_do_vec512( // If q is negative, we want to add either (2**shift)-1 if d is // a power of 2, or (2**shift) if it is not a power of 2. uint32_t is_power_of_2 = (magic == 0); - __m512i q_sign = libdivide_s64_signbits(q); // q_sign = q >> 63 - __m512i mask = _mm512_set1_epi64((1ULL << shift) - is_power_of_2); + __m512i q_sign = libdivide_s64_signbits_vec512(q); // q_sign = q >> 63 + __m512i mask = _mm512_set1_epi64(((uint64_t)1 << shift) - is_power_of_2); q = _mm512_add_epi64(q, _mm512_and_si512(q_sign, mask)); // q = q + (q_sign & mask) q = libdivide_s64_shift_right_vec512(q, shift); // q >>= shift q = _mm512_sub_epi64(_mm512_xor_si512(q, sign), sign); // q = (q ^ sign) - sign @@ -1715,40 +2267,52 @@ __m512i libdivide_s64_branchfree_do_vec512( #if defined(LIBDIVIDE_AVX2) -static inline __m256i libdivide_u32_do_vec256(__m256i numers, const struct libdivide_u32_t *denom); -static inline __m256i libdivide_s32_do_vec256(__m256i numers, const struct libdivide_s32_t *denom); -static inline __m256i libdivide_u64_do_vec256(__m256i numers, const struct libdivide_u64_t *denom); -static inline __m256i libdivide_s64_do_vec256(__m256i numers, const struct libdivide_s64_t *denom); +static LIBDIVIDE_INLINE __m256i libdivide_u16_do_vec256( + __m256i numers, const struct libdivide_u16_t *denom); +static LIBDIVIDE_INLINE __m256i libdivide_s16_do_vec256( + __m256i numers, const struct libdivide_s16_t *denom); +static LIBDIVIDE_INLINE __m256i libdivide_u32_do_vec256( + __m256i numers, const struct libdivide_u32_t *denom); +static LIBDIVIDE_INLINE __m256i libdivide_s32_do_vec256( + __m256i numers, const struct libdivide_s32_t *denom); +static LIBDIVIDE_INLINE __m256i libdivide_u64_do_vec256( + __m256i numers, const struct libdivide_u64_t *denom); +static LIBDIVIDE_INLINE __m256i libdivide_s64_do_vec256( + __m256i numers, const struct libdivide_s64_t *denom); -static inline __m256i libdivide_u32_branchfree_do_vec256( +static LIBDIVIDE_INLINE __m256i libdivide_u16_branchfree_do_vec256( + __m256i numers, const struct libdivide_u16_branchfree_t *denom); +static LIBDIVIDE_INLINE __m256i libdivide_s16_branchfree_do_vec256( + __m256i numers, const struct libdivide_s16_branchfree_t *denom); +static LIBDIVIDE_INLINE __m256i libdivide_u32_branchfree_do_vec256( __m256i numers, const struct libdivide_u32_branchfree_t *denom); -static inline __m256i libdivide_s32_branchfree_do_vec256( +static LIBDIVIDE_INLINE __m256i libdivide_s32_branchfree_do_vec256( __m256i numers, const struct libdivide_s32_branchfree_t *denom); -static inline __m256i libdivide_u64_branchfree_do_vec256( +static LIBDIVIDE_INLINE __m256i libdivide_u64_branchfree_do_vec256( __m256i numers, const struct libdivide_u64_branchfree_t *denom); -static inline __m256i libdivide_s64_branchfree_do_vec256( +static LIBDIVIDE_INLINE __m256i libdivide_s64_branchfree_do_vec256( __m256i numers, const struct libdivide_s64_branchfree_t *denom); //////// Internal Utility Functions // Implementation of _mm256_srai_epi64(v, 63) (from AVX512). -static inline __m256i libdivide_s64_signbits(__m256i v) { +static LIBDIVIDE_INLINE __m256i libdivide_s64_signbits_vec256(__m256i v) { __m256i hiBitsDuped = _mm256_shuffle_epi32(v, _MM_SHUFFLE(3, 3, 1, 1)); __m256i signBits = _mm256_srai_epi32(hiBitsDuped, 31); return signBits; } // Implementation of _mm256_srai_epi64 (from AVX512). -static inline __m256i libdivide_s64_shift_right_vec256(__m256i v, int amt) { +static LIBDIVIDE_INLINE __m256i libdivide_s64_shift_right_vec256(__m256i v, int amt) { const int b = 64 - amt; - __m256i m = _mm256_set1_epi64x(1ULL << (b - 1)); + __m256i m = _mm256_set1_epi64x((uint64_t)1 << (b - 1)); __m256i x = _mm256_srli_epi64(v, amt); __m256i result = _mm256_sub_epi64(_mm256_xor_si256(x, m), m); return result; } // Here, b is assumed to contain one 32-bit value repeated. -static inline __m256i libdivide_mullhi_u32_vec256(__m256i a, __m256i b) { +static LIBDIVIDE_INLINE __m256i libdivide_mullhi_u32_vec256(__m256i a, __m256i b) { __m256i hi_product_0Z2Z = _mm256_srli_epi64(_mm256_mul_epu32(a, b), 32); __m256i a1X3X = _mm256_srli_epi64(a, 32); __m256i mask = _mm256_set_epi32(-1, 0, -1, 0, -1, 0, -1, 0); @@ -1757,7 +2321,7 @@ static inline __m256i libdivide_mullhi_u32_vec256(__m256i a, __m256i b) { } // b is one 32-bit value repeated. -static inline __m256i libdivide_mullhi_s32_vec256(__m256i a, __m256i b) { +static LIBDIVIDE_INLINE __m256i libdivide_mullhi_s32_vec256(__m256i a, __m256i b) { __m256i hi_product_0Z2Z = _mm256_srli_epi64(_mm256_mul_epi32(a, b), 32); __m256i a1X3X = _mm256_srli_epi64(a, 32); __m256i mask = _mm256_set_epi32(-1, 0, -1, 0, -1, 0, -1, 0); @@ -1766,7 +2330,7 @@ static inline __m256i libdivide_mullhi_s32_vec256(__m256i a, __m256i b) { } // Here, y is assumed to contain one 64-bit value repeated. -static inline __m256i libdivide_mullhi_u64_vec256(__m256i x, __m256i y) { +static LIBDIVIDE_INLINE __m256i libdivide_mullhi_u64_vec256(__m256i x, __m256i y) { // see m128i variant for comments. __m256i x0y0 = _mm256_mul_epu32(x, y); __m256i x0y0_hi = _mm256_srli_epi64(x0y0, 32); @@ -1789,15 +2353,25 @@ static inline __m256i libdivide_mullhi_u64_vec256(__m256i x, __m256i y) { } // y is one 64-bit value repeated. -static inline __m256i libdivide_mullhi_s64_vec256(__m256i x, __m256i y) { +static LIBDIVIDE_INLINE __m256i libdivide_mullhi_s64_vec256(__m256i x, __m256i y) { __m256i p = libdivide_mullhi_u64_vec256(x, y); - __m256i t1 = _mm256_and_si256(libdivide_s64_signbits(x), y); - __m256i t2 = _mm256_and_si256(libdivide_s64_signbits(y), x); + __m256i t1 = _mm256_and_si256(libdivide_s64_signbits_vec256(x), y); + __m256i t2 = _mm256_and_si256(libdivide_s64_signbits_vec256(y), x); p = _mm256_sub_epi64(p, t1); p = _mm256_sub_epi64(p, t2); return p; } +////////// UINT16 + +__m256i libdivide_u16_do_vec256(__m256i numers, const struct libdivide_u16_t *denom) { + SIMPLE_VECTOR_DIVISION(uint16_t, __m256i, u16) +} + +__m256i libdivide_u16_branchfree_do_vec256(__m256i numers, const struct libdivide_u16_branchfree_t *denom) { + SIMPLE_VECTOR_DIVISION(uint16_t, __m256i, u16_branchfree) +} + ////////// UINT32 __m256i libdivide_u32_do_vec256(__m256i numers, const struct libdivide_u32_t *denom) { @@ -1852,13 +2426,23 @@ __m256i libdivide_u64_branchfree_do_vec256( return _mm256_srli_epi64(t, denom->more); } +////////// SINT16 + +__m256i libdivide_s16_do_vec256(__m256i numers, const struct libdivide_s16_t *denom) { + SIMPLE_VECTOR_DIVISION(int16_t, __m256i, s16) +} + +__m256i libdivide_s16_branchfree_do_vec256(__m256i numers, const struct libdivide_s16_branchfree_t *denom) { + SIMPLE_VECTOR_DIVISION(int16_t, __m256i, s16_branchfree) +} + ////////// SINT32 __m256i libdivide_s32_do_vec256(__m256i numers, const struct libdivide_s32_t *denom) { uint8_t more = denom->more; if (!denom->magic) { uint32_t shift = more & LIBDIVIDE_32_SHIFT_MASK; - uint32_t mask = (1U << shift) - 1; + uint32_t mask = ((uint32_t)1 << shift) - 1; __m256i roundToZeroTweak = _mm256_set1_epi32(mask); // q = numer + ((numer >> 31) & roundToZeroTweak); __m256i q = _mm256_add_epi32( @@ -1898,7 +2482,7 @@ __m256i libdivide_s32_branchfree_do_vec256( // a power of 2, or (2**shift) if it is not a power of 2 uint32_t is_power_of_2 = (magic == 0); __m256i q_sign = _mm256_srai_epi32(q, 31); // q_sign = q >> 31 - __m256i mask = _mm256_set1_epi32((1U << shift) - is_power_of_2); + __m256i mask = _mm256_set1_epi32(((uint32_t)1 << shift) - is_power_of_2); q = _mm256_add_epi32(q, _mm256_and_si256(q_sign, mask)); // q = q + (q_sign & mask) q = _mm256_srai_epi32(q, shift); // q >>= shift q = _mm256_sub_epi32(_mm256_xor_si256(q, sign), sign); // q = (q ^ sign) - sign @@ -1912,11 +2496,11 @@ __m256i libdivide_s64_do_vec256(__m256i numers, const struct libdivide_s64_t *de int64_t magic = denom->magic; if (magic == 0) { // shift path uint32_t shift = more & LIBDIVIDE_64_SHIFT_MASK; - uint64_t mask = (1ULL << shift) - 1; + uint64_t mask = ((uint64_t)1 << shift) - 1; __m256i roundToZeroTweak = _mm256_set1_epi64x(mask); // q = numer + ((numer >> 63) & roundToZeroTweak); __m256i q = _mm256_add_epi64( - numers, _mm256_and_si256(libdivide_s64_signbits(numers), roundToZeroTweak)); + numers, _mm256_and_si256(libdivide_s64_signbits_vec256(numers), roundToZeroTweak)); q = libdivide_s64_shift_right_vec256(q, shift); __m256i sign = _mm256_set1_epi32((int8_t)more >> 7); // q = (q ^ sign) - sign; @@ -1953,8 +2537,8 @@ __m256i libdivide_s64_branchfree_do_vec256( // If q is negative, we want to add either (2**shift)-1 if d is // a power of 2, or (2**shift) if it is not a power of 2. uint32_t is_power_of_2 = (magic == 0); - __m256i q_sign = libdivide_s64_signbits(q); // q_sign = q >> 63 - __m256i mask = _mm256_set1_epi64x((1ULL << shift) - is_power_of_2); + __m256i q_sign = libdivide_s64_signbits_vec256(q); // q_sign = q >> 63 + __m256i mask = _mm256_set1_epi64x(((uint64_t)1 << shift) - is_power_of_2); q = _mm256_add_epi64(q, _mm256_and_si256(q_sign, mask)); // q = q + (q_sign & mask) q = libdivide_s64_shift_right_vec256(q, shift); // q >>= shift q = _mm256_sub_epi64(_mm256_xor_si256(q, sign), sign); // q = (q ^ sign) - sign @@ -1965,40 +2549,52 @@ __m256i libdivide_s64_branchfree_do_vec256( #if defined(LIBDIVIDE_SSE2) -static inline __m128i libdivide_u32_do_vec128(__m128i numers, const struct libdivide_u32_t *denom); -static inline __m128i libdivide_s32_do_vec128(__m128i numers, const struct libdivide_s32_t *denom); -static inline __m128i libdivide_u64_do_vec128(__m128i numers, const struct libdivide_u64_t *denom); -static inline __m128i libdivide_s64_do_vec128(__m128i numers, const struct libdivide_s64_t *denom); +static LIBDIVIDE_INLINE __m128i libdivide_u16_do_vec128( + __m128i numers, const struct libdivide_u16_t *denom); +static LIBDIVIDE_INLINE __m128i libdivide_s16_do_vec128( + __m128i numers, const struct libdivide_s16_t *denom); +static LIBDIVIDE_INLINE __m128i libdivide_u32_do_vec128( + __m128i numers, const struct libdivide_u32_t *denom); +static LIBDIVIDE_INLINE __m128i libdivide_s32_do_vec128( + __m128i numers, const struct libdivide_s32_t *denom); +static LIBDIVIDE_INLINE __m128i libdivide_u64_do_vec128( + __m128i numers, const struct libdivide_u64_t *denom); +static LIBDIVIDE_INLINE __m128i libdivide_s64_do_vec128( + __m128i numers, const struct libdivide_s64_t *denom); -static inline __m128i libdivide_u32_branchfree_do_vec128( +static LIBDIVIDE_INLINE __m128i libdivide_u16_branchfree_do_vec128( + __m128i numers, const struct libdivide_u16_branchfree_t *denom); +static LIBDIVIDE_INLINE __m128i libdivide_s16_branchfree_do_vec128( + __m128i numers, const struct libdivide_s16_branchfree_t *denom); +static LIBDIVIDE_INLINE __m128i libdivide_u32_branchfree_do_vec128( __m128i numers, const struct libdivide_u32_branchfree_t *denom); -static inline __m128i libdivide_s32_branchfree_do_vec128( +static LIBDIVIDE_INLINE __m128i libdivide_s32_branchfree_do_vec128( __m128i numers, const struct libdivide_s32_branchfree_t *denom); -static inline __m128i libdivide_u64_branchfree_do_vec128( +static LIBDIVIDE_INLINE __m128i libdivide_u64_branchfree_do_vec128( __m128i numers, const struct libdivide_u64_branchfree_t *denom); -static inline __m128i libdivide_s64_branchfree_do_vec128( +static LIBDIVIDE_INLINE __m128i libdivide_s64_branchfree_do_vec128( __m128i numers, const struct libdivide_s64_branchfree_t *denom); //////// Internal Utility Functions // Implementation of _mm_srai_epi64(v, 63) (from AVX512). -static inline __m128i libdivide_s64_signbits(__m128i v) { +static LIBDIVIDE_INLINE __m128i libdivide_s64_signbits_vec128(__m128i v) { __m128i hiBitsDuped = _mm_shuffle_epi32(v, _MM_SHUFFLE(3, 3, 1, 1)); __m128i signBits = _mm_srai_epi32(hiBitsDuped, 31); return signBits; } // Implementation of _mm_srai_epi64 (from AVX512). -static inline __m128i libdivide_s64_shift_right_vec128(__m128i v, int amt) { +static LIBDIVIDE_INLINE __m128i libdivide_s64_shift_right_vec128(__m128i v, int amt) { const int b = 64 - amt; - __m128i m = _mm_set1_epi64x(1ULL << (b - 1)); + __m128i m = _mm_set1_epi64x((uint64_t)1 << (b - 1)); __m128i x = _mm_srli_epi64(v, amt); __m128i result = _mm_sub_epi64(_mm_xor_si128(x, m), m); return result; } // Here, b is assumed to contain one 32-bit value repeated. -static inline __m128i libdivide_mullhi_u32_vec128(__m128i a, __m128i b) { +static LIBDIVIDE_INLINE __m128i libdivide_mullhi_u32_vec128(__m128i a, __m128i b) { __m128i hi_product_0Z2Z = _mm_srli_epi64(_mm_mul_epu32(a, b), 32); __m128i a1X3X = _mm_srli_epi64(a, 32); __m128i mask = _mm_set_epi32(-1, 0, -1, 0); @@ -2009,7 +2605,7 @@ static inline __m128i libdivide_mullhi_u32_vec128(__m128i a, __m128i b) { // SSE2 does not have a signed multiplication instruction, but we can convert // unsigned to signed pretty efficiently. Again, b is just a 32 bit value // repeated four times. -static inline __m128i libdivide_mullhi_s32_vec128(__m128i a, __m128i b) { +static LIBDIVIDE_INLINE __m128i libdivide_mullhi_s32_vec128(__m128i a, __m128i b) { __m128i p = libdivide_mullhi_u32_vec128(a, b); // t1 = (a >> 31) & y, arithmetic shift __m128i t1 = _mm_and_si128(_mm_srai_epi32(a, 31), b); @@ -2020,7 +2616,7 @@ static inline __m128i libdivide_mullhi_s32_vec128(__m128i a, __m128i b) { } // Here, y is assumed to contain one 64-bit value repeated. -static inline __m128i libdivide_mullhi_u64_vec128(__m128i x, __m128i y) { +static LIBDIVIDE_INLINE __m128i libdivide_mullhi_u64_vec128(__m128i x, __m128i y) { // full 128 bits product is: // x0*y0 + (x0*y1 << 32) + (x1*y0 << 32) + (x1*y1 << 64) // Note x0,y0,x1,y1 are all conceptually uint32, products are 32x32->64. @@ -2053,15 +2649,25 @@ static inline __m128i libdivide_mullhi_u64_vec128(__m128i x, __m128i y) { } // y is one 64-bit value repeated. -static inline __m128i libdivide_mullhi_s64_vec128(__m128i x, __m128i y) { +static LIBDIVIDE_INLINE __m128i libdivide_mullhi_s64_vec128(__m128i x, __m128i y) { __m128i p = libdivide_mullhi_u64_vec128(x, y); - __m128i t1 = _mm_and_si128(libdivide_s64_signbits(x), y); - __m128i t2 = _mm_and_si128(libdivide_s64_signbits(y), x); + __m128i t1 = _mm_and_si128(libdivide_s64_signbits_vec128(x), y); + __m128i t2 = _mm_and_si128(libdivide_s64_signbits_vec128(y), x); p = _mm_sub_epi64(p, t1); p = _mm_sub_epi64(p, t2); return p; } +////////// UINT26 + +__m128i libdivide_u16_do_vec128(__m128i numers, const struct libdivide_u16_t *denom) { + SIMPLE_VECTOR_DIVISION(uint16_t, __m128i, u16) +} + +__m128i libdivide_u16_branchfree_do_vec128(__m128i numers, const struct libdivide_u16_branchfree_t *denom) { + SIMPLE_VECTOR_DIVISION(uint16_t, __m128i, u16_branchfree) +} + ////////// UINT32 __m128i libdivide_u32_do_vec128(__m128i numers, const struct libdivide_u32_t *denom) { @@ -2116,13 +2722,23 @@ __m128i libdivide_u64_branchfree_do_vec128( return _mm_srli_epi64(t, denom->more); } +////////// SINT16 + +__m128i libdivide_s16_do_vec128(__m128i numers, const struct libdivide_s16_t *denom) { + SIMPLE_VECTOR_DIVISION(int16_t, __m128i, s16) +} + +__m128i libdivide_s16_branchfree_do_vec128(__m128i numers, const struct libdivide_s16_branchfree_t *denom) { + SIMPLE_VECTOR_DIVISION(int16_t, __m128i, s16_branchfree) +} + ////////// SINT32 __m128i libdivide_s32_do_vec128(__m128i numers, const struct libdivide_s32_t *denom) { uint8_t more = denom->more; if (!denom->magic) { uint32_t shift = more & LIBDIVIDE_32_SHIFT_MASK; - uint32_t mask = (1U << shift) - 1; + uint32_t mask = ((uint32_t)1 << shift) - 1; __m128i roundToZeroTweak = _mm_set1_epi32(mask); // q = numer + ((numer >> 31) & roundToZeroTweak); __m128i q = @@ -2162,7 +2778,7 @@ __m128i libdivide_s32_branchfree_do_vec128( // a power of 2, or (2**shift) if it is not a power of 2 uint32_t is_power_of_2 = (magic == 0); __m128i q_sign = _mm_srai_epi32(q, 31); // q_sign = q >> 31 - __m128i mask = _mm_set1_epi32((1U << shift) - is_power_of_2); + __m128i mask = _mm_set1_epi32(((uint32_t)1 << shift) - is_power_of_2); q = _mm_add_epi32(q, _mm_and_si128(q_sign, mask)); // q = q + (q_sign & mask) q = _mm_srai_epi32(q, shift); // q >>= shift q = _mm_sub_epi32(_mm_xor_si128(q, sign), sign); // q = (q ^ sign) - sign @@ -2176,11 +2792,11 @@ __m128i libdivide_s64_do_vec128(__m128i numers, const struct libdivide_s64_t *de int64_t magic = denom->magic; if (magic == 0) { // shift path uint32_t shift = more & LIBDIVIDE_64_SHIFT_MASK; - uint64_t mask = (1ULL << shift) - 1; + uint64_t mask = ((uint64_t)1 << shift) - 1; __m128i roundToZeroTweak = _mm_set1_epi64x(mask); // q = numer + ((numer >> 63) & roundToZeroTweak); __m128i q = - _mm_add_epi64(numers, _mm_and_si128(libdivide_s64_signbits(numers), roundToZeroTweak)); + _mm_add_epi64(numers, _mm_and_si128(libdivide_s64_signbits_vec128(numers), roundToZeroTweak)); q = libdivide_s64_shift_right_vec128(q, shift); __m128i sign = _mm_set1_epi32((int8_t)more >> 7); // q = (q ^ sign) - sign; @@ -2217,8 +2833,8 @@ __m128i libdivide_s64_branchfree_do_vec128( // If q is negative, we want to add either (2**shift)-1 if d is // a power of 2, or (2**shift) if it is not a power of 2. uint32_t is_power_of_2 = (magic == 0); - __m128i q_sign = libdivide_s64_signbits(q); // q_sign = q >> 63 - __m128i mask = _mm_set1_epi64x((1ULL << shift) - is_power_of_2); + __m128i q_sign = libdivide_s64_signbits_vec128(q); // q_sign = q >> 63 + __m128i mask = _mm_set1_epi64x(((uint64_t)1 << shift) - is_power_of_2); q = _mm_add_epi64(q, _mm_and_si128(q_sign, mask)); // q = q + (q_sign & mask) q = libdivide_s64_shift_right_vec128(q, shift); // q >>= shift q = _mm_sub_epi64(_mm_xor_si128(q, sign), sign); // q = (q ^ sign) - sign @@ -2241,6 +2857,16 @@ enum Branching { template struct NeonVecFor {}; +template <> +struct NeonVecFor { + typedef uint16x8_t type; +}; + +template <> +struct NeonVecFor { + typedef int16x8_t type; +}; + template <> struct NeonVecFor { typedef uint32x4_t type; @@ -2264,82 +2890,105 @@ struct NeonVecFor { // Versions of our algorithms for SIMD. #if defined(LIBDIVIDE_NEON) -#define LIBDIVIDE_DIVIDE_NEON(ALGO, INT_TYPE) \ - typename NeonVecFor::type divide(typename NeonVecFor::type n) const { \ - return libdivide_##ALGO##_do_vec128(n, &denom); \ +#define LIBDIVIDE_DIVIDE_NEON(ALGO, INT_TYPE) \ + LIBDIVIDE_INLINE typename NeonVecFor::type divide( \ + typename NeonVecFor::type n) const { \ + return libdivide_##ALGO##_do_vec128(n, &denom); \ } #else #define LIBDIVIDE_DIVIDE_NEON(ALGO, INT_TYPE) #endif #if defined(LIBDIVIDE_SSE2) -#define LIBDIVIDE_DIVIDE_SSE2(ALGO) \ - __m128i divide(__m128i n) const { return libdivide_##ALGO##_do_vec128(n, &denom); } +#define LIBDIVIDE_DIVIDE_SSE2(ALGO) \ + LIBDIVIDE_INLINE __m128i divide(__m128i n) const { \ + return libdivide_##ALGO##_do_vec128(n, &denom); \ + } #else #define LIBDIVIDE_DIVIDE_SSE2(ALGO) #endif #if defined(LIBDIVIDE_AVX2) -#define LIBDIVIDE_DIVIDE_AVX2(ALGO) \ - __m256i divide(__m256i n) const { return libdivide_##ALGO##_do_vec256(n, &denom); } +#define LIBDIVIDE_DIVIDE_AVX2(ALGO) \ + LIBDIVIDE_INLINE __m256i divide(__m256i n) const { \ + return libdivide_##ALGO##_do_vec256(n, &denom); \ + } #else #define LIBDIVIDE_DIVIDE_AVX2(ALGO) #endif #if defined(LIBDIVIDE_AVX512) -#define LIBDIVIDE_DIVIDE_AVX512(ALGO) \ - __m512i divide(__m512i n) const { return libdivide_##ALGO##_do_vec512(n, &denom); } +#define LIBDIVIDE_DIVIDE_AVX512(ALGO) \ + LIBDIVIDE_INLINE __m512i divide(__m512i n) const { \ + return libdivide_##ALGO##_do_vec512(n, &denom); \ + } #else #define LIBDIVIDE_DIVIDE_AVX512(ALGO) #endif // The DISPATCHER_GEN() macro generates C++ methods (for the given integer // and algorithm types) that redirect to libdivide's C API. -#define DISPATCHER_GEN(T, ALGO) \ - libdivide_##ALGO##_t denom; \ - dispatcher() {} \ - dispatcher(T d) : denom(libdivide_##ALGO##_gen(d)) {} \ - T divide(T n) const { return libdivide_##ALGO##_do(n, &denom); } \ - T recover() const { return libdivide_##ALGO##_recover(&denom); } \ - LIBDIVIDE_DIVIDE_NEON(ALGO, T) \ - LIBDIVIDE_DIVIDE_SSE2(ALGO) \ - LIBDIVIDE_DIVIDE_AVX2(ALGO) \ +#define DISPATCHER_GEN(T, ALGO) \ + libdivide_##ALGO##_t denom; \ + LIBDIVIDE_INLINE dispatcher() {} \ + LIBDIVIDE_INLINE dispatcher(T d) : denom(libdivide_##ALGO##_gen(d)) {} \ + LIBDIVIDE_INLINE T divide(T n) const { return libdivide_##ALGO##_do(n, &denom); } \ + LIBDIVIDE_INLINE T recover() const { return libdivide_##ALGO##_recover(&denom); } \ + LIBDIVIDE_DIVIDE_NEON(ALGO, T) \ + LIBDIVIDE_DIVIDE_SSE2(ALGO) \ + LIBDIVIDE_DIVIDE_AVX2(ALGO) \ LIBDIVIDE_DIVIDE_AVX512(ALGO) // The dispatcher selects a specific division algorithm for a given // type and ALGO using partial template specialization. -template +template struct dispatcher {}; template <> -struct dispatcher { +struct dispatcher { + DISPATCHER_GEN(int16_t, s16) +}; +template <> +struct dispatcher { + DISPATCHER_GEN(int16_t, s16_branchfree) +}; +template <> +struct dispatcher { + DISPATCHER_GEN(uint16_t, u16) +}; +template <> +struct dispatcher { + DISPATCHER_GEN(uint16_t, u16_branchfree) +}; +template <> +struct dispatcher { DISPATCHER_GEN(int32_t, s32) }; template <> -struct dispatcher { +struct dispatcher { DISPATCHER_GEN(int32_t, s32_branchfree) }; template <> -struct dispatcher { +struct dispatcher { DISPATCHER_GEN(uint32_t, u32) }; template <> -struct dispatcher { +struct dispatcher { DISPATCHER_GEN(uint32_t, u32_branchfree) }; template <> -struct dispatcher { +struct dispatcher { DISPATCHER_GEN(int64_t, s64) }; template <> -struct dispatcher { +struct dispatcher { DISPATCHER_GEN(int64_t, s64_branchfree) }; template <> -struct dispatcher { +struct dispatcher { DISPATCHER_GEN(uint64_t, u64) }; template <> -struct dispatcher { +struct dispatcher { DISPATCHER_GEN(uint64_t, u64_branchfree) }; @@ -2348,6 +2997,9 @@ struct dispatcher { // based on the integer and algorithm template parameters. template class divider { + private: + typedef dispatcher dispatcher_t; + public: // We leave the default constructor empty so that creating // an array of dividers and then initializing them @@ -2355,10 +3007,10 @@ class divider { divider() {} // Constructor that takes the divisor as a parameter - divider(T d) : div(d) {} + LIBDIVIDE_INLINE divider(T d) : div(d) {} // Divides n by the divisor - T divide(T n) const { return div.divide(n); } + LIBDIVIDE_INLINE T divide(T n) const { return div.divide(n); } // Recovers the divisor, returns the value that was // used to initialize this divider object. @@ -2374,34 +3026,34 @@ class divider { // (e.g. s32, u32, s64, u64) and divides each of them by the divider, returning the packed // quotients. #if defined(LIBDIVIDE_SSE2) - __m128i divide(__m128i n) const { return div.divide(n); } + LIBDIVIDE_INLINE __m128i divide(__m128i n) const { return div.divide(n); } #endif #if defined(LIBDIVIDE_AVX2) - __m256i divide(__m256i n) const { return div.divide(n); } + LIBDIVIDE_INLINE __m256i divide(__m256i n) const { return div.divide(n); } #endif #if defined(LIBDIVIDE_AVX512) - __m512i divide(__m512i n) const { return div.divide(n); } + LIBDIVIDE_INLINE __m512i divide(__m512i n) const { return div.divide(n); } #endif #if defined(LIBDIVIDE_NEON) - typename NeonVecFor::type divide(typename NeonVecFor::type n) const { + LIBDIVIDE_INLINE typename NeonVecFor::type divide(typename NeonVecFor::type n) const { return div.divide(n); } #endif private: // Storage for the actual divisor - dispatcher::value, std::is_signed::value, sizeof(T), ALGO> div; + dispatcher_t div; }; // Overload of operator / for scalar division template -T operator/(T n, const divider &div) { +LIBDIVIDE_INLINE T operator/(T n, const divider &div) { return div.divide(n); } // Overload of operator /= for scalar division template -T &operator/=(T &n, const divider &div) { +LIBDIVIDE_INLINE T &operator/=(T &n, const divider &div) { n = div.divide(n); return n; } @@ -2409,88 +3061,55 @@ T &operator/=(T &n, const divider &div) { // Overloads for vector types. #if defined(LIBDIVIDE_SSE2) template -__m128i operator/(__m128i n, const divider &div) { +LIBDIVIDE_INLINE __m128i operator/(__m128i n, const divider &div) { return div.divide(n); } template -__m128i operator/=(__m128i &n, const divider &div) { +LIBDIVIDE_INLINE __m128i operator/=(__m128i &n, const divider &div) { n = div.divide(n); return n; } #endif #if defined(LIBDIVIDE_AVX2) template -__m256i operator/(__m256i n, const divider &div) { +LIBDIVIDE_INLINE __m256i operator/(__m256i n, const divider &div) { return div.divide(n); } template -__m256i operator/=(__m256i &n, const divider &div) { +LIBDIVIDE_INLINE __m256i operator/=(__m256i &n, const divider &div) { n = div.divide(n); return n; } #endif #if defined(LIBDIVIDE_AVX512) template -__m512i operator/(__m512i n, const divider &div) { +LIBDIVIDE_INLINE __m512i operator/(__m512i n, const divider &div) { return div.divide(n); } template -__m512i operator/=(__m512i &n, const divider &div) { +LIBDIVIDE_INLINE __m512i operator/=(__m512i &n, const divider &div) { n = div.divide(n); return n; } #endif #if defined(LIBDIVIDE_NEON) -template -uint32x4_t operator/(uint32x4_t n, const divider &div) { +template +LIBDIVIDE_INLINE typename NeonVecFor::type operator/(typename NeonVecFor::type n, const divider &div) { return div.divide(n); } -template -int32x4_t operator/(int32x4_t n, const divider &div) { - return div.divide(n); -} - -template -uint64x2_t operator/(uint64x2_t n, const divider &div) { - return div.divide(n); -} - -template -int64x2_t operator/(int64x2_t n, const divider &div) { - return div.divide(n); -} - -template -uint32x4_t operator/=(uint32x4_t &n, const divider &div) { - n = div.divide(n); - return n; -} - -template -int32x4_t operator/=(int32x4_t &n, const divider &div) { - n = div.divide(n); - return n; -} - -template -uint64x2_t operator/=(uint64x2_t &n, const divider &div) { - n = div.divide(n); - return n; -} - -template -int64x2_t operator/=(int64x2_t &n, const divider &div) { +template +LIBDIVIDE_INLINE typename NeonVecFor::type operator/=(typename NeonVecFor::type &n, const divider &div) { n = div.divide(n); return n; } #endif -#if __cplusplus >= 201103L +#if __cplusplus >= 201103L || (defined(_MSC_VER) && _MSC_VER >= 1900) // libdivide::branchfree_divider template using branchfree_divider = divider; @@ -2500,4 +3119,8 @@ using branchfree_divider = divider; #endif // __cplusplus +#if defined(_MSC_VER) +#pragma warning(pop) +#endif + #endif // LIBDIVIDE_H