From a579257a26204a4b224324b85fe57c6fde70e7f3 Mon Sep 17 00:00:00 2001 From: Daniel Micay Date: Sun, 23 Jun 2019 00:37:45 -0400 Subject: [PATCH] update libdivide to 1.1 --- third_party/libdivide.h | 1327 ++++++++++++++++++++------------------- 1 file changed, 689 insertions(+), 638 deletions(-) diff --git a/third_party/libdivide.h b/third_party/libdivide.h index e938849..d1c3f8d 100644 --- a/third_party/libdivide.h +++ b/third_party/libdivide.h @@ -1,5 +1,5 @@ // libdivide.h -// Copyright 2010 - 2018 ridiculous_fish +// Copyright 2010 - 2019 ridiculous_fish // // libdivide is dual-licensed under the Boost or zlib licenses. // You may use libdivide under the terms of either of these. @@ -8,55 +8,76 @@ #ifndef LIBDIVIDE_H #define LIBDIVIDE_H -#if defined(_MSC_VER) -// disable warning C4146: unary minus operator applied to -// unsigned type, result still unsigned -#pragma warning(disable: 4146) -#define LIBDIVIDE_VC -#endif - -#ifdef __cplusplus -#include -#include -#else -#include -#include -#endif +#define LIBDIVIDE_VERSION "1.1" +#define LIBDIVIDE_VERSION_MAJOR 1 +#define LIBDIVIDE_VERSION_MINOR 1 #include +#if defined(__cplusplus) + #include + #include +#else + #include + #include +#endif + #if defined(LIBDIVIDE_USE_SSE2) -#include + #include #endif -#if defined(LIBDIVIDE_VC) -#include +// libdivide may use the pmuldq (vector signed 32x32->64 mult instruction) +// which is in SSE 4.1. However, signed multiplication can be emulated +// efficiently with unsigned multiplication, and SSE 4.1 is currently rare, +// so it is OK to not turn this on. +#if defined(LIBDIVIDE_USE_SSE4_1) + #include #endif -#ifndef __has_builtin -#define __has_builtin(x) 0 // Compatibility with non-clang compilers. +#if !defined(__has_builtin) + #define __has_builtin(x) 0 #endif #if defined(__SIZEOF_INT128__) -#define HAS_INT128_T + #define HAS_INT128_T #endif -#if defined(__x86_64__) || defined(_WIN64) || defined(_M_X64) -#define LIBDIVIDE_IS_X86_64 +#if defined(__x86_64__) || defined(_M_X64) + #define LIBDIVIDE_X86_64 #endif #if defined(__i386__) -#define LIBDIVIDE_IS_i386 + #define LIBDIVIDE_i386 #endif #if defined(__GNUC__) || defined(__clang__) -#define LIBDIVIDE_GCC_STYLE_ASM + #define LIBDIVIDE_GCC_STYLE_ASM #endif #if defined(__cplusplus) || defined(LIBDIVIDE_VC) -#define LIBDIVIDE_FUNCTION __FUNCTION__ + #define LIBDIVIDE_FUNCTION __FUNCTION__ #else -#define LIBDIVIDE_FUNCTION __func__ + #define LIBDIVIDE_FUNCTION __func__ +#endif + +#if defined(_MSC_VER) + #include + // disable warning C4146: unary minus operator applied + // to unsigned type, result still unsigned + #pragma warning(disable: 4146) + #define LIBDIVIDE_VC + + // _udiv128() is available in Visual Studio 2019 + // or later on the x64 CPU architecture + #if defined(LIBDIVIDE_X86_64) && _MSC_VER >= 1920 + #if !defined(__has_include) + #include + #define LIBDIVIDE_VC_UDIV128 + #elif __has_include() + #include + #define LIBDIVIDE_VC_UDIV128 + #endif + #endif #endif #define LIBDIVIDE_ERROR(msg) \ @@ -67,24 +88,16 @@ } while (0) #if defined(LIBDIVIDE_ASSERTIONS_ON) -#define LIBDIVIDE_ASSERT(x) \ - do { \ - if (!(x)) { \ - fprintf(stderr, "libdivide.h:%d: %s(): Assertion failed: %s\n", \ - __LINE__, LIBDIVIDE_FUNCTION, #x); \ - exit(-1); \ - } \ - } while (0) + #define LIBDIVIDE_ASSERT(x) \ + do { \ + if (!(x)) { \ + fprintf(stderr, "libdivide.h:%d: %s(): Assertion failed: %s\n", \ + __LINE__, LIBDIVIDE_FUNCTION, #x); \ + exit(-1); \ + } \ + } while (0) #else -#define LIBDIVIDE_ASSERT(x) -#endif - -// libdivide may use the pmuldq (vector signed 32x32->64 mult instruction) -// which is in SSE 4.1. However, signed multiplication can be emulated -// efficiently with unsigned multiplication, and SSE 4.1 is currently rare, so -// it is OK to not turn this on. -#ifdef LIBDIVIDE_USE_SSE4_1 -#include + #define LIBDIVIDE_ASSERT(x) #endif #ifdef __cplusplus @@ -249,50 +262,24 @@ LIBDIVIDE_API int64_t libdivide_s64_do_alg2(int64_t numer, const struct libdivid LIBDIVIDE_API int64_t libdivide_s64_do_alg3(int64_t numer, const struct libdivide_s64_t *denom); LIBDIVIDE_API int64_t libdivide_s64_do_alg4(int64_t numer, const struct libdivide_s64_t *denom); -#if defined(LIBDIVIDE_USE_SSE2) - -LIBDIVIDE_API __m128i libdivide_u32_do_vector(__m128i numers, const struct libdivide_u32_t *denom); -LIBDIVIDE_API __m128i libdivide_s32_do_vector(__m128i numers, const struct libdivide_s32_t *denom); -LIBDIVIDE_API __m128i libdivide_u64_do_vector(__m128i numers, const struct libdivide_u64_t *denom); -LIBDIVIDE_API __m128i libdivide_s64_do_vector(__m128i numers, const struct libdivide_s64_t *denom); - -LIBDIVIDE_API __m128i libdivide_u32_do_vector_alg0(__m128i numers, const struct libdivide_u32_t *denom); -LIBDIVIDE_API __m128i libdivide_u32_do_vector_alg1(__m128i numers, const struct libdivide_u32_t *denom); -LIBDIVIDE_API __m128i libdivide_u32_do_vector_alg2(__m128i numers, const struct libdivide_u32_t *denom); - -LIBDIVIDE_API __m128i libdivide_s32_do_vector_alg0(__m128i numers, const struct libdivide_s32_t *denom); -LIBDIVIDE_API __m128i libdivide_s32_do_vector_alg1(__m128i numers, const struct libdivide_s32_t *denom); -LIBDIVIDE_API __m128i libdivide_s32_do_vector_alg2(__m128i numers, const struct libdivide_s32_t *denom); -LIBDIVIDE_API __m128i libdivide_s32_do_vector_alg3(__m128i numers, const struct libdivide_s32_t *denom); -LIBDIVIDE_API __m128i libdivide_s32_do_vector_alg4(__m128i numers, const struct libdivide_s32_t *denom); - -LIBDIVIDE_API __m128i libdivide_u64_do_vector_alg0(__m128i numers, const struct libdivide_u64_t *denom); -LIBDIVIDE_API __m128i libdivide_u64_do_vector_alg1(__m128i numers, const struct libdivide_u64_t *denom); -LIBDIVIDE_API __m128i libdivide_u64_do_vector_alg2(__m128i numers, const struct libdivide_u64_t *denom); - -LIBDIVIDE_API __m128i libdivide_s64_do_vector_alg0(__m128i numers, const struct libdivide_s64_t *denom); -LIBDIVIDE_API __m128i libdivide_s64_do_vector_alg1(__m128i numers, const struct libdivide_s64_t *denom); -LIBDIVIDE_API __m128i libdivide_s64_do_vector_alg2(__m128i numers, const struct libdivide_s64_t *denom); -LIBDIVIDE_API __m128i libdivide_s64_do_vector_alg3(__m128i numers, const struct libdivide_s64_t *denom); -LIBDIVIDE_API __m128i libdivide_s64_do_vector_alg4(__m128i numers, const struct libdivide_s64_t *denom); - -LIBDIVIDE_API __m128i libdivide_u32_branchfree_do_vector(__m128i numers, const struct libdivide_u32_branchfree_t *denom); -LIBDIVIDE_API __m128i libdivide_s32_branchfree_do_vector(__m128i numers, const struct libdivide_s32_branchfree_t *denom); -LIBDIVIDE_API __m128i libdivide_u64_branchfree_do_vector(__m128i numers, const struct libdivide_u64_branchfree_t *denom); -LIBDIVIDE_API __m128i libdivide_s64_branchfree_do_vector(__m128i numers, const struct libdivide_s64_branchfree_t *denom); - -#endif - //////// Internal Utility Functions -static inline uint32_t libdivide__mullhi_u32(uint32_t x, uint32_t y) { +static inline uint32_t libdivide_mullhi_u32(uint32_t x, uint32_t y) { uint64_t xl = x, yl = y; uint64_t rl = xl * yl; return (uint32_t)(rl >> 32); } -static uint64_t libdivide__mullhi_u64(uint64_t x, uint64_t y) { -#if defined(LIBDIVIDE_VC) && defined(LIBDIVIDE_IS_X86_64) +static inline int32_t libdivide_mullhi_s32(int32_t x, int32_t y) { + int64_t xl = x, yl = y; + int64_t rl = xl * yl; + // needs to be arithmetic shift + return (int32_t)(rl >> 32); +} + +static uint64_t libdivide_mullhi_u64(uint64_t x, uint64_t y) { +#if defined(LIBDIVIDE_VC) && \ + defined(LIBDIVIDE_X86_64) return __umulh(x, y); #elif defined(HAS_INT128_T) __uint128_t xl = x, yl = y; @@ -305,7 +292,7 @@ static uint64_t libdivide__mullhi_u64(uint64_t x, uint64_t y) { uint32_t x1 = (uint32_t)(x >> 32); uint32_t y0 = (uint32_t)(y & mask); uint32_t y1 = (uint32_t)(y >> 32); - uint32_t x0y0_hi = libdivide__mullhi_u32(x0, y0); + uint32_t x0y0_hi = libdivide_mullhi_u32(x0, y0); uint64_t x0y1 = x0 * (uint64_t)y1; uint64_t x1y0 = x1 * (uint64_t)y0; uint64_t x1y1 = x1 * (uint64_t)y1; @@ -317,8 +304,9 @@ static uint64_t libdivide__mullhi_u64(uint64_t x, uint64_t y) { #endif } -static inline int64_t libdivide__mullhi_s64(int64_t x, int64_t y) { -#if defined(LIBDIVIDE_VC) && defined(LIBDIVIDE_IS_X86_64) +static inline int64_t libdivide_mullhi_s64(int64_t x, int64_t y) { +#if defined(LIBDIVIDE_VC) && \ + defined(LIBDIVIDE_X86_64) return __mulh(x, y); #elif defined(HAS_INT128_T) __int128_t xl = x, yl = y; @@ -331,7 +319,7 @@ static inline int64_t libdivide__mullhi_s64(int64_t x, int64_t y) { uint32_t y0 = (uint32_t)(y & mask); int32_t x1 = (int32_t)(x >> 32); int32_t y1 = (int32_t)(y >> 32); - uint32_t x0y0_hi = libdivide__mullhi_u32(x0, y0); + uint32_t x0y0_hi = libdivide_mullhi_u32(x0, y0); int64_t t = x1 * (int64_t)y0 + x0y0_hi; int64_t w1 = x0 * (int64_t)y1 + (t & mask); @@ -339,141 +327,9 @@ static inline int64_t libdivide__mullhi_s64(int64_t x, int64_t y) { #endif } -#if defined(LIBDIVIDE_USE_SSE2) - -static inline __m128i libdivide__u64_to_m128(uint64_t x) { -#if defined(LIBDIVIDE_VC) && !defined(_WIN64) - // 64 bit windows doesn't seem to have an implementation of any of these - // load intrinsics, and 32 bit Visual C++ crashes - _declspec(align(16)) uint64_t temp[2] = {x, x}; - return _mm_load_si128((const __m128i*)temp); -#else - // everyone else gets it right - return _mm_set1_epi64x(x); -#endif -} - -static inline __m128i libdivide_get_FFFFFFFF00000000(void) { - // returns the same as _mm_set1_epi64(0xFFFFFFFF00000000ULL) - // without touching memory. - // optimizes to pcmpeqd on OS X - __m128i result = _mm_set1_epi8(-1); - return _mm_slli_epi64(result, 32); -} - -static inline __m128i libdivide_get_00000000FFFFFFFF(void) { - // returns the same as _mm_set1_epi64(0x00000000FFFFFFFFULL) - // without touching memory. - // optimizes to pcmpeqd on OS X - __m128i result = _mm_set1_epi8(-1); - result = _mm_srli_epi64(result, 32); - return result; -} - -static inline __m128i libdivide_s64_signbits(__m128i v) { - // we want to compute v >> 63, that is, _mm_srai_epi64(v, 63). But there - // is no 64 bit shift right arithmetic instruction in SSE2. So we have to - // fake it by first duplicating the high 32 bit values, and then using a 32 - // bit shift. Another option would be to use _mm_srli_epi64(v, 63) and - // then subtract that from 0, but that approach appears to be substantially - // slower for unknown reasons - __m128i hiBitsDuped = _mm_shuffle_epi32(v, _MM_SHUFFLE(3, 3, 1, 1)); - __m128i signBits = _mm_srai_epi32(hiBitsDuped, 31); - return signBits; -} - -// Returns an __m128i whose low 32 bits are equal to amt and has zero elsewhere. -static inline __m128i libdivide_u32_to_m128i(uint32_t amt) { - return _mm_set_epi32(0, 0, 0, amt); -} - -static inline __m128i libdivide_s64_shift_right_vector(__m128i v, int amt) { - // implementation of _mm_sra_epi64. Here we have two 64 bit values which - // are shifted right to logically become (64 - amt) values, and are then - // sign extended from a (64 - amt) bit number. - const int b = 64 - amt; - __m128i m = libdivide__u64_to_m128(1ULL << (b - 1)); - __m128i x = _mm_srl_epi64(v, libdivide_u32_to_m128i(amt)); - __m128i result = _mm_sub_epi64(_mm_xor_si128(x, m), m); // result = x^m - m - return result; -} - -// Here, b is assumed to contain one 32 bit value repeated four times. -// If it did not, the function would not work. -static inline __m128i libdivide__mullhi_u32_flat_vector(__m128i a, __m128i b) { - __m128i hi_product_0Z2Z = _mm_srli_epi64(_mm_mul_epu32(a, b), 32); - __m128i a1X3X = _mm_srli_epi64(a, 32); - __m128i mask = libdivide_get_FFFFFFFF00000000(); - __m128i hi_product_Z1Z3 = _mm_and_si128(_mm_mul_epu32(a1X3X, b), mask); - return _mm_or_si128(hi_product_0Z2Z, hi_product_Z1Z3); // = hi_product_0123 -} - -// Here, y is assumed to contain one 64 bit value repeated twice. -static inline __m128i libdivide_mullhi_u64_flat_vector(__m128i x, __m128i y) { - // full 128 bits are x0 * y0 + (x0 * y1 << 32) + (x1 * y0 << 32) + (x1 * y1 << 64) - __m128i mask = libdivide_get_00000000FFFFFFFF(); - // x0 is low half of 2 64 bit values, x1 is high half in low slots - __m128i x0 = _mm_and_si128(x, mask); - __m128i x1 = _mm_srli_epi64(x, 32); - __m128i y0 = _mm_and_si128(y, mask); - __m128i y1 = _mm_srli_epi64(y, 32); - // x0 happens to have the low half of the two 64 bit values in 32 bit slots - // 0 and 2, so _mm_mul_epu32 computes their full product, and then we shift - // right by 32 to get just the high values - __m128i x0y0_hi = _mm_srli_epi64(_mm_mul_epu32(x0, y0), 32); - __m128i x0y1 = _mm_mul_epu32(x0, y1); - __m128i x1y0 = _mm_mul_epu32(x1, y0); - __m128i x1y1 = _mm_mul_epu32(x1, y1); - __m128i temp = _mm_add_epi64(x1y0, x0y0_hi); - __m128i temp_lo = _mm_and_si128(temp, mask); - __m128i temp_hi = _mm_srli_epi64(temp, 32); - temp_lo = _mm_srli_epi64(_mm_add_epi64(temp_lo, x0y1), 32); - temp_hi = _mm_add_epi64(x1y1, temp_hi); - - return _mm_add_epi64(temp_lo, temp_hi); -} - -// y is one 64 bit value repeated twice -static inline __m128i libdivide_mullhi_s64_flat_vector(__m128i x, __m128i y) { - __m128i p = libdivide_mullhi_u64_flat_vector(x, y); - __m128i t1 = _mm_and_si128(libdivide_s64_signbits(x), y); - p = _mm_sub_epi64(p, t1); - __m128i t2 = _mm_and_si128(libdivide_s64_signbits(y), x); - p = _mm_sub_epi64(p, t2); - return p; -} - -#ifdef LIBDIVIDE_USE_SSE4_1 - -// b is one 32 bit value repeated four times. -static inline __m128i libdivide_mullhi_s32_flat_vector(__m128i a, __m128i b) { - __m128i hi_product_0Z2Z = _mm_srli_epi64(_mm_mul_epi32(a, b), 32); - __m128i a1X3X = _mm_srli_epi64(a, 32); - __m128i mask = libdivide_get_FFFFFFFF00000000(); - __m128i hi_product_Z1Z3 = _mm_and_si128(_mm_mul_epi32(a1X3X, b), mask); - return _mm_or_si128(hi_product_0Z2Z, hi_product_Z1Z3); // = hi_product_0123 -} - -#else - -// SSE2 does not have a signed multiplication instruction, but we can convert -// unsigned to signed pretty efficiently. Again, b is just a 32 bit value -// repeated four times. -static inline __m128i libdivide_mullhi_s32_flat_vector(__m128i a, __m128i b) { - __m128i p = libdivide__mullhi_u32_flat_vector(a, b); - __m128i t1 = _mm_and_si128(_mm_srai_epi32(a, 31), b); // t1 = (a >> 31) & y, arithmetic shift - __m128i t2 = _mm_and_si128(_mm_srai_epi32(b, 31), a); - p = _mm_sub_epi32(p, t1); - p = _mm_sub_epi32(p, t2); - return p; -} - -#endif // LIBDIVIDE_USE_SSE4_1 - -#endif // LIBDIVIDE_USE_SSE2 - -static inline int32_t libdivide__count_leading_zeros32(uint32_t val) { -#if defined(__GNUC__) || __has_builtin(__builtin_clz) +static inline int32_t libdivide_count_leading_zeros32(uint32_t val) { +#if defined(__GNUC__) || \ + __has_builtin(__builtin_clz) // Fast way to count leading zeros return __builtin_clz(val); #elif defined(LIBDIVIDE_VC) @@ -483,19 +339,18 @@ static inline int32_t libdivide__count_leading_zeros32(uint32_t val) { } return 0; #else - int32_t result = 0; - uint32_t hi = 1U << 31; - - while (~val & hi) { - hi >>= 1; - result++; - } - return result; + int32_t result = 0; + uint32_t hi = 1U << 31; + for (; ~val & hi; hi >>= 1) { + result++; + } + return result; #endif } -static inline int32_t libdivide__count_leading_zeros64(uint64_t val) { -#if defined(__GNUC__) || __has_builtin(__builtin_clzll) +static inline int32_t libdivide_count_leading_zeros64(uint64_t val) { +#if defined(__GNUC__) || \ + __has_builtin(__builtin_clzll) // Fast way to count leading zeros return __builtin_clzll(val); #elif defined(LIBDIVIDE_VC) && defined(_WIN64) @@ -507,61 +362,60 @@ static inline int32_t libdivide__count_leading_zeros64(uint64_t val) { #else uint32_t hi = val >> 32; uint32_t lo = val & 0xFFFFFFFF; - if (hi != 0) return libdivide__count_leading_zeros32(hi); - return 32 + libdivide__count_leading_zeros32(lo); + if (hi != 0) return libdivide_count_leading_zeros32(hi); + return 32 + libdivide_count_leading_zeros32(lo); #endif } -#if (defined(LIBDIVIDE_IS_i386) || defined(LIBDIVIDE_IS_X86_64)) && \ - defined(LIBDIVIDE_GCC_STYLE_ASM) - -// libdivide_64_div_32_to_32: divides a 64 bit uint {u1, u0} by a 32 bit +// libdivide_64_div_32_to_32: divides a 64-bit uint {u1, u0} by a 32-bit // uint {v}. The result must fit in 32 bits. // Returns the quotient directly and the remainder in *r static uint32_t libdivide_64_div_32_to_32(uint32_t u1, uint32_t u0, uint32_t v, uint32_t *r) { +#if (defined(LIBDIVIDE_i386) || defined(LIBDIVIDE_X86_64)) && \ + defined(LIBDIVIDE_GCC_STYLE_ASM) uint32_t result; __asm__("divl %[v]" : "=a"(result), "=d"(*r) : [v] "r"(v), "a"(u0), "d"(u1) ); return result; -} - #else - -static uint32_t libdivide_64_div_32_to_32(uint32_t u1, uint32_t u0, uint32_t v, uint32_t *r) { - uint64_t n = (((uint64_t)u1) << 32) | u0; + uint64_t n = ((uint64_t)u1 << 32) | u0; uint32_t result = (uint32_t)(n / v); *r = (uint32_t)(n - result * (uint64_t)v); return result; +#endif } -#endif - -#if defined(LIBDIVIDE_IS_X86_64) && \ - defined(LIBDIVIDE_GCC_STYLE_ASM) - +// libdivide_128_div_64_to_64: divides a 128-bit uint {u1, u0} by a 64-bit +// uint {v}. The result must fit in 64 bits. +// Returns the quotient directly and the remainder in *r static uint64_t libdivide_128_div_64_to_64(uint64_t u1, uint64_t u0, uint64_t v, uint64_t *r) { - // u0 -> rax - // u1 -> rdx - // divq +#if defined(LIBDIVIDE_VC_UDIV128) + return _udiv128(u1, u0, v, r); + +#elif defined(LIBDIVIDE_X86_64) && \ + defined(LIBDIVIDE_GCC_STYLE_ASM) + uint64_t result; __asm__("divq %[v]" : "=a"(result), "=d"(*r) : [v] "r"(v), "a"(u0), "d"(u1) ); return result; -} +#elif defined(HAS_INT128_T) + __uint128_t n = ((__uint128_t)u1 << 64) | u0; + uint64_t result = (uint64_t)(n / v); + *r = (uint64_t)(n - result * (__uint128_t)v); + return result; #else + // Code taken from Hacker's Delight: + // http://www.hackersdelight.org/HDcode/divlu.c. + // License permits inclusion here per: + // http://www.hackersdelight.org/permissions.htm -// Code taken from Hacker's Delight: -// http://www.hackersdelight.org/HDcode/divlu.c. -// License permits inclusion here per: -// http://www.hackersdelight.org/permissions.htm - -static uint64_t libdivide_128_div_64_to_64(uint64_t u1, uint64_t u0, uint64_t v, uint64_t *r) { - const uint64_t b = (1ULL << 32); // Number base (16 bits) + const uint64_t b = (1ULL << 32); // Number base (32 bits) uint64_t un1, un0; // Norm. dividend LSD's uint64_t vn1, vn0; // Norm. divisor digits uint64_t q1, q0; // Quotient digits @@ -572,21 +426,23 @@ static uint64_t libdivide_128_div_64_to_64(uint64_t u1, uint64_t u0, uint64_t v, // If overflow, set rem. to an impossible value, // and return the largest possible quotient if (u1 >= v) { - if (r != NULL) - *r = (uint64_t) -1; + *r = (uint64_t) -1; return (uint64_t) -1; } // count leading zeros - s = libdivide__count_leading_zeros64(v); + s = libdivide_count_leading_zeros64(v); if (s > 0) { // Normalize divisor v = v << s; - un64 = (u1 << s) | ((u0 >> (64 - s)) & (-s >> 31)); + un64 = (u1 << s) | (u0 >> (64 - s)); un10 = u0 << s; // Shift dividend left } else { - // Avoid undefined behavior - un64 = u1 | u0; + // Avoid undefined behavior of (u0 >> 64). + // The behavior is undefined if the right operand is + // negative, or greater than or equal to the length + // in bits of the promoted left operand. + un64 = u1; un10 = u0; } @@ -623,18 +479,13 @@ static uint64_t libdivide_128_div_64_to_64(uint64_t u1, uint64_t u0, uint64_t v, break; } - // If remainder is wanted, return it - if (r != NULL) - *r = (un21 * b + un0 - q0 * v) >> s; - + *r = (un21 * b + un0 - q0 * v) >> s; return q1 * b + q0; +#endif } -#endif - // Bitshift a u128 in place, left (signed_shift > 0) or right (signed_shift < 0) -static inline void libdivide_u128_shift(uint64_t *u1, uint64_t *u0, int32_t signed_shift) -{ +static inline void libdivide_u128_shift(uint64_t *u1, uint64_t *u0, int32_t signed_shift) { if (signed_shift > 0) { uint32_t shift = signed_shift; *u1 <<= shift; @@ -678,7 +529,7 @@ static uint64_t libdivide_128_div_128_to_64(uint64_t u_hi, uint64_t u_lo, uint64 // Here v >= 2**64 // We know that v.hi != 0, so count leading zeros is OK // We have 0 <= n <= 63 - uint32_t n = libdivide__count_leading_zeros64(v.hi); + uint32_t n = libdivide_count_leading_zeros64(v.hi); // Normalize the divisor so its MSB is 1 u128_t v1t = v; @@ -713,7 +564,7 @@ static uint64_t libdivide_128_div_128_to_64(uint64_t u_hi, uint64_t u_lo, uint64 // Each term is 128 bit // High half of full product (upper 128 bits!) are dropped u128_t q0v = {0, 0}; - q0v.hi = q0.hi*v.lo + q0.lo*v.hi + libdivide__mullhi_u64(q0.lo, v.lo); + q0v.hi = q0.hi*v.lo + q0.lo*v.hi + libdivide_mullhi_u64(q0.lo, v.lo); q0v.lo = q0.lo*v.lo; // Compute u - q0v as u_q0v @@ -751,18 +602,18 @@ static inline struct libdivide_u32_t libdivide_internal_u32_gen(uint32_t d, int } struct libdivide_u32_t result; - uint32_t floor_log_2_d = 31 - libdivide__count_leading_zeros32(d); + uint32_t floor_log_2_d = 31 - libdivide_count_leading_zeros32(d); if ((d & (d - 1)) == 0) { // Power of 2 if (! branchfree) { result.magic = 0; - result.more = floor_log_2_d | LIBDIVIDE_U32_SHIFT_PATH; + result.more = (uint8_t)(floor_log_2_d | LIBDIVIDE_U32_SHIFT_PATH); } else { // We want a magic number of 2**32 and a shift of floor_log_2_d // but one of the shifts is taken up by LIBDIVIDE_ADD_MARKER, // so we subtract 1 from the shift result.magic = 0; - result.more = (floor_log_2_d-1) | LIBDIVIDE_ADD_MARKER; + result.more = (uint8_t)((floor_log_2_d-1) | LIBDIVIDE_ADD_MARKER); } } else { uint8_t more; @@ -817,13 +668,14 @@ uint32_t libdivide_u32_do(uint32_t numer, const struct libdivide_u32_t *denom) { return numer >> (more & LIBDIVIDE_32_SHIFT_MASK); } else { - uint32_t q = libdivide__mullhi_u32(denom->magic, numer); + uint32_t q = libdivide_mullhi_u32(denom->magic, numer); if (more & LIBDIVIDE_ADD_MARKER) { uint32_t t = ((numer - q) >> 1) + q; return t >> (more & LIBDIVIDE_32_SHIFT_MASK); } else { - // all upper bits are 0 - don't need to mask them off + // All upper bits are 0, + // don't need to mask them off. return q >> more; } } @@ -874,9 +726,12 @@ uint32_t libdivide_u32_branchfree_recover(const struct libdivide_u32_branchfree_ int libdivide_u32_get_algorithm(const struct libdivide_u32_t *denom) { uint8_t more = denom->more; - if (more & LIBDIVIDE_U32_SHIFT_PATH) return 0; - else if (!(more & LIBDIVIDE_ADD_MARKER)) return 1; - else return 2; + if (more & LIBDIVIDE_U32_SHIFT_PATH) + return 0; + else if (!(more & LIBDIVIDE_ADD_MARKER)) + return 1; + else + return 2; } uint32_t libdivide_u32_do_alg0(uint32_t numer, const struct libdivide_u32_t *denom) { @@ -884,73 +739,26 @@ uint32_t libdivide_u32_do_alg0(uint32_t numer, const struct libdivide_u32_t *den } uint32_t libdivide_u32_do_alg1(uint32_t numer, const struct libdivide_u32_t *denom) { - uint32_t q = libdivide__mullhi_u32(denom->magic, numer); + uint32_t q = libdivide_mullhi_u32(denom->magic, numer); return q >> denom->more; } uint32_t libdivide_u32_do_alg2(uint32_t numer, const struct libdivide_u32_t *denom) { // denom->add != 0 - uint32_t q = libdivide__mullhi_u32(denom->magic, numer); + uint32_t q = libdivide_mullhi_u32(denom->magic, numer); uint32_t t = ((numer - q) >> 1) + q; - // Note that this mask is typically free. Only the low bits are meaningful - // to a shift, so compilers can optimize out this AND. + // Note that this mask is typically free. Only the low bits are + // meaningful to a shift, so compilers can optimize this out. return t >> (denom->more & LIBDIVIDE_32_SHIFT_MASK); } // same as algo 2 uint32_t libdivide_u32_branchfree_do(uint32_t numer, const struct libdivide_u32_branchfree_t *denom) { - uint32_t q = libdivide__mullhi_u32(denom->magic, numer); + uint32_t q = libdivide_mullhi_u32(denom->magic, numer); uint32_t t = ((numer - q) >> 1) + q; return t >> denom->more; } -#if defined(LIBDIVIDE_USE_SSE2) - -__m128i libdivide_u32_do_vector(__m128i numers, const struct libdivide_u32_t *denom) { - uint8_t more = denom->more; - if (more & LIBDIVIDE_U32_SHIFT_PATH) { - return _mm_srl_epi32(numers, libdivide_u32_to_m128i(more & LIBDIVIDE_32_SHIFT_MASK)); - } - else { - __m128i q = libdivide__mullhi_u32_flat_vector(numers, _mm_set1_epi32(denom->magic)); - if (more & LIBDIVIDE_ADD_MARKER) { - // uint32_t t = ((numer - q) >> 1) + q; - // return t >> denom->shift; - __m128i t = _mm_add_epi32(_mm_srli_epi32(_mm_sub_epi32(numers, q), 1), q); - return _mm_srl_epi32(t, libdivide_u32_to_m128i(more & LIBDIVIDE_32_SHIFT_MASK)); - - } - else { - // q >> denom->shift - return _mm_srl_epi32(q, libdivide_u32_to_m128i(more)); - } - } -} - -__m128i libdivide_u32_do_vector_alg0(__m128i numers, const struct libdivide_u32_t *denom) { - return _mm_srl_epi32(numers, libdivide_u32_to_m128i(denom->more & LIBDIVIDE_32_SHIFT_MASK)); -} - -__m128i libdivide_u32_do_vector_alg1(__m128i numers, const struct libdivide_u32_t *denom) { - __m128i q = libdivide__mullhi_u32_flat_vector(numers, _mm_set1_epi32(denom->magic)); - return _mm_srl_epi32(q, libdivide_u32_to_m128i(denom->more)); -} - -__m128i libdivide_u32_do_vector_alg2(__m128i numers, const struct libdivide_u32_t *denom) { - __m128i q = libdivide__mullhi_u32_flat_vector(numers, _mm_set1_epi32(denom->magic)); - __m128i t = _mm_add_epi32(_mm_srli_epi32(_mm_sub_epi32(numers, q), 1), q); - return _mm_srl_epi32(t, libdivide_u32_to_m128i(denom->more & LIBDIVIDE_32_SHIFT_MASK)); -} - -// same as algo 2 -LIBDIVIDE_API __m128i libdivide_u32_branchfree_do_vector(__m128i numers, const struct libdivide_u32_branchfree_t *denom) { - __m128i q = libdivide__mullhi_u32_flat_vector(numers, _mm_set1_epi32(denom->magic)); - __m128i t = _mm_add_epi32(_mm_srli_epi32(_mm_sub_epi32(numers, q), 1), q); - return _mm_srl_epi32(t, libdivide_u32_to_m128i(denom->more)); -} - -#endif - /////////// UINT64 static inline struct libdivide_u64_t libdivide_internal_u64_gen(uint64_t d, int branchfree) { @@ -959,7 +767,7 @@ static inline struct libdivide_u64_t libdivide_internal_u64_gen(uint64_t d, int } struct libdivide_u64_t result; - uint32_t floor_log_2_d = 63 - libdivide__count_leading_zeros64(d); + uint32_t floor_log_2_d = 63 - libdivide_count_leading_zeros64(d); if ((d & (d - 1)) == 0) { // Power of 2 if (! branchfree) { @@ -968,7 +776,7 @@ static inline struct libdivide_u64_t libdivide_internal_u64_gen(uint64_t d, int } else { // We want a magic number of 2**64 and a shift of floor_log_2_d // but one of the shifts is taken up by LIBDIVIDE_ADD_MARKER, - // so we subtract 1 from the shift + // so we subtract 1 from the shift. result.magic = 0; result.more = (floor_log_2_d-1) | LIBDIVIDE_ADD_MARKER; } @@ -1027,13 +835,14 @@ uint64_t libdivide_u64_do(uint64_t numer, const struct libdivide_u64_t *denom) { return numer >> (more & LIBDIVIDE_64_SHIFT_MASK); } else { - uint64_t q = libdivide__mullhi_u64(denom->magic, numer); + uint64_t q = libdivide_mullhi_u64(denom->magic, numer); if (more & LIBDIVIDE_ADD_MARKER) { uint64_t t = ((numer - q) >> 1) + q; return t >> (more & LIBDIVIDE_64_SHIFT_MASK); } else { - // all upper bits are 0 - don't need to mask them off + // All upper bits are 0, + // don't need to mask them off. return q >> more; } } @@ -1079,8 +888,8 @@ uint64_t libdivide_u64_recover(const struct libdivide_u64_t *denom) { uint64_t half_q = libdivide_128_div_128_to_64(half_n_hi, half_n_lo, d_hi, d_lo, &r_hi, &r_lo); // We computed 2^(64+shift)/(m+2^64) // Double the remainder ('dr') and check if that is larger than d - // Note that d is a 65 bit value, so r1 is small and so r1 + r1 cannot - // overflow + // Note that d is a 65 bit value, so r1 is small and so r1 + r1 + // cannot overflow uint64_t dr_lo = r_lo + r_lo; uint64_t dr_hi = r_hi + r_hi + (dr_lo < r_lo); // last term is carry int dr_exceeds_d = (dr_hi > d_hi) || (dr_hi == d_hi && dr_lo >= d_lo); @@ -1096,9 +905,12 @@ uint64_t libdivide_u64_branchfree_recover(const struct libdivide_u64_branchfree_ int libdivide_u64_get_algorithm(const struct libdivide_u64_t *denom) { uint8_t more = denom->more; - if (more & LIBDIVIDE_U64_SHIFT_PATH) return 0; - else if (!(more & LIBDIVIDE_ADD_MARKER)) return 1; - else return 2; + if (more & LIBDIVIDE_U64_SHIFT_PATH) + return 0; + else if (!(more & LIBDIVIDE_ADD_MARKER)) + return 1; + else + return 2; } uint64_t libdivide_u64_do_alg0(uint64_t numer, const struct libdivide_u64_t *denom) { @@ -1106,77 +918,25 @@ uint64_t libdivide_u64_do_alg0(uint64_t numer, const struct libdivide_u64_t *den } uint64_t libdivide_u64_do_alg1(uint64_t numer, const struct libdivide_u64_t *denom) { - uint64_t q = libdivide__mullhi_u64(denom->magic, numer); + uint64_t q = libdivide_mullhi_u64(denom->magic, numer); return q >> denom->more; } uint64_t libdivide_u64_do_alg2(uint64_t numer, const struct libdivide_u64_t *denom) { - uint64_t q = libdivide__mullhi_u64(denom->magic, numer); + uint64_t q = libdivide_mullhi_u64(denom->magic, numer); uint64_t t = ((numer - q) >> 1) + q; return t >> (denom->more & LIBDIVIDE_64_SHIFT_MASK); } // same as alg 2 uint64_t libdivide_u64_branchfree_do(uint64_t numer, const struct libdivide_u64_branchfree_t *denom) { - uint64_t q = libdivide__mullhi_u64(denom->magic, numer); + uint64_t q = libdivide_mullhi_u64(denom->magic, numer); uint64_t t = ((numer - q) >> 1) + q; return t >> denom->more; } -#if defined(LIBDIVIDE_USE_SSE2) - -__m128i libdivide_u64_do_vector(__m128i numers, const struct libdivide_u64_t *denom) { - uint8_t more = denom->more; - if (more & LIBDIVIDE_U64_SHIFT_PATH) { - return _mm_srl_epi64(numers, libdivide_u32_to_m128i(more & LIBDIVIDE_64_SHIFT_MASK)); - } - else { - __m128i q = libdivide_mullhi_u64_flat_vector(numers, libdivide__u64_to_m128(denom->magic)); - if (more & LIBDIVIDE_ADD_MARKER) { - // uint32_t t = ((numer - q) >> 1) + q; - // return t >> denom->shift; - __m128i t = _mm_add_epi64(_mm_srli_epi64(_mm_sub_epi64(numers, q), 1), q); - return _mm_srl_epi64(t, libdivide_u32_to_m128i(more & LIBDIVIDE_64_SHIFT_MASK)); - } - else { - // q >> denom->shift - return _mm_srl_epi64(q, libdivide_u32_to_m128i(more)); - } - } -} - -__m128i libdivide_u64_do_vector_alg0(__m128i numers, const struct libdivide_u64_t *denom) { - return _mm_srl_epi64(numers, libdivide_u32_to_m128i(denom->more & LIBDIVIDE_64_SHIFT_MASK)); -} - -__m128i libdivide_u64_do_vector_alg1(__m128i numers, const struct libdivide_u64_t *denom) { - __m128i q = libdivide_mullhi_u64_flat_vector(numers, libdivide__u64_to_m128(denom->magic)); - return _mm_srl_epi64(q, libdivide_u32_to_m128i(denom->more)); -} - -__m128i libdivide_u64_do_vector_alg2(__m128i numers, const struct libdivide_u64_t *denom) { - __m128i q = libdivide_mullhi_u64_flat_vector(numers, libdivide__u64_to_m128(denom->magic)); - __m128i t = _mm_add_epi64(_mm_srli_epi64(_mm_sub_epi64(numers, q), 1), q); - return _mm_srl_epi64(t, libdivide_u32_to_m128i(denom->more & LIBDIVIDE_64_SHIFT_MASK)); -} - -__m128i libdivide_u64_branchfree_do_vector(__m128i numers, const struct libdivide_u64_branchfree_t *denom) { - __m128i q = libdivide_mullhi_u64_flat_vector(numers, libdivide__u64_to_m128(denom->magic)); - __m128i t = _mm_add_epi64(_mm_srli_epi64(_mm_sub_epi64(numers, q), 1), q); - return _mm_srl_epi64(t, libdivide_u32_to_m128i(denom->more)); -} - -#endif - /////////// SINT32 -static inline int32_t libdivide__mullhi_s32(int32_t x, int32_t y) { - int64_t xl = x, yl = y; - int64_t rl = xl * yl; - // needs to be arithmetic shift - return (int32_t)(rl >> 32); -} - static inline struct libdivide_s32_t libdivide_internal_s32_gen(int32_t d, int branchfree) { if (d == 0) { LIBDIVIDE_ERROR("divider must be != 0"); @@ -1192,7 +952,7 @@ static inline struct libdivide_s32_t libdivide_internal_s32_gen(int32_t d, int b // and is a power of 2. uint32_t ud = (uint32_t)d; uint32_t absD = (d < 0) ? -ud : ud; - uint32_t floor_log_2_d = 31 - libdivide__count_leading_zeros32(absD); + uint32_t floor_log_2_d = 31 - libdivide_count_leading_zeros32(absD); // check if exactly one bit is set, // don't care if absD is 0 since that's divide by zero if ((absD & (absD - 1)) == 0) { @@ -1262,14 +1022,15 @@ int32_t libdivide_s32_do(int32_t numer, const struct libdivide_s32_t *denom) { uint8_t more = denom->more; if (more & LIBDIVIDE_S32_SHIFT_PATH) { uint32_t sign = (int8_t)more >> 7; - uint8_t shifter = more & LIBDIVIDE_32_SHIFT_MASK; - uint32_t uq = (uint32_t)(numer + ((numer >> 31) & ((1U << shifter) - 1))); + uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK; + uint32_t mask = (1U << shift) - 1; + uint32_t uq = numer + ((numer >> 31) & mask); int32_t q = (int32_t)uq; - q = q >> shifter; + q = q >> shift; q = (q ^ sign) - sign; return q; } else { - uint32_t uq = (uint32_t)libdivide__mullhi_s32(denom->magic, numer); + uint32_t uq = (uint32_t)libdivide_mullhi_s32(denom->magic, numer); if (more & LIBDIVIDE_ADD_MARKER) { // must be arithmetic shift and then sign extend int32_t sign = (int8_t)more >> 7; @@ -1289,7 +1050,7 @@ int32_t libdivide_s32_branchfree_do(int32_t numer, const struct libdivide_s32_br // must be arithmetic shift and then sign extend int32_t sign = (int8_t)more >> 7; int32_t magic = denom->magic; - int32_t q = libdivide__mullhi_s32(magic, numer); + int32_t q = libdivide_mullhi_s32(magic, numer); q += numer; // If q is non-negative, we have nothing to do @@ -1351,25 +1112,30 @@ int32_t libdivide_s32_branchfree_recover(const struct libdivide_s32_branchfree_t int libdivide_s32_get_algorithm(const struct libdivide_s32_t *denom) { uint8_t more = denom->more; int positiveDivisor = !(more & LIBDIVIDE_NEGATIVE_DIVISOR); - if (more & LIBDIVIDE_S32_SHIFT_PATH) return (positiveDivisor ? 0 : 1); - else if (more & LIBDIVIDE_ADD_MARKER) return (positiveDivisor ? 2 : 3); - else return 4; + if (more & LIBDIVIDE_S32_SHIFT_PATH) + return (positiveDivisor ? 0 : 1); + else if (more & LIBDIVIDE_ADD_MARKER) + return (positiveDivisor ? 2 : 3); + else + return 4; } int32_t libdivide_s32_do_alg0(int32_t numer, const struct libdivide_s32_t *denom) { - uint8_t shifter = denom->more & LIBDIVIDE_32_SHIFT_MASK; - int32_t q = numer + ((numer >> 31) & ((1U << shifter) - 1)); - return q >> shifter; + uint8_t shift = denom->more & LIBDIVIDE_32_SHIFT_MASK; + uint32_t mask = (1U << shift) - 1; + int32_t q = numer + ((numer >> 31) & mask); + return q >> shift; } int32_t libdivide_s32_do_alg1(int32_t numer, const struct libdivide_s32_t *denom) { - uint8_t shifter = denom->more & LIBDIVIDE_32_SHIFT_MASK; - int32_t q = numer + ((numer >> 31) & ((1U << shifter) - 1)); - return - (q >> shifter); + uint8_t shift = denom->more & LIBDIVIDE_32_SHIFT_MASK; + uint32_t mask = (1U << shift) - 1; + int32_t q = numer + ((numer >> 31) & mask); + return - (q >> shift); } int32_t libdivide_s32_do_alg2(int32_t numer, const struct libdivide_s32_t *denom) { - int32_t q = libdivide__mullhi_s32(denom->magic, numer); + int32_t q = libdivide_mullhi_s32(denom->magic, numer); q += numer; q >>= denom->more & LIBDIVIDE_32_SHIFT_MASK; q += (q < 0); @@ -1377,7 +1143,7 @@ int32_t libdivide_s32_do_alg2(int32_t numer, const struct libdivide_s32_t *denom } int32_t libdivide_s32_do_alg3(int32_t numer, const struct libdivide_s32_t *denom) { - int32_t q = libdivide__mullhi_s32(denom->magic, numer); + int32_t q = libdivide_mullhi_s32(denom->magic, numer); q -= numer; q >>= denom->more & LIBDIVIDE_32_SHIFT_MASK; q += (q < 0); @@ -1385,100 +1151,12 @@ int32_t libdivide_s32_do_alg3(int32_t numer, const struct libdivide_s32_t *denom } int32_t libdivide_s32_do_alg4(int32_t numer, const struct libdivide_s32_t *denom) { - int32_t q = libdivide__mullhi_s32(denom->magic, numer); + int32_t q = libdivide_mullhi_s32(denom->magic, numer); q >>= denom->more & LIBDIVIDE_32_SHIFT_MASK; q += (q < 0); return q; } -#if defined(LIBDIVIDE_USE_SSE2) - -__m128i libdivide_s32_do_vector(__m128i numers, const struct libdivide_s32_t *denom) { - uint8_t more = denom->more; - if (more & LIBDIVIDE_S32_SHIFT_PATH) { - uint32_t shifter = more & LIBDIVIDE_32_SHIFT_MASK; - __m128i roundToZeroTweak = _mm_set1_epi32((1U << shifter) - 1); // could use _mm_srli_epi32 with an all -1 register - __m128i q = _mm_add_epi32(numers, _mm_and_si128(_mm_srai_epi32(numers, 31), roundToZeroTweak)); //q = numer + ((numer >> 31) & roundToZeroTweak); - q = _mm_sra_epi32(q, libdivide_u32_to_m128i(shifter)); // q = q >> shifter - __m128i shiftMask = _mm_set1_epi32((int32_t)((int8_t)more >> 7)); // set all bits of shift mask = to the sign bit of more - q = _mm_sub_epi32(_mm_xor_si128(q, shiftMask), shiftMask); // q = (q ^ shiftMask) - shiftMask; - return q; - } - else { - __m128i q = libdivide_mullhi_s32_flat_vector(numers, _mm_set1_epi32(denom->magic)); - if (more & LIBDIVIDE_ADD_MARKER) { - __m128i sign = _mm_set1_epi32((int32_t)(int8_t)more >> 7); // must be arithmetic shift - q = _mm_add_epi32(q, _mm_sub_epi32(_mm_xor_si128(numers, sign), sign)); // q += ((numer ^ sign) - sign); - } - q = _mm_sra_epi32(q, libdivide_u32_to_m128i(more & LIBDIVIDE_32_SHIFT_MASK)); // q >>= shift - q = _mm_add_epi32(q, _mm_srli_epi32(q, 31)); // q += (q < 0) - return q; - } -} - -__m128i libdivide_s32_do_vector_alg0(__m128i numers, const struct libdivide_s32_t *denom) { - uint8_t shifter = denom->more & LIBDIVIDE_32_SHIFT_MASK; - __m128i roundToZeroTweak = _mm_set1_epi32((1U << shifter) - 1); - __m128i q = _mm_add_epi32(numers, _mm_and_si128(_mm_srai_epi32(numers, 31), roundToZeroTweak)); - return _mm_sra_epi32(q, libdivide_u32_to_m128i(shifter)); -} - -__m128i libdivide_s32_do_vector_alg1(__m128i numers, const struct libdivide_s32_t *denom) { - uint8_t shifter = denom->more & LIBDIVIDE_32_SHIFT_MASK; - __m128i roundToZeroTweak = _mm_set1_epi32((1U << shifter) - 1); - __m128i q = _mm_add_epi32(numers, _mm_and_si128(_mm_srai_epi32(numers, 31), roundToZeroTweak)); - return _mm_sub_epi32(_mm_setzero_si128(), _mm_sra_epi32(q, libdivide_u32_to_m128i(shifter))); -} - -__m128i libdivide_s32_do_vector_alg2(__m128i numers, const struct libdivide_s32_t *denom) { - __m128i q = libdivide_mullhi_s32_flat_vector(numers, _mm_set1_epi32(denom->magic)); - q = _mm_add_epi32(q, numers); - q = _mm_sra_epi32(q, libdivide_u32_to_m128i(denom->more & LIBDIVIDE_32_SHIFT_MASK)); - q = _mm_add_epi32(q, _mm_srli_epi32(q, 31)); - return q; -} - -__m128i libdivide_s32_do_vector_alg3(__m128i numers, const struct libdivide_s32_t *denom) { - __m128i q = libdivide_mullhi_s32_flat_vector(numers, _mm_set1_epi32(denom->magic)); - q = _mm_sub_epi32(q, numers); - q = _mm_sra_epi32(q, libdivide_u32_to_m128i(denom->more & LIBDIVIDE_32_SHIFT_MASK)); - q = _mm_add_epi32(q, _mm_srli_epi32(q, 31)); - return q; -} - -__m128i libdivide_s32_do_vector_alg4(__m128i numers, const struct libdivide_s32_t *denom) { - uint8_t more = denom->more; - __m128i q = libdivide_mullhi_s32_flat_vector(numers, _mm_set1_epi32(denom->magic)); - q = _mm_sra_epi32(q, libdivide_u32_to_m128i(more & LIBDIVIDE_32_SHIFT_MASK)); //q >>= shift - q = _mm_add_epi32(q, _mm_srli_epi32(q, 31)); // q += (q < 0) - return q; -} - -__m128i libdivide_s32_branchfree_do_vector(__m128i numers, const struct libdivide_s32_branchfree_t *denom) { - int32_t magic = denom->magic; - uint8_t more = denom->more; - uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK; - // must be arithmetic shift - __m128i sign = _mm_set1_epi32((int32_t)(int8_t)more >> 7); - - // libdivide__mullhi_s32(numers, magic); - __m128i q = libdivide_mullhi_s32_flat_vector(numers, _mm_set1_epi32(magic)); - q = _mm_add_epi32(q, numers); // q += numers - - // If q is non-negative, we have nothing to do - // If q is negative, we want to add either (2**shift)-1 if d is a power of - // 2, or (2**shift) if it is not a power of 2 - uint32_t is_power_of_2 = (magic == 0); - __m128i q_sign = _mm_srai_epi32(q, 31); // q_sign = q >> 31 - __m128i mask = _mm_set1_epi32((1 << shift) - is_power_of_2); - q = _mm_add_epi32(q, _mm_and_si128(q_sign, mask)); // q = q + (q_sign & mask) - q = _mm_srai_epi32(q, shift); //q >>= shift - q = _mm_sub_epi32(_mm_xor_si128(q, sign), sign); // q = (q ^ sign) - sign - return q; -} - -#endif - ///////////// SINT64 static inline struct libdivide_s64_t libdivide_internal_s64_gen(int64_t d, int branchfree) { @@ -1496,7 +1174,7 @@ static inline struct libdivide_s64_t libdivide_internal_s64_gen(int64_t d, int b // and is a power of 2. uint64_t ud = (uint64_t)d; uint64_t absD = (d < 0) ? -ud : ud; - uint32_t floor_log_2_d = 63 - libdivide__count_leading_zeros64(absD); + uint32_t floor_log_2_d = 63 - libdivide_count_leading_zeros64(absD); // check if exactly one bit is set, // don't care if absD is 0 since that's divide by zero if ((absD & (absD - 1)) == 0) { @@ -1566,16 +1244,17 @@ int64_t libdivide_s64_do(int64_t numer, const struct libdivide_s64_t *denom) { uint8_t more = denom->more; int64_t magic = denom->magic; if (magic == 0) { //shift path - uint32_t shifter = more & LIBDIVIDE_64_SHIFT_MASK; - uint64_t uq = (uint64_t)numer + ((numer >> 63) & ((1ULL << shifter) - 1)); + uint32_t shift = more & LIBDIVIDE_64_SHIFT_MASK; + uint64_t mask = (1ULL << shift) - 1; + uint64_t uq = numer + ((numer >> 63) & mask); int64_t q = (int64_t)uq; - q = q >> shifter; + q = q >> shift; // must be arithmetic shift and then sign-extend int64_t shiftMask = (int8_t)more >> 7; q = (q ^ shiftMask) - shiftMask; return q; } else { - uint64_t uq = (uint64_t)libdivide__mullhi_s64(magic, numer); + uint64_t uq = (uint64_t)libdivide_mullhi_s64(magic, numer); if (more & LIBDIVIDE_ADD_MARKER) { // must be arithmetic shift and then sign extend int64_t sign = (int8_t)more >> 7; @@ -1594,7 +1273,7 @@ int64_t libdivide_s64_branchfree_do(int64_t numer, const struct libdivide_s64_br // must be arithmetic shift and then sign extend int64_t sign = (int8_t)more >> 7; int64_t magic = denom->magic; - int64_t q = libdivide__mullhi_s64(magic, numer); + int64_t q = libdivide_mullhi_s64(magic, numer); q += numer; // If q is non-negative, we have nothing to do. @@ -1646,26 +1325,31 @@ int64_t libdivide_s64_branchfree_recover(const struct libdivide_s64_branchfree_t int libdivide_s64_get_algorithm(const struct libdivide_s64_t *denom) { uint8_t more = denom->more; int positiveDivisor = !(more & LIBDIVIDE_NEGATIVE_DIVISOR); - if (denom->magic == 0) return (positiveDivisor ? 0 : 1); // shift path - else if (more & LIBDIVIDE_ADD_MARKER) return (positiveDivisor ? 2 : 3); - else return 4; + if (denom->magic == 0) + return (positiveDivisor ? 0 : 1); // shift path + else if (more & LIBDIVIDE_ADD_MARKER) + return (positiveDivisor ? 2 : 3); + else + return 4; } int64_t libdivide_s64_do_alg0(int64_t numer, const struct libdivide_s64_t *denom) { - uint32_t shifter = denom->more & LIBDIVIDE_64_SHIFT_MASK; - int64_t q = numer + ((numer >> 63) & ((1ULL << shifter) - 1)); - return q >> shifter; + uint32_t shift = denom->more & LIBDIVIDE_64_SHIFT_MASK; + uint64_t mask = (1ULL << shift) - 1; + int64_t q = numer + ((numer >> 63) & mask); + return q >> shift; } int64_t libdivide_s64_do_alg1(int64_t numer, const struct libdivide_s64_t *denom) { - // denom->shifter != -1 && demo->shiftMask != 0 - uint32_t shifter = denom->more & LIBDIVIDE_64_SHIFT_MASK; - int64_t q = numer + ((numer >> 63) & ((1ULL << shifter) - 1)); - return - (q >> shifter); + // denom->shift != -1 && demo->shiftMask != 0 + uint32_t shift = denom->more & LIBDIVIDE_64_SHIFT_MASK; + uint64_t mask = (1ULL << shift) - 1; + int64_t q = numer + ((numer >> 63) & mask); + return - (q >> shift); } int64_t libdivide_s64_do_alg2(int64_t numer, const struct libdivide_s64_t *denom) { - int64_t q = libdivide__mullhi_s64(denom->magic, numer); + int64_t q = libdivide_mullhi_s64(denom->magic, numer); q += numer; q >>= denom->more & LIBDIVIDE_64_SHIFT_MASK; q += (q < 0); @@ -1673,7 +1357,7 @@ int64_t libdivide_s64_do_alg2(int64_t numer, const struct libdivide_s64_t *denom } int64_t libdivide_s64_do_alg3(int64_t numer, const struct libdivide_s64_t *denom) { - int64_t q = libdivide__mullhi_s64(denom->magic, numer); + int64_t q = libdivide_mullhi_s64(denom->magic, numer); q -= numer; q >>= denom->more & LIBDIVIDE_64_SHIFT_MASK; q += (q < 0); @@ -1681,7 +1365,7 @@ int64_t libdivide_s64_do_alg3(int64_t numer, const struct libdivide_s64_t *denom } int64_t libdivide_s64_do_alg4(int64_t numer, const struct libdivide_s64_t *denom) { - int64_t q = libdivide__mullhi_s64(denom->magic, numer); + int64_t q = libdivide_mullhi_s64(denom->magic, numer); q >>= denom->more & LIBDIVIDE_64_SHIFT_MASK; q += (q < 0); return q; @@ -1689,23 +1373,372 @@ int64_t libdivide_s64_do_alg4(int64_t numer, const struct libdivide_s64_t *denom #if defined(LIBDIVIDE_USE_SSE2) +LIBDIVIDE_API __m128i libdivide_u32_do_vector(__m128i numers, const struct libdivide_u32_t *denom); +LIBDIVIDE_API __m128i libdivide_s32_do_vector(__m128i numers, const struct libdivide_s32_t *denom); +LIBDIVIDE_API __m128i libdivide_u64_do_vector(__m128i numers, const struct libdivide_u64_t *denom); +LIBDIVIDE_API __m128i libdivide_s64_do_vector(__m128i numers, const struct libdivide_s64_t *denom); + +LIBDIVIDE_API __m128i libdivide_u32_do_vector_alg0(__m128i numers, const struct libdivide_u32_t *denom); +LIBDIVIDE_API __m128i libdivide_u32_do_vector_alg1(__m128i numers, const struct libdivide_u32_t *denom); +LIBDIVIDE_API __m128i libdivide_u32_do_vector_alg2(__m128i numers, const struct libdivide_u32_t *denom); + +LIBDIVIDE_API __m128i libdivide_s32_do_vector_alg0(__m128i numers, const struct libdivide_s32_t *denom); +LIBDIVIDE_API __m128i libdivide_s32_do_vector_alg1(__m128i numers, const struct libdivide_s32_t *denom); +LIBDIVIDE_API __m128i libdivide_s32_do_vector_alg2(__m128i numers, const struct libdivide_s32_t *denom); +LIBDIVIDE_API __m128i libdivide_s32_do_vector_alg3(__m128i numers, const struct libdivide_s32_t *denom); +LIBDIVIDE_API __m128i libdivide_s32_do_vector_alg4(__m128i numers, const struct libdivide_s32_t *denom); + +LIBDIVIDE_API __m128i libdivide_u64_do_vector_alg0(__m128i numers, const struct libdivide_u64_t *denom); +LIBDIVIDE_API __m128i libdivide_u64_do_vector_alg1(__m128i numers, const struct libdivide_u64_t *denom); +LIBDIVIDE_API __m128i libdivide_u64_do_vector_alg2(__m128i numers, const struct libdivide_u64_t *denom); + +LIBDIVIDE_API __m128i libdivide_s64_do_vector_alg0(__m128i numers, const struct libdivide_s64_t *denom); +LIBDIVIDE_API __m128i libdivide_s64_do_vector_alg1(__m128i numers, const struct libdivide_s64_t *denom); +LIBDIVIDE_API __m128i libdivide_s64_do_vector_alg2(__m128i numers, const struct libdivide_s64_t *denom); +LIBDIVIDE_API __m128i libdivide_s64_do_vector_alg3(__m128i numers, const struct libdivide_s64_t *denom); +LIBDIVIDE_API __m128i libdivide_s64_do_vector_alg4(__m128i numers, const struct libdivide_s64_t *denom); + +LIBDIVIDE_API __m128i libdivide_u32_branchfree_do_vector(__m128i numers, const struct libdivide_u32_branchfree_t *denom); +LIBDIVIDE_API __m128i libdivide_s32_branchfree_do_vector(__m128i numers, const struct libdivide_s32_branchfree_t *denom); +LIBDIVIDE_API __m128i libdivide_u64_branchfree_do_vector(__m128i numers, const struct libdivide_u64_branchfree_t *denom); +LIBDIVIDE_API __m128i libdivide_s64_branchfree_do_vector(__m128i numers, const struct libdivide_s64_branchfree_t *denom); + +//////// Internal Utility Functions + +static inline __m128i libdivide_u64_to_m128(uint64_t x) { + return _mm_set1_epi64x(x); +} + +static inline __m128i libdivide_get_FFFFFFFF00000000(void) { + // returns the same as _mm_set1_epi64(0xFFFFFFFF00000000ULL) + // without touching memory. + // optimizes to pcmpeqd on OS X + __m128i result = _mm_set1_epi8(-1); + return _mm_slli_epi64(result, 32); +} + +static inline __m128i libdivide_get_00000000FFFFFFFF(void) { + // returns the same as _mm_set1_epi64(0x00000000FFFFFFFFULL) + // without touching memory. + // optimizes to pcmpeqd on OS X + __m128i result = _mm_set1_epi8(-1); + result = _mm_srli_epi64(result, 32); + return result; +} + +static inline __m128i libdivide_s64_signbits(__m128i v) { + // we want to compute v >> 63, that is, _mm_srai_epi64(v, 63). But there + // is no 64 bit shift right arithmetic instruction in SSE2. So we have to + // fake it by first duplicating the high 32 bit values, and then using a 32 + // bit shift. Another option would be to use _mm_srli_epi64(v, 63) and + // then subtract that from 0, but that approach appears to be substantially + // slower for unknown reasons + __m128i hiBitsDuped = _mm_shuffle_epi32(v, _MM_SHUFFLE(3, 3, 1, 1)); + __m128i signBits = _mm_srai_epi32(hiBitsDuped, 31); + return signBits; +} + +// Returns an __m128i whose low 32 bits are equal to amt and has zero elsewhere. +static inline __m128i libdivide_u32_to_m128i(uint32_t amt) { + return _mm_set_epi32(0, 0, 0, amt); +} + +static inline __m128i libdivide_s64_shift_right_vector(__m128i v, int amt) { + // implementation of _mm_sra_epi64. Here we have two 64 bit values which + // are shifted right to logically become (64 - amt) values, and are then + // sign extended from a (64 - amt) bit number. + const int b = 64 - amt; + __m128i m = libdivide_u64_to_m128(1ULL << (b - 1)); + __m128i x = _mm_srl_epi64(v, libdivide_u32_to_m128i(amt)); + __m128i result = _mm_sub_epi64(_mm_xor_si128(x, m), m); + return result; +} + +// Here, b is assumed to contain one 32 bit value repeated four times. +// If it did not, the function would not work. +static inline __m128i libdivide_mullhi_u32_flat_vector(__m128i a, __m128i b) { + __m128i hi_product_0Z2Z = _mm_srli_epi64(_mm_mul_epu32(a, b), 32); + __m128i a1X3X = _mm_srli_epi64(a, 32); + __m128i mask = libdivide_get_FFFFFFFF00000000(); + __m128i hi_product_Z1Z3 = _mm_and_si128(_mm_mul_epu32(a1X3X, b), mask); + // return hi_product_0123 + return _mm_or_si128(hi_product_0Z2Z, hi_product_Z1Z3); +} + +// Here, y is assumed to contain one 64 bit value repeated twice. +static inline __m128i libdivide_mullhi_u64_flat_vector(__m128i x, __m128i y) { + // full 128 bits are x0 * y0 + (x0 * y1 << 32) + (x1 * y0 << 32) + (x1 * y1 << 64) + __m128i mask = libdivide_get_00000000FFFFFFFF(); + // x0 is low half of 2 64 bit values, x1 is high half in low slots + __m128i x0 = _mm_and_si128(x, mask); + __m128i x1 = _mm_srli_epi64(x, 32); + __m128i y0 = _mm_and_si128(y, mask); + __m128i y1 = _mm_srli_epi64(y, 32); + // x0 happens to have the low half of the two 64 bit values in 32 bit slots + // 0 and 2, so _mm_mul_epu32 computes their full product, and then we shift + // right by 32 to get just the high values + __m128i x0y0_hi = _mm_srli_epi64(_mm_mul_epu32(x0, y0), 32); + __m128i x0y1 = _mm_mul_epu32(x0, y1); + __m128i x1y0 = _mm_mul_epu32(x1, y0); + __m128i x1y1 = _mm_mul_epu32(x1, y1); + __m128i temp = _mm_add_epi64(x1y0, x0y0_hi); + __m128i temp_lo = _mm_and_si128(temp, mask); + __m128i temp_hi = _mm_srli_epi64(temp, 32); + temp_lo = _mm_srli_epi64(_mm_add_epi64(temp_lo, x0y1), 32); + temp_hi = _mm_add_epi64(x1y1, temp_hi); + + return _mm_add_epi64(temp_lo, temp_hi); +} + +// y is one 64 bit value repeated twice +static inline __m128i libdivide_mullhi_s64_flat_vector(__m128i x, __m128i y) { + __m128i p = libdivide_mullhi_u64_flat_vector(x, y); + __m128i t1 = _mm_and_si128(libdivide_s64_signbits(x), y); + p = _mm_sub_epi64(p, t1); + __m128i t2 = _mm_and_si128(libdivide_s64_signbits(y), x); + p = _mm_sub_epi64(p, t2); + return p; +} + +#ifdef LIBDIVIDE_USE_SSE4_1 + +// b is one 32 bit value repeated four times. +static inline __m128i libdivide_mullhi_s32_flat_vector(__m128i a, __m128i b) { + __m128i hi_product_0Z2Z = _mm_srli_epi64(_mm_mul_epi32(a, b), 32); + __m128i a1X3X = _mm_srli_epi64(a, 32); + __m128i mask = libdivide_get_FFFFFFFF00000000(); + __m128i hi_product_Z1Z3 = _mm_and_si128(_mm_mul_epi32(a1X3X, b), mask); + // return hi_product_0123 + return _mm_or_si128(hi_product_0Z2Z, hi_product_Z1Z3); +} + +#else + +// SSE2 does not have a signed multiplication instruction, but we can convert +// unsigned to signed pretty efficiently. Again, b is just a 32 bit value +// repeated four times. +static inline __m128i libdivide_mullhi_s32_flat_vector(__m128i a, __m128i b) { + __m128i p = libdivide_mullhi_u32_flat_vector(a, b); + // t1 = (a >> 31) & y, arithmetic shift + __m128i t1 = _mm_and_si128(_mm_srai_epi32(a, 31), b); + __m128i t2 = _mm_and_si128(_mm_srai_epi32(b, 31), a); + p = _mm_sub_epi32(p, t1); + p = _mm_sub_epi32(p, t2); + return p; +} + +#endif + +////////// UINT32 + +__m128i libdivide_u32_do_vector(__m128i numers, const struct libdivide_u32_t *denom) { + uint8_t more = denom->more; + if (more & LIBDIVIDE_U32_SHIFT_PATH) { + uint32_t shift = more & LIBDIVIDE_32_SHIFT_MASK; + return _mm_srl_epi32(numers, libdivide_u32_to_m128i(shift)); + } + else { + __m128i q = libdivide_mullhi_u32_flat_vector(numers, _mm_set1_epi32(denom->magic)); + if (more & LIBDIVIDE_ADD_MARKER) { + // uint32_t t = ((numer - q) >> 1) + q; + // return t >> denom->shift; + uint32_t shift = more & LIBDIVIDE_32_SHIFT_MASK; + __m128i t = _mm_add_epi32(_mm_srli_epi32(_mm_sub_epi32(numers, q), 1), q); + return _mm_srl_epi32(t, libdivide_u32_to_m128i(shift)); + } + else { + // q >> denom->shift + return _mm_srl_epi32(q, libdivide_u32_to_m128i(more)); + } + } +} + +__m128i libdivide_u32_do_vector_alg0(__m128i numers, const struct libdivide_u32_t *denom) { + return _mm_srl_epi32(numers, libdivide_u32_to_m128i(denom->more & LIBDIVIDE_32_SHIFT_MASK)); +} + +__m128i libdivide_u32_do_vector_alg1(__m128i numers, const struct libdivide_u32_t *denom) { + __m128i q = libdivide_mullhi_u32_flat_vector(numers, _mm_set1_epi32(denom->magic)); + return _mm_srl_epi32(q, libdivide_u32_to_m128i(denom->more)); +} + +__m128i libdivide_u32_do_vector_alg2(__m128i numers, const struct libdivide_u32_t *denom) { + __m128i q = libdivide_mullhi_u32_flat_vector(numers, _mm_set1_epi32(denom->magic)); + __m128i t = _mm_add_epi32(_mm_srli_epi32(_mm_sub_epi32(numers, q), 1), q); + return _mm_srl_epi32(t, libdivide_u32_to_m128i(denom->more & LIBDIVIDE_32_SHIFT_MASK)); +} + +LIBDIVIDE_API __m128i libdivide_u32_branchfree_do_vector(__m128i numers, const struct libdivide_u32_branchfree_t *denom) { + __m128i q = libdivide_mullhi_u32_flat_vector(numers, _mm_set1_epi32(denom->magic)); + __m128i t = _mm_add_epi32(_mm_srli_epi32(_mm_sub_epi32(numers, q), 1), q); + return _mm_srl_epi32(t, libdivide_u32_to_m128i(denom->more)); +} + +////////// UINT64 + +__m128i libdivide_u64_do_vector(__m128i numers, const struct libdivide_u64_t *denom) { + uint8_t more = denom->more; + if (more & LIBDIVIDE_U64_SHIFT_PATH) { + uint32_t shift = more & LIBDIVIDE_64_SHIFT_MASK; + return _mm_srl_epi64(numers, libdivide_u32_to_m128i(shift)); + } + else { + __m128i q = libdivide_mullhi_u64_flat_vector(numers, libdivide_u64_to_m128(denom->magic)); + if (more & LIBDIVIDE_ADD_MARKER) { + // uint32_t t = ((numer - q) >> 1) + q; + // return t >> denom->shift; + uint32_t shift = more & LIBDIVIDE_64_SHIFT_MASK; + __m128i t = _mm_add_epi64(_mm_srli_epi64(_mm_sub_epi64(numers, q), 1), q); + return _mm_srl_epi64(t, libdivide_u32_to_m128i(shift)); + } + else { + // q >> denom->shift + return _mm_srl_epi64(q, libdivide_u32_to_m128i(more)); + } + } +} + +__m128i libdivide_u64_do_vector_alg0(__m128i numers, const struct libdivide_u64_t *denom) { + return _mm_srl_epi64(numers, libdivide_u32_to_m128i(denom->more & LIBDIVIDE_64_SHIFT_MASK)); +} + +__m128i libdivide_u64_do_vector_alg1(__m128i numers, const struct libdivide_u64_t *denom) { + __m128i q = libdivide_mullhi_u64_flat_vector(numers, libdivide_u64_to_m128(denom->magic)); + return _mm_srl_epi64(q, libdivide_u32_to_m128i(denom->more)); +} + +__m128i libdivide_u64_do_vector_alg2(__m128i numers, const struct libdivide_u64_t *denom) { + __m128i q = libdivide_mullhi_u64_flat_vector(numers, libdivide_u64_to_m128(denom->magic)); + __m128i t = _mm_add_epi64(_mm_srli_epi64(_mm_sub_epi64(numers, q), 1), q); + return _mm_srl_epi64(t, libdivide_u32_to_m128i(denom->more & LIBDIVIDE_64_SHIFT_MASK)); +} + +__m128i libdivide_u64_branchfree_do_vector(__m128i numers, const struct libdivide_u64_branchfree_t *denom) { + __m128i q = libdivide_mullhi_u64_flat_vector(numers, libdivide_u64_to_m128(denom->magic)); + __m128i t = _mm_add_epi64(_mm_srli_epi64(_mm_sub_epi64(numers, q), 1), q); + return _mm_srl_epi64(t, libdivide_u32_to_m128i(denom->more)); +} + +////////// SINT32 + +__m128i libdivide_s32_do_vector(__m128i numers, const struct libdivide_s32_t *denom) { + uint8_t more = denom->more; + if (more & LIBDIVIDE_S32_SHIFT_PATH) { + uint32_t shift = more & LIBDIVIDE_32_SHIFT_MASK; + uint32_t mask = (1U << shift) - 1; + // could use _mm_srli_epi32 with an all -1 register + __m128i roundToZeroTweak = _mm_set1_epi32(mask); + // q = numer + ((numer >> 31) & roundToZeroTweak); + __m128i q = _mm_add_epi32(numers, _mm_and_si128(_mm_srai_epi32(numers, 31), roundToZeroTweak)); + q = _mm_sra_epi32(q, libdivide_u32_to_m128i(shift)); + // set all bits of shift mask = to the sign bit of more + __m128i shiftMask = _mm_set1_epi32((int32_t)((int8_t)more >> 7)); + // q = (q ^ shiftMask) - shiftMask; + q = _mm_sub_epi32(_mm_xor_si128(q, shiftMask), shiftMask); + return q; + } + else { + __m128i q = libdivide_mullhi_s32_flat_vector(numers, _mm_set1_epi32(denom->magic)); + if (more & LIBDIVIDE_ADD_MARKER) { + // must be arithmetic shift + __m128i sign = _mm_set1_epi32((int32_t)(int8_t)more >> 7); + // q += ((numer ^ sign) - sign); + q = _mm_add_epi32(q, _mm_sub_epi32(_mm_xor_si128(numers, sign), sign)); + } + // q >>= shift + q = _mm_sra_epi32(q, libdivide_u32_to_m128i(more & LIBDIVIDE_32_SHIFT_MASK)); + q = _mm_add_epi32(q, _mm_srli_epi32(q, 31)); // q += (q < 0) + return q; + } +} + +__m128i libdivide_s32_do_vector_alg0(__m128i numers, const struct libdivide_s32_t *denom) { + uint8_t shift = denom->more & LIBDIVIDE_32_SHIFT_MASK; + uint32_t mask = (1U << shift) - 1; + __m128i roundToZeroTweak = _mm_set1_epi32(mask); + __m128i q = _mm_add_epi32(numers, _mm_and_si128(_mm_srai_epi32(numers, 31), roundToZeroTweak)); + return _mm_sra_epi32(q, libdivide_u32_to_m128i(shift)); +} + +__m128i libdivide_s32_do_vector_alg1(__m128i numers, const struct libdivide_s32_t *denom) { + uint8_t shift = denom->more & LIBDIVIDE_32_SHIFT_MASK; + uint32_t mask = (1U << shift) - 1; + __m128i roundToZeroTweak = _mm_set1_epi32(mask); + __m128i q = _mm_add_epi32(numers, _mm_and_si128(_mm_srai_epi32(numers, 31), roundToZeroTweak)); + return _mm_sub_epi32(_mm_setzero_si128(), _mm_sra_epi32(q, libdivide_u32_to_m128i(shift))); +} + +__m128i libdivide_s32_do_vector_alg2(__m128i numers, const struct libdivide_s32_t *denom) { + __m128i q = libdivide_mullhi_s32_flat_vector(numers, _mm_set1_epi32(denom->magic)); + q = _mm_add_epi32(q, numers); + q = _mm_sra_epi32(q, libdivide_u32_to_m128i(denom->more & LIBDIVIDE_32_SHIFT_MASK)); + q = _mm_add_epi32(q, _mm_srli_epi32(q, 31)); + return q; +} + +__m128i libdivide_s32_do_vector_alg3(__m128i numers, const struct libdivide_s32_t *denom) { + __m128i q = libdivide_mullhi_s32_flat_vector(numers, _mm_set1_epi32(denom->magic)); + q = _mm_sub_epi32(q, numers); + q = _mm_sra_epi32(q, libdivide_u32_to_m128i(denom->more & LIBDIVIDE_32_SHIFT_MASK)); + q = _mm_add_epi32(q, _mm_srli_epi32(q, 31)); + return q; +} + +__m128i libdivide_s32_do_vector_alg4(__m128i numers, const struct libdivide_s32_t *denom) { + uint8_t more = denom->more; + __m128i q = libdivide_mullhi_s32_flat_vector(numers, _mm_set1_epi32(denom->magic)); + q = _mm_sra_epi32(q, libdivide_u32_to_m128i(more & LIBDIVIDE_32_SHIFT_MASK)); + q = _mm_add_epi32(q, _mm_srli_epi32(q, 31)); // q += (q < 0) + return q; +} + +__m128i libdivide_s32_branchfree_do_vector(__m128i numers, const struct libdivide_s32_branchfree_t *denom) { + int32_t magic = denom->magic; + uint8_t more = denom->more; + uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK; + // must be arithmetic shift + __m128i sign = _mm_set1_epi32((int32_t)(int8_t)more >> 7); + + // libdivide_mullhi_s32(numers, magic); + __m128i q = libdivide_mullhi_s32_flat_vector(numers, _mm_set1_epi32(magic)); + q = _mm_add_epi32(q, numers); // q += numers + + // If q is non-negative, we have nothing to do + // If q is negative, we want to add either (2**shift)-1 if d is + // a power of 2, or (2**shift) if it is not a power of 2 + uint32_t is_power_of_2 = (magic == 0); + __m128i q_sign = _mm_srai_epi32(q, 31); // q_sign = q >> 31 + __m128i mask = _mm_set1_epi32((1 << shift) - is_power_of_2); + q = _mm_add_epi32(q, _mm_and_si128(q_sign, mask)); // q = q + (q_sign & mask) + q = _mm_srai_epi32(q, shift); // q >>= shift + q = _mm_sub_epi32(_mm_xor_si128(q, sign), sign); // q = (q ^ sign) - sign + return q; +} + +////////// SINT64 + __m128i libdivide_s64_do_vector(__m128i numers, const struct libdivide_s64_t *denom) { uint8_t more = denom->more; int64_t magic = denom->magic; if (magic == 0) { // shift path - uint32_t shifter = more & LIBDIVIDE_64_SHIFT_MASK; - __m128i roundToZeroTweak = libdivide__u64_to_m128((1ULL << shifter) - 1); - __m128i q = _mm_add_epi64(numers, _mm_and_si128(libdivide_s64_signbits(numers), roundToZeroTweak)); // q = numer + ((numer >> 63) & roundToZeroTweak); - q = libdivide_s64_shift_right_vector(q, shifter); // q = q >> shifter + uint32_t shift = more & LIBDIVIDE_64_SHIFT_MASK; + uint64_t mask = (1ULL << shift) - 1; + __m128i roundToZeroTweak = libdivide_u64_to_m128(mask); + // q = numer + ((numer >> 63) & roundToZeroTweak); + __m128i q = _mm_add_epi64(numers, _mm_and_si128(libdivide_s64_signbits(numers), roundToZeroTweak)); + q = libdivide_s64_shift_right_vector(q, shift); __m128i shiftMask = _mm_set1_epi32((int32_t)((int8_t)more >> 7)); - q = _mm_sub_epi64(_mm_xor_si128(q, shiftMask), shiftMask); // q = (q ^ shiftMask) - shiftMask; + // q = (q ^ shiftMask) - shiftMask; + q = _mm_sub_epi64(_mm_xor_si128(q, shiftMask), shiftMask); return q; } else { - __m128i q = libdivide_mullhi_s64_flat_vector(numers, libdivide__u64_to_m128(magic)); + __m128i q = libdivide_mullhi_s64_flat_vector(numers, libdivide_u64_to_m128(magic)); if (more & LIBDIVIDE_ADD_MARKER) { - __m128i sign = _mm_set1_epi32((int32_t)((int8_t)more >> 7)); // must be arithmetic shift - q = _mm_add_epi64(q, _mm_sub_epi64(_mm_xor_si128(numers, sign), sign)); // q += ((numer ^ sign) - sign); + // must be arithmetic shift + __m128i sign = _mm_set1_epi32((int32_t)((int8_t)more >> 7)); + // q += ((numer ^ sign) - sign); + q = _mm_add_epi64(q, _mm_sub_epi64(_mm_xor_si128(numers, sign), sign)); } // q >>= denom->mult_path.shift q = libdivide_s64_shift_right_vector(q, more & LIBDIVIDE_64_SHIFT_MASK); @@ -1715,23 +1748,25 @@ __m128i libdivide_s64_do_vector(__m128i numers, const struct libdivide_s64_t *de } __m128i libdivide_s64_do_vector_alg0(__m128i numers, const struct libdivide_s64_t *denom) { - uint32_t shifter = denom->more & LIBDIVIDE_64_SHIFT_MASK; - __m128i roundToZeroTweak = libdivide__u64_to_m128((1ULL << shifter) - 1); + uint32_t shift = denom->more & LIBDIVIDE_64_SHIFT_MASK; + uint64_t mask = (1ULL << shift) - 1; + __m128i roundToZeroTweak = libdivide_u64_to_m128(mask); __m128i q = _mm_add_epi64(numers, _mm_and_si128(libdivide_s64_signbits(numers), roundToZeroTweak)); - q = libdivide_s64_shift_right_vector(q, shifter); + q = libdivide_s64_shift_right_vector(q, shift); return q; } __m128i libdivide_s64_do_vector_alg1(__m128i numers, const struct libdivide_s64_t *denom) { - uint32_t shifter = denom->more & LIBDIVIDE_64_SHIFT_MASK; - __m128i roundToZeroTweak = libdivide__u64_to_m128((1ULL << shifter) - 1); + uint32_t shift = denom->more & LIBDIVIDE_64_SHIFT_MASK; + uint64_t mask = (1ULL << shift) - 1; + __m128i roundToZeroTweak = libdivide_u64_to_m128(mask); __m128i q = _mm_add_epi64(numers, _mm_and_si128(libdivide_s64_signbits(numers), roundToZeroTweak)); - q = libdivide_s64_shift_right_vector(q, shifter); + q = libdivide_s64_shift_right_vector(q, shift); return _mm_sub_epi64(_mm_setzero_si128(), q); } __m128i libdivide_s64_do_vector_alg2(__m128i numers, const struct libdivide_s64_t *denom) { - __m128i q = libdivide_mullhi_s64_flat_vector(numers, libdivide__u64_to_m128(denom->magic)); + __m128i q = libdivide_mullhi_s64_flat_vector(numers, libdivide_u64_to_m128(denom->magic)); q = _mm_add_epi64(q, numers); q = libdivide_s64_shift_right_vector(q, denom->more & LIBDIVIDE_64_SHIFT_MASK); q = _mm_add_epi64(q, _mm_srli_epi64(q, 63)); // q += (q < 0) @@ -1739,7 +1774,7 @@ __m128i libdivide_s64_do_vector_alg2(__m128i numers, const struct libdivide_s64_ } __m128i libdivide_s64_do_vector_alg3(__m128i numers, const struct libdivide_s64_t *denom) { - __m128i q = libdivide_mullhi_s64_flat_vector(numers, libdivide__u64_to_m128(denom->magic)); + __m128i q = libdivide_mullhi_s64_flat_vector(numers, libdivide_u64_to_m128(denom->magic)); q = _mm_sub_epi64(q, numers); q = libdivide_s64_shift_right_vector(q, denom->more & LIBDIVIDE_64_SHIFT_MASK); q = _mm_add_epi64(q, _mm_srli_epi64(q, 63)); // q += (q < 0) @@ -1747,7 +1782,7 @@ __m128i libdivide_s64_do_vector_alg3(__m128i numers, const struct libdivide_s64_ } __m128i libdivide_s64_do_vector_alg4(__m128i numers, const struct libdivide_s64_t *denom) { - __m128i q = libdivide_mullhi_s64_flat_vector(numers, libdivide__u64_to_m128(denom->magic)); + __m128i q = libdivide_mullhi_s64_flat_vector(numers, libdivide_u64_to_m128(denom->magic)); q = libdivide_s64_shift_right_vector(q, denom->more & LIBDIVIDE_64_SHIFT_MASK); q = _mm_add_epi64(q, _mm_srli_epi64(q, 63)); return q; @@ -1760,16 +1795,16 @@ __m128i libdivide_s64_branchfree_do_vector(__m128i numers, const struct libdivid // must be arithmetic shift __m128i sign = _mm_set1_epi32((int32_t)(int8_t)more >> 7); - // libdivide__mullhi_s64(numers, magic); - __m128i q = libdivide_mullhi_s64_flat_vector(numers, libdivide__u64_to_m128(magic)); + // libdivide_mullhi_s64(numers, magic); + __m128i q = libdivide_mullhi_s64_flat_vector(numers, libdivide_u64_to_m128(magic)); q = _mm_add_epi64(q, numers); // q += numers // If q is non-negative, we have nothing to do. - // If q is negative, we want to add either (2**shift)-1 if d is a power of - // 2, or (2**shift) if it is not a power of 2. + // If q is negative, we want to add either (2**shift)-1 if d is + // a power of 2, or (2**shift) if it is not a power of 2. uint32_t is_power_of_2 = (magic == 0); __m128i q_sign = libdivide_s64_signbits(q); // q_sign = q >> 63 - __m128i mask = libdivide__u64_to_m128((1ULL << shift) - is_power_of_2); + __m128i mask = libdivide_u64_to_m128((1ULL << shift) - is_power_of_2); q = _mm_add_epi64(q, _mm_and_si128(q_sign, mask)); // q = q + (q_sign & mask) q = libdivide_s64_shift_right_vector(q, shift); // q >>= shift q = _mm_sub_epi64(_mm_xor_si128(q, sign), sign); // q = (q ^ sign) - sign @@ -1797,12 +1832,22 @@ enum { namespace libdivide_internal { +#if defined(LIBDIVIDE_USE_SSE2) && \ + defined(__GNUC__) && \ + __GNUC__ >= 6 + // Using vector functions as template arguments causes many + // -Wignored-attributes compiler warnings with GCC 9. + // These warnings can safely be turned off. + #pragma GCC diagnostic push + #pragma GCC diagnostic ignored "-Wignored-attributes" +#endif + #if defined(LIBDIVIDE_USE_SSE2) -#define MAYBE_VECTOR(X) X -#define MAYBE_VECTOR_PARAM(X) __m128i vector_func(__m128i, const X *) + #define MAYBE_VECTOR(X) X + #define MAYBE_VECTOR_PARAM(X) __m128i vector_func(__m128i, const X *) #else -#define MAYBE_VECTOR(X) 0 -#define MAYBE_VECTOR_PARAM(X) int unused + #define MAYBE_VECTOR(X) 0 + #define MAYBE_VECTOR_PARAM(X) int unused #endif // The following convenience macros are used to build a type of the base @@ -1837,115 +1882,121 @@ namespace libdivide_internal { libdivide_##TYPE##_crash, \ MAYBE_VECTOR(libdivide_##TYPE##_crash_vector)> - // Base divider, provides storage for the actual divider. - // @IntType: e.g. uint32_t - // @DenomType: e.g. libdivide_u32_t - // @gen_func(): e.g. libdivide_u32_gen - // @do_func(): e.g. libdivide_u32_do - // @MAYBE_VECTOR_PARAM: e.g. libdivide_u32_do_vector - template - struct base { - // Storage for the actual divider - DenomType denom; +// Base divider, provides storage for the actual divider. +// @T: e.g. uint32_t +// @DenomType: e.g. libdivide_u32_t +// @gen_func(): e.g. libdivide_u32_gen +// @do_func(): e.g. libdivide_u32_do +// @MAYBE_VECTOR_PARAM: e.g. libdivide_u32_do_vector +template +struct base { + // Storage for the actual divider + DenomType denom; - // Constructor that takes a divisor value, and applies the gen function - base(IntType d) : denom(gen_func(d)) { } + // Constructor that takes a divisor value, and applies the gen function + base(T d) : denom(gen_func(d)) { } - // Default constructor to allow uninitialized uses in e.g. arrays - base() {} + // Default constructor to allow uninitialized uses in e.g. arrays + base() {} - // Needed for unswitch - base(const DenomType& d) : denom(d) { } + // Needed for unswitch + base(const DenomType& d) : denom(d) { } - IntType perform_divide(IntType val) const { - return do_func(val, &denom); - } + T perform_divide(T val) const { + return do_func(val, &denom); + } #if defined(LIBDIVIDE_USE_SSE2) - __m128i perform_divide_vector(__m128i val) const { - return vector_func(val, &denom); - } + __m128i perform_divide_vector(__m128i val) const { + return vector_func(val, &denom); + } #endif - }; +}; - // Functions that will never be called but are required to be able - // to use unswitch in C++ template code. Unsigned has fewer algorithms - // than signed i.e. alg3 and alg4 are not defined for unsigned. In - // order to make templates compile we need to define unsigned alg3 and - // alg4 as crash functions. - uint32_t libdivide_u32_crash(uint32_t, const libdivide_u32_t *) { exit(-1); } - uint64_t libdivide_u64_crash(uint64_t, const libdivide_u64_t *) { exit(-1); } +// Functions that will never be called but are required to be able +// to use unswitch in C++ template code. Unsigned has fewer algorithms +// than signed i.e. alg3 and alg4 are not defined for unsigned. In +// order to make templates compile we need to define unsigned alg3 and +// alg4 as crash functions. +uint32_t libdivide_u32_crash(uint32_t, const libdivide_u32_t *) { exit(-1); } +uint64_t libdivide_u64_crash(uint64_t, const libdivide_u64_t *) { exit(-1); } #if defined(LIBDIVIDE_USE_SSE2) __m128i libdivide_u32_crash_vector(__m128i, const libdivide_u32_t *) { exit(-1); } __m128i libdivide_u64_crash_vector(__m128i, const libdivide_u64_t *) { exit(-1); } #endif - template struct dispatcher { }; +template struct dispatcher { }; - // Templated dispatch using partial specialization - template<> struct dispatcher { BRANCHFULL_DIVIDER(int32_t, s32) divider; }; - template<> struct dispatcher { BRANCHFREE_DIVIDER(int32_t, s32) divider; }; - template<> struct dispatcher { ALGORITHM_DIVIDER(int32_t, s32, alg0) divider; }; - template<> struct dispatcher { ALGORITHM_DIVIDER(int32_t, s32, alg1) divider; }; - template<> struct dispatcher { ALGORITHM_DIVIDER(int32_t, s32, alg2) divider; }; - template<> struct dispatcher { ALGORITHM_DIVIDER(int32_t, s32, alg3) divider; }; - template<> struct dispatcher { ALGORITHM_DIVIDER(int32_t, s32, alg4) divider; }; +// Templated dispatch using partial specialization +template<> struct dispatcher { BRANCHFULL_DIVIDER(int32_t, s32) divider; }; +template<> struct dispatcher { BRANCHFREE_DIVIDER(int32_t, s32) divider; }; +template<> struct dispatcher { ALGORITHM_DIVIDER(int32_t, s32, alg0) divider; }; +template<> struct dispatcher { ALGORITHM_DIVIDER(int32_t, s32, alg1) divider; }; +template<> struct dispatcher { ALGORITHM_DIVIDER(int32_t, s32, alg2) divider; }; +template<> struct dispatcher { ALGORITHM_DIVIDER(int32_t, s32, alg3) divider; }; +template<> struct dispatcher { ALGORITHM_DIVIDER(int32_t, s32, alg4) divider; }; - template<> struct dispatcher { BRANCHFULL_DIVIDER(uint32_t, u32) divider; }; - template<> struct dispatcher { BRANCHFREE_DIVIDER(uint32_t, u32) divider; }; - template<> struct dispatcher { ALGORITHM_DIVIDER(uint32_t, u32, alg0) divider; }; - template<> struct dispatcher { ALGORITHM_DIVIDER(uint32_t, u32, alg1) divider; }; - template<> struct dispatcher { ALGORITHM_DIVIDER(uint32_t, u32, alg2) divider; }; - template<> struct dispatcher { CRASH_DIVIDER(uint32_t, u32) divider; }; - template<> struct dispatcher { CRASH_DIVIDER(uint32_t, u32) divider; }; +template<> struct dispatcher { BRANCHFULL_DIVIDER(uint32_t, u32) divider; }; +template<> struct dispatcher { BRANCHFREE_DIVIDER(uint32_t, u32) divider; }; +template<> struct dispatcher { ALGORITHM_DIVIDER(uint32_t, u32, alg0) divider; }; +template<> struct dispatcher { ALGORITHM_DIVIDER(uint32_t, u32, alg1) divider; }; +template<> struct dispatcher { ALGORITHM_DIVIDER(uint32_t, u32, alg2) divider; }; +template<> struct dispatcher { CRASH_DIVIDER(uint32_t, u32) divider; }; +template<> struct dispatcher { CRASH_DIVIDER(uint32_t, u32) divider; }; - template<> struct dispatcher { BRANCHFULL_DIVIDER(int64_t, s64) divider; }; - template<> struct dispatcher { BRANCHFREE_DIVIDER(int64_t, s64) divider; }; - template<> struct dispatcher { ALGORITHM_DIVIDER (int64_t, s64, alg0) divider; }; - template<> struct dispatcher { ALGORITHM_DIVIDER (int64_t, s64, alg1) divider; }; - template<> struct dispatcher { ALGORITHM_DIVIDER (int64_t, s64, alg2) divider; }; - template<> struct dispatcher { ALGORITHM_DIVIDER (int64_t, s64, alg3) divider; }; - template<> struct dispatcher { ALGORITHM_DIVIDER (int64_t, s64, alg4) divider; }; +template<> struct dispatcher { BRANCHFULL_DIVIDER(int64_t, s64) divider; }; +template<> struct dispatcher { BRANCHFREE_DIVIDER(int64_t, s64) divider; }; +template<> struct dispatcher { ALGORITHM_DIVIDER (int64_t, s64, alg0) divider; }; +template<> struct dispatcher { ALGORITHM_DIVIDER (int64_t, s64, alg1) divider; }; +template<> struct dispatcher { ALGORITHM_DIVIDER (int64_t, s64, alg2) divider; }; +template<> struct dispatcher { ALGORITHM_DIVIDER (int64_t, s64, alg3) divider; }; +template<> struct dispatcher { ALGORITHM_DIVIDER (int64_t, s64, alg4) divider; }; - template<> struct dispatcher { BRANCHFULL_DIVIDER(uint64_t, u64) divider; }; - template<> struct dispatcher { BRANCHFREE_DIVIDER(uint64_t, u64) divider; }; - template<> struct dispatcher { ALGORITHM_DIVIDER(uint64_t, u64, alg0) divider; }; - template<> struct dispatcher { ALGORITHM_DIVIDER(uint64_t, u64, alg1) divider; }; - template<> struct dispatcher { ALGORITHM_DIVIDER(uint64_t, u64, alg2) divider; }; - template<> struct dispatcher { CRASH_DIVIDER(uint64_t, u64) divider; }; - template<> struct dispatcher { CRASH_DIVIDER(uint64_t, u64) divider; }; +template<> struct dispatcher { BRANCHFULL_DIVIDER(uint64_t, u64) divider; }; +template<> struct dispatcher { BRANCHFREE_DIVIDER(uint64_t, u64) divider; }; +template<> struct dispatcher { ALGORITHM_DIVIDER(uint64_t, u64, alg0) divider; }; +template<> struct dispatcher { ALGORITHM_DIVIDER(uint64_t, u64, alg1) divider; }; +template<> struct dispatcher { ALGORITHM_DIVIDER(uint64_t, u64, alg2) divider; }; +template<> struct dispatcher { CRASH_DIVIDER(uint64_t, u64) divider; }; +template<> struct dispatcher { CRASH_DIVIDER(uint64_t, u64) divider; }; - // Overloads that don't depend on the algorithm - inline int32_t recover(const libdivide_s32_t *s) { return libdivide_s32_recover(s); } - inline uint32_t recover(const libdivide_u32_t *s) { return libdivide_u32_recover(s); } - inline int64_t recover(const libdivide_s64_t *s) { return libdivide_s64_recover(s); } - inline uint64_t recover(const libdivide_u64_t *s) { return libdivide_u64_recover(s); } +#if defined(LIBDIVIDE_USE_SSE2) && \ + defined(__GNUC__) && \ + __GNUC__ >= 6 + #pragma GCC diagnostic pop +#endif - inline int32_t recover(const libdivide_s32_branchfree_t *s) { return libdivide_s32_branchfree_recover(s); } - inline uint32_t recover(const libdivide_u32_branchfree_t *s) { return libdivide_u32_branchfree_recover(s); } - inline int64_t recover(const libdivide_s64_branchfree_t *s) { return libdivide_s64_branchfree_recover(s); } - inline uint64_t recover(const libdivide_u64_branchfree_t *s) { return libdivide_u64_branchfree_recover(s); } +// Overloads that don't depend on the algorithm +inline int32_t recover(const libdivide_s32_t *s) { return libdivide_s32_recover(s); } +inline uint32_t recover(const libdivide_u32_t *s) { return libdivide_u32_recover(s); } +inline int64_t recover(const libdivide_s64_t *s) { return libdivide_s64_recover(s); } +inline uint64_t recover(const libdivide_u64_t *s) { return libdivide_u64_recover(s); } - inline int get_algorithm(const libdivide_s32_t *s) { return libdivide_s32_get_algorithm(s); } - inline int get_algorithm(const libdivide_u32_t *s) { return libdivide_u32_get_algorithm(s); } - inline int get_algorithm(const libdivide_s64_t *s) { return libdivide_s64_get_algorithm(s); } - inline int get_algorithm(const libdivide_u64_t *s) { return libdivide_u64_get_algorithm(s); } +inline int32_t recover(const libdivide_s32_branchfree_t *s) { return libdivide_s32_branchfree_recover(s); } +inline uint32_t recover(const libdivide_u32_branchfree_t *s) { return libdivide_u32_branchfree_recover(s); } +inline int64_t recover(const libdivide_s64_branchfree_t *s) { return libdivide_s64_branchfree_recover(s); } +inline uint64_t recover(const libdivide_u64_branchfree_t *s) { return libdivide_u64_branchfree_recover(s); } - // Fallback for branchfree variants, which do not support unswitching - template int get_algorithm(const T *) { return -1; } -} +inline int get_algorithm(const libdivide_s32_t *s) { return libdivide_s32_get_algorithm(s); } +inline int get_algorithm(const libdivide_u32_t *s) { return libdivide_u32_get_algorithm(s); } +inline int get_algorithm(const libdivide_s64_t *s) { return libdivide_s64_get_algorithm(s); } +inline int get_algorithm(const libdivide_u64_t *s) { return libdivide_u64_get_algorithm(s); } + +// Fallback for branchfree variants, which do not support unswitching +template int get_algorithm(const T *) { return -1; } + +} // namespace libdivide_internal // This is the main divider class for use by the user (C++ API). // The divider itself is stored in the div variable who's // type is chosen by the dispatcher based on the template paramaters. template -class divider -{ +class divider { private: // Here's the actual divider typedef typename libdivide_internal::dispatcher::divider div_t; @@ -2019,14 +2070,14 @@ divider unswitch(const divider& d) { } // Overload of the / operator for scalar division -template -int_type operator/(int_type numer, const divider& denom) { +template +T operator/(T numer, const divider& denom) { return denom.perform_divide(numer); } // Overload of the /= operator for scalar division -template -int_type operator/=(int_type& numer, const divider& denom) { +template +T operator/=(T& numer, const divider& denom) { numer = denom.perform_divide(numer); return numer; } @@ -2034,14 +2085,14 @@ int_type operator/=(int_type& numer, const divider& denom) { #if defined(LIBDIVIDE_USE_SSE2) // Overload of the / operator for vector division -template -__m128i operator/(__m128i numer, const divider& denom) { +template +__m128i operator/(__m128i numer, const divider& denom) { return denom.perform_divide_vector(numer); } // Overload of the /= operator for vector division -template -__m128i operator/=(__m128i& numer, const divider& denom) { +template +__m128i operator/=(__m128i& numer, const divider& denom) { numer = denom.perform_divide_vector(numer); return numer; }