diff --git a/third_party/libdivide.h b/third_party/libdivide.h
index e938849..d1c3f8d 100644
--- a/third_party/libdivide.h
+++ b/third_party/libdivide.h
@@ -1,5 +1,5 @@
 // libdivide.h
-// Copyright 2010 - 2018 ridiculous_fish
+// Copyright 2010 - 2019 ridiculous_fish
 //
 // libdivide is dual-licensed under the Boost or zlib licenses.
 // You may use libdivide under the terms of either of these.
@@ -8,55 +8,76 @@
 #ifndef LIBDIVIDE_H
 #define LIBDIVIDE_H
 
-#if defined(_MSC_VER)
-// disable warning C4146: unary minus operator applied to
-// unsigned type, result still unsigned
-#pragma warning(disable: 4146)
-#define LIBDIVIDE_VC
-#endif
-
-#ifdef __cplusplus
-#include <cstdlib>
-#include <cstdio>
-#else
-#include <stdlib.h>
-#include <stdio.h>
-#endif
+#define LIBDIVIDE_VERSION "1.1"
+#define LIBDIVIDE_VERSION_MAJOR 1
+#define LIBDIVIDE_VERSION_MINOR 1
 
 #include <stdint.h>
 
+#if defined(__cplusplus)
+    #include <cstdlib>
+    #include <cstdio>
+#else
+    #include <stdlib.h>
+    #include <stdio.h>
+#endif
+
 #if defined(LIBDIVIDE_USE_SSE2)
-#include <emmintrin.h>
+    #include <emmintrin.h>
 #endif
 
-#if defined(LIBDIVIDE_VC)
-#include <intrin.h>
+// libdivide may use the pmuldq (vector signed 32x32->64 mult instruction)
+// which is in SSE 4.1. However, signed multiplication can be emulated
+// efficiently with unsigned multiplication, and SSE 4.1 is currently rare,
+// so it is OK to not turn this on.
+#if defined(LIBDIVIDE_USE_SSE4_1)
+    #include <smmintrin.h>
 #endif
 
-#ifndef __has_builtin
-#define __has_builtin(x) 0 // Compatibility with non-clang compilers.
+#if !defined(__has_builtin)
+    #define __has_builtin(x) 0
 #endif
 
 #if defined(__SIZEOF_INT128__)
-#define HAS_INT128_T
+    #define HAS_INT128_T
 #endif
 
-#if defined(__x86_64__) || defined(_WIN64) || defined(_M_X64)
-#define LIBDIVIDE_IS_X86_64
+#if defined(__x86_64__) || defined(_M_X64)
+    #define LIBDIVIDE_X86_64
 #endif
 
 #if defined(__i386__)
-#define LIBDIVIDE_IS_i386
+    #define LIBDIVIDE_i386
 #endif
 
 #if defined(__GNUC__) || defined(__clang__)
-#define LIBDIVIDE_GCC_STYLE_ASM
+    #define LIBDIVIDE_GCC_STYLE_ASM
 #endif
 
 #if defined(__cplusplus) || defined(LIBDIVIDE_VC)
-#define LIBDIVIDE_FUNCTION __FUNCTION__
+    #define LIBDIVIDE_FUNCTION __FUNCTION__
 #else
-#define LIBDIVIDE_FUNCTION __func__
+    #define LIBDIVIDE_FUNCTION __func__
+#endif
+
+#if defined(_MSC_VER)
+    #include <intrin.h>
+    // disable warning C4146: unary minus operator applied
+    // to unsigned type, result still unsigned
+    #pragma warning(disable: 4146)
+    #define LIBDIVIDE_VC
+
+    // _udiv128() is available in Visual Studio 2019
+    // or later on the x64 CPU architecture
+    #if defined(LIBDIVIDE_X86_64) && _MSC_VER >= 1920
+        #if !defined(__has_include)
+            #include <immintrin.h>
+            #define LIBDIVIDE_VC_UDIV128
+        #elif __has_include(<immintrin.h>)
+            #include <immintrin.h>
+            #define LIBDIVIDE_VC_UDIV128
+        #endif
+    #endif
 #endif
 
 #define LIBDIVIDE_ERROR(msg) \
@@ -67,24 +88,16 @@
     } while (0)
 
 #if defined(LIBDIVIDE_ASSERTIONS_ON)
-#define LIBDIVIDE_ASSERT(x) \
-    do { \
-        if (!(x)) { \
-            fprintf(stderr, "libdivide.h:%d: %s(): Assertion failed: %s\n", \
-                __LINE__, LIBDIVIDE_FUNCTION, #x); \
-            exit(-1); \
-        } \
-    } while (0)
+    #define LIBDIVIDE_ASSERT(x) \
+        do { \
+            if (!(x)) { \
+                fprintf(stderr, "libdivide.h:%d: %s(): Assertion failed: %s\n", \
+                    __LINE__, LIBDIVIDE_FUNCTION, #x); \
+                exit(-1); \
+            } \
+        } while (0)
 #else
-#define LIBDIVIDE_ASSERT(x)
-#endif
-
-// libdivide may use the pmuldq (vector signed 32x32->64 mult instruction)
-// which is in SSE 4.1. However, signed multiplication can be emulated
-// efficiently with unsigned multiplication, and SSE 4.1 is currently rare, so
-// it is OK to not turn this on.
-#ifdef LIBDIVIDE_USE_SSE4_1
-#include <smmintrin.h>
+    #define LIBDIVIDE_ASSERT(x)
 #endif
 
 #ifdef __cplusplus
@@ -249,50 +262,24 @@ LIBDIVIDE_API int64_t libdivide_s64_do_alg2(int64_t numer, const struct libdivid
 LIBDIVIDE_API int64_t libdivide_s64_do_alg3(int64_t numer, const struct libdivide_s64_t *denom);
 LIBDIVIDE_API int64_t libdivide_s64_do_alg4(int64_t numer, const struct libdivide_s64_t *denom);
 
-#if defined(LIBDIVIDE_USE_SSE2)
-
-LIBDIVIDE_API __m128i libdivide_u32_do_vector(__m128i numers, const struct libdivide_u32_t *denom);
-LIBDIVIDE_API __m128i libdivide_s32_do_vector(__m128i numers, const struct libdivide_s32_t *denom);
-LIBDIVIDE_API __m128i libdivide_u64_do_vector(__m128i numers, const struct libdivide_u64_t *denom);
-LIBDIVIDE_API __m128i libdivide_s64_do_vector(__m128i numers, const struct libdivide_s64_t *denom);
-
-LIBDIVIDE_API __m128i libdivide_u32_do_vector_alg0(__m128i numers, const struct libdivide_u32_t *denom);
-LIBDIVIDE_API __m128i libdivide_u32_do_vector_alg1(__m128i numers, const struct libdivide_u32_t *denom);
-LIBDIVIDE_API __m128i libdivide_u32_do_vector_alg2(__m128i numers, const struct libdivide_u32_t *denom);
-
-LIBDIVIDE_API __m128i libdivide_s32_do_vector_alg0(__m128i numers, const struct libdivide_s32_t *denom);
-LIBDIVIDE_API __m128i libdivide_s32_do_vector_alg1(__m128i numers, const struct libdivide_s32_t *denom);
-LIBDIVIDE_API __m128i libdivide_s32_do_vector_alg2(__m128i numers, const struct libdivide_s32_t *denom);
-LIBDIVIDE_API __m128i libdivide_s32_do_vector_alg3(__m128i numers, const struct libdivide_s32_t *denom);
-LIBDIVIDE_API __m128i libdivide_s32_do_vector_alg4(__m128i numers, const struct libdivide_s32_t *denom);
-
-LIBDIVIDE_API __m128i libdivide_u64_do_vector_alg0(__m128i numers, const struct libdivide_u64_t *denom);
-LIBDIVIDE_API __m128i libdivide_u64_do_vector_alg1(__m128i numers, const struct libdivide_u64_t *denom);
-LIBDIVIDE_API __m128i libdivide_u64_do_vector_alg2(__m128i numers, const struct libdivide_u64_t *denom);
-
-LIBDIVIDE_API __m128i libdivide_s64_do_vector_alg0(__m128i numers, const struct libdivide_s64_t *denom);
-LIBDIVIDE_API __m128i libdivide_s64_do_vector_alg1(__m128i numers, const struct libdivide_s64_t *denom);
-LIBDIVIDE_API __m128i libdivide_s64_do_vector_alg2(__m128i numers, const struct libdivide_s64_t *denom);
-LIBDIVIDE_API __m128i libdivide_s64_do_vector_alg3(__m128i numers, const struct libdivide_s64_t *denom);
-LIBDIVIDE_API __m128i libdivide_s64_do_vector_alg4(__m128i numers, const struct libdivide_s64_t *denom);
-
-LIBDIVIDE_API __m128i libdivide_u32_branchfree_do_vector(__m128i numers, const struct libdivide_u32_branchfree_t *denom);
-LIBDIVIDE_API __m128i libdivide_s32_branchfree_do_vector(__m128i numers, const struct libdivide_s32_branchfree_t *denom);
-LIBDIVIDE_API __m128i libdivide_u64_branchfree_do_vector(__m128i numers, const struct libdivide_u64_branchfree_t *denom);
-LIBDIVIDE_API __m128i libdivide_s64_branchfree_do_vector(__m128i numers, const struct libdivide_s64_branchfree_t *denom);
-
-#endif
-
 //////// Internal Utility Functions
 
-static inline uint32_t libdivide__mullhi_u32(uint32_t x, uint32_t y) {
+static inline uint32_t libdivide_mullhi_u32(uint32_t x, uint32_t y) {
     uint64_t xl = x, yl = y;
     uint64_t rl = xl * yl;
     return (uint32_t)(rl >> 32);
 }
 
-static uint64_t libdivide__mullhi_u64(uint64_t x, uint64_t y) {
-#if defined(LIBDIVIDE_VC) && defined(LIBDIVIDE_IS_X86_64)
+static inline int32_t libdivide_mullhi_s32(int32_t x, int32_t y) {
+    int64_t xl = x, yl = y;
+    int64_t rl = xl * yl;
+    // needs to be arithmetic shift
+    return (int32_t)(rl >> 32);
+}
+
+static uint64_t libdivide_mullhi_u64(uint64_t x, uint64_t y) {
+#if defined(LIBDIVIDE_VC) && \
+    defined(LIBDIVIDE_X86_64)
     return __umulh(x, y);
 #elif defined(HAS_INT128_T)
     __uint128_t xl = x, yl = y;
@@ -305,7 +292,7 @@ static uint64_t libdivide__mullhi_u64(uint64_t x, uint64_t y) {
     uint32_t x1 = (uint32_t)(x >> 32);
     uint32_t y0 = (uint32_t)(y & mask);
     uint32_t y1 = (uint32_t)(y >> 32);
-    uint32_t x0y0_hi = libdivide__mullhi_u32(x0, y0);
+    uint32_t x0y0_hi = libdivide_mullhi_u32(x0, y0);
     uint64_t x0y1 = x0 * (uint64_t)y1;
     uint64_t x1y0 = x1 * (uint64_t)y0;
     uint64_t x1y1 = x1 * (uint64_t)y1;
@@ -317,8 +304,9 @@ static uint64_t libdivide__mullhi_u64(uint64_t x, uint64_t y) {
 #endif
 }
 
-static inline int64_t libdivide__mullhi_s64(int64_t x, int64_t y) {
-#if defined(LIBDIVIDE_VC) && defined(LIBDIVIDE_IS_X86_64)
+static inline int64_t libdivide_mullhi_s64(int64_t x, int64_t y) {
+#if defined(LIBDIVIDE_VC) && \
+    defined(LIBDIVIDE_X86_64)
     return __mulh(x, y);
 #elif defined(HAS_INT128_T)
     __int128_t xl = x, yl = y;
@@ -331,7 +319,7 @@ static inline int64_t libdivide__mullhi_s64(int64_t x, int64_t y) {
     uint32_t y0 = (uint32_t)(y & mask);
     int32_t x1 = (int32_t)(x >> 32);
     int32_t y1 = (int32_t)(y >> 32);
-    uint32_t x0y0_hi = libdivide__mullhi_u32(x0, y0);
+    uint32_t x0y0_hi = libdivide_mullhi_u32(x0, y0);
     int64_t t = x1 * (int64_t)y0 + x0y0_hi;
     int64_t w1 = x0 * (int64_t)y1 + (t & mask);
 
@@ -339,141 +327,9 @@ static inline int64_t libdivide__mullhi_s64(int64_t x, int64_t y) {
 #endif
 }
 
-#if defined(LIBDIVIDE_USE_SSE2)
-
-static inline __m128i libdivide__u64_to_m128(uint64_t x) {
-#if defined(LIBDIVIDE_VC) && !defined(_WIN64)
-    // 64 bit windows doesn't seem to have an implementation of any of these
-    // load intrinsics, and 32 bit Visual C++ crashes
-    _declspec(align(16)) uint64_t temp[2] = {x, x};
-    return _mm_load_si128((const __m128i*)temp);
-#else
-    // everyone else gets it right
-    return _mm_set1_epi64x(x);
-#endif
-}
-
-static inline __m128i libdivide_get_FFFFFFFF00000000(void) {
-    // returns the same as _mm_set1_epi64(0xFFFFFFFF00000000ULL)
-    // without touching memory.
-    // optimizes to pcmpeqd on OS X
-    __m128i result = _mm_set1_epi8(-1);
-    return _mm_slli_epi64(result, 32);
-}
-
-static inline __m128i libdivide_get_00000000FFFFFFFF(void) {
-    // returns the same as _mm_set1_epi64(0x00000000FFFFFFFFULL)
-    // without touching memory.
-    // optimizes to pcmpeqd on OS X
-    __m128i result = _mm_set1_epi8(-1);
-    result = _mm_srli_epi64(result, 32);
-    return result;
-}
-
-static inline __m128i libdivide_s64_signbits(__m128i v) {
-    // we want to compute v >> 63, that is, _mm_srai_epi64(v, 63). But there
-    // is no 64 bit shift right arithmetic instruction in SSE2. So we have to
-    // fake it by first duplicating the high 32 bit values, and then using a 32
-    // bit shift. Another option would be to use _mm_srli_epi64(v, 63) and
-    // then subtract that from 0, but that approach appears to be substantially
-    // slower for unknown reasons
-    __m128i hiBitsDuped = _mm_shuffle_epi32(v, _MM_SHUFFLE(3, 3, 1, 1));
-    __m128i signBits = _mm_srai_epi32(hiBitsDuped, 31);
-    return signBits;
-}
-
-// Returns an __m128i whose low 32 bits are equal to amt and has zero elsewhere.
-static inline __m128i libdivide_u32_to_m128i(uint32_t amt) {
-    return _mm_set_epi32(0, 0, 0, amt);
-}
-
-static inline __m128i libdivide_s64_shift_right_vector(__m128i v, int amt) {
-    // implementation of _mm_sra_epi64. Here we have two 64 bit values which
-    // are shifted right to logically become (64 - amt) values, and are then
-    // sign extended from a (64 - amt) bit number.
-    const int b = 64 - amt;
-    __m128i m = libdivide__u64_to_m128(1ULL << (b - 1));
-    __m128i x = _mm_srl_epi64(v, libdivide_u32_to_m128i(amt));
-    __m128i result = _mm_sub_epi64(_mm_xor_si128(x, m), m); // result = x^m - m
-    return result;
-}
-
-// Here, b is assumed to contain one 32 bit value repeated four times.
-// If it did not, the function would not work.
-static inline __m128i libdivide__mullhi_u32_flat_vector(__m128i a, __m128i b) {
-    __m128i hi_product_0Z2Z = _mm_srli_epi64(_mm_mul_epu32(a, b), 32);
-    __m128i a1X3X = _mm_srli_epi64(a, 32);
-    __m128i mask = libdivide_get_FFFFFFFF00000000();
-    __m128i hi_product_Z1Z3 = _mm_and_si128(_mm_mul_epu32(a1X3X, b), mask);
-    return _mm_or_si128(hi_product_0Z2Z, hi_product_Z1Z3); // = hi_product_0123
-}
-
-// Here, y is assumed to contain one 64 bit value repeated twice.
-static inline __m128i libdivide_mullhi_u64_flat_vector(__m128i x, __m128i y) {
-    // full 128 bits are x0 * y0 + (x0 * y1 << 32) + (x1 * y0 << 32) + (x1 * y1 << 64)
-    __m128i mask = libdivide_get_00000000FFFFFFFF();
-    // x0 is low half of 2 64 bit values, x1 is high half in low slots
-    __m128i x0 = _mm_and_si128(x, mask);
-    __m128i x1 = _mm_srli_epi64(x, 32);
-    __m128i y0 = _mm_and_si128(y, mask);
-    __m128i y1 = _mm_srli_epi64(y, 32);
-    // x0 happens to have the low half of the two 64 bit values in 32 bit slots
-    // 0 and 2, so _mm_mul_epu32 computes their full product, and then we shift
-    // right by 32 to get just the high values
-    __m128i x0y0_hi = _mm_srli_epi64(_mm_mul_epu32(x0, y0), 32);
-    __m128i x0y1 = _mm_mul_epu32(x0, y1);
-    __m128i x1y0 = _mm_mul_epu32(x1, y0);
-    __m128i x1y1 = _mm_mul_epu32(x1, y1);
-    __m128i temp = _mm_add_epi64(x1y0, x0y0_hi);
-    __m128i temp_lo = _mm_and_si128(temp, mask);
-    __m128i temp_hi = _mm_srli_epi64(temp, 32);
-    temp_lo = _mm_srli_epi64(_mm_add_epi64(temp_lo, x0y1), 32);
-    temp_hi = _mm_add_epi64(x1y1, temp_hi);
-
-    return _mm_add_epi64(temp_lo, temp_hi);
-}
-
-// y is one 64 bit value repeated twice
-static inline __m128i libdivide_mullhi_s64_flat_vector(__m128i x, __m128i y) {
-    __m128i p = libdivide_mullhi_u64_flat_vector(x, y);
-    __m128i t1 = _mm_and_si128(libdivide_s64_signbits(x), y);
-    p = _mm_sub_epi64(p, t1);
-    __m128i t2 = _mm_and_si128(libdivide_s64_signbits(y), x);
-    p = _mm_sub_epi64(p, t2);
-    return p;
-}
-
-#ifdef LIBDIVIDE_USE_SSE4_1
-
-// b is one 32 bit value repeated four times.
-static inline __m128i libdivide_mullhi_s32_flat_vector(__m128i a, __m128i b) {
-    __m128i hi_product_0Z2Z = _mm_srli_epi64(_mm_mul_epi32(a, b), 32);
-    __m128i a1X3X = _mm_srli_epi64(a, 32);
-    __m128i mask = libdivide_get_FFFFFFFF00000000();
-    __m128i hi_product_Z1Z3 = _mm_and_si128(_mm_mul_epi32(a1X3X, b), mask);
-    return _mm_or_si128(hi_product_0Z2Z, hi_product_Z1Z3); // = hi_product_0123
-}
-
-#else
-
-// SSE2 does not have a signed multiplication instruction, but we can convert
-// unsigned to signed pretty efficiently. Again, b is just a 32 bit value
-// repeated four times.
-static inline __m128i libdivide_mullhi_s32_flat_vector(__m128i a, __m128i b) {
-    __m128i p = libdivide__mullhi_u32_flat_vector(a, b);
-    __m128i t1 = _mm_and_si128(_mm_srai_epi32(a, 31), b); // t1 = (a >> 31) & y, arithmetic shift
-    __m128i t2 = _mm_and_si128(_mm_srai_epi32(b, 31), a);
-    p = _mm_sub_epi32(p, t1);
-    p = _mm_sub_epi32(p, t2);
-    return p;
-}
-
-#endif // LIBDIVIDE_USE_SSE4_1
-
-#endif // LIBDIVIDE_USE_SSE2
-
-static inline int32_t libdivide__count_leading_zeros32(uint32_t val) {
-#if defined(__GNUC__) || __has_builtin(__builtin_clz)
+static inline int32_t libdivide_count_leading_zeros32(uint32_t val) {
+#if defined(__GNUC__) || \
+    __has_builtin(__builtin_clz)
     // Fast way to count leading zeros
     return __builtin_clz(val);
 #elif defined(LIBDIVIDE_VC)
@@ -483,19 +339,18 @@ static inline int32_t libdivide__count_leading_zeros32(uint32_t val) {
     }
     return 0;
 #else
-  int32_t result = 0;
-  uint32_t hi = 1U << 31;
-
-  while (~val & hi) {
-      hi >>= 1;
-      result++;
-  }
-  return result;
+    int32_t result = 0;
+    uint32_t hi = 1U << 31;
+    for (; ~val & hi; hi >>= 1) {
+        result++;
+    }
+    return result;
 #endif
 }
 
-static inline int32_t libdivide__count_leading_zeros64(uint64_t val) {
-#if defined(__GNUC__) || __has_builtin(__builtin_clzll)
+static inline int32_t libdivide_count_leading_zeros64(uint64_t val) {
+#if defined(__GNUC__) || \
+    __has_builtin(__builtin_clzll)
     // Fast way to count leading zeros
     return __builtin_clzll(val);
 #elif defined(LIBDIVIDE_VC) && defined(_WIN64)
@@ -507,61 +362,60 @@ static inline int32_t libdivide__count_leading_zeros64(uint64_t val) {
 #else
     uint32_t hi = val >> 32;
     uint32_t lo = val & 0xFFFFFFFF;
-    if (hi != 0) return libdivide__count_leading_zeros32(hi);
-    return 32 + libdivide__count_leading_zeros32(lo);
+    if (hi != 0) return libdivide_count_leading_zeros32(hi);
+    return 32 + libdivide_count_leading_zeros32(lo);
 #endif
 }
 
-#if (defined(LIBDIVIDE_IS_i386) || defined(LIBDIVIDE_IS_X86_64)) && \
-     defined(LIBDIVIDE_GCC_STYLE_ASM)
-
-// libdivide_64_div_32_to_32: divides a 64 bit uint {u1, u0} by a 32 bit
+// libdivide_64_div_32_to_32: divides a 64-bit uint {u1, u0} by a 32-bit
 // uint {v}. The result must fit in 32 bits.
 // Returns the quotient directly and the remainder in *r
 static uint32_t libdivide_64_div_32_to_32(uint32_t u1, uint32_t u0, uint32_t v, uint32_t *r) {
+#if (defined(LIBDIVIDE_i386) || defined(LIBDIVIDE_X86_64)) && \
+     defined(LIBDIVIDE_GCC_STYLE_ASM)
     uint32_t result;
     __asm__("divl %[v]"
             : "=a"(result), "=d"(*r)
             : [v] "r"(v), "a"(u0), "d"(u1)
             );
     return result;
-}
-
 #else
-
-static uint32_t libdivide_64_div_32_to_32(uint32_t u1, uint32_t u0, uint32_t v, uint32_t *r) {
-    uint64_t n = (((uint64_t)u1) << 32) | u0;
+    uint64_t n = ((uint64_t)u1 << 32) | u0;
     uint32_t result = (uint32_t)(n / v);
     *r = (uint32_t)(n - result * (uint64_t)v);
     return result;
+#endif
 }
 
-#endif
-
-#if defined(LIBDIVIDE_IS_X86_64) && \
-    defined(LIBDIVIDE_GCC_STYLE_ASM)
-
+// libdivide_128_div_64_to_64: divides a 128-bit uint {u1, u0} by a 64-bit
+// uint {v}. The result must fit in 64 bits.
+// Returns the quotient directly and the remainder in *r
 static uint64_t libdivide_128_div_64_to_64(uint64_t u1, uint64_t u0, uint64_t v, uint64_t *r) {
-    // u0 -> rax
-    // u1 -> rdx
-    // divq
+#if defined(LIBDIVIDE_VC_UDIV128)
+    return _udiv128(u1, u0, v, r);
+
+#elif defined(LIBDIVIDE_X86_64) && \
+      defined(LIBDIVIDE_GCC_STYLE_ASM)
+
     uint64_t result;
     __asm__("divq %[v]"
             : "=a"(result), "=d"(*r)
             : [v] "r"(v), "a"(u0), "d"(u1)
             );
     return result;
-}
 
+#elif defined(HAS_INT128_T)
+    __uint128_t n = ((__uint128_t)u1 << 64) | u0;
+    uint64_t result = (uint64_t)(n / v);
+    *r = (uint64_t)(n - result * (__uint128_t)v);
+    return result;
 #else
+    // Code taken from Hacker's Delight:
+    // http://www.hackersdelight.org/HDcode/divlu.c.
+    // License permits inclusion here per:
+    // http://www.hackersdelight.org/permissions.htm
 
-// Code taken from Hacker's Delight:
-// http://www.hackersdelight.org/HDcode/divlu.c.
-// License permits inclusion here per:
-// http://www.hackersdelight.org/permissions.htm
-
-static uint64_t libdivide_128_div_64_to_64(uint64_t u1, uint64_t u0, uint64_t v, uint64_t *r) {
-    const uint64_t b = (1ULL << 32); // Number base (16 bits)
+    const uint64_t b = (1ULL << 32); // Number base (32 bits)
     uint64_t un1, un0; // Norm. dividend LSD's
     uint64_t vn1, vn0; // Norm. divisor digits
     uint64_t q1, q0; // Quotient digits
@@ -572,21 +426,23 @@ static uint64_t libdivide_128_div_64_to_64(uint64_t u1, uint64_t u0, uint64_t v,
     // If overflow, set rem. to an impossible value,
     // and return the largest possible quotient
     if (u1 >= v) {
-        if (r != NULL)
-            *r = (uint64_t) -1;
+        *r = (uint64_t) -1;
         return (uint64_t) -1;
     }
 
     // count leading zeros
-    s = libdivide__count_leading_zeros64(v);
+    s = libdivide_count_leading_zeros64(v);
     if (s > 0) {
         // Normalize divisor
         v = v << s;
-        un64 = (u1 << s) | ((u0 >> (64 - s)) & (-s >> 31));
+        un64 = (u1 << s) | (u0 >> (64 - s));
         un10 = u0 << s; // Shift dividend left
     } else {
-        // Avoid undefined behavior
-        un64 = u1 | u0;
+        // Avoid undefined behavior of (u0 >> 64).
+        // The behavior is undefined if the right operand is
+        // negative, or greater than or equal to the length
+        // in bits of the promoted left operand.
+        un64 = u1;
         un10 = u0;
     }
 
@@ -623,18 +479,13 @@ static uint64_t libdivide_128_div_64_to_64(uint64_t u1, uint64_t u0, uint64_t v,
             break;
     }
 
-    // If remainder is wanted, return it
-    if (r != NULL)
-        *r = (un21 * b + un0 - q0 * v) >> s;
-
+    *r = (un21 * b + un0 - q0 * v) >> s;
     return q1 * b + q0;
+#endif
 }
 
-#endif
-
 // Bitshift a u128 in place, left (signed_shift > 0) or right (signed_shift < 0)
-static inline void libdivide_u128_shift(uint64_t *u1, uint64_t *u0, int32_t signed_shift)
-{
+static inline void libdivide_u128_shift(uint64_t *u1, uint64_t *u0, int32_t signed_shift) {
     if (signed_shift > 0) {
         uint32_t shift = signed_shift;
         *u1 <<= shift;
@@ -678,7 +529,7 @@ static uint64_t libdivide_128_div_128_to_64(uint64_t u_hi, uint64_t u_lo, uint64
     // Here v >= 2**64
     // We know that v.hi != 0, so count leading zeros is OK
     // We have 0 <= n <= 63
-    uint32_t n = libdivide__count_leading_zeros64(v.hi);
+    uint32_t n = libdivide_count_leading_zeros64(v.hi);
 
     // Normalize the divisor so its MSB is 1
     u128_t v1t = v;
@@ -713,7 +564,7 @@ static uint64_t libdivide_128_div_128_to_64(uint64_t u_hi, uint64_t u_lo, uint64
     // Each term is 128 bit
     // High half of full product (upper 128 bits!) are dropped
     u128_t q0v = {0, 0};
-    q0v.hi = q0.hi*v.lo + q0.lo*v.hi + libdivide__mullhi_u64(q0.lo, v.lo);
+    q0v.hi = q0.hi*v.lo + q0.lo*v.hi + libdivide_mullhi_u64(q0.lo, v.lo);
     q0v.lo = q0.lo*v.lo;
 
     // Compute u - q0v as u_q0v
@@ -751,18 +602,18 @@ static inline struct libdivide_u32_t libdivide_internal_u32_gen(uint32_t d, int
     }
 
     struct libdivide_u32_t result;
-    uint32_t floor_log_2_d = 31 - libdivide__count_leading_zeros32(d);
+    uint32_t floor_log_2_d = 31 - libdivide_count_leading_zeros32(d);
     if ((d & (d - 1)) == 0) {
         // Power of 2
         if (! branchfree) {
             result.magic = 0;
-            result.more = floor_log_2_d | LIBDIVIDE_U32_SHIFT_PATH;
+            result.more = (uint8_t)(floor_log_2_d | LIBDIVIDE_U32_SHIFT_PATH);
         } else {
             // We want a magic number of 2**32 and a shift of floor_log_2_d
             // but one of the shifts is taken up by LIBDIVIDE_ADD_MARKER,
             // so we subtract 1 from the shift
             result.magic = 0;
-            result.more = (floor_log_2_d-1) | LIBDIVIDE_ADD_MARKER;
+            result.more = (uint8_t)((floor_log_2_d-1) | LIBDIVIDE_ADD_MARKER);
         }
     } else {
         uint8_t more;
@@ -817,13 +668,14 @@ uint32_t libdivide_u32_do(uint32_t numer, const struct libdivide_u32_t *denom) {
         return numer >> (more & LIBDIVIDE_32_SHIFT_MASK);
     }
     else {
-        uint32_t q = libdivide__mullhi_u32(denom->magic, numer);
+        uint32_t q = libdivide_mullhi_u32(denom->magic, numer);
         if (more & LIBDIVIDE_ADD_MARKER) {
             uint32_t t = ((numer - q) >> 1) + q;
             return t >> (more & LIBDIVIDE_32_SHIFT_MASK);
         }
         else {
-            // all upper bits are 0 - don't need to mask them off
+            // All upper bits are 0,
+            // don't need to mask them off.
             return q >> more;
         }
     }
@@ -874,9 +726,12 @@ uint32_t libdivide_u32_branchfree_recover(const struct libdivide_u32_branchfree_
 
 int libdivide_u32_get_algorithm(const struct libdivide_u32_t *denom) {
     uint8_t more = denom->more;
-    if (more & LIBDIVIDE_U32_SHIFT_PATH) return 0;
-    else if (!(more & LIBDIVIDE_ADD_MARKER)) return 1;
-    else return 2;
+    if (more & LIBDIVIDE_U32_SHIFT_PATH)
+        return 0;
+    else if (!(more & LIBDIVIDE_ADD_MARKER))
+        return 1;
+    else
+        return 2;
 }
 
 uint32_t libdivide_u32_do_alg0(uint32_t numer, const struct libdivide_u32_t *denom) {
@@ -884,73 +739,26 @@ uint32_t libdivide_u32_do_alg0(uint32_t numer, const struct libdivide_u32_t *den
 }
 
 uint32_t libdivide_u32_do_alg1(uint32_t numer, const struct libdivide_u32_t *denom) {
-    uint32_t q = libdivide__mullhi_u32(denom->magic, numer);
+    uint32_t q = libdivide_mullhi_u32(denom->magic, numer);
     return q >> denom->more;
 }
 
 uint32_t libdivide_u32_do_alg2(uint32_t numer, const struct libdivide_u32_t *denom) {
     // denom->add != 0
-    uint32_t q = libdivide__mullhi_u32(denom->magic, numer);
+    uint32_t q = libdivide_mullhi_u32(denom->magic, numer);
     uint32_t t = ((numer - q) >> 1) + q;
-    // Note that this mask is typically free. Only the low bits are meaningful
-    // to a shift, so compilers can optimize out this AND.
+    // Note that this mask is typically free. Only the low bits are
+    // meaningful to a shift, so compilers can optimize this out.
     return t >> (denom->more & LIBDIVIDE_32_SHIFT_MASK);
 }
 
 // same as algo 2
 uint32_t libdivide_u32_branchfree_do(uint32_t numer, const struct libdivide_u32_branchfree_t *denom) {
-    uint32_t q = libdivide__mullhi_u32(denom->magic, numer);
+    uint32_t q = libdivide_mullhi_u32(denom->magic, numer);
     uint32_t t = ((numer - q) >> 1) + q;
     return t >> denom->more;
 }
 
-#if defined(LIBDIVIDE_USE_SSE2)
-
-__m128i libdivide_u32_do_vector(__m128i numers, const struct libdivide_u32_t *denom) {
-    uint8_t more = denom->more;
-    if (more & LIBDIVIDE_U32_SHIFT_PATH) {
-        return _mm_srl_epi32(numers, libdivide_u32_to_m128i(more & LIBDIVIDE_32_SHIFT_MASK));
-    }
-    else {
-        __m128i q = libdivide__mullhi_u32_flat_vector(numers, _mm_set1_epi32(denom->magic));
-        if (more & LIBDIVIDE_ADD_MARKER) {
-            // uint32_t t = ((numer - q) >> 1) + q;
-            // return t >> denom->shift;
-            __m128i t = _mm_add_epi32(_mm_srli_epi32(_mm_sub_epi32(numers, q), 1), q);
-            return _mm_srl_epi32(t, libdivide_u32_to_m128i(more & LIBDIVIDE_32_SHIFT_MASK));
-
-        }
-        else {
-            // q >> denom->shift
-            return _mm_srl_epi32(q, libdivide_u32_to_m128i(more));
-        }
-    }
-}
-
-__m128i libdivide_u32_do_vector_alg0(__m128i numers, const struct libdivide_u32_t *denom) {
-    return _mm_srl_epi32(numers, libdivide_u32_to_m128i(denom->more & LIBDIVIDE_32_SHIFT_MASK));
-}
-
-__m128i libdivide_u32_do_vector_alg1(__m128i numers, const struct libdivide_u32_t *denom) {
-    __m128i q = libdivide__mullhi_u32_flat_vector(numers, _mm_set1_epi32(denom->magic));
-    return _mm_srl_epi32(q, libdivide_u32_to_m128i(denom->more));
-}
-
-__m128i libdivide_u32_do_vector_alg2(__m128i numers, const struct libdivide_u32_t *denom) {
-    __m128i q = libdivide__mullhi_u32_flat_vector(numers, _mm_set1_epi32(denom->magic));
-    __m128i t = _mm_add_epi32(_mm_srli_epi32(_mm_sub_epi32(numers, q), 1), q);
-    return _mm_srl_epi32(t, libdivide_u32_to_m128i(denom->more & LIBDIVIDE_32_SHIFT_MASK));
-}
-
-// same as algo 2
-LIBDIVIDE_API __m128i libdivide_u32_branchfree_do_vector(__m128i numers, const struct libdivide_u32_branchfree_t *denom) {
-    __m128i q = libdivide__mullhi_u32_flat_vector(numers, _mm_set1_epi32(denom->magic));
-    __m128i t = _mm_add_epi32(_mm_srli_epi32(_mm_sub_epi32(numers, q), 1), q);
-    return _mm_srl_epi32(t, libdivide_u32_to_m128i(denom->more));
-}
-
-#endif
-
 /////////// UINT64
 
 static inline struct libdivide_u64_t libdivide_internal_u64_gen(uint64_t d, int branchfree) {
@@ -959,7 +767,7 @@ static inline struct libdivide_u64_t libdivide_internal_u64_gen(uint64_t d, int
     }
 
     struct libdivide_u64_t result;
-    uint32_t floor_log_2_d = 63 - libdivide__count_leading_zeros64(d);
+    uint32_t floor_log_2_d = 63 - libdivide_count_leading_zeros64(d);
     if ((d & (d - 1)) == 0) {
         // Power of 2
         if (! branchfree) {
@@ -968,7 +776,7 @@ static inline struct libdivide_u64_t libdivide_internal_u64_gen(uint64_t d, int
         } else {
             // We want a magic number of 2**64 and a shift of floor_log_2_d
             // but one of the shifts is taken up by LIBDIVIDE_ADD_MARKER,
-            // so we subtract 1 from the shift
+            // so we subtract 1 from the shift.
             result.magic = 0;
             result.more = (floor_log_2_d-1) | LIBDIVIDE_ADD_MARKER;
         }
@@ -1027,13 +835,14 @@ uint64_t libdivide_u64_do(uint64_t numer, const struct libdivide_u64_t *denom) {
         return numer >> (more & LIBDIVIDE_64_SHIFT_MASK);
     }
     else {
-        uint64_t q = libdivide__mullhi_u64(denom->magic, numer);
+        uint64_t q = libdivide_mullhi_u64(denom->magic, numer);
         if (more & LIBDIVIDE_ADD_MARKER) {
             uint64_t t = ((numer - q) >> 1) + q;
             return t >> (more & LIBDIVIDE_64_SHIFT_MASK);
         }
         else {
-             // all upper bits are 0 - don't need to mask them off
+             // All upper bits are 0,
+             // don't need to mask them off.
             return q >> more;
         }
     }
@@ -1079,8 +888,8 @@ uint64_t libdivide_u64_recover(const struct libdivide_u64_t *denom) {
         uint64_t half_q = libdivide_128_div_128_to_64(half_n_hi, half_n_lo, d_hi, d_lo, &r_hi, &r_lo);
         // We computed 2^(64+shift)/(m+2^64)
         // Double the remainder ('dr') and check if that is larger than d
-        // Note that d is a 65 bit value, so r1 is small and so r1 + r1 cannot
-        // overflow
+        // Note that d is a 65 bit value, so r1 is small and so r1 + r1
+        // cannot overflow
         uint64_t dr_lo = r_lo + r_lo;
         uint64_t dr_hi = r_hi + r_hi + (dr_lo < r_lo); // last term is carry
         int dr_exceeds_d = (dr_hi > d_hi) || (dr_hi == d_hi && dr_lo >= d_lo);
@@ -1096,9 +905,12 @@ uint64_t libdivide_u64_branchfree_recover(const struct libdivide_u64_branchfree_
 
 int libdivide_u64_get_algorithm(const struct libdivide_u64_t *denom) {
     uint8_t more = denom->more;
-    if (more & LIBDIVIDE_U64_SHIFT_PATH) return 0;
-    else if (!(more & LIBDIVIDE_ADD_MARKER)) return 1;
-    else return 2;
+    if (more & LIBDIVIDE_U64_SHIFT_PATH)
+        return 0;
+    else if (!(more & LIBDIVIDE_ADD_MARKER))
+        return 1;
+    else
+        return 2;
 }
 
 uint64_t libdivide_u64_do_alg0(uint64_t numer, const struct libdivide_u64_t *denom) {
@@ -1106,77 +918,25 @@ uint64_t libdivide_u64_do_alg0(uint64_t numer, const struct libdivide_u64_t *den
 }
 
 uint64_t libdivide_u64_do_alg1(uint64_t numer, const struct libdivide_u64_t *denom) {
-    uint64_t q = libdivide__mullhi_u64(denom->magic, numer);
+    uint64_t q = libdivide_mullhi_u64(denom->magic, numer);
     return q >> denom->more;
 }
 
 uint64_t libdivide_u64_do_alg2(uint64_t numer, const struct libdivide_u64_t *denom) {
-    uint64_t q = libdivide__mullhi_u64(denom->magic, numer);
+    uint64_t q = libdivide_mullhi_u64(denom->magic, numer);
     uint64_t t = ((numer - q) >> 1) + q;
     return t >> (denom->more & LIBDIVIDE_64_SHIFT_MASK);
 }
 
 // same as alg 2
 uint64_t libdivide_u64_branchfree_do(uint64_t numer, const struct libdivide_u64_branchfree_t *denom) {
-    uint64_t q = libdivide__mullhi_u64(denom->magic, numer);
+    uint64_t q = libdivide_mullhi_u64(denom->magic, numer);
     uint64_t t = ((numer - q) >> 1) + q;
     return t >> denom->more;
 }
 
-#if defined(LIBDIVIDE_USE_SSE2)
-
-__m128i libdivide_u64_do_vector(__m128i numers, const struct libdivide_u64_t *denom) {
-    uint8_t more = denom->more;
-    if (more & LIBDIVIDE_U64_SHIFT_PATH) {
-        return _mm_srl_epi64(numers, libdivide_u32_to_m128i(more & LIBDIVIDE_64_SHIFT_MASK));
-    }
-    else {
-        __m128i q = libdivide_mullhi_u64_flat_vector(numers, libdivide__u64_to_m128(denom->magic));
-        if (more & LIBDIVIDE_ADD_MARKER) {
-            // uint32_t t = ((numer - q) >> 1) + q;
-            // return t >> denom->shift;
-            __m128i t = _mm_add_epi64(_mm_srli_epi64(_mm_sub_epi64(numers, q), 1), q);
-            return _mm_srl_epi64(t, libdivide_u32_to_m128i(more & LIBDIVIDE_64_SHIFT_MASK));
-        }
-        else {
-            // q >> denom->shift
-            return _mm_srl_epi64(q, libdivide_u32_to_m128i(more));
-        }
-    }
-}
-
-__m128i libdivide_u64_do_vector_alg0(__m128i numers, const struct libdivide_u64_t *denom) {
-    return _mm_srl_epi64(numers, libdivide_u32_to_m128i(denom->more & LIBDIVIDE_64_SHIFT_MASK));
-}
-
-__m128i libdivide_u64_do_vector_alg1(__m128i numers, const struct libdivide_u64_t *denom) {
-    __m128i q = libdivide_mullhi_u64_flat_vector(numers, libdivide__u64_to_m128(denom->magic));
-    return _mm_srl_epi64(q, libdivide_u32_to_m128i(denom->more));
-}
-
-__m128i libdivide_u64_do_vector_alg2(__m128i numers, const struct libdivide_u64_t *denom) {
-    __m128i q = libdivide_mullhi_u64_flat_vector(numers, libdivide__u64_to_m128(denom->magic));
-    __m128i t = _mm_add_epi64(_mm_srli_epi64(_mm_sub_epi64(numers, q), 1), q);
-    return _mm_srl_epi64(t, libdivide_u32_to_m128i(denom->more & LIBDIVIDE_64_SHIFT_MASK));
-}
-
-__m128i libdivide_u64_branchfree_do_vector(__m128i numers, const struct libdivide_u64_branchfree_t *denom) {
-    __m128i q = libdivide_mullhi_u64_flat_vector(numers, libdivide__u64_to_m128(denom->magic));
-    __m128i t = _mm_add_epi64(_mm_srli_epi64(_mm_sub_epi64(numers, q), 1), q);
-    return _mm_srl_epi64(t, libdivide_u32_to_m128i(denom->more));
-}
-
-#endif
-
 /////////// SINT32
 
-static inline int32_t libdivide__mullhi_s32(int32_t x, int32_t y) {
-    int64_t xl = x, yl = y;
-    int64_t rl = xl * yl;
-    // needs to be arithmetic shift
-    return (int32_t)(rl >> 32);
-}
-
 static inline struct libdivide_s32_t libdivide_internal_s32_gen(int32_t d, int branchfree) {
     if (d == 0) {
         LIBDIVIDE_ERROR("divider must be != 0");
@@ -1192,7 +952,7 @@ static inline struct libdivide_s32_t libdivide_internal_s32_gen(int32_t d, int b
     // and is a power of 2.
     uint32_t ud = (uint32_t)d;
     uint32_t absD = (d < 0) ? -ud : ud;
-    uint32_t floor_log_2_d = 31 - libdivide__count_leading_zeros32(absD);
+    uint32_t floor_log_2_d = 31 - libdivide_count_leading_zeros32(absD);
     // check if exactly one bit is set,
     // don't care if absD is 0 since that's divide by zero
     if ((absD & (absD - 1)) == 0) {
@@ -1262,14 +1022,15 @@ int32_t libdivide_s32_do(int32_t numer, const struct libdivide_s32_t *denom) {
     uint8_t more = denom->more;
     if (more & LIBDIVIDE_S32_SHIFT_PATH) {
         uint32_t sign = (int8_t)more >> 7;
-        uint8_t shifter = more & LIBDIVIDE_32_SHIFT_MASK;
-        uint32_t uq = (uint32_t)(numer + ((numer >> 31) & ((1U << shifter) - 1)));
+        uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
+        uint32_t mask = (1U << shift) - 1;
+        uint32_t uq = numer + ((numer >> 31) & mask);
         int32_t q = (int32_t)uq;
-        q = q >> shifter;
+        q = q >> shift;
         q = (q ^ sign) - sign;
         return q;
     } else {
-        uint32_t uq = (uint32_t)libdivide__mullhi_s32(denom->magic, numer);
+        uint32_t uq = (uint32_t)libdivide_mullhi_s32(denom->magic, numer);
         if (more & LIBDIVIDE_ADD_MARKER) {
             // must be arithmetic shift and then sign extend
             int32_t sign = (int8_t)more >> 7;
@@ -1289,7 +1050,7 @@ int32_t libdivide_s32_branchfree_do(int32_t numer, const struct libdivide_s32_br
     // must be arithmetic shift and then sign extend
     int32_t sign = (int8_t)more >> 7;
     int32_t magic = denom->magic;
-    int32_t q = libdivide__mullhi_s32(magic, numer);
+    int32_t q = libdivide_mullhi_s32(magic, numer);
     q += numer;
 
     // If q is non-negative, we have nothing to do
@@ -1351,25 +1112,30 @@ int32_t libdivide_s32_branchfree_recover(const struct libdivide_s32_branchfree_t
 int libdivide_s32_get_algorithm(const struct libdivide_s32_t *denom) {
     uint8_t more = denom->more;
     int positiveDivisor = !(more & LIBDIVIDE_NEGATIVE_DIVISOR);
-    if (more & LIBDIVIDE_S32_SHIFT_PATH) return (positiveDivisor ? 0 : 1);
-    else if (more & LIBDIVIDE_ADD_MARKER) return (positiveDivisor ? 2 : 3);
-    else return 4;
+    if (more & LIBDIVIDE_S32_SHIFT_PATH)
+        return (positiveDivisor ? 0 : 1);
+    else if (more & LIBDIVIDE_ADD_MARKER)
+        return (positiveDivisor ? 2 : 3);
+    else
+        return 4;
 }
 
 int32_t libdivide_s32_do_alg0(int32_t numer, const struct libdivide_s32_t *denom) {
-    uint8_t shifter = denom->more & LIBDIVIDE_32_SHIFT_MASK;
-    int32_t q = numer + ((numer >> 31) & ((1U << shifter) - 1));
-    return q >> shifter;
+    uint8_t shift = denom->more & LIBDIVIDE_32_SHIFT_MASK;
+    uint32_t mask = (1U << shift) - 1;
+    int32_t q = numer + ((numer >> 31) & mask);
+    return q >> shift;
 }
 
 int32_t libdivide_s32_do_alg1(int32_t numer, const struct libdivide_s32_t *denom) {
-    uint8_t shifter = denom->more & LIBDIVIDE_32_SHIFT_MASK;
-    int32_t q = numer + ((numer >> 31) & ((1U << shifter) - 1));
-    return - (q >> shifter);
+    uint8_t shift = denom->more & LIBDIVIDE_32_SHIFT_MASK;
+    uint32_t mask = (1U << shift) - 1;
+    int32_t q = numer + ((numer >> 31) & mask);
+    return - (q >> shift);
 }
 
 int32_t libdivide_s32_do_alg2(int32_t numer, const struct libdivide_s32_t *denom) {
-    int32_t q = libdivide__mullhi_s32(denom->magic, numer);
+    int32_t q = libdivide_mullhi_s32(denom->magic, numer);
     q += numer;
     q >>= denom->more & LIBDIVIDE_32_SHIFT_MASK;
     q += (q < 0);
@@ -1377,7 +1143,7 @@ int32_t libdivide_s32_do_alg2(int32_t numer, const struct libdivide_s32_t *denom
 }
 
 int32_t libdivide_s32_do_alg3(int32_t numer, const struct libdivide_s32_t *denom) {
-    int32_t q = libdivide__mullhi_s32(denom->magic, numer);
+    int32_t q = libdivide_mullhi_s32(denom->magic, numer);
     q -= numer;
     q >>= denom->more & LIBDIVIDE_32_SHIFT_MASK;
     q += (q < 0);
@@ -1385,100 +1151,12 @@ int32_t libdivide_s32_do_alg3(int32_t numer, const struct libdivide_s32_t *denom
 }
 
 int32_t libdivide_s32_do_alg4(int32_t numer, const struct libdivide_s32_t *denom) {
-    int32_t q = libdivide__mullhi_s32(denom->magic, numer);
+    int32_t q = libdivide_mullhi_s32(denom->magic, numer);
     q >>= denom->more & LIBDIVIDE_32_SHIFT_MASK;
     q += (q < 0);
     return q;
 }
 
-#if defined(LIBDIVIDE_USE_SSE2)
-
-__m128i libdivide_s32_do_vector(__m128i numers, const struct libdivide_s32_t *denom) {
-    uint8_t more = denom->more;
-    if (more & LIBDIVIDE_S32_SHIFT_PATH) {
-        uint32_t shifter = more & LIBDIVIDE_32_SHIFT_MASK;
-        __m128i roundToZeroTweak = _mm_set1_epi32((1U << shifter) - 1); // could use _mm_srli_epi32 with an all -1 register
-        __m128i q = _mm_add_epi32(numers, _mm_and_si128(_mm_srai_epi32(numers, 31), roundToZeroTweak)); //q = numer + ((numer >> 31) & roundToZeroTweak);
-        q = _mm_sra_epi32(q, libdivide_u32_to_m128i(shifter)); // q = q >> shifter
-        __m128i shiftMask = _mm_set1_epi32((int32_t)((int8_t)more >> 7)); // set all bits of shift mask = to the sign bit of more
-        q = _mm_sub_epi32(_mm_xor_si128(q, shiftMask), shiftMask); // q = (q ^ shiftMask) - shiftMask;
-        return q;
-    }
-    else {
-        __m128i q = libdivide_mullhi_s32_flat_vector(numers, _mm_set1_epi32(denom->magic));
-        if (more & LIBDIVIDE_ADD_MARKER) {
-            __m128i sign = _mm_set1_epi32((int32_t)(int8_t)more >> 7); // must be arithmetic shift
-            q = _mm_add_epi32(q, _mm_sub_epi32(_mm_xor_si128(numers, sign), sign)); // q += ((numer ^ sign) - sign);
-        }
-        q = _mm_sra_epi32(q, libdivide_u32_to_m128i(more & LIBDIVIDE_32_SHIFT_MASK)); // q >>= shift
-        q = _mm_add_epi32(q, _mm_srli_epi32(q, 31)); // q += (q < 0)
-        return q;
-    }
-}
-
-__m128i libdivide_s32_do_vector_alg0(__m128i numers, const struct libdivide_s32_t *denom) {
-    uint8_t shifter = denom->more & LIBDIVIDE_32_SHIFT_MASK;
-    __m128i roundToZeroTweak = _mm_set1_epi32((1U << shifter) - 1);
-    __m128i q = _mm_add_epi32(numers, _mm_and_si128(_mm_srai_epi32(numers, 31), roundToZeroTweak));
-    return _mm_sra_epi32(q, libdivide_u32_to_m128i(shifter));
-}
-
-__m128i libdivide_s32_do_vector_alg1(__m128i numers, const struct libdivide_s32_t *denom) {
-    uint8_t shifter = denom->more & LIBDIVIDE_32_SHIFT_MASK;
-    __m128i roundToZeroTweak = _mm_set1_epi32((1U << shifter) - 1);
-    __m128i q = _mm_add_epi32(numers, _mm_and_si128(_mm_srai_epi32(numers, 31), roundToZeroTweak));
-    return _mm_sub_epi32(_mm_setzero_si128(), _mm_sra_epi32(q, libdivide_u32_to_m128i(shifter)));
-}
-
-__m128i libdivide_s32_do_vector_alg2(__m128i numers, const struct libdivide_s32_t *denom) {
-    __m128i q = libdivide_mullhi_s32_flat_vector(numers, _mm_set1_epi32(denom->magic));
-    q = _mm_add_epi32(q, numers);
-    q = _mm_sra_epi32(q, libdivide_u32_to_m128i(denom->more & LIBDIVIDE_32_SHIFT_MASK));
-    q = _mm_add_epi32(q, _mm_srli_epi32(q, 31));
-    return q;
-}
-
-__m128i libdivide_s32_do_vector_alg3(__m128i numers, const struct libdivide_s32_t *denom) {
-    __m128i q = libdivide_mullhi_s32_flat_vector(numers, _mm_set1_epi32(denom->magic));
-    q = _mm_sub_epi32(q, numers);
-    q = _mm_sra_epi32(q, libdivide_u32_to_m128i(denom->more & LIBDIVIDE_32_SHIFT_MASK));
-    q = _mm_add_epi32(q, _mm_srli_epi32(q, 31));
-    return q;
-}
-
-__m128i libdivide_s32_do_vector_alg4(__m128i numers, const struct libdivide_s32_t *denom) {
-    uint8_t more = denom->more;
-    __m128i q = libdivide_mullhi_s32_flat_vector(numers, _mm_set1_epi32(denom->magic));
-    q = _mm_sra_epi32(q, libdivide_u32_to_m128i(more & LIBDIVIDE_32_SHIFT_MASK)); //q >>= shift
-    q = _mm_add_epi32(q, _mm_srli_epi32(q, 31)); // q += (q < 0)
-    return q;
-}
-
-__m128i libdivide_s32_branchfree_do_vector(__m128i numers, const struct libdivide_s32_branchfree_t *denom) {
-    int32_t magic = denom->magic;
-    uint8_t more = denom->more;
-    uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
-     // must be arithmetic shift
-    __m128i sign = _mm_set1_epi32((int32_t)(int8_t)more >> 7);
-
-     // libdivide__mullhi_s32(numers, magic);
-    __m128i q = libdivide_mullhi_s32_flat_vector(numers, _mm_set1_epi32(magic));
-    q = _mm_add_epi32(q, numers); // q += numers
-
-    // If q is non-negative, we have nothing to do
-    // If q is negative, we want to add either (2**shift)-1 if d is a power of
-    // 2, or (2**shift) if it is not a power of 2
-    uint32_t is_power_of_2 = (magic == 0);
-    __m128i q_sign = _mm_srai_epi32(q, 31); // q_sign = q >> 31
-    __m128i mask = _mm_set1_epi32((1 << shift) - is_power_of_2);
-    q = _mm_add_epi32(q, _mm_and_si128(q_sign, mask)); // q = q + (q_sign & mask)
-    q = _mm_srai_epi32(q, shift); //q >>= shift
-    q = _mm_sub_epi32(_mm_xor_si128(q, sign), sign); // q = (q ^ sign) - sign
-    return q;
-}
-
-#endif
-
 ///////////// SINT64
 
 static inline struct libdivide_s64_t libdivide_internal_s64_gen(int64_t d, int branchfree) {
@@ -1496,7 +1174,7 @@ static inline struct libdivide_s64_t libdivide_internal_s64_gen(int64_t d, int b
     // and is a power of 2.
     uint64_t ud = (uint64_t)d;
     uint64_t absD = (d < 0) ? -ud : ud;
-    uint32_t floor_log_2_d = 63 - libdivide__count_leading_zeros64(absD);
+    uint32_t floor_log_2_d = 63 - libdivide_count_leading_zeros64(absD);
     // check if exactly one bit is set,
     // don't care if absD is 0 since that's divide by zero
     if ((absD & (absD - 1)) == 0) {
@@ -1566,16 +1244,17 @@ int64_t libdivide_s64_do(int64_t numer, const struct libdivide_s64_t *denom) {
     uint8_t more = denom->more;
     int64_t magic = denom->magic;
     if (magic == 0) { //shift path
-        uint32_t shifter = more & LIBDIVIDE_64_SHIFT_MASK;
-        uint64_t uq = (uint64_t)numer + ((numer >> 63) & ((1ULL << shifter) - 1));
+        uint32_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
+        uint64_t mask = (1ULL << shift) - 1;
+        uint64_t uq = numer + ((numer >> 63) & mask);
         int64_t q = (int64_t)uq;
-        q = q >> shifter;
+        q = q >> shift;
         // must be arithmetic shift and then sign-extend
         int64_t shiftMask = (int8_t)more >> 7;
         q = (q ^ shiftMask) - shiftMask;
         return q;
     } else {
-        uint64_t uq = (uint64_t)libdivide__mullhi_s64(magic, numer);
+        uint64_t uq = (uint64_t)libdivide_mullhi_s64(magic, numer);
         if (more & LIBDIVIDE_ADD_MARKER) {
             // must be arithmetic shift and then sign extend
             int64_t sign = (int8_t)more >> 7;
@@ -1594,7 +1273,7 @@ int64_t libdivide_s64_branchfree_do(int64_t numer, const struct libdivide_s64_br
     // must be arithmetic shift and then sign extend
     int64_t sign = (int8_t)more >> 7;
     int64_t magic = denom->magic;
-    int64_t q = libdivide__mullhi_s64(magic, numer);
+    int64_t q = libdivide_mullhi_s64(magic, numer);
     q += numer;
 
     // If q is non-negative, we have nothing to do.
@@ -1646,26 +1325,31 @@ int64_t libdivide_s64_branchfree_recover(const struct libdivide_s64_branchfree_t
 int libdivide_s64_get_algorithm(const struct libdivide_s64_t *denom) {
     uint8_t more = denom->more;
     int positiveDivisor = !(more & LIBDIVIDE_NEGATIVE_DIVISOR);
-    if (denom->magic == 0) return (positiveDivisor ? 0 : 1); // shift path
-    else if (more & LIBDIVIDE_ADD_MARKER) return (positiveDivisor ? 2 : 3);
-    else return 4;
+    if (denom->magic == 0)
+        return (positiveDivisor ? 0 : 1); // shift path
+    else if (more & LIBDIVIDE_ADD_MARKER)
+        return (positiveDivisor ? 2 : 3);
+    else
+        return 4;
 }
 
 int64_t libdivide_s64_do_alg0(int64_t numer, const struct libdivide_s64_t *denom) {
-    uint32_t shifter = denom->more & LIBDIVIDE_64_SHIFT_MASK;
-    int64_t q = numer + ((numer >> 63) & ((1ULL << shifter) - 1));
-    return q >> shifter;
+    uint32_t shift = denom->more & LIBDIVIDE_64_SHIFT_MASK;
+    uint64_t mask = (1ULL << shift) - 1;
+    int64_t q = numer + ((numer >> 63) & mask);
+    return q >> shift;
 }
 
 int64_t libdivide_s64_do_alg1(int64_t numer, const struct libdivide_s64_t *denom) {
-    // denom->shifter != -1 && demo->shiftMask != 0
-    uint32_t shifter = denom->more & LIBDIVIDE_64_SHIFT_MASK;
-    int64_t q = numer + ((numer >> 63) & ((1ULL << shifter) - 1));
-    return - (q >> shifter);
+    // denom->shift != -1 && demo->shiftMask != 0
+    uint32_t shift = denom->more & LIBDIVIDE_64_SHIFT_MASK;
+    uint64_t mask = (1ULL << shift) - 1;
+    int64_t q = numer + ((numer >> 63) & mask);
+    return - (q >> shift);
 }
 
 int64_t libdivide_s64_do_alg2(int64_t numer, const struct libdivide_s64_t *denom) {
-    int64_t q = libdivide__mullhi_s64(denom->magic, numer);
+    int64_t q = libdivide_mullhi_s64(denom->magic, numer);
     q += numer;
     q >>= denom->more & LIBDIVIDE_64_SHIFT_MASK;
     q += (q < 0);
@@ -1673,7 +1357,7 @@ int64_t libdivide_s64_do_alg2(int64_t numer, const struct libdivide_s64_t *denom
 }
 
 int64_t libdivide_s64_do_alg3(int64_t numer, const struct libdivide_s64_t *denom) {
-    int64_t q = libdivide__mullhi_s64(denom->magic, numer);
+    int64_t q = libdivide_mullhi_s64(denom->magic, numer);
     q -= numer;
     q >>= denom->more & LIBDIVIDE_64_SHIFT_MASK;
     q += (q < 0);
@@ -1681,7 +1365,7 @@ int64_t libdivide_s64_do_alg3(int64_t numer, const struct libdivide_s64_t *denom
 }
 
 int64_t libdivide_s64_do_alg4(int64_t numer, const struct libdivide_s64_t *denom) {
-    int64_t q = libdivide__mullhi_s64(denom->magic, numer);
+    int64_t q = libdivide_mullhi_s64(denom->magic, numer);
     q >>= denom->more & LIBDIVIDE_64_SHIFT_MASK;
     q += (q < 0);
     return q;
@@ -1689,23 +1373,372 @@ int64_t libdivide_s64_do_alg4(int64_t numer, const struct libdivide_s64_t *denom
 
 #if defined(LIBDIVIDE_USE_SSE2)
 
+LIBDIVIDE_API __m128i libdivide_u32_do_vector(__m128i numers, const struct libdivide_u32_t *denom);
+LIBDIVIDE_API __m128i libdivide_s32_do_vector(__m128i numers, const struct libdivide_s32_t *denom);
+LIBDIVIDE_API __m128i libdivide_u64_do_vector(__m128i numers, const struct libdivide_u64_t *denom);
+LIBDIVIDE_API __m128i libdivide_s64_do_vector(__m128i numers, const struct libdivide_s64_t *denom);
+
+LIBDIVIDE_API __m128i libdivide_u32_do_vector_alg0(__m128i numers, const struct libdivide_u32_t *denom);
+LIBDIVIDE_API __m128i libdivide_u32_do_vector_alg1(__m128i numers, const struct libdivide_u32_t *denom);
+LIBDIVIDE_API __m128i libdivide_u32_do_vector_alg2(__m128i numers, const struct libdivide_u32_t *denom);
+
+LIBDIVIDE_API __m128i libdivide_s32_do_vector_alg0(__m128i numers, const struct libdivide_s32_t *denom);
+LIBDIVIDE_API __m128i libdivide_s32_do_vector_alg1(__m128i numers, const struct libdivide_s32_t *denom);
+LIBDIVIDE_API __m128i libdivide_s32_do_vector_alg2(__m128i numers, const struct libdivide_s32_t *denom);
+LIBDIVIDE_API __m128i libdivide_s32_do_vector_alg3(__m128i numers, const struct libdivide_s32_t *denom);
+LIBDIVIDE_API __m128i libdivide_s32_do_vector_alg4(__m128i numers, const struct libdivide_s32_t *denom);
+
+LIBDIVIDE_API __m128i libdivide_u64_do_vector_alg0(__m128i numers, const struct libdivide_u64_t *denom);
+LIBDIVIDE_API __m128i libdivide_u64_do_vector_alg1(__m128i numers, const struct libdivide_u64_t *denom);
+LIBDIVIDE_API __m128i libdivide_u64_do_vector_alg2(__m128i numers, const struct libdivide_u64_t *denom);
+
+LIBDIVIDE_API __m128i libdivide_s64_do_vector_alg0(__m128i numers, const struct libdivide_s64_t *denom);
+LIBDIVIDE_API __m128i libdivide_s64_do_vector_alg1(__m128i numers, const struct libdivide_s64_t *denom);
+LIBDIVIDE_API __m128i libdivide_s64_do_vector_alg2(__m128i numers, const struct libdivide_s64_t *denom);
+LIBDIVIDE_API __m128i libdivide_s64_do_vector_alg3(__m128i numers, const struct libdivide_s64_t *denom);
+LIBDIVIDE_API __m128i libdivide_s64_do_vector_alg4(__m128i numers, const struct libdivide_s64_t *denom);
+
+LIBDIVIDE_API __m128i libdivide_u32_branchfree_do_vector(__m128i numers, const struct libdivide_u32_branchfree_t *denom);
+LIBDIVIDE_API __m128i libdivide_s32_branchfree_do_vector(__m128i numers, const struct libdivide_s32_branchfree_t *denom);
+LIBDIVIDE_API __m128i libdivide_u64_branchfree_do_vector(__m128i numers, const struct libdivide_u64_branchfree_t *denom);
+LIBDIVIDE_API __m128i libdivide_s64_branchfree_do_vector(__m128i numers, const struct libdivide_s64_branchfree_t *denom);
+
+//////// Internal Utility Functions
+
+static inline __m128i libdivide_u64_to_m128(uint64_t x) {
+    return _mm_set1_epi64x(x);
+}
+
+static inline __m128i libdivide_get_FFFFFFFF00000000(void) {
+    // returns the same as _mm_set1_epi64(0xFFFFFFFF00000000ULL)
+    // without touching memory.
+    // optimizes to pcmpeqd on OS X
+    __m128i result = _mm_set1_epi8(-1);
+    return _mm_slli_epi64(result, 32);
+}
+
+static inline __m128i libdivide_get_00000000FFFFFFFF(void) {
+    // returns the same as _mm_set1_epi64(0x00000000FFFFFFFFULL)
+    // without touching memory.
+    // optimizes to pcmpeqd on OS X
+    __m128i result = _mm_set1_epi8(-1);
+    result = _mm_srli_epi64(result, 32);
+    return result;
+}
+
+static inline __m128i libdivide_s64_signbits(__m128i v) {
+    // we want to compute v >> 63, that is, _mm_srai_epi64(v, 63). But there
+    // is no 64 bit shift right arithmetic instruction in SSE2. So we have to
+    // fake it by first duplicating the high 32 bit values, and then using a 32
+    // bit shift. Another option would be to use _mm_srli_epi64(v, 63) and
+    // then subtract that from 0, but that approach appears to be substantially
+    // slower for unknown reasons
+    __m128i hiBitsDuped = _mm_shuffle_epi32(v, _MM_SHUFFLE(3, 3, 1, 1));
+    __m128i signBits = _mm_srai_epi32(hiBitsDuped, 31);
+    return signBits;
+}
+
+// Returns an __m128i whose low 32 bits are equal to amt and has zero elsewhere.
+static inline __m128i libdivide_u32_to_m128i(uint32_t amt) {
+    return _mm_set_epi32(0, 0, 0, amt);
+}
+
+static inline __m128i libdivide_s64_shift_right_vector(__m128i v, int amt) {
+    // implementation of _mm_sra_epi64. Here we have two 64 bit values which
+    // are shifted right to logically become (64 - amt) values, and are then
+    // sign extended from a (64 - amt) bit number.
+    const int b = 64 - amt;
+    __m128i m = libdivide_u64_to_m128(1ULL << (b - 1));
+    __m128i x = _mm_srl_epi64(v, libdivide_u32_to_m128i(amt));
+    __m128i result = _mm_sub_epi64(_mm_xor_si128(x, m), m);
+    return result;
+}
+
+// Here, b is assumed to contain one 32 bit value repeated four times.
+// If it did not, the function would not work.
+static inline __m128i libdivide_mullhi_u32_flat_vector(__m128i a, __m128i b) {
+    __m128i hi_product_0Z2Z = _mm_srli_epi64(_mm_mul_epu32(a, b), 32);
+    __m128i a1X3X = _mm_srli_epi64(a, 32);
+    __m128i mask = libdivide_get_FFFFFFFF00000000();
+    __m128i hi_product_Z1Z3 = _mm_and_si128(_mm_mul_epu32(a1X3X, b), mask);
+    // return hi_product_0123
+    return _mm_or_si128(hi_product_0Z2Z, hi_product_Z1Z3);
+}
+
+// Here, y is assumed to contain one 64 bit value repeated twice.
+static inline __m128i libdivide_mullhi_u64_flat_vector(__m128i x, __m128i y) {
+    // full 128 bits are x0 * y0 + (x0 * y1 << 32) + (x1 * y0 << 32) + (x1 * y1 << 64)
+    __m128i mask = libdivide_get_00000000FFFFFFFF();
+    // x0 is low half of 2 64 bit values, x1 is high half in low slots
+    __m128i x0 = _mm_and_si128(x, mask);
+    __m128i x1 = _mm_srli_epi64(x, 32);
+    __m128i y0 = _mm_and_si128(y, mask);
+    __m128i y1 = _mm_srli_epi64(y, 32);
+    // x0 happens to have the low half of the two 64 bit values in 32 bit slots
+    // 0 and 2, so _mm_mul_epu32 computes their full product, and then we shift
+    // right by 32 to get just the high values
+    __m128i x0y0_hi = _mm_srli_epi64(_mm_mul_epu32(x0, y0), 32);
+    __m128i x0y1 = _mm_mul_epu32(x0, y1);
+    __m128i x1y0 = _mm_mul_epu32(x1, y0);
+    __m128i x1y1 = _mm_mul_epu32(x1, y1);
+    __m128i temp = _mm_add_epi64(x1y0, x0y0_hi);
+    __m128i temp_lo = _mm_and_si128(temp, mask);
+    __m128i temp_hi = _mm_srli_epi64(temp, 32);
+    temp_lo = _mm_srli_epi64(_mm_add_epi64(temp_lo, x0y1), 32);
+    temp_hi = _mm_add_epi64(x1y1, temp_hi);
+
+    return _mm_add_epi64(temp_lo, temp_hi);
+}
+
+// y is one 64 bit value repeated twice
+static inline __m128i libdivide_mullhi_s64_flat_vector(__m128i x, __m128i y) {
+    __m128i p = libdivide_mullhi_u64_flat_vector(x, y);
+    __m128i t1 = _mm_and_si128(libdivide_s64_signbits(x), y);
+    p = _mm_sub_epi64(p, t1);
+    __m128i t2 = _mm_and_si128(libdivide_s64_signbits(y), x);
+    p = _mm_sub_epi64(p, t2);
+    return p;
+}
+
+#ifdef LIBDIVIDE_USE_SSE4_1
+
+// b is one 32 bit value repeated four times.
+static inline __m128i libdivide_mullhi_s32_flat_vector(__m128i a, __m128i b) {
+    __m128i hi_product_0Z2Z = _mm_srli_epi64(_mm_mul_epi32(a, b), 32);
+    __m128i a1X3X = _mm_srli_epi64(a, 32);
+    __m128i mask = libdivide_get_FFFFFFFF00000000();
+    __m128i hi_product_Z1Z3 = _mm_and_si128(_mm_mul_epi32(a1X3X, b), mask);
+    // return hi_product_0123
+    return _mm_or_si128(hi_product_0Z2Z, hi_product_Z1Z3);
+}
+
+#else
+
+// SSE2 does not have a signed multiplication instruction, but we can convert
+// unsigned to signed pretty efficiently. Again, b is just a 32 bit value
+// repeated four times.
+static inline __m128i libdivide_mullhi_s32_flat_vector(__m128i a, __m128i b) {
+    __m128i p = libdivide_mullhi_u32_flat_vector(a, b);
+    // t1 = (a >> 31) & y, arithmetic shift
+    __m128i t1 = _mm_and_si128(_mm_srai_epi32(a, 31), b);
+    __m128i t2 = _mm_and_si128(_mm_srai_epi32(b, 31), a);
+    p = _mm_sub_epi32(p, t1);
+    p = _mm_sub_epi32(p, t2);
+    return p;
+}
+
+#endif
+
+////////// UINT32
+
+__m128i libdivide_u32_do_vector(__m128i numers, const struct libdivide_u32_t *denom) {
+    uint8_t more = denom->more;
+    if (more & LIBDIVIDE_U32_SHIFT_PATH) {
+        uint32_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
+        return _mm_srl_epi32(numers, libdivide_u32_to_m128i(shift));
+    }
+    else {
+        __m128i q = libdivide_mullhi_u32_flat_vector(numers, _mm_set1_epi32(denom->magic));
+        if (more & LIBDIVIDE_ADD_MARKER) {
+            // uint32_t t = ((numer - q) >> 1) + q;
+            // return t >> denom->shift;
+            uint32_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
+            __m128i t = _mm_add_epi32(_mm_srli_epi32(_mm_sub_epi32(numers, q), 1), q);
+            return _mm_srl_epi32(t, libdivide_u32_to_m128i(shift));
+        }
+        else {
+            // q >> denom->shift
+            return _mm_srl_epi32(q, libdivide_u32_to_m128i(more));
+        }
+    }
+}
+
+__m128i libdivide_u32_do_vector_alg0(__m128i numers, const struct libdivide_u32_t *denom) {
+    return _mm_srl_epi32(numers, libdivide_u32_to_m128i(denom->more & LIBDIVIDE_32_SHIFT_MASK));
+}
+
+__m128i libdivide_u32_do_vector_alg1(__m128i numers, const struct libdivide_u32_t *denom) {
+    __m128i q = libdivide_mullhi_u32_flat_vector(numers, _mm_set1_epi32(denom->magic));
+    return _mm_srl_epi32(q, libdivide_u32_to_m128i(denom->more));
+}
+
+__m128i libdivide_u32_do_vector_alg2(__m128i numers, const struct libdivide_u32_t *denom) {
+    __m128i q = libdivide_mullhi_u32_flat_vector(numers, _mm_set1_epi32(denom->magic));
+    __m128i t = _mm_add_epi32(_mm_srli_epi32(_mm_sub_epi32(numers, q), 1), q);
+    return _mm_srl_epi32(t, libdivide_u32_to_m128i(denom->more & LIBDIVIDE_32_SHIFT_MASK));
+}
+
+LIBDIVIDE_API __m128i libdivide_u32_branchfree_do_vector(__m128i numers, const struct libdivide_u32_branchfree_t *denom) {
+    __m128i q = libdivide_mullhi_u32_flat_vector(numers, _mm_set1_epi32(denom->magic));
+    __m128i t = _mm_add_epi32(_mm_srli_epi32(_mm_sub_epi32(numers, q), 1), q);
+    return _mm_srl_epi32(t, libdivide_u32_to_m128i(denom->more));
+}
+
+////////// UINT64
+
+__m128i libdivide_u64_do_vector(__m128i numers, const struct libdivide_u64_t *denom) {
+    uint8_t more = denom->more;
+    if (more & LIBDIVIDE_U64_SHIFT_PATH) {
+        uint32_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
+        return _mm_srl_epi64(numers, libdivide_u32_to_m128i(shift));
+    }
+    else {
+        __m128i q = libdivide_mullhi_u64_flat_vector(numers, libdivide_u64_to_m128(denom->magic));
+        if (more & LIBDIVIDE_ADD_MARKER) {
+            // uint32_t t = ((numer - q) >> 1) + q;
+            // return t >> denom->shift;
+            uint32_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
+            __m128i t = _mm_add_epi64(_mm_srli_epi64(_mm_sub_epi64(numers, q), 1), q);
+            return _mm_srl_epi64(t, libdivide_u32_to_m128i(shift));
+        }
+        else {
+            // q >> denom->shift
+            return _mm_srl_epi64(q, libdivide_u32_to_m128i(more));
+        }
+    }
+}
+
+__m128i libdivide_u64_do_vector_alg0(__m128i numers, const struct libdivide_u64_t *denom) {
+    return _mm_srl_epi64(numers, libdivide_u32_to_m128i(denom->more & LIBDIVIDE_64_SHIFT_MASK));
+}
+
+__m128i libdivide_u64_do_vector_alg1(__m128i numers, const struct libdivide_u64_t *denom) {
+    __m128i q = libdivide_mullhi_u64_flat_vector(numers, libdivide_u64_to_m128(denom->magic));
+    return _mm_srl_epi64(q, libdivide_u32_to_m128i(denom->more));
+}
+
+__m128i libdivide_u64_do_vector_alg2(__m128i numers, const struct libdivide_u64_t *denom) {
+    __m128i q = libdivide_mullhi_u64_flat_vector(numers, libdivide_u64_to_m128(denom->magic));
+    __m128i t = _mm_add_epi64(_mm_srli_epi64(_mm_sub_epi64(numers, q), 1), q);
+    return _mm_srl_epi64(t, libdivide_u32_to_m128i(denom->more & LIBDIVIDE_64_SHIFT_MASK));
+}
+
+__m128i libdivide_u64_branchfree_do_vector(__m128i numers, const struct libdivide_u64_branchfree_t *denom) {
+    __m128i q = libdivide_mullhi_u64_flat_vector(numers, libdivide_u64_to_m128(denom->magic));
+    __m128i t = _mm_add_epi64(_mm_srli_epi64(_mm_sub_epi64(numers, q), 1), q);
+    return _mm_srl_epi64(t, libdivide_u32_to_m128i(denom->more));
+}
+
+////////// SINT32
+
+__m128i libdivide_s32_do_vector(__m128i numers, const struct libdivide_s32_t *denom) {
+    uint8_t more = denom->more;
+    if (more & LIBDIVIDE_S32_SHIFT_PATH) {
+        uint32_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
+        uint32_t mask = (1U << shift) - 1;
+        // could use _mm_srli_epi32 with an all -1 register
+        __m128i roundToZeroTweak = _mm_set1_epi32(mask);
+        // q = numer + ((numer >> 31) & roundToZeroTweak);
+        __m128i q = _mm_add_epi32(numers, _mm_and_si128(_mm_srai_epi32(numers, 31), roundToZeroTweak));
+        q = _mm_sra_epi32(q, libdivide_u32_to_m128i(shift));
+        // set all bits of shift mask = to the sign bit of more
+        __m128i shiftMask = _mm_set1_epi32((int32_t)((int8_t)more >> 7));
+        // q = (q ^ shiftMask) - shiftMask;
+        q = _mm_sub_epi32(_mm_xor_si128(q, shiftMask), shiftMask);
+        return q;
+    }
+    else {
+        __m128i q = libdivide_mullhi_s32_flat_vector(numers, _mm_set1_epi32(denom->magic));
+        if (more & LIBDIVIDE_ADD_MARKER) {
+             // must be arithmetic shift
+            __m128i sign = _mm_set1_epi32((int32_t)(int8_t)more >> 7);
+             // q += ((numer ^ sign) - sign);
+            q = _mm_add_epi32(q, _mm_sub_epi32(_mm_xor_si128(numers, sign), sign));
+        }
+        // q >>= shift
+        q = _mm_sra_epi32(q, libdivide_u32_to_m128i(more & LIBDIVIDE_32_SHIFT_MASK));
+        q = _mm_add_epi32(q, _mm_srli_epi32(q, 31)); // q += (q < 0)
+        return q;
+    }
+}
+
+__m128i libdivide_s32_do_vector_alg0(__m128i numers, const struct libdivide_s32_t *denom) {
+    uint8_t shift = denom->more & LIBDIVIDE_32_SHIFT_MASK;
+    uint32_t mask = (1U << shift) - 1;
+    __m128i roundToZeroTweak = _mm_set1_epi32(mask);
+    __m128i q = _mm_add_epi32(numers, _mm_and_si128(_mm_srai_epi32(numers, 31), roundToZeroTweak));
+    return _mm_sra_epi32(q, libdivide_u32_to_m128i(shift));
+}
+
+__m128i libdivide_s32_do_vector_alg1(__m128i numers, const struct libdivide_s32_t *denom) {
+    uint8_t shift = denom->more & LIBDIVIDE_32_SHIFT_MASK;
+    uint32_t mask = (1U << shift) - 1;
+    __m128i roundToZeroTweak = _mm_set1_epi32(mask);
+    __m128i q = _mm_add_epi32(numers, _mm_and_si128(_mm_srai_epi32(numers, 31), roundToZeroTweak));
+    return _mm_sub_epi32(_mm_setzero_si128(), _mm_sra_epi32(q, libdivide_u32_to_m128i(shift)));
+}
+
+__m128i libdivide_s32_do_vector_alg2(__m128i numers, const struct libdivide_s32_t *denom) {
+    __m128i q = libdivide_mullhi_s32_flat_vector(numers, _mm_set1_epi32(denom->magic));
+    q = _mm_add_epi32(q, numers);
+    q = _mm_sra_epi32(q, libdivide_u32_to_m128i(denom->more & LIBDIVIDE_32_SHIFT_MASK));
+    q = _mm_add_epi32(q, _mm_srli_epi32(q, 31));
+    return q;
+}
+
+__m128i libdivide_s32_do_vector_alg3(__m128i numers, const struct libdivide_s32_t *denom) {
+    __m128i q = libdivide_mullhi_s32_flat_vector(numers, _mm_set1_epi32(denom->magic));
+    q = _mm_sub_epi32(q, numers);
+    q = _mm_sra_epi32(q, libdivide_u32_to_m128i(denom->more & LIBDIVIDE_32_SHIFT_MASK));
+    q = _mm_add_epi32(q, _mm_srli_epi32(q, 31));
+    return q;
+}
+
+__m128i libdivide_s32_do_vector_alg4(__m128i numers, const struct libdivide_s32_t *denom) {
+    uint8_t more = denom->more;
+    __m128i q = libdivide_mullhi_s32_flat_vector(numers, _mm_set1_epi32(denom->magic));
+    q = _mm_sra_epi32(q, libdivide_u32_to_m128i(more & LIBDIVIDE_32_SHIFT_MASK));
+    q = _mm_add_epi32(q, _mm_srli_epi32(q, 31)); // q += (q < 0)
+    return q;
+}
+
+__m128i libdivide_s32_branchfree_do_vector(__m128i numers, const struct libdivide_s32_branchfree_t *denom) {
+    int32_t magic = denom->magic;
+    uint8_t more = denom->more;
+    uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
+     // must be arithmetic shift
+    __m128i sign = _mm_set1_epi32((int32_t)(int8_t)more >> 7);
+
+     // libdivide_mullhi_s32(numers, magic);
+    __m128i q = libdivide_mullhi_s32_flat_vector(numers, _mm_set1_epi32(magic));
+    q = _mm_add_epi32(q, numers); // q += numers
+
+    // If q is non-negative, we have nothing to do
+    // If q is negative, we want to add either (2**shift)-1 if d is
+    // a power of 2, or (2**shift) if it is not a power of 2
+    uint32_t is_power_of_2 = (magic == 0);
+    __m128i q_sign = _mm_srai_epi32(q, 31); // q_sign = q >> 31
+    __m128i mask = _mm_set1_epi32((1 << shift) - is_power_of_2);
+    q = _mm_add_epi32(q, _mm_and_si128(q_sign, mask)); // q = q + (q_sign & mask)
+    q = _mm_srai_epi32(q, shift); // q >>= shift
+    q = _mm_sub_epi32(_mm_xor_si128(q, sign), sign); // q = (q ^ sign) - sign
+    return q;
+}
+
+////////// SINT64
+
 __m128i libdivide_s64_do_vector(__m128i numers, const struct libdivide_s64_t *denom) {
     uint8_t more = denom->more;
     int64_t magic = denom->magic;
     if (magic == 0) { // shift path
-        uint32_t shifter = more & LIBDIVIDE_64_SHIFT_MASK;
-        __m128i roundToZeroTweak = libdivide__u64_to_m128((1ULL << shifter) - 1);
-        __m128i q = _mm_add_epi64(numers, _mm_and_si128(libdivide_s64_signbits(numers), roundToZeroTweak)); // q = numer + ((numer >> 63) & roundToZeroTweak);
-        q = libdivide_s64_shift_right_vector(q, shifter); // q = q >> shifter
+        uint32_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
+        uint64_t mask = (1ULL << shift) - 1;
+        __m128i roundToZeroTweak = libdivide_u64_to_m128(mask);
+        // q = numer + ((numer >> 63) & roundToZeroTweak);
+        __m128i q = _mm_add_epi64(numers, _mm_and_si128(libdivide_s64_signbits(numers), roundToZeroTweak));
+        q = libdivide_s64_shift_right_vector(q, shift);
         __m128i shiftMask = _mm_set1_epi32((int32_t)((int8_t)more >> 7));
-        q = _mm_sub_epi64(_mm_xor_si128(q, shiftMask), shiftMask); // q = (q ^ shiftMask) - shiftMask;
+         // q = (q ^ shiftMask) - shiftMask;
+        q = _mm_sub_epi64(_mm_xor_si128(q, shiftMask), shiftMask);
         return q;
     }
     else {
-        __m128i q = libdivide_mullhi_s64_flat_vector(numers, libdivide__u64_to_m128(magic));
+        __m128i q = libdivide_mullhi_s64_flat_vector(numers, libdivide_u64_to_m128(magic));
         if (more & LIBDIVIDE_ADD_MARKER) {
-            __m128i sign = _mm_set1_epi32((int32_t)((int8_t)more >> 7)); // must be arithmetic shift
-            q = _mm_add_epi64(q, _mm_sub_epi64(_mm_xor_si128(numers, sign), sign)); // q += ((numer ^ sign) - sign);
+            // must be arithmetic shift
+            __m128i sign = _mm_set1_epi32((int32_t)((int8_t)more >> 7));
+            // q += ((numer ^ sign) - sign);
+            q = _mm_add_epi64(q, _mm_sub_epi64(_mm_xor_si128(numers, sign), sign));
         }
         // q >>= denom->mult_path.shift
         q = libdivide_s64_shift_right_vector(q, more & LIBDIVIDE_64_SHIFT_MASK);
@@ -1715,23 +1748,25 @@ __m128i libdivide_s64_do_vector(__m128i numers, const struct libdivide_s64_t *de
 }
 
 __m128i libdivide_s64_do_vector_alg0(__m128i numers, const struct libdivide_s64_t *denom) {
-    uint32_t shifter = denom->more & LIBDIVIDE_64_SHIFT_MASK;
-    __m128i roundToZeroTweak = libdivide__u64_to_m128((1ULL << shifter) - 1);
+    uint32_t shift = denom->more & LIBDIVIDE_64_SHIFT_MASK;
+    uint64_t mask = (1ULL << shift) - 1;
+    __m128i roundToZeroTweak = libdivide_u64_to_m128(mask);
     __m128i q = _mm_add_epi64(numers, _mm_and_si128(libdivide_s64_signbits(numers), roundToZeroTweak));
-    q = libdivide_s64_shift_right_vector(q, shifter);
+    q = libdivide_s64_shift_right_vector(q, shift);
     return q;
 }
 
 __m128i libdivide_s64_do_vector_alg1(__m128i numers, const struct libdivide_s64_t *denom) {
-    uint32_t shifter = denom->more & LIBDIVIDE_64_SHIFT_MASK;
-    __m128i roundToZeroTweak = libdivide__u64_to_m128((1ULL << shifter) - 1);
+    uint32_t shift = denom->more & LIBDIVIDE_64_SHIFT_MASK;
+    uint64_t mask = (1ULL << shift) - 1;
+    __m128i roundToZeroTweak = libdivide_u64_to_m128(mask);
     __m128i q = _mm_add_epi64(numers, _mm_and_si128(libdivide_s64_signbits(numers), roundToZeroTweak));
-    q = libdivide_s64_shift_right_vector(q, shifter);
+    q = libdivide_s64_shift_right_vector(q, shift);
     return _mm_sub_epi64(_mm_setzero_si128(), q);
 }
 
 __m128i libdivide_s64_do_vector_alg2(__m128i numers, const struct libdivide_s64_t *denom) {
-    __m128i q = libdivide_mullhi_s64_flat_vector(numers, libdivide__u64_to_m128(denom->magic));
+    __m128i q = libdivide_mullhi_s64_flat_vector(numers, libdivide_u64_to_m128(denom->magic));
     q = _mm_add_epi64(q, numers);
     q = libdivide_s64_shift_right_vector(q, denom->more & LIBDIVIDE_64_SHIFT_MASK);
     q = _mm_add_epi64(q, _mm_srli_epi64(q, 63)); // q += (q < 0)
@@ -1739,7 +1774,7 @@ __m128i libdivide_s64_do_vector_alg2(__m128i numers, const struct libdivide_s64_
 }
 
 __m128i libdivide_s64_do_vector_alg3(__m128i numers, const struct libdivide_s64_t *denom) {
-    __m128i q = libdivide_mullhi_s64_flat_vector(numers, libdivide__u64_to_m128(denom->magic));
+    __m128i q = libdivide_mullhi_s64_flat_vector(numers, libdivide_u64_to_m128(denom->magic));
     q = _mm_sub_epi64(q, numers);
     q = libdivide_s64_shift_right_vector(q, denom->more & LIBDIVIDE_64_SHIFT_MASK);
     q = _mm_add_epi64(q, _mm_srli_epi64(q, 63)); // q += (q < 0)
@@ -1747,7 +1782,7 @@ __m128i libdivide_s64_do_vector_alg3(__m128i numers, const struct libdivide_s64_
 }
 
 __m128i libdivide_s64_do_vector_alg4(__m128i numers, const struct libdivide_s64_t *denom) {
-    __m128i q = libdivide_mullhi_s64_flat_vector(numers, libdivide__u64_to_m128(denom->magic));
+    __m128i q = libdivide_mullhi_s64_flat_vector(numers, libdivide_u64_to_m128(denom->magic));
     q = libdivide_s64_shift_right_vector(q, denom->more & LIBDIVIDE_64_SHIFT_MASK);
     q = _mm_add_epi64(q, _mm_srli_epi64(q, 63));
     return q;
@@ -1760,16 +1795,16 @@ __m128i libdivide_s64_branchfree_do_vector(__m128i numers, const struct libdivid
     // must be arithmetic shift
     __m128i sign = _mm_set1_epi32((int32_t)(int8_t)more >> 7);
 
-     // libdivide__mullhi_s64(numers, magic);
-    __m128i q = libdivide_mullhi_s64_flat_vector(numers, libdivide__u64_to_m128(magic));
+     // libdivide_mullhi_s64(numers, magic);
+    __m128i q = libdivide_mullhi_s64_flat_vector(numers, libdivide_u64_to_m128(magic));
     q = _mm_add_epi64(q, numers); // q += numers
 
     // If q is non-negative, we have nothing to do.
-    // If q is negative, we want to add either (2**shift)-1 if d is a power of
-    // 2, or (2**shift) if it is not a power of 2.
+    // If q is negative, we want to add either (2**shift)-1 if d is
+    // a power of 2, or (2**shift) if it is not a power of 2.
     uint32_t is_power_of_2 = (magic == 0);
     __m128i q_sign = libdivide_s64_signbits(q); // q_sign = q >> 63
-    __m128i mask = libdivide__u64_to_m128((1ULL << shift) - is_power_of_2);
+    __m128i mask = libdivide_u64_to_m128((1ULL << shift) - is_power_of_2);
     q = _mm_add_epi64(q, _mm_and_si128(q_sign, mask)); // q = q + (q_sign & mask)
     q = libdivide_s64_shift_right_vector(q, shift); // q >>= shift
     q = _mm_sub_epi64(_mm_xor_si128(q, sign), sign); // q = (q ^ sign) - sign
@@ -1797,12 +1832,22 @@ enum {
 
 namespace libdivide_internal {
 
+#if defined(LIBDIVIDE_USE_SSE2) && \
+    defined(__GNUC__) && \
+    __GNUC__ >= 6
+    // Using vector functions as template arguments causes many
+    // -Wignored-attributes compiler warnings with GCC 9.
+    // These warnings can safely be turned off.
+    #pragma GCC diagnostic push
+    #pragma GCC diagnostic ignored "-Wignored-attributes"
+#endif
+
 #if defined(LIBDIVIDE_USE_SSE2)
-#define MAYBE_VECTOR(X) X
-#define MAYBE_VECTOR_PARAM(X) __m128i vector_func(__m128i, const X *)
+    #define MAYBE_VECTOR(X) X
+    #define MAYBE_VECTOR_PARAM(X) __m128i vector_func(__m128i, const X *)
 #else
-#define MAYBE_VECTOR(X) 0
-#define MAYBE_VECTOR_PARAM(X) int unused
+    #define MAYBE_VECTOR(X) 0
+    #define MAYBE_VECTOR_PARAM(X) int unused
 #endif
 
 // The following convenience macros are used to build a type of the base
@@ -1837,115 +1882,121 @@ namespace libdivide_internal {
                  libdivide_##TYPE##_crash, \
                  MAYBE_VECTOR(libdivide_##TYPE##_crash_vector)>
 
-    // Base divider, provides storage for the actual divider.
-    // @IntType: e.g. uint32_t
-    // @DenomType: e.g. libdivide_u32_t
-    // @gen_func(): e.g. libdivide_u32_gen
-    // @do_func(): e.g. libdivide_u32_do
-    // @MAYBE_VECTOR_PARAM: e.g. libdivide_u32_do_vector
-    template<typename IntType,
-             typename DenomType,
-             DenomType gen_func(IntType),
-             IntType do_func(IntType, const DenomType *),
-             MAYBE_VECTOR_PARAM(DenomType)>
-    struct base {
-        // Storage for the actual divider
-        DenomType denom;
+// Base divider, provides storage for the actual divider.
+// @T: e.g. uint32_t
+// @DenomType: e.g. libdivide_u32_t
+// @gen_func(): e.g. libdivide_u32_gen
+// @do_func(): e.g. libdivide_u32_do
+// @MAYBE_VECTOR_PARAM: e.g. libdivide_u32_do_vector
+template<typename T,
+         typename DenomType,
+         DenomType gen_func(T),
+         T do_func(T, const DenomType *),
+         MAYBE_VECTOR_PARAM(DenomType)>
+struct base {
+    // Storage for the actual divider
+    DenomType denom;
 
-        // Constructor that takes a divisor value, and applies the gen function
-        base(IntType d) : denom(gen_func(d)) { }
+    // Constructor that takes a divisor value, and applies the gen function
+    base(T d) : denom(gen_func(d)) { }
 
-        // Default constructor to allow uninitialized uses in e.g. arrays
-        base() {}
+    // Default constructor to allow uninitialized uses in e.g. arrays
+    base() {}
 
-        // Needed for unswitch
-        base(const DenomType& d) : denom(d) { }
+    // Needed for unswitch
+    base(const DenomType& d) : denom(d) { }
 
-        IntType perform_divide(IntType val) const {
-            return do_func(val, &denom);
-        }
+    T perform_divide(T val) const {
+        return do_func(val, &denom);
+    }
 
 #if defined(LIBDIVIDE_USE_SSE2)
-        __m128i perform_divide_vector(__m128i val) const {
-            return vector_func(val, &denom);
-        }
+    __m128i perform_divide_vector(__m128i val) const {
+        return vector_func(val, &denom);
+    }
 #endif
-    };
+};
 
-    // Functions that will never be called but are required to be able
-    // to use unswitch in C++ template code. Unsigned has fewer algorithms
-    // than signed i.e. alg3 and alg4 are not defined for unsigned. In
-    // order to make templates compile we need to define unsigned alg3 and
-    // alg4 as crash functions.
-    uint32_t libdivide_u32_crash(uint32_t, const libdivide_u32_t *) { exit(-1); }
-    uint64_t libdivide_u64_crash(uint64_t, const libdivide_u64_t *) { exit(-1); }
+// Functions that will never be called but are required to be able
+// to use unswitch in C++ template code. Unsigned has fewer algorithms
+// than signed i.e. alg3 and alg4 are not defined for unsigned. In
+// order to make templates compile we need to define unsigned alg3 and
+// alg4 as crash functions.
+uint32_t libdivide_u32_crash(uint32_t, const libdivide_u32_t *) { exit(-1); }
+uint64_t libdivide_u64_crash(uint64_t, const libdivide_u64_t *) { exit(-1); }
 
 #if defined(LIBDIVIDE_USE_SSE2)
     __m128i libdivide_u32_crash_vector(__m128i, const libdivide_u32_t *) { exit(-1); }
     __m128i libdivide_u64_crash_vector(__m128i, const libdivide_u64_t *) { exit(-1); }
 #endif
 
-    template<typename T, int ALGO> struct dispatcher { };
+template<typename T, int ALGO> struct dispatcher { };
 
-    // Templated dispatch using partial specialization
-    template<> struct dispatcher<int32_t, BRANCHFULL> { BRANCHFULL_DIVIDER(int32_t, s32) divider; };
-    template<> struct dispatcher<int32_t, BRANCHFREE> { BRANCHFREE_DIVIDER(int32_t, s32) divider; };
-    template<> struct dispatcher<int32_t, ALGORITHM0> { ALGORITHM_DIVIDER(int32_t, s32, alg0) divider; };
-    template<> struct dispatcher<int32_t, ALGORITHM1> { ALGORITHM_DIVIDER(int32_t, s32, alg1) divider; };
-    template<> struct dispatcher<int32_t, ALGORITHM2> { ALGORITHM_DIVIDER(int32_t, s32, alg2) divider; };
-    template<> struct dispatcher<int32_t, ALGORITHM3> { ALGORITHM_DIVIDER(int32_t, s32, alg3) divider; };
-    template<> struct dispatcher<int32_t, ALGORITHM4> { ALGORITHM_DIVIDER(int32_t, s32, alg4) divider; };
+// Templated dispatch using partial specialization
+template<> struct dispatcher<int32_t, BRANCHFULL> { BRANCHFULL_DIVIDER(int32_t, s32) divider; };
+template<> struct dispatcher<int32_t, BRANCHFREE> { BRANCHFREE_DIVIDER(int32_t, s32) divider; };
+template<> struct dispatcher<int32_t, ALGORITHM0> { ALGORITHM_DIVIDER(int32_t, s32, alg0) divider; };
+template<> struct dispatcher<int32_t, ALGORITHM1> { ALGORITHM_DIVIDER(int32_t, s32, alg1) divider; };
+template<> struct dispatcher<int32_t, ALGORITHM2> { ALGORITHM_DIVIDER(int32_t, s32, alg2) divider; };
+template<> struct dispatcher<int32_t, ALGORITHM3> { ALGORITHM_DIVIDER(int32_t, s32, alg3) divider; };
+template<> struct dispatcher<int32_t, ALGORITHM4> { ALGORITHM_DIVIDER(int32_t, s32, alg4) divider; };
 
-    template<> struct dispatcher<uint32_t, BRANCHFULL> { BRANCHFULL_DIVIDER(uint32_t, u32) divider; };
-    template<> struct dispatcher<uint32_t, BRANCHFREE> { BRANCHFREE_DIVIDER(uint32_t, u32) divider; };
-    template<> struct dispatcher<uint32_t, ALGORITHM0> { ALGORITHM_DIVIDER(uint32_t, u32, alg0) divider; };
-    template<> struct dispatcher<uint32_t, ALGORITHM1> { ALGORITHM_DIVIDER(uint32_t, u32, alg1) divider; };
-    template<> struct dispatcher<uint32_t, ALGORITHM2> { ALGORITHM_DIVIDER(uint32_t, u32, alg2) divider; };
-    template<> struct dispatcher<uint32_t, ALGORITHM3> { CRASH_DIVIDER(uint32_t, u32) divider; };
-    template<> struct dispatcher<uint32_t, ALGORITHM4> { CRASH_DIVIDER(uint32_t, u32) divider; };
+template<> struct dispatcher<uint32_t, BRANCHFULL> { BRANCHFULL_DIVIDER(uint32_t, u32) divider; };
+template<> struct dispatcher<uint32_t, BRANCHFREE> { BRANCHFREE_DIVIDER(uint32_t, u32) divider; };
+template<> struct dispatcher<uint32_t, ALGORITHM0> { ALGORITHM_DIVIDER(uint32_t, u32, alg0) divider; };
+template<> struct dispatcher<uint32_t, ALGORITHM1> { ALGORITHM_DIVIDER(uint32_t, u32, alg1) divider; };
+template<> struct dispatcher<uint32_t, ALGORITHM2> { ALGORITHM_DIVIDER(uint32_t, u32, alg2) divider; };
+template<> struct dispatcher<uint32_t, ALGORITHM3> { CRASH_DIVIDER(uint32_t, u32) divider; };
+template<> struct dispatcher<uint32_t, ALGORITHM4> { CRASH_DIVIDER(uint32_t, u32) divider; };
 
-    template<> struct dispatcher<int64_t, BRANCHFULL> { BRANCHFULL_DIVIDER(int64_t, s64) divider; };
-    template<> struct dispatcher<int64_t, BRANCHFREE> { BRANCHFREE_DIVIDER(int64_t, s64) divider; };
-    template<> struct dispatcher<int64_t, ALGORITHM0> { ALGORITHM_DIVIDER (int64_t, s64, alg0) divider; };
-    template<> struct dispatcher<int64_t, ALGORITHM1> { ALGORITHM_DIVIDER (int64_t, s64, alg1) divider; };
-    template<> struct dispatcher<int64_t, ALGORITHM2> { ALGORITHM_DIVIDER (int64_t, s64, alg2) divider; };
-    template<> struct dispatcher<int64_t, ALGORITHM3> { ALGORITHM_DIVIDER (int64_t, s64, alg3) divider; };
-    template<> struct dispatcher<int64_t, ALGORITHM4> { ALGORITHM_DIVIDER (int64_t, s64, alg4) divider; };
+template<> struct dispatcher<int64_t, BRANCHFULL> { BRANCHFULL_DIVIDER(int64_t, s64) divider; };
+template<> struct dispatcher<int64_t, BRANCHFREE> { BRANCHFREE_DIVIDER(int64_t, s64) divider; };
+template<> struct dispatcher<int64_t, ALGORITHM0> { ALGORITHM_DIVIDER (int64_t, s64, alg0) divider; };
+template<> struct dispatcher<int64_t, ALGORITHM1> { ALGORITHM_DIVIDER (int64_t, s64, alg1) divider; };
+template<> struct dispatcher<int64_t, ALGORITHM2> { ALGORITHM_DIVIDER (int64_t, s64, alg2) divider; };
+template<> struct dispatcher<int64_t, ALGORITHM3> { ALGORITHM_DIVIDER (int64_t, s64, alg3) divider; };
+template<> struct dispatcher<int64_t, ALGORITHM4> { ALGORITHM_DIVIDER (int64_t, s64, alg4) divider; };
 
-    template<> struct dispatcher<uint64_t, BRANCHFULL> { BRANCHFULL_DIVIDER(uint64_t, u64) divider; };
-    template<> struct dispatcher<uint64_t, BRANCHFREE> { BRANCHFREE_DIVIDER(uint64_t, u64) divider; };
-    template<> struct dispatcher<uint64_t, ALGORITHM0> { ALGORITHM_DIVIDER(uint64_t, u64, alg0) divider; };
-    template<> struct dispatcher<uint64_t, ALGORITHM1> { ALGORITHM_DIVIDER(uint64_t, u64, alg1) divider; };
-    template<> struct dispatcher<uint64_t, ALGORITHM2> { ALGORITHM_DIVIDER(uint64_t, u64, alg2) divider; };
-    template<> struct dispatcher<uint64_t, ALGORITHM3> { CRASH_DIVIDER(uint64_t, u64) divider; };
-    template<> struct dispatcher<uint64_t, ALGORITHM4> { CRASH_DIVIDER(uint64_t, u64) divider; };
+template<> struct dispatcher<uint64_t, BRANCHFULL> { BRANCHFULL_DIVIDER(uint64_t, u64) divider; };
+template<> struct dispatcher<uint64_t, BRANCHFREE> { BRANCHFREE_DIVIDER(uint64_t, u64) divider; };
+template<> struct dispatcher<uint64_t, ALGORITHM0> { ALGORITHM_DIVIDER(uint64_t, u64, alg0) divider; };
+template<> struct dispatcher<uint64_t, ALGORITHM1> { ALGORITHM_DIVIDER(uint64_t, u64, alg1) divider; };
+template<> struct dispatcher<uint64_t, ALGORITHM2> { ALGORITHM_DIVIDER(uint64_t, u64, alg2) divider; };
+template<> struct dispatcher<uint64_t, ALGORITHM3> { CRASH_DIVIDER(uint64_t, u64) divider; };
+template<> struct dispatcher<uint64_t, ALGORITHM4> { CRASH_DIVIDER(uint64_t, u64) divider; };
 
-    // Overloads that don't depend on the algorithm
-    inline int32_t  recover(const libdivide_s32_t *s) { return libdivide_s32_recover(s); }
-    inline uint32_t recover(const libdivide_u32_t *s) { return libdivide_u32_recover(s); }
-    inline int64_t  recover(const libdivide_s64_t *s) { return libdivide_s64_recover(s); }
-    inline uint64_t recover(const libdivide_u64_t *s) { return libdivide_u64_recover(s); }
+#if defined(LIBDIVIDE_USE_SSE2) && \
+    defined(__GNUC__) && \
+    __GNUC__ >= 6
+    #pragma GCC diagnostic pop
+#endif
 
-    inline int32_t  recover(const libdivide_s32_branchfree_t *s) { return libdivide_s32_branchfree_recover(s); }
-    inline uint32_t recover(const libdivide_u32_branchfree_t *s) { return libdivide_u32_branchfree_recover(s); }
-    inline int64_t  recover(const libdivide_s64_branchfree_t *s) { return libdivide_s64_branchfree_recover(s); }
-    inline uint64_t recover(const libdivide_u64_branchfree_t *s) { return libdivide_u64_branchfree_recover(s); }
+// Overloads that don't depend on the algorithm
+inline int32_t  recover(const libdivide_s32_t *s) { return libdivide_s32_recover(s); }
+inline uint32_t recover(const libdivide_u32_t *s) { return libdivide_u32_recover(s); }
+inline int64_t  recover(const libdivide_s64_t *s) { return libdivide_s64_recover(s); }
+inline uint64_t recover(const libdivide_u64_t *s) { return libdivide_u64_recover(s); }
 
-    inline int get_algorithm(const libdivide_s32_t *s) { return libdivide_s32_get_algorithm(s); }
-    inline int get_algorithm(const libdivide_u32_t *s) { return libdivide_u32_get_algorithm(s); }
-    inline int get_algorithm(const libdivide_s64_t *s) { return libdivide_s64_get_algorithm(s); }
-    inline int get_algorithm(const libdivide_u64_t *s) { return libdivide_u64_get_algorithm(s); }
+inline int32_t  recover(const libdivide_s32_branchfree_t *s) { return libdivide_s32_branchfree_recover(s); }
+inline uint32_t recover(const libdivide_u32_branchfree_t *s) { return libdivide_u32_branchfree_recover(s); }
+inline int64_t  recover(const libdivide_s64_branchfree_t *s) { return libdivide_s64_branchfree_recover(s); }
+inline uint64_t recover(const libdivide_u64_branchfree_t *s) { return libdivide_u64_branchfree_recover(s); }
 
-    // Fallback for branchfree variants, which do not support unswitching
-    template<typename T> int get_algorithm(const T *) { return -1; }
-}
+inline int get_algorithm(const libdivide_s32_t *s) { return libdivide_s32_get_algorithm(s); }
+inline int get_algorithm(const libdivide_u32_t *s) { return libdivide_u32_get_algorithm(s); }
+inline int get_algorithm(const libdivide_s64_t *s) { return libdivide_s64_get_algorithm(s); }
+inline int get_algorithm(const libdivide_u64_t *s) { return libdivide_u64_get_algorithm(s); }
+
+// Fallback for branchfree variants, which do not support unswitching
+template<typename T> int get_algorithm(const T *) { return -1; }
+
+}  // namespace libdivide_internal
 
 // This is the main divider class for use by the user (C++ API).
 // The divider itself is stored in the div variable who's
 // type is chosen by the dispatcher based on the template paramaters.
 template<typename T, int ALGO = BRANCHFULL>
-class divider
-{
+class divider {
 private:
     // Here's the actual divider
     typedef typename libdivide_internal::dispatcher<T, ALGO>::divider div_t;
@@ -2019,14 +2070,14 @@ divider<T, NEW_ALGO> unswitch(const divider<T, BRANCHFULL>& d) {
 }
 
 // Overload of the / operator for scalar division
-template<typename int_type, int ALGO>
-int_type operator/(int_type numer, const divider<int_type, ALGO>& denom) {
+template<typename T, int ALGO>
+T operator/(T numer, const divider<T, ALGO>& denom) {
     return denom.perform_divide(numer);
 }
 
 // Overload of the /= operator for scalar division
-template<typename int_type, int ALGO>
-int_type operator/=(int_type& numer, const divider<int_type, ALGO>& denom) {
+template<typename T, int ALGO>
+T operator/=(T& numer, const divider<T, ALGO>& denom) {
     numer = denom.perform_divide(numer);
     return numer;
 }
@@ -2034,14 +2085,14 @@ int_type operator/=(int_type& numer, const divider<int_type, ALGO>& denom) {
 #if defined(LIBDIVIDE_USE_SSE2)
 
 // Overload of the / operator for vector division
-template<typename int_type, int ALGO>
-__m128i operator/(__m128i numer, const divider<int_type, ALGO>& denom) {
+template<typename T, int ALGO>
+__m128i operator/(__m128i numer, const divider<T, ALGO>& denom) {
     return denom.perform_divide_vector(numer);
 }
 
 // Overload of the /= operator for vector division
-template<typename int_type, int ALGO>
-__m128i operator/=(__m128i& numer, const divider<int_type, ALGO>& denom) {
+template<typename T, int ALGO>
+__m128i operator/=(__m128i& numer, const divider<T, ALGO>& denom) {
     numer = denom.perform_divide_vector(numer);
     return numer;
 }