diff --git a/third_party/libdivide.h b/third_party/libdivide.h
index 562e49f..e9a31d1 100644
--- a/third_party/libdivide.h
+++ b/third_party/libdivide.h
@@ -1,8 +1,8 @@
 // libdivide.h - Optimized integer division
 // https://libdivide.com
 //
-// Copyright (C) 2010 - 2019 ridiculous_fish, <libdivide@ridiculousfish.com>
-// Copyright (C) 2016 - 2019 Kim Walisch, <kim.walisch@gmail.com>
+// Copyright (C) 2010 - 2021 ridiculous_fish, <libdivide@ridiculousfish.com>
+// Copyright (C) 2016 - 2021 Kim Walisch, <kim.walisch@gmail.com>
 //
 // libdivide is dual-licensed under the Boost or zlib licenses.
 // You may use libdivide under the terms of either of these.
@@ -11,17 +11,12 @@
 #ifndef LIBDIVIDE_H
 #define LIBDIVIDE_H
 
-#define LIBDIVIDE_VERSION "3.0"
-#define LIBDIVIDE_VERSION_MAJOR 3
+#define LIBDIVIDE_VERSION "5.0"
+#define LIBDIVIDE_VERSION_MAJOR 5
 #define LIBDIVIDE_VERSION_MINOR 0
 
 #include <stdint.h>
-
-#if defined(__cplusplus)
-#include <cstdio>
-#include <cstdlib>
-#include <type_traits>
-#else
+#if !defined(__AVR__)
 #include <stdio.h>
 #include <stdlib.h>
 #endif
@@ -38,9 +33,15 @@
 
 #if defined(_MSC_VER)
 #include <intrin.h>
+#pragma warning(push)
 // disable warning C4146: unary minus operator applied
 // to unsigned type, result still unsigned
 #pragma warning(disable : 4146)
+// disable warning C4204: nonstandard extension used : non-constant aggregate 
+// initializer
+//
+// It's valid C99
+#pragma warning(disable : 4204)
 #define LIBDIVIDE_VC
 #endif
 
@@ -74,13 +75,28 @@
 #define LIBDIVIDE_FUNCTION __func__
 #endif
 
+// Set up forced inlining if possible.
+// We need both the attribute and keyword to avoid "might not be inlineable" warnings.
+#ifdef __has_attribute
+#if __has_attribute(always_inline)
+#define LIBDIVIDE_INLINE __attribute__((always_inline)) inline
+#endif
+#endif
+#ifndef LIBDIVIDE_INLINE
+#define LIBDIVIDE_INLINE inline
+#endif
+
+#if defined(__AVR__)
+#define LIBDIVIDE_ERROR(msg)
+#else
 #define LIBDIVIDE_ERROR(msg)                                                                     \
     do {                                                                                         \
         fprintf(stderr, "libdivide.h:%d: %s(): Error: %s\n", __LINE__, LIBDIVIDE_FUNCTION, msg); \
         abort();                                                                                 \
     } while (0)
+#endif
 
-#if defined(LIBDIVIDE_ASSERTIONS_ON)
+#if defined(LIBDIVIDE_ASSERTIONS_ON) && !defined(__AVR__)
 #define LIBDIVIDE_ASSERT(x)                                                           \
     do {                                                                              \
         if (!(x)) {                                                                   \
@@ -103,6 +119,16 @@ namespace libdivide {
 // by up to 10% because of reduced memory bandwidth.
 #pragma pack(push, 1)
 
+struct libdivide_u16_t {
+    uint16_t magic;
+    uint8_t more;
+};
+
+struct libdivide_s16_t {
+    int16_t magic;
+    uint8_t more;
+};
+
 struct libdivide_u32_t {
     uint32_t magic;
     uint8_t more;
@@ -123,6 +149,16 @@ struct libdivide_s64_t {
     uint8_t more;
 };
 
+struct libdivide_u16_branchfree_t {
+    uint16_t magic;
+    uint8_t more;
+};
+
+struct libdivide_s16_branchfree_t {
+    int16_t magic;
+    uint8_t more;
+};
+
 struct libdivide_u32_branchfree_t {
     uint32_t magic;
     uint8_t more;
@@ -178,66 +214,106 @@ struct libdivide_s64_branchfree_t {
 // whether the divisor is negated. In branchfree strategy, it is not negated.
 
 enum {
+    LIBDIVIDE_16_SHIFT_MASK = 0x1F,
     LIBDIVIDE_32_SHIFT_MASK = 0x1F,
     LIBDIVIDE_64_SHIFT_MASK = 0x3F,
     LIBDIVIDE_ADD_MARKER = 0x40,
     LIBDIVIDE_NEGATIVE_DIVISOR = 0x80
 };
 
-static inline struct libdivide_s32_t libdivide_s32_gen(int32_t d);
-static inline struct libdivide_u32_t libdivide_u32_gen(uint32_t d);
-static inline struct libdivide_s64_t libdivide_s64_gen(int64_t d);
-static inline struct libdivide_u64_t libdivide_u64_gen(uint64_t d);
+static LIBDIVIDE_INLINE struct libdivide_s16_t libdivide_s16_gen(int16_t d);
+static LIBDIVIDE_INLINE struct libdivide_u16_t libdivide_u16_gen(uint16_t d);
+static LIBDIVIDE_INLINE struct libdivide_s32_t libdivide_s32_gen(int32_t d);
+static LIBDIVIDE_INLINE struct libdivide_u32_t libdivide_u32_gen(uint32_t d);
+static LIBDIVIDE_INLINE struct libdivide_s64_t libdivide_s64_gen(int64_t d);
+static LIBDIVIDE_INLINE struct libdivide_u64_t libdivide_u64_gen(uint64_t d);
 
-static inline struct libdivide_s32_branchfree_t libdivide_s32_branchfree_gen(int32_t d);
-static inline struct libdivide_u32_branchfree_t libdivide_u32_branchfree_gen(uint32_t d);
-static inline struct libdivide_s64_branchfree_t libdivide_s64_branchfree_gen(int64_t d);
-static inline struct libdivide_u64_branchfree_t libdivide_u64_branchfree_gen(uint64_t d);
+static LIBDIVIDE_INLINE struct libdivide_s16_branchfree_t libdivide_s16_branchfree_gen(int16_t d);
+static LIBDIVIDE_INLINE struct libdivide_u16_branchfree_t libdivide_u16_branchfree_gen(uint16_t d);
+static LIBDIVIDE_INLINE struct libdivide_s32_branchfree_t libdivide_s32_branchfree_gen(int32_t d);
+static LIBDIVIDE_INLINE struct libdivide_u32_branchfree_t libdivide_u32_branchfree_gen(uint32_t d);
+static LIBDIVIDE_INLINE struct libdivide_s64_branchfree_t libdivide_s64_branchfree_gen(int64_t d);
+static LIBDIVIDE_INLINE struct libdivide_u64_branchfree_t libdivide_u64_branchfree_gen(uint64_t d);
 
-static inline int32_t libdivide_s32_do(int32_t numer, const struct libdivide_s32_t *denom);
-static inline uint32_t libdivide_u32_do(uint32_t numer, const struct libdivide_u32_t *denom);
-static inline int64_t libdivide_s64_do(int64_t numer, const struct libdivide_s64_t *denom);
-static inline uint64_t libdivide_u64_do(uint64_t numer, const struct libdivide_u64_t *denom);
+static LIBDIVIDE_INLINE int16_t libdivide_s16_do_raw(
+    int16_t numer, int16_t magic, uint8_t more);
+static LIBDIVIDE_INLINE int16_t libdivide_s16_do(
+    int16_t numer, const struct libdivide_s16_t* denom);
+static LIBDIVIDE_INLINE uint16_t libdivide_u16_do_raw(
+    uint16_t numer, uint16_t magic, uint8_t more);    
+static LIBDIVIDE_INLINE uint16_t libdivide_u16_do(
+    uint16_t numer, const struct libdivide_u16_t* denom);
+static LIBDIVIDE_INLINE int32_t libdivide_s32_do(
+    int32_t numer, const struct libdivide_s32_t *denom);
+static LIBDIVIDE_INLINE uint32_t libdivide_u32_do(
+    uint32_t numer, const struct libdivide_u32_t *denom);
+static LIBDIVIDE_INLINE int64_t libdivide_s64_do(
+    int64_t numer, const struct libdivide_s64_t *denom);
+static LIBDIVIDE_INLINE uint64_t libdivide_u64_do(
+    uint64_t numer, const struct libdivide_u64_t *denom);
 
-static inline int32_t libdivide_s32_branchfree_do(
+static LIBDIVIDE_INLINE int16_t libdivide_s16_branchfree_do(
+    int16_t numer, const struct libdivide_s16_branchfree_t* denom);
+static LIBDIVIDE_INLINE uint16_t libdivide_u16_branchfree_do(
+    uint16_t numer, const struct libdivide_u16_branchfree_t* denom);
+static LIBDIVIDE_INLINE int32_t libdivide_s32_branchfree_do(
     int32_t numer, const struct libdivide_s32_branchfree_t *denom);
-static inline uint32_t libdivide_u32_branchfree_do(
+static LIBDIVIDE_INLINE uint32_t libdivide_u32_branchfree_do(
     uint32_t numer, const struct libdivide_u32_branchfree_t *denom);
-static inline int64_t libdivide_s64_branchfree_do(
+static LIBDIVIDE_INLINE int64_t libdivide_s64_branchfree_do(
     int64_t numer, const struct libdivide_s64_branchfree_t *denom);
-static inline uint64_t libdivide_u64_branchfree_do(
+static LIBDIVIDE_INLINE uint64_t libdivide_u64_branchfree_do(
     uint64_t numer, const struct libdivide_u64_branchfree_t *denom);
 
-static inline int32_t libdivide_s32_recover(const struct libdivide_s32_t *denom);
-static inline uint32_t libdivide_u32_recover(const struct libdivide_u32_t *denom);
-static inline int64_t libdivide_s64_recover(const struct libdivide_s64_t *denom);
-static inline uint64_t libdivide_u64_recover(const struct libdivide_u64_t *denom);
+static LIBDIVIDE_INLINE int16_t libdivide_s16_recover(const struct libdivide_s16_t* denom);
+static LIBDIVIDE_INLINE uint16_t libdivide_u16_recover(const struct libdivide_u16_t* denom);
+static LIBDIVIDE_INLINE int32_t libdivide_s32_recover(const struct libdivide_s32_t *denom);
+static LIBDIVIDE_INLINE uint32_t libdivide_u32_recover(const struct libdivide_u32_t *denom);
+static LIBDIVIDE_INLINE int64_t libdivide_s64_recover(const struct libdivide_s64_t *denom);
+static LIBDIVIDE_INLINE uint64_t libdivide_u64_recover(const struct libdivide_u64_t *denom);
 
-static inline int32_t libdivide_s32_branchfree_recover(
+static LIBDIVIDE_INLINE int16_t libdivide_s16_branchfree_recover(
+    const struct libdivide_s16_branchfree_t* denom);
+static LIBDIVIDE_INLINE uint16_t libdivide_u16_branchfree_recover(
+    const struct libdivide_u16_branchfree_t* denom);
+static LIBDIVIDE_INLINE int32_t libdivide_s32_branchfree_recover(
     const struct libdivide_s32_branchfree_t *denom);
-static inline uint32_t libdivide_u32_branchfree_recover(
+static LIBDIVIDE_INLINE uint32_t libdivide_u32_branchfree_recover(
     const struct libdivide_u32_branchfree_t *denom);
-static inline int64_t libdivide_s64_branchfree_recover(
+static LIBDIVIDE_INLINE int64_t libdivide_s64_branchfree_recover(
     const struct libdivide_s64_branchfree_t *denom);
-static inline uint64_t libdivide_u64_branchfree_recover(
+static LIBDIVIDE_INLINE uint64_t libdivide_u64_branchfree_recover(
     const struct libdivide_u64_branchfree_t *denom);
 
 //////// Internal Utility Functions
 
-static inline uint32_t libdivide_mullhi_u32(uint32_t x, uint32_t y) {
+static LIBDIVIDE_INLINE uint16_t libdivide_mullhi_u16(uint16_t x, uint16_t y) {
+    uint32_t xl = x, yl = y;
+    uint32_t rl = xl * yl;
+    return (uint16_t)(rl >> 16);
+}
+
+static LIBDIVIDE_INLINE int16_t libdivide_mullhi_s16(int16_t x, int16_t y) {
+    int32_t xl = x, yl = y;
+    int32_t rl = xl * yl;
+    // needs to be arithmetic shift
+    return (int16_t)(rl >> 16);
+}
+
+static LIBDIVIDE_INLINE uint32_t libdivide_mullhi_u32(uint32_t x, uint32_t y) {
     uint64_t xl = x, yl = y;
     uint64_t rl = xl * yl;
     return (uint32_t)(rl >> 32);
 }
 
-static inline int32_t libdivide_mullhi_s32(int32_t x, int32_t y) {
+static LIBDIVIDE_INLINE int32_t libdivide_mullhi_s32(int32_t x, int32_t y) {
     int64_t xl = x, yl = y;
     int64_t rl = xl * yl;
     // needs to be arithmetic shift
     return (int32_t)(rl >> 32);
 }
 
-static inline uint64_t libdivide_mullhi_u64(uint64_t x, uint64_t y) {
+static LIBDIVIDE_INLINE uint64_t libdivide_mullhi_u64(uint64_t x, uint64_t y) {
 #if defined(LIBDIVIDE_VC) && defined(LIBDIVIDE_X86_64)
     return __umulh(x, y);
 #elif defined(HAS_INT128_T)
@@ -263,7 +339,7 @@ static inline uint64_t libdivide_mullhi_u64(uint64_t x, uint64_t y) {
 #endif
 }
 
-static inline int64_t libdivide_mullhi_s64(int64_t x, int64_t y) {
+static LIBDIVIDE_INLINE int64_t libdivide_mullhi_s64(int64_t x, int64_t y) {
 #if defined(LIBDIVIDE_VC) && defined(LIBDIVIDE_X86_64)
     return __mulh(x, y);
 #elif defined(HAS_INT128_T)
@@ -285,8 +361,41 @@ static inline int64_t libdivide_mullhi_s64(int64_t x, int64_t y) {
 #endif
 }
 
-static inline int32_t libdivide_count_leading_zeros32(uint32_t val) {
-#if defined(__GNUC__) || __has_builtin(__builtin_clz)
+static LIBDIVIDE_INLINE int16_t libdivide_count_leading_zeros16(uint16_t val) {
+#if defined(__AVR__)
+    // Fast way to count leading zeros
+    // On the AVR 8-bit architecture __builtin_clz() works on a int16_t.
+    return __builtin_clz(val);
+#elif defined(__GNUC__) || __has_builtin(__builtin_clz)
+    // Fast way to count leading zeros
+    return __builtin_clz(val) - 16;
+#elif defined(LIBDIVIDE_VC)
+    unsigned long result;
+    if (_BitScanReverse(&result, (unsigned long)val)) {
+        return (int16_t)(15 - result);
+    }
+    return 0;
+#else
+    if (val == 0) return 16;
+    int16_t result = 4;
+    uint16_t hi = 0xFU << 12;
+    while ((val & hi) == 0) {
+        hi >>= 4;
+        result += 4;
+    }
+    while (val & hi) {
+        result -= 1;
+        hi <<= 1;
+    }
+    return result;
+#endif
+}
+
+static LIBDIVIDE_INLINE int32_t libdivide_count_leading_zeros32(uint32_t val) {
+#if defined(__AVR__)
+   // Fast way to count leading zeros
+    return __builtin_clzl(val);
+#elif defined(__GNUC__) || __has_builtin(__builtin_clz)
     // Fast way to count leading zeros
     return __builtin_clz(val);
 #elif defined(LIBDIVIDE_VC)
@@ -311,7 +420,7 @@ static inline int32_t libdivide_count_leading_zeros32(uint32_t val) {
 #endif
 }
 
-static inline int32_t libdivide_count_leading_zeros64(uint64_t val) {
+static LIBDIVIDE_INLINE int32_t libdivide_count_leading_zeros64(uint64_t val) {
 #if defined(__GNUC__) || __has_builtin(__builtin_clzll)
     // Fast way to count leading zeros
     return __builtin_clzll(val);
@@ -329,10 +438,21 @@ static inline int32_t libdivide_count_leading_zeros64(uint64_t val) {
 #endif
 }
 
+// libdivide_32_div_16_to_16: divides a 32-bit uint {u1, u0} by a 16-bit
+// uint {v}. The result must fit in 16 bits.
+// Returns the quotient directly and the remainder in *r
+static LIBDIVIDE_INLINE uint16_t libdivide_32_div_16_to_16(
+    uint16_t u1, uint16_t u0, uint16_t v, uint16_t* r) {
+    uint32_t n = ((uint32_t)u1 << 16) | u0;
+    uint16_t result = (uint16_t)(n / v);
+    *r = (uint16_t)(n - result * (uint32_t)v);
+    return result;
+}
+
 // libdivide_64_div_32_to_32: divides a 64-bit uint {u1, u0} by a 32-bit
 // uint {v}. The result must fit in 32 bits.
 // Returns the quotient directly and the remainder in *r
-static inline uint32_t libdivide_64_div_32_to_32(
+static LIBDIVIDE_INLINE uint32_t libdivide_64_div_32_to_32(
     uint32_t u1, uint32_t u0, uint32_t v, uint32_t *r) {
 #if (defined(LIBDIVIDE_i386) || defined(LIBDIVIDE_X86_64)) && defined(LIBDIVIDE_GCC_STYLE_ASM)
     uint32_t result;
@@ -346,93 +466,106 @@ static inline uint32_t libdivide_64_div_32_to_32(
 #endif
 }
 
-// libdivide_128_div_64_to_64: divides a 128-bit uint {u1, u0} by a 64-bit
-// uint {v}. The result must fit in 64 bits.
-// Returns the quotient directly and the remainder in *r
-static uint64_t libdivide_128_div_64_to_64(uint64_t u1, uint64_t u0, uint64_t v, uint64_t *r) {
+// libdivide_128_div_64_to_64: divides a 128-bit uint {numhi, numlo} by a 64-bit uint {den}. The
+// result must fit in 64 bits. Returns the quotient directly and the remainder in *r
+static LIBDIVIDE_INLINE uint64_t libdivide_128_div_64_to_64(
+    uint64_t numhi, uint64_t numlo, uint64_t den, uint64_t *r) {
     // N.B. resist the temptation to use __uint128_t here.
     // In LLVM compiler-rt, it performs a 128/128 -> 128 division which is many times slower than
     // necessary. In gcc it's better but still slower than the divlu implementation, perhaps because
-    // it's not inlined.
+    // it's not LIBDIVIDE_INLINEd.
 #if defined(LIBDIVIDE_X86_64) && defined(LIBDIVIDE_GCC_STYLE_ASM)
     uint64_t result;
-    __asm__("divq %[v]" : "=a"(result), "=d"(*r) : [v] "r"(v), "a"(u0), "d"(u1));
+    __asm__("divq %[v]" : "=a"(result), "=d"(*r) : [v] "r"(den), "a"(numlo), "d"(numhi));
     return result;
 #else
-    // Code taken from Hacker's Delight:
-    // http://www.hackersdelight.org/HDcode/divlu.c.
-    // License permits inclusion here per:
-    // http://www.hackersdelight.org/permissions.htm
+    // We work in base 2**32.
+    // A uint32 holds a single digit. A uint64 holds two digits.
+    // Our numerator is conceptually [num3, num2, num1, num0].
+    // Our denominator is [den1, den0].
+    const uint64_t b = ((uint64_t)1 << 32);
 
-    const uint64_t b = (1ULL << 32);  // Number base (32 bits)
-    uint64_t un1, un0;                // Norm. dividend LSD's
-    uint64_t vn1, vn0;                // Norm. divisor digits
-    uint64_t q1, q0;                  // Quotient digits
-    uint64_t un64, un21, un10;        // Dividend digit pairs
-    uint64_t rhat;                    // A remainder
-    int32_t s;                        // Shift amount for norm
+    // The high and low digits of our computed quotient.
+    uint32_t q1;
+    uint32_t q0;
 
-    // If overflow, set rem. to an impossible value,
-    // and return the largest possible quotient
-    if (u1 >= v) {
-        *r = (uint64_t)-1;
-        return (uint64_t)-1;
+    // The normalization shift factor.
+    int shift;
+
+    // The high and low digits of our denominator (after normalizing).
+    // Also the low 2 digits of our numerator (after normalizing).
+    uint32_t den1;
+    uint32_t den0;
+    uint32_t num1;
+    uint32_t num0;
+
+    // A partial remainder.
+    uint64_t rem;
+
+    // The estimated quotient, and its corresponding remainder (unrelated to true remainder).
+    uint64_t qhat;
+    uint64_t rhat;
+
+    // Variables used to correct the estimated quotient.
+    uint64_t c1;
+    uint64_t c2;
+
+    // Check for overflow and divide by 0.
+    if (numhi >= den) {
+        if (r != NULL) *r = ~0ull;
+        return ~0ull;
     }
 
-    // count leading zeros
-    s = libdivide_count_leading_zeros64(v);
-    if (s > 0) {
-        // Normalize divisor
-        v = v << s;
-        un64 = (u1 << s) | (u0 >> (64 - s));
-        un10 = u0 << s;  // Shift dividend left
-    } else {
-        // Avoid undefined behavior of (u0 >> 64).
-        // The behavior is undefined if the right operand is
-        // negative, or greater than or equal to the length
-        // in bits of the promoted left operand.
-        un64 = u1;
-        un10 = u0;
-    }
+    // Determine the normalization factor. We multiply den by this, so that its leading digit is at
+    // least half b. In binary this means just shifting left by the number of leading zeros, so that
+    // there's a 1 in the MSB.
+    // We also shift numer by the same amount. This cannot overflow because numhi < den.
+    // The expression (-shift & 63) is the same as (64 - shift), except it avoids the UB of shifting
+    // by 64. The funny bitwise 'and' ensures that numlo does not get shifted into numhi if shift is
+    // 0. clang 11 has an x86 codegen bug here: see LLVM bug 50118. The sequence below avoids it.
+    shift = libdivide_count_leading_zeros64(den);
+    den <<= shift;
+    numhi <<= shift;
+    numhi |= (numlo >> (-shift & 63)) & (-(int64_t)shift >> 63);
+    numlo <<= shift;
 
-    // Break divisor up into two 32-bit digits
-    vn1 = v >> 32;
-    vn0 = v & 0xFFFFFFFF;
+    // Extract the low digits of the numerator and both digits of the denominator.
+    num1 = (uint32_t)(numlo >> 32);
+    num0 = (uint32_t)(numlo & 0xFFFFFFFFu);
+    den1 = (uint32_t)(den >> 32);
+    den0 = (uint32_t)(den & 0xFFFFFFFFu);
 
-    // Break right half of dividend into two digits
-    un1 = un10 >> 32;
-    un0 = un10 & 0xFFFFFFFF;
+    // We wish to compute q1 = [n3 n2 n1] / [d1 d0].
+    // Estimate q1 as [n3 n2] / [d1], and then correct it.
+    // Note while qhat may be 2 digits, q1 is always 1 digit.
+    qhat = numhi / den1;
+    rhat = numhi % den1;
+    c1 = qhat * den0;
+    c2 = rhat * b + num1;
+    if (c1 > c2) qhat -= (c1 - c2 > den) ? 2 : 1;
+    q1 = (uint32_t)qhat;
 
-    // Compute the first quotient digit, q1
-    q1 = un64 / vn1;
-    rhat = un64 - q1 * vn1;
+    // Compute the true (partial) remainder.
+    rem = numhi * b + num1 - q1 * den;
 
-    while (q1 >= b || q1 * vn0 > b * rhat + un1) {
-        q1 = q1 - 1;
-        rhat = rhat + vn1;
-        if (rhat >= b) break;
-    }
+    // We wish to compute q0 = [rem1 rem0 n0] / [d1 d0].
+    // Estimate q0 as [rem1 rem0] / [d1] and correct it.
+    qhat = rem / den1;
+    rhat = rem % den1;
+    c1 = qhat * den0;
+    c2 = rhat * b + num0;
+    if (c1 > c2) qhat -= (c1 - c2 > den) ? 2 : 1;
+    q0 = (uint32_t)qhat;
 
-    // Multiply and subtract
-    un21 = un64 * b + un1 - q1 * v;
-
-    // Compute the second quotient digit
-    q0 = un21 / vn1;
-    rhat = un21 - q0 * vn1;
-
-    while (q0 >= b || q0 * vn0 > b * rhat + un0) {
-        q0 = q0 - 1;
-        rhat = rhat + vn1;
-        if (rhat >= b) break;
-    }
-
-    *r = (un21 * b + un0 - q0 * v) >> s;
-    return q1 * b + q0;
+    // Return remainder if requested.
+    if (r != NULL) *r = (rem * b + num0 - q0 * den) >> shift;
+    return ((uint64_t)q1 << 32) | q0;
 #endif
 }
 
 // Bitshift a u128 in place, left (signed_shift > 0) or right (signed_shift < 0)
-static inline void libdivide_u128_shift(uint64_t *u1, uint64_t *u0, int32_t signed_shift) {
+static LIBDIVIDE_INLINE void libdivide_u128_shift(
+    uint64_t *u1, uint64_t *u0, int32_t signed_shift) {
     if (signed_shift > 0) {
         uint32_t shift = signed_shift;
         *u1 <<= shift;
@@ -447,7 +580,7 @@ static inline void libdivide_u128_shift(uint64_t *u1, uint64_t *u0, int32_t sign
 }
 
 // Computes a 128 / 128 -> 64 bit division, with a 128 bit remainder.
-static uint64_t libdivide_128_div_128_to_64(
+static LIBDIVIDE_INLINE uint64_t libdivide_128_div_128_to_64(
     uint64_t u_hi, uint64_t u_lo, uint64_t v_hi, uint64_t v_lo, uint64_t *r_hi, uint64_t *r_lo) {
 #if defined(HAS_INT128_T) && defined(HAS_INT128_DIV)
     __uint128_t ufull = u_hi;
@@ -544,9 +677,178 @@ static uint64_t libdivide_128_div_128_to_64(
 #endif
 }
 
+////////// UINT16
+
+static LIBDIVIDE_INLINE struct libdivide_u16_t libdivide_internal_u16_gen(
+    uint16_t d, int branchfree) {
+    if (d == 0) {
+        LIBDIVIDE_ERROR("divider must be != 0");
+    }
+
+    struct libdivide_u16_t result;
+    uint8_t floor_log_2_d = (uint8_t)(15 - libdivide_count_leading_zeros16(d));
+
+    // Power of 2
+    if ((d & (d - 1)) == 0) {
+        // We need to subtract 1 from the shift value in case of an unsigned
+        // branchfree divider because there is a hardcoded right shift by 1
+        // in its division algorithm. Because of this we also need to add back
+        // 1 in its recovery algorithm.
+        result.magic = 0;
+        result.more = (uint8_t)(floor_log_2_d - (branchfree != 0));
+    }
+    else {
+        uint8_t more;
+        uint16_t rem, proposed_m;
+        proposed_m = libdivide_32_div_16_to_16((uint16_t)1 << floor_log_2_d, 0, d, &rem);
+
+        LIBDIVIDE_ASSERT(rem > 0 && rem < d);
+        const uint16_t e = d - rem;
+
+        // This power works if e < 2**floor_log_2_d.
+        if (!branchfree && (e < ((uint16_t)1 << floor_log_2_d))) {
+            // This power works
+            more = floor_log_2_d;
+        }
+        else {
+            // We have to use the general 17-bit algorithm.  We need to compute
+            // (2**power) / d. However, we already have (2**(power-1))/d and
+            // its remainder.  By doubling both, and then correcting the
+            // remainder, we can compute the larger division.
+            // don't care about overflow here - in fact, we expect it
+            proposed_m += proposed_m;
+            const uint16_t twice_rem = rem + rem;
+            if (twice_rem >= d || twice_rem < rem) proposed_m += 1;
+            more = floor_log_2_d | LIBDIVIDE_ADD_MARKER;
+        }
+        result.magic = 1 + proposed_m;
+        result.more = more;
+        // result.more's shift should in general be ceil_log_2_d. But if we
+        // used the smaller power, we subtract one from the shift because we're
+        // using the smaller power. If we're using the larger power, we
+        // subtract one from the shift because it's taken care of by the add
+        // indicator. So floor_log_2_d happens to be correct in both cases.
+    }
+    return result;
+}
+
+struct libdivide_u16_t libdivide_u16_gen(uint16_t d) {
+    return libdivide_internal_u16_gen(d, 0);
+}
+
+struct libdivide_u16_branchfree_t libdivide_u16_branchfree_gen(uint16_t d) {
+    if (d == 1) {
+        LIBDIVIDE_ERROR("branchfree divider must be != 1");
+    }
+    struct libdivide_u16_t tmp = libdivide_internal_u16_gen(d, 1);
+    struct libdivide_u16_branchfree_t ret = {
+        tmp.magic, (uint8_t)(tmp.more & LIBDIVIDE_16_SHIFT_MASK) };
+    return ret;
+}
+
+// The original libdivide_u16_do takes a const pointer. However, this cannot be used
+// with a compile time constant libdivide_u16_t: it will generate a warning about
+// taking the address of a temporary. Hence this overload.
+uint16_t libdivide_u16_do_raw(uint16_t numer, uint16_t magic, uint8_t more) {
+    if (!magic) {
+        return numer >> more;
+    }
+    else {
+        uint16_t q = libdivide_mullhi_u16(magic, numer);
+        if (more & LIBDIVIDE_ADD_MARKER) {
+            uint16_t t = ((numer - q) >> 1) + q;
+            return t >> (more & LIBDIVIDE_16_SHIFT_MASK);
+        }
+        else {
+            // All upper bits are 0,
+            // don't need to mask them off.
+            return q >> more;
+        }
+    }    
+}
+
+uint16_t libdivide_u16_do(uint16_t numer, const struct libdivide_u16_t* denom) {
+    return libdivide_u16_do_raw(numer, denom->magic, denom->more);
+}
+
+uint16_t libdivide_u16_branchfree_do(
+    uint16_t numer, const struct libdivide_u16_branchfree_t* denom) {
+    uint16_t q = libdivide_mullhi_u16(denom->magic, numer);
+    uint16_t t = ((numer - q) >> 1) + q;
+    return t >> denom->more;
+}
+
+uint16_t libdivide_u16_recover(const struct libdivide_u16_t *denom) {
+    uint8_t more = denom->more;
+    uint8_t shift = more & LIBDIVIDE_16_SHIFT_MASK;
+
+    if (!denom->magic) {
+        return (uint16_t)1 << shift;
+    } else if (!(more & LIBDIVIDE_ADD_MARKER)) {
+        // We compute q = n/d = n*m / 2^(16 + shift)
+        // Therefore we have d = 2^(16 + shift) / m
+        // We need to ceil it.
+        // We know d is not a power of 2, so m is not a power of 2,
+        // so we can just add 1 to the floor
+        uint16_t hi_dividend = (uint16_t)1 << shift;
+        uint16_t rem_ignored;
+        return 1 + libdivide_32_div_16_to_16(hi_dividend, 0, denom->magic, &rem_ignored);
+    } else {
+        // Here we wish to compute d = 2^(16+shift+1)/(m+2^16).
+        // Notice (m + 2^16) is a 17 bit number. Use 32 bit division for now
+        // Also note that shift may be as high as 15, so shift + 1 will
+        // overflow. So we have to compute it as 2^(16+shift)/(m+2^16), and
+        // then double the quotient and remainder.
+        uint32_t half_n = (uint32_t)1 << (16 + shift);
+        uint32_t d = ( (uint32_t)1 << 16) | denom->magic;
+        // Note that the quotient is guaranteed <= 16 bits, but the remainder
+        // may need 17!
+        uint16_t half_q = (uint16_t)(half_n / d);
+        uint32_t rem = half_n % d;
+        // We computed 2^(16+shift)/(m+2^16)
+        // Need to double it, and then add 1 to the quotient if doubling th
+        // remainder would increase the quotient.
+        // Note that rem<<1 cannot overflow, since rem < d and d is 17 bits
+        uint16_t full_q = half_q + half_q + ((rem << 1) >= d);
+
+        // We rounded down in gen (hence +1)
+        return full_q + 1;
+    }
+}
+
+uint16_t libdivide_u16_branchfree_recover(const struct libdivide_u16_branchfree_t *denom) {
+    uint8_t more = denom->more;
+    uint8_t shift = more & LIBDIVIDE_16_SHIFT_MASK;
+
+    if (!denom->magic) {
+        return (uint16_t)1 << (shift + 1);
+    } else {
+        // Here we wish to compute d = 2^(16+shift+1)/(m+2^16).
+        // Notice (m + 2^16) is a 17 bit number. Use 32 bit division for now
+        // Also note that shift may be as high as 15, so shift + 1 will
+        // overflow. So we have to compute it as 2^(16+shift)/(m+2^16), and
+        // then double the quotient and remainder.
+        uint32_t half_n = (uint32_t)1 << (16 + shift);
+        uint32_t d = ((uint32_t)1 << 16) | denom->magic;
+        // Note that the quotient is guaranteed <= 16 bits, but the remainder
+        // may need 17!
+        uint16_t half_q = (uint16_t)(half_n / d);
+        uint32_t rem = half_n % d;
+        // We computed 2^(16+shift)/(m+2^16)
+        // Need to double it, and then add 1 to the quotient if doubling th
+        // remainder would increase the quotient.
+        // Note that rem<<1 cannot overflow, since rem < d and d is 33 bits
+        uint16_t full_q = half_q + half_q + ((rem << 1) >= d);
+
+        // We rounded down in gen (hence +1)
+        return full_q + 1;
+    }
+}
+
 ////////// UINT32
 
-static inline struct libdivide_u32_t libdivide_internal_u32_gen(uint32_t d, int branchfree) {
+static LIBDIVIDE_INLINE struct libdivide_u32_t libdivide_internal_u32_gen(
+    uint32_t d, int branchfree) {
     if (d == 0) {
         LIBDIVIDE_ERROR("divider must be != 0");
     }
@@ -565,15 +867,15 @@ static inline struct libdivide_u32_t libdivide_internal_u32_gen(uint32_t d, int
     } else {
         uint8_t more;
         uint32_t rem, proposed_m;
-        proposed_m = libdivide_64_div_32_to_32(1U << floor_log_2_d, 0, d, &rem);
+        proposed_m = libdivide_64_div_32_to_32((uint32_t)1 << floor_log_2_d, 0, d, &rem);
 
         LIBDIVIDE_ASSERT(rem > 0 && rem < d);
         const uint32_t e = d - rem;
 
         // This power works if e < 2**floor_log_2_d.
-        if (!branchfree && (e < (1U << floor_log_2_d))) {
+        if (!branchfree && (e < ((uint32_t)1 << floor_log_2_d))) {
             // This power works
-            more = floor_log_2_d;
+            more = (uint8_t)floor_log_2_d;
         } else {
             // We have to use the general 33-bit algorithm.  We need to compute
             // (2**power) / d. However, we already have (2**(power-1))/d and
@@ -583,7 +885,7 @@ static inline struct libdivide_u32_t libdivide_internal_u32_gen(uint32_t d, int
             proposed_m += proposed_m;
             const uint32_t twice_rem = rem + rem;
             if (twice_rem >= d || twice_rem < rem) proposed_m += 1;
-            more = floor_log_2_d | LIBDIVIDE_ADD_MARKER;
+            more = (uint8_t)(floor_log_2_d | LIBDIVIDE_ADD_MARKER);
         }
         result.magic = 1 + proposed_m;
         result.more = more;
@@ -639,14 +941,14 @@ uint32_t libdivide_u32_recover(const struct libdivide_u32_t *denom) {
     uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
 
     if (!denom->magic) {
-        return 1U << shift;
+        return (uint32_t)1 << shift;
     } else if (!(more & LIBDIVIDE_ADD_MARKER)) {
         // We compute q = n/d = n*m / 2^(32 + shift)
         // Therefore we have d = 2^(32 + shift) / m
         // We need to ceil it.
         // We know d is not a power of 2, so m is not a power of 2,
         // so we can just add 1 to the floor
-        uint32_t hi_dividend = 1U << shift;
+        uint32_t hi_dividend = (uint32_t)1 << shift;
         uint32_t rem_ignored;
         return 1 + libdivide_64_div_32_to_32(hi_dividend, 0, denom->magic, &rem_ignored);
     } else {
@@ -655,8 +957,8 @@ uint32_t libdivide_u32_recover(const struct libdivide_u32_t *denom) {
         // Also note that shift may be as high as 31, so shift + 1 will
         // overflow. So we have to compute it as 2^(32+shift)/(m+2^32), and
         // then double the quotient and remainder.
-        uint64_t half_n = 1ULL << (32 + shift);
-        uint64_t d = (1ULL << 32) | denom->magic;
+        uint64_t half_n = (uint64_t)1 << (32 + shift);
+        uint64_t d = ((uint64_t)1 << 32) | denom->magic;
         // Note that the quotient is guaranteed <= 32 bits, but the remainder
         // may need 33!
         uint32_t half_q = (uint32_t)(half_n / d);
@@ -677,15 +979,15 @@ uint32_t libdivide_u32_branchfree_recover(const struct libdivide_u32_branchfree_
     uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
 
     if (!denom->magic) {
-        return 1U << (shift + 1);
+        return (uint32_t)1 << (shift + 1);
     } else {
         // Here we wish to compute d = 2^(32+shift+1)/(m+2^32).
         // Notice (m + 2^32) is a 33 bit number. Use 64 bit division for now
         // Also note that shift may be as high as 31, so shift + 1 will
         // overflow. So we have to compute it as 2^(32+shift)/(m+2^32), and
         // then double the quotient and remainder.
-        uint64_t half_n = 1ULL << (32 + shift);
-        uint64_t d = (1ULL << 32) | denom->magic;
+        uint64_t half_n = (uint64_t)1 << (32 + shift);
+        uint64_t d = ((uint64_t)1 << 32) | denom->magic;
         // Note that the quotient is guaranteed <= 32 bits, but the remainder
         // may need 33!
         uint32_t half_q = (uint32_t)(half_n / d);
@@ -703,7 +1005,8 @@ uint32_t libdivide_u32_branchfree_recover(const struct libdivide_u32_branchfree_
 
 /////////// UINT64
 
-static inline struct libdivide_u64_t libdivide_internal_u64_gen(uint64_t d, int branchfree) {
+static LIBDIVIDE_INLINE struct libdivide_u64_t libdivide_internal_u64_gen(
+    uint64_t d, int branchfree) {
     if (d == 0) {
         LIBDIVIDE_ERROR("divider must be != 0");
     }
@@ -723,15 +1026,15 @@ static inline struct libdivide_u64_t libdivide_internal_u64_gen(uint64_t d, int
         uint64_t proposed_m, rem;
         uint8_t more;
         // (1 << (64 + floor_log_2_d)) / d
-        proposed_m = libdivide_128_div_64_to_64(1ULL << floor_log_2_d, 0, d, &rem);
+        proposed_m = libdivide_128_div_64_to_64((uint64_t)1 << floor_log_2_d, 0, d, &rem);
 
         LIBDIVIDE_ASSERT(rem > 0 && rem < d);
         const uint64_t e = d - rem;
 
         // This power works if e < 2**floor_log_2_d.
-        if (!branchfree && e < (1ULL << floor_log_2_d)) {
+        if (!branchfree && e < ((uint64_t)1 << floor_log_2_d)) {
             // This power works
-            more = floor_log_2_d;
+            more = (uint8_t)floor_log_2_d;
         } else {
             // We have to use the general 65-bit algorithm.  We need to compute
             // (2**power) / d. However, we already have (2**(power-1))/d and
@@ -741,7 +1044,7 @@ static inline struct libdivide_u64_t libdivide_internal_u64_gen(uint64_t d, int
             proposed_m += proposed_m;
             const uint64_t twice_rem = rem + rem;
             if (twice_rem >= d || twice_rem < rem) proposed_m += 1;
-            more = floor_log_2_d | LIBDIVIDE_ADD_MARKER;
+            more = (uint8_t)(floor_log_2_d | LIBDIVIDE_ADD_MARKER);
         }
         result.magic = 1 + proposed_m;
         result.more = more;
@@ -798,14 +1101,14 @@ uint64_t libdivide_u64_recover(const struct libdivide_u64_t *denom) {
     uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
 
     if (!denom->magic) {
-        return 1ULL << shift;
+        return (uint64_t)1 << shift;
     } else if (!(more & LIBDIVIDE_ADD_MARKER)) {
         // We compute q = n/d = n*m / 2^(64 + shift)
         // Therefore we have d = 2^(64 + shift) / m
         // We need to ceil it.
         // We know d is not a power of 2, so m is not a power of 2,
         // so we can just add 1 to the floor
-        uint64_t hi_dividend = 1ULL << shift;
+        uint64_t hi_dividend = (uint64_t)1 << shift;
         uint64_t rem_ignored;
         return 1 + libdivide_128_div_64_to_64(hi_dividend, 0, denom->magic, &rem_ignored);
     } else {
@@ -817,7 +1120,7 @@ uint64_t libdivide_u64_recover(const struct libdivide_u64_t *denom) {
         // Full n is a (potentially) 129 bit value
         // half_n is a 128 bit value
         // Compute the hi half of half_n. Low half is 0.
-        uint64_t half_n_hi = 1ULL << shift, half_n_lo = 0;
+        uint64_t half_n_hi = (uint64_t)1 << shift, half_n_lo = 0;
         // d is a 65 bit value. The high bit is always set to 1.
         const uint64_t d_hi = 1, d_lo = denom->magic;
         // Note that the quotient is guaranteed <= 64 bits,
@@ -842,7 +1145,7 @@ uint64_t libdivide_u64_branchfree_recover(const struct libdivide_u64_branchfree_
     uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
 
     if (!denom->magic) {
-        return 1ULL << (shift + 1);
+        return (uint64_t)1 << (shift + 1);
     } else {
         // Here we wish to compute d = 2^(64+shift+1)/(m+2^64).
         // Notice (m + 2^64) is a 65 bit number. This gets hairy. See
@@ -852,7 +1155,7 @@ uint64_t libdivide_u64_branchfree_recover(const struct libdivide_u64_branchfree_
         // Full n is a (potentially) 129 bit value
         // half_n is a 128 bit value
         // Compute the hi half of half_n. Low half is 0.
-        uint64_t half_n_hi = 1ULL << shift, half_n_lo = 0;
+        uint64_t half_n_hi = (uint64_t)1 << shift, half_n_lo = 0;
         // d is a 65 bit value. The high bit is always set to 1.
         const uint64_t d_hi = 1, d_lo = denom->magic;
         // Note that the quotient is guaranteed <= 64 bits,
@@ -872,9 +1175,185 @@ uint64_t libdivide_u64_branchfree_recover(const struct libdivide_u64_branchfree_
     }
 }
 
+/////////// SINT16
+
+static LIBDIVIDE_INLINE struct libdivide_s16_t libdivide_internal_s16_gen(
+    int16_t d, int branchfree) {
+    if (d == 0) {
+        LIBDIVIDE_ERROR("divider must be != 0");
+    }
+
+    struct libdivide_s16_t result;
+
+    // If d is a power of 2, or negative a power of 2, we have to use a shift.
+    // This is especially important because the magic algorithm fails for -1.
+    // To check if d is a power of 2 or its inverse, it suffices to check
+    // whether its absolute value has exactly one bit set. This works even for
+    // INT_MIN, because abs(INT_MIN) == INT_MIN, and INT_MIN has one bit set
+    // and is a power of 2.
+    uint16_t ud = (uint16_t)d;
+    uint16_t absD = (d < 0) ? -ud : ud;
+    uint16_t floor_log_2_d = 15 - libdivide_count_leading_zeros16(absD);
+    // check if exactly one bit is set,
+    // don't care if absD is 0 since that's divide by zero
+    if ((absD & (absD - 1)) == 0) {
+        // Branchfree and normal paths are exactly the same
+        result.magic = 0;
+        result.more = (uint8_t)(floor_log_2_d | (d < 0 ? LIBDIVIDE_NEGATIVE_DIVISOR : 0));
+    } else {
+        LIBDIVIDE_ASSERT(floor_log_2_d >= 1);
+
+        uint8_t more;
+        // the dividend here is 2**(floor_log_2_d + 31), so the low 16 bit word
+        // is 0 and the high word is floor_log_2_d - 1
+        uint16_t rem, proposed_m;
+        proposed_m = libdivide_32_div_16_to_16((uint16_t)1 << (floor_log_2_d - 1), 0, absD, &rem);
+        const uint16_t e = absD - rem;
+
+        // We are going to start with a power of floor_log_2_d - 1.
+        // This works if works if e < 2**floor_log_2_d.
+        if (!branchfree && e < ((uint16_t)1 << floor_log_2_d)) {
+            // This power works
+            more = (uint8_t)(floor_log_2_d - 1);
+        } else {
+            // We need to go one higher. This should not make proposed_m
+            // overflow, but it will make it negative when interpreted as an
+            // int16_t.
+            proposed_m += proposed_m;
+            const uint16_t twice_rem = rem + rem;
+            if (twice_rem >= absD || twice_rem < rem) proposed_m += 1;
+            more = (uint8_t)(floor_log_2_d | LIBDIVIDE_ADD_MARKER);
+        }
+
+        proposed_m += 1;
+        int16_t magic = (int16_t)proposed_m;
+
+        // Mark if we are negative. Note we only negate the magic number in the
+        // branchfull case.
+        if (d < 0) {
+            more |= LIBDIVIDE_NEGATIVE_DIVISOR;
+            if (!branchfree) {
+                magic = -magic;
+            }
+        }
+
+        result.more = more;
+        result.magic = magic;
+    }
+    return result;
+}
+
+struct libdivide_s16_t libdivide_s16_gen(int16_t d) {
+    return libdivide_internal_s16_gen(d, 0);
+}
+
+struct libdivide_s16_branchfree_t libdivide_s16_branchfree_gen(int16_t d) {
+    struct libdivide_s16_t tmp = libdivide_internal_s16_gen(d, 1);
+    struct libdivide_s16_branchfree_t result = {tmp.magic, tmp.more};
+    return result;
+}
+
+// The original libdivide_s16_do takes a const pointer. However, this cannot be used
+// with a compile time constant libdivide_s16_t: it will generate a warning about
+// taking the address of a temporary. Hence this overload.
+int16_t libdivide_s16_do_raw(int16_t numer, int16_t magic, uint8_t more) {
+    uint8_t shift = more & LIBDIVIDE_16_SHIFT_MASK;
+
+    if (!magic) {
+        uint16_t sign = (int8_t)more >> 7;
+        uint16_t mask = ((uint16_t)1 << shift) - 1;
+        uint16_t uq = numer + ((numer >> 15) & mask);
+        int16_t q = (int16_t)uq;
+        q >>= shift;
+        q = (q ^ sign) - sign;
+        return q;
+    } else {
+        uint16_t uq = (uint16_t)libdivide_mullhi_s16(magic, numer);
+        if (more & LIBDIVIDE_ADD_MARKER) {
+            // must be arithmetic shift and then sign extend
+            int16_t sign = (int8_t)more >> 7;
+            // q += (more < 0 ? -numer : numer)
+            // cast required to avoid UB
+            uq += ((uint16_t)numer ^ sign) - sign;
+        }
+        int16_t q = (int16_t)uq;
+        q >>= shift;
+        q += (q < 0);
+        return q;
+    }
+}
+
+int16_t libdivide_s16_do(int16_t numer, const struct libdivide_s16_t *denom) {
+    return libdivide_s16_do_raw(numer, denom->magic, denom->more);
+}
+
+int16_t libdivide_s16_branchfree_do(int16_t numer, const struct libdivide_s16_branchfree_t *denom) {
+    uint8_t more = denom->more;
+    uint8_t shift = more & LIBDIVIDE_16_SHIFT_MASK;
+    // must be arithmetic shift and then sign extend
+    int16_t sign = (int8_t)more >> 7;
+    int16_t magic = denom->magic;
+    int16_t q = libdivide_mullhi_s16(magic, numer);
+    q += numer;
+
+    // If q is non-negative, we have nothing to do
+    // If q is negative, we want to add either (2**shift)-1 if d is a power of
+    // 2, or (2**shift) if it is not a power of 2
+    uint16_t is_power_of_2 = (magic == 0);
+    uint16_t q_sign = (uint16_t)(q >> 15);
+    q += q_sign & (((uint16_t)1 << shift) - is_power_of_2);
+
+    // Now arithmetic right shift
+    q >>= shift;
+    // Negate if needed
+    q = (q ^ sign) - sign;
+
+    return q;
+}
+
+int16_t libdivide_s16_recover(const struct libdivide_s16_t *denom) {
+    uint8_t more = denom->more;
+    uint8_t shift = more & LIBDIVIDE_16_SHIFT_MASK;
+    if (!denom->magic) {
+        uint16_t absD = (uint16_t)1 << shift;
+        if (more & LIBDIVIDE_NEGATIVE_DIVISOR) {
+            absD = -absD;
+        }
+        return (int16_t)absD;
+    } else {
+        // Unsigned math is much easier
+        // We negate the magic number only in the branchfull case, and we don't
+        // know which case we're in. However we have enough information to
+        // determine the correct sign of the magic number. The divisor was
+        // negative if LIBDIVIDE_NEGATIVE_DIVISOR is set. If ADD_MARKER is set,
+        // the magic number's sign is opposite that of the divisor.
+        // We want to compute the positive magic number.
+        int negative_divisor = (more & LIBDIVIDE_NEGATIVE_DIVISOR);
+        int magic_was_negated = (more & LIBDIVIDE_ADD_MARKER) ? denom->magic > 0 : denom->magic < 0;
+
+        // Handle the power of 2 case (including branchfree)
+        if (denom->magic == 0) {
+            int16_t result = (uint16_t)1 << shift;
+            return negative_divisor ? -result : result;
+        }
+
+        uint16_t d = (uint16_t)(magic_was_negated ? -denom->magic : denom->magic);
+        uint32_t n = (uint32_t)1 << (16 + shift);  // this shift cannot exceed 30
+        uint16_t q = (uint16_t)(n / d);
+        int16_t result = (int16_t)q;
+        result += 1;
+        return negative_divisor ? -result : result;
+    }
+}
+
+int16_t libdivide_s16_branchfree_recover(const struct libdivide_s16_branchfree_t *denom) {
+    return libdivide_s16_recover((const struct libdivide_s16_t *)denom);
+}
+
 /////////// SINT32
 
-static inline struct libdivide_s32_t libdivide_internal_s32_gen(int32_t d, int branchfree) {
+static LIBDIVIDE_INLINE struct libdivide_s32_t libdivide_internal_s32_gen(
+    int32_t d, int branchfree) {
     if (d == 0) {
         LIBDIVIDE_ERROR("divider must be != 0");
     }
@@ -895,7 +1374,7 @@ static inline struct libdivide_s32_t libdivide_internal_s32_gen(int32_t d, int b
     if ((absD & (absD - 1)) == 0) {
         // Branchfree and normal paths are exactly the same
         result.magic = 0;
-        result.more = floor_log_2_d | (d < 0 ? LIBDIVIDE_NEGATIVE_DIVISOR : 0);
+        result.more = (uint8_t)(floor_log_2_d | (d < 0 ? LIBDIVIDE_NEGATIVE_DIVISOR : 0));
     } else {
         LIBDIVIDE_ASSERT(floor_log_2_d >= 1);
 
@@ -903,14 +1382,14 @@ static inline struct libdivide_s32_t libdivide_internal_s32_gen(int32_t d, int b
         // the dividend here is 2**(floor_log_2_d + 31), so the low 32 bit word
         // is 0 and the high word is floor_log_2_d - 1
         uint32_t rem, proposed_m;
-        proposed_m = libdivide_64_div_32_to_32(1U << (floor_log_2_d - 1), 0, absD, &rem);
+        proposed_m = libdivide_64_div_32_to_32((uint32_t)1 << (floor_log_2_d - 1), 0, absD, &rem);
         const uint32_t e = absD - rem;
 
         // We are going to start with a power of floor_log_2_d - 1.
         // This works if works if e < 2**floor_log_2_d.
-        if (!branchfree && e < (1U << floor_log_2_d)) {
+        if (!branchfree && e < ((uint32_t)1 << floor_log_2_d)) {
             // This power works
-            more = floor_log_2_d - 1;
+            more = (uint8_t)(floor_log_2_d - 1);
         } else {
             // We need to go one higher. This should not make proposed_m
             // overflow, but it will make it negative when interpreted as an
@@ -918,7 +1397,7 @@ static inline struct libdivide_s32_t libdivide_internal_s32_gen(int32_t d, int b
             proposed_m += proposed_m;
             const uint32_t twice_rem = rem + rem;
             if (twice_rem >= absD || twice_rem < rem) proposed_m += 1;
-            more = floor_log_2_d | LIBDIVIDE_ADD_MARKER;
+            more = (uint8_t)(floor_log_2_d | LIBDIVIDE_ADD_MARKER);
         }
 
         proposed_m += 1;
@@ -955,7 +1434,7 @@ int32_t libdivide_s32_do(int32_t numer, const struct libdivide_s32_t *denom) {
 
     if (!denom->magic) {
         uint32_t sign = (int8_t)more >> 7;
-        uint32_t mask = (1U << shift) - 1;
+        uint32_t mask = ((uint32_t)1 << shift) - 1;
         uint32_t uq = numer + ((numer >> 31) & mask);
         int32_t q = (int32_t)uq;
         q >>= shift;
@@ -991,7 +1470,7 @@ int32_t libdivide_s32_branchfree_do(int32_t numer, const struct libdivide_s32_br
     // 2, or (2**shift) if it is not a power of 2
     uint32_t is_power_of_2 = (magic == 0);
     uint32_t q_sign = (uint32_t)(q >> 31);
-    q += q_sign & ((1U << shift) - is_power_of_2);
+    q += q_sign & (((uint32_t)1 << shift) - is_power_of_2);
 
     // Now arithmetic right shift
     q >>= shift;
@@ -1005,7 +1484,7 @@ int32_t libdivide_s32_recover(const struct libdivide_s32_t *denom) {
     uint8_t more = denom->more;
     uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
     if (!denom->magic) {
-        uint32_t absD = 1U << shift;
+        uint32_t absD = (uint32_t)1 << shift;
         if (more & LIBDIVIDE_NEGATIVE_DIVISOR) {
             absD = -absD;
         }
@@ -1023,12 +1502,12 @@ int32_t libdivide_s32_recover(const struct libdivide_s32_t *denom) {
 
         // Handle the power of 2 case (including branchfree)
         if (denom->magic == 0) {
-            int32_t result = 1U << shift;
+            int32_t result = (uint32_t)1 << shift;
             return negative_divisor ? -result : result;
         }
 
         uint32_t d = (uint32_t)(magic_was_negated ? -denom->magic : denom->magic);
-        uint64_t n = 1ULL << (32 + shift);  // this shift cannot exceed 30
+        uint64_t n = (uint64_t)1 << (32 + shift);  // this shift cannot exceed 30
         uint32_t q = (uint32_t)(n / d);
         int32_t result = (int32_t)q;
         result += 1;
@@ -1042,7 +1521,8 @@ int32_t libdivide_s32_branchfree_recover(const struct libdivide_s32_branchfree_t
 
 ///////////// SINT64
 
-static inline struct libdivide_s64_t libdivide_internal_s64_gen(int64_t d, int branchfree) {
+static LIBDIVIDE_INLINE struct libdivide_s64_t libdivide_internal_s64_gen(
+    int64_t d, int branchfree) {
     if (d == 0) {
         LIBDIVIDE_ERROR("divider must be != 0");
     }
@@ -1063,20 +1543,20 @@ static inline struct libdivide_s64_t libdivide_internal_s64_gen(int64_t d, int b
     if ((absD & (absD - 1)) == 0) {
         // Branchfree and non-branchfree cases are the same
         result.magic = 0;
-        result.more = floor_log_2_d | (d < 0 ? LIBDIVIDE_NEGATIVE_DIVISOR : 0);
+        result.more = (uint8_t)(floor_log_2_d | (d < 0 ? LIBDIVIDE_NEGATIVE_DIVISOR : 0));
     } else {
         // the dividend here is 2**(floor_log_2_d + 63), so the low 64 bit word
         // is 0 and the high word is floor_log_2_d - 1
         uint8_t more;
         uint64_t rem, proposed_m;
-        proposed_m = libdivide_128_div_64_to_64(1ULL << (floor_log_2_d - 1), 0, absD, &rem);
+        proposed_m = libdivide_128_div_64_to_64((uint64_t)1 << (floor_log_2_d - 1), 0, absD, &rem);
         const uint64_t e = absD - rem;
 
         // We are going to start with a power of floor_log_2_d - 1.
         // This works if works if e < 2**floor_log_2_d.
-        if (!branchfree && e < (1ULL << floor_log_2_d)) {
+        if (!branchfree && e < ((uint64_t)1 << floor_log_2_d)) {
             // This power works
-            more = floor_log_2_d - 1;
+            more = (uint8_t)(floor_log_2_d - 1);
         } else {
             // We need to go one higher. This should not make proposed_m
             // overflow, but it will make it negative when interpreted as an
@@ -1088,7 +1568,7 @@ static inline struct libdivide_s64_t libdivide_internal_s64_gen(int64_t d, int b
             // also set ADD_MARKER this is an annoying optimization that
             // enables algorithm #4 to avoid the mask. However we always set it
             // in the branchfree case
-            more = floor_log_2_d | LIBDIVIDE_ADD_MARKER;
+            more = (uint8_t)(floor_log_2_d | LIBDIVIDE_ADD_MARKER);
         }
         proposed_m += 1;
         int64_t magic = (int64_t)proposed_m;
@@ -1122,7 +1602,7 @@ int64_t libdivide_s64_do(int64_t numer, const struct libdivide_s64_t *denom) {
     uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
 
     if (!denom->magic) {  // shift path
-        uint64_t mask = (1ULL << shift) - 1;
+        uint64_t mask = ((uint64_t)1 << shift) - 1;
         uint64_t uq = numer + ((numer >> 63) & mask);
         int64_t q = (int64_t)uq;
         q >>= shift;
@@ -1160,7 +1640,7 @@ int64_t libdivide_s64_branchfree_do(int64_t numer, const struct libdivide_s64_br
     // 2, or (2**shift) if it is not a power of 2.
     uint64_t is_power_of_2 = (magic == 0);
     uint64_t q_sign = (uint64_t)(q >> 63);
-    q += q_sign & ((1ULL << shift) - is_power_of_2);
+    q += q_sign & (((uint64_t)1 << shift) - is_power_of_2);
 
     // Arithmetic right shift
     q >>= shift;
@@ -1174,7 +1654,7 @@ int64_t libdivide_s64_recover(const struct libdivide_s64_t *denom) {
     uint8_t more = denom->more;
     uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
     if (denom->magic == 0) {  // shift path
-        uint64_t absD = 1ULL << shift;
+        uint64_t absD = (uint64_t)1 << shift;
         if (more & LIBDIVIDE_NEGATIVE_DIVISOR) {
             absD = -absD;
         }
@@ -1185,7 +1665,7 @@ int64_t libdivide_s64_recover(const struct libdivide_s64_t *denom) {
         int magic_was_negated = (more & LIBDIVIDE_ADD_MARKER) ? denom->magic > 0 : denom->magic < 0;
 
         uint64_t d = (uint64_t)(magic_was_negated ? -denom->magic : denom->magic);
-        uint64_t n_hi = 1ULL << shift, n_lo = 0;
+        uint64_t n_hi = (uint64_t)1 << shift, n_lo = 0;
         uint64_t rem_ignored;
         uint64_t q = libdivide_128_div_64_to_64(n_hi, n_lo, d, &rem_ignored);
         int64_t result = (int64_t)(q + 1);
@@ -1200,67 +1680,87 @@ int64_t libdivide_s64_branchfree_recover(const struct libdivide_s64_branchfree_t
     return libdivide_s64_recover((const struct libdivide_s64_t *)denom);
 }
 
+// Simplest possible vector type division: treat the vector type as an array
+// of underlying native type.
+#define SIMPLE_VECTOR_DIVISION(IntT, VecT, Algo) \
+    const size_t count = sizeof(VecT) / sizeof(IntT); \
+    VecT result; \
+    IntT *pSource = (IntT *)&numers; \
+    IntT *pTarget = (IntT *)&result; \
+    for (size_t loop=0; loop<count; ++loop) { \
+        pTarget[loop] = libdivide_##Algo##_do(pSource[loop], denom); \
+    } \
+    return result; \
+
 #if defined(LIBDIVIDE_NEON)
 
-static inline uint32x4_t libdivide_u32_do_vec128(
+static LIBDIVIDE_INLINE uint16x8_t libdivide_u16_do_vec128(
+    uint16x8_t numers, const struct libdivide_u16_t *denom);
+static LIBDIVIDE_INLINE int16x8_t libdivide_s16_do_vec128(
+    int16x8_t numers, const struct libdivide_s16_t *denom);
+static LIBDIVIDE_INLINE uint32x4_t libdivide_u32_do_vec128(
     uint32x4_t numers, const struct libdivide_u32_t *denom);
-static inline int32x4_t libdivide_s32_do_vec128(
+static LIBDIVIDE_INLINE int32x4_t libdivide_s32_do_vec128(
     int32x4_t numers, const struct libdivide_s32_t *denom);
-static inline uint64x2_t libdivide_u64_do_vec128(
+static LIBDIVIDE_INLINE uint64x2_t libdivide_u64_do_vec128(
     uint64x2_t numers, const struct libdivide_u64_t *denom);
-static inline int64x2_t libdivide_s64_do_vec128(
+static LIBDIVIDE_INLINE int64x2_t libdivide_s64_do_vec128(
     int64x2_t numers, const struct libdivide_s64_t *denom);
 
-static inline uint32x4_t libdivide_u32_branchfree_do_vec128(
+static LIBDIVIDE_INLINE uint16x8_t libdivide_u16_branchfree_do_vec128(
+    uint16x8_t numers, const struct libdivide_u16_branchfree_t *denom);
+static LIBDIVIDE_INLINE int16x8_t libdivide_s16_branchfree_do_vec128(
+    int16x8_t numers, const struct libdivide_s16_branchfree_t *denom);
+static LIBDIVIDE_INLINE uint32x4_t libdivide_u32_branchfree_do_vec128(
     uint32x4_t numers, const struct libdivide_u32_branchfree_t *denom);
-static inline int32x4_t libdivide_s32_branchfree_do_vec128(
+static LIBDIVIDE_INLINE int32x4_t libdivide_s32_branchfree_do_vec128(
     int32x4_t numers, const struct libdivide_s32_branchfree_t *denom);
-static inline uint64x2_t libdivide_u64_branchfree_do_vec128(
+static LIBDIVIDE_INLINE uint64x2_t libdivide_u64_branchfree_do_vec128(
     uint64x2_t numers, const struct libdivide_u64_branchfree_t *denom);
-static inline int64x2_t libdivide_s64_branchfree_do_vec128(
+static LIBDIVIDE_INLINE int64x2_t libdivide_s64_branchfree_do_vec128(
     int64x2_t numers, const struct libdivide_s64_branchfree_t *denom);
 
 //////// Internal Utility Functions
 
 // Logical right shift by runtime value.
 // NEON implements right shift as left shits by negative values.
-static inline uint32x4_t libdivide_u32_neon_srl(uint32x4_t v, uint8_t amt) {
-    int32_t wamt = static_cast<int32_t>(amt);
+static LIBDIVIDE_INLINE uint32x4_t libdivide_u32_neon_srl(uint32x4_t v, uint8_t amt) {
+    int32_t wamt = (int32_t)(amt);
     return vshlq_u32(v, vdupq_n_s32(-wamt));
 }
 
-static inline uint64x2_t libdivide_u64_neon_srl(uint64x2_t v, uint8_t amt) {
-    int64_t wamt = static_cast<int64_t>(amt);
+static LIBDIVIDE_INLINE uint64x2_t libdivide_u64_neon_srl(uint64x2_t v, uint8_t amt) {
+    int64_t wamt = (int64_t)(amt);
     return vshlq_u64(v, vdupq_n_s64(-wamt));
 }
 
 // Arithmetic right shift by runtime value.
-static inline int32x4_t libdivide_s32_neon_sra(int32x4_t v, uint8_t amt) {
-    int32_t wamt = static_cast<int32_t>(amt);
+static LIBDIVIDE_INLINE int32x4_t libdivide_s32_neon_sra(int32x4_t v, uint8_t amt) {
+    int32_t wamt = (int32_t)(amt);
     return vshlq_s32(v, vdupq_n_s32(-wamt));
 }
 
-static inline int64x2_t libdivide_s64_neon_sra(int64x2_t v, uint8_t amt) {
-    int64_t wamt = static_cast<int64_t>(amt);
+static LIBDIVIDE_INLINE int64x2_t libdivide_s64_neon_sra(int64x2_t v, uint8_t amt) {
+    int64_t wamt = (int64_t)(amt);
     return vshlq_s64(v, vdupq_n_s64(-wamt));
 }
 
-static inline int64x2_t libdivide_s64_signbits(int64x2_t v) { return vshrq_n_s64(v, 63); }
+static LIBDIVIDE_INLINE int64x2_t libdivide_s64_signbits(int64x2_t v) { return vshrq_n_s64(v, 63); }
 
-static inline uint32x4_t libdivide_mullhi_u32_vec128(uint32x4_t a, uint32_t b) {
+static LIBDIVIDE_INLINE uint32x4_t libdivide_mullhi_u32_vec128(uint32x4_t a, uint32_t b) {
     // Desire is [x0, x1, x2, x3]
     uint32x4_t w1 = vreinterpretq_u32_u64(vmull_n_u32(vget_low_u32(a), b));  // [_, x0, _, x1]
     uint32x4_t w2 = vreinterpretq_u32_u64(vmull_high_n_u32(a, b));           //[_, x2, _, x3]
     return vuzp2q_u32(w1, w2);                                               // [x0, x1, x2, x3]
 }
 
-static inline int32x4_t libdivide_mullhi_s32_vec128(int32x4_t a, int32_t b) {
+static LIBDIVIDE_INLINE int32x4_t libdivide_mullhi_s32_vec128(int32x4_t a, int32_t b) {
     int32x4_t w1 = vreinterpretq_s32_s64(vmull_n_s32(vget_low_s32(a), b));  // [_, x0, _, x1]
     int32x4_t w2 = vreinterpretq_s32_s64(vmull_high_n_s32(a, b));           //[_, x2, _, x3]
     return vuzp2q_s32(w1, w2);                                              // [x0, x1, x2, x3]
 }
 
-static inline uint64x2_t libdivide_mullhi_u64_vec128(uint64x2_t x, uint64_t sy) {
+static LIBDIVIDE_INLINE uint64x2_t libdivide_mullhi_u64_vec128(uint64x2_t x, uint64_t sy) {
     // full 128 bits product is:
     // x0*y0 + (x0*y1 << 32) + (x1*y0 << 32) + (x1*y1 << 64)
     // Note x0,y0,x1,y1 are all conceptually uint32, products are 32x32->64.
@@ -1291,9 +1791,9 @@ static inline uint64x2_t libdivide_mullhi_u64_vec128(uint64x2_t x, uint64_t sy)
     return result;
 }
 
-static inline int64x2_t libdivide_mullhi_s64_vec128(int64x2_t x, int64_t sy) {
+static LIBDIVIDE_INLINE int64x2_t libdivide_mullhi_s64_vec128(int64x2_t x, int64_t sy) {
     int64x2_t p = vreinterpretq_s64_u64(
-        libdivide_mullhi_u64_vec128(vreinterpretq_u64_s64(x), static_cast<uint64_t>(sy)));
+        libdivide_mullhi_u64_vec128(vreinterpretq_u64_s64(x), (uint64_t)(sy)));
     int64x2_t y = vdupq_n_s64(sy);
     int64x2_t t1 = vandq_s64(libdivide_s64_signbits(x), y);
     int64x2_t t2 = vandq_s64(libdivide_s64_signbits(y), x);
@@ -1302,6 +1802,16 @@ static inline int64x2_t libdivide_mullhi_s64_vec128(int64x2_t x, int64_t sy) {
     return p;
 }
 
+////////// UINT16
+
+uint16x8_t libdivide_u16_do_vec128(uint16x8_t numers, const struct libdivide_u16_t *denom) {
+    SIMPLE_VECTOR_DIVISION(uint16_t, uint16x8_t, u16)
+}
+
+uint16x8_t libdivide_u16_branchfree_do_vec128(uint16x8_t numers, const struct libdivide_u16_branchfree_t *denom) {
+    SIMPLE_VECTOR_DIVISION(uint16_t, uint16x8_t, u16_branchfree)
+}
+
 ////////// UINT32
 
 uint32x4_t libdivide_u32_do_vec128(uint32x4_t numers, const struct libdivide_u32_t *denom) {
@@ -1358,13 +1868,23 @@ uint64x2_t libdivide_u64_branchfree_do_vec128(
     return libdivide_u64_neon_srl(t, denom->more);
 }
 
+////////// SINT16
+
+int16x8_t libdivide_s16_do_vec128(int16x8_t numers, const struct libdivide_s16_t *denom) {
+    SIMPLE_VECTOR_DIVISION(int16_t, int16x8_t, s16)
+}
+
+int16x8_t libdivide_s16_branchfree_do_vec128(int16x8_t numers, const struct libdivide_s16_branchfree_t *denom) {
+    SIMPLE_VECTOR_DIVISION(int16_t, int16x8_t, s16_branchfree)
+}
+
 ////////// SINT32
 
 int32x4_t libdivide_s32_do_vec128(int32x4_t numers, const struct libdivide_s32_t *denom) {
     uint8_t more = denom->more;
     if (!denom->magic) {
         uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
-        uint32_t mask = (1U << shift) - 1;
+        uint32_t mask = ((uint32_t)1 << shift) - 1;
         int32x4_t roundToZeroTweak = vdupq_n_s32((int)mask);
         // q = numer + ((numer >> 31) & roundToZeroTweak);
         int32x4_t q = vaddq_s32(numers, vandq_s32(vshrq_n_s32(numers, 31), roundToZeroTweak));
@@ -1404,7 +1924,7 @@ int32x4_t libdivide_s32_branchfree_do_vec128(
     // a power of 2, or (2**shift) if it is not a power of 2
     uint32_t is_power_of_2 = (magic == 0);
     int32x4_t q_sign = vshrq_n_s32(q, 31);  // q_sign = q >> 31
-    int32x4_t mask = vdupq_n_s32((1U << shift) - is_power_of_2);
+    int32x4_t mask = vdupq_n_s32(((uint32_t)1 << shift) - is_power_of_2);
     q = vaddq_s32(q, vandq_s32(q_sign, mask));  // q = q + (q_sign & mask)
     q = libdivide_s32_neon_sra(q, shift);       // q >>= shift
     q = vsubq_s32(veorq_s32(q, sign), sign);    // q = (q ^ sign) - sign
@@ -1418,7 +1938,7 @@ int64x2_t libdivide_s64_do_vec128(int64x2_t numers, const struct libdivide_s64_t
     int64_t magic = denom->magic;
     if (magic == 0) {  // shift path
         uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
-        uint64_t mask = (1ULL << shift) - 1;
+        uint64_t mask = ((uint64_t)1 << shift) - 1;
         int64x2_t roundToZeroTweak = vdupq_n_s64(mask);  // TODO: no need to sign extend
         // q = numer + ((numer >> 63) & roundToZeroTweak);
         int64x2_t q =
@@ -1461,7 +1981,7 @@ int64x2_t libdivide_s64_branchfree_do_vec128(
     // a power of 2, or (2**shift) if it is not a power of 2.
     uint32_t is_power_of_2 = (magic == 0);
     int64x2_t q_sign = libdivide_s64_signbits(q);  // q_sign = q >> 63
-    int64x2_t mask = vdupq_n_s64((1ULL << shift) - is_power_of_2);
+    int64x2_t mask = vdupq_n_s64(((uint64_t)1 << shift) - is_power_of_2);
     q = vaddq_s64(q, vandq_s64(q_sign, mask));  // q = q + (q_sign & mask)
     q = libdivide_s64_neon_sra(q, shift);       // q >>= shift
     q = vsubq_s64(veorq_s64(q, sign), sign);    // q = (q ^ sign) - sign
@@ -1472,33 +1992,45 @@ int64x2_t libdivide_s64_branchfree_do_vec128(
 
 #if defined(LIBDIVIDE_AVX512)
 
-static inline __m512i libdivide_u32_do_vec512(__m512i numers, const struct libdivide_u32_t *denom);
-static inline __m512i libdivide_s32_do_vec512(__m512i numers, const struct libdivide_s32_t *denom);
-static inline __m512i libdivide_u64_do_vec512(__m512i numers, const struct libdivide_u64_t *denom);
-static inline __m512i libdivide_s64_do_vec512(__m512i numers, const struct libdivide_s64_t *denom);
+static LIBDIVIDE_INLINE __m512i libdivide_u16_do_vec512(
+    __m512i numers, const struct libdivide_u16_t *denom);
+static LIBDIVIDE_INLINE __m512i libdivide_s16_do_vec512(
+    __m512i numers, const struct libdivide_s16_t *denom);
+static LIBDIVIDE_INLINE __m512i libdivide_u32_do_vec512(
+    __m512i numers, const struct libdivide_u32_t *denom);
+static LIBDIVIDE_INLINE __m512i libdivide_s32_do_vec512(
+    __m512i numers, const struct libdivide_s32_t *denom);
+static LIBDIVIDE_INLINE __m512i libdivide_u64_do_vec512(
+    __m512i numers, const struct libdivide_u64_t *denom);
+static LIBDIVIDE_INLINE __m512i libdivide_s64_do_vec512(
+    __m512i numers, const struct libdivide_s64_t *denom);
 
-static inline __m512i libdivide_u32_branchfree_do_vec512(
+static LIBDIVIDE_INLINE __m512i libdivide_u16_branchfree_do_vec512(
+    __m512i numers, const struct libdivide_u16_branchfree_t *denom);
+static LIBDIVIDE_INLINE __m512i libdivide_s16_branchfree_do_vec512(
+    __m512i numers, const struct libdivide_s16_branchfree_t *denom);
+static LIBDIVIDE_INLINE __m512i libdivide_u32_branchfree_do_vec512(
     __m512i numers, const struct libdivide_u32_branchfree_t *denom);
-static inline __m512i libdivide_s32_branchfree_do_vec512(
+static LIBDIVIDE_INLINE __m512i libdivide_s32_branchfree_do_vec512(
     __m512i numers, const struct libdivide_s32_branchfree_t *denom);
-static inline __m512i libdivide_u64_branchfree_do_vec512(
+static LIBDIVIDE_INLINE __m512i libdivide_u64_branchfree_do_vec512(
     __m512i numers, const struct libdivide_u64_branchfree_t *denom);
-static inline __m512i libdivide_s64_branchfree_do_vec512(
+static LIBDIVIDE_INLINE __m512i libdivide_s64_branchfree_do_vec512(
     __m512i numers, const struct libdivide_s64_branchfree_t *denom);
 
 //////// Internal Utility Functions
 
-static inline __m512i libdivide_s64_signbits(__m512i v) {
+static LIBDIVIDE_INLINE __m512i libdivide_s64_signbits_vec512(__m512i v) {
     ;
     return _mm512_srai_epi64(v, 63);
 }
 
-static inline __m512i libdivide_s64_shift_right_vec512(__m512i v, int amt) {
+static LIBDIVIDE_INLINE __m512i libdivide_s64_shift_right_vec512(__m512i v, int amt) {
     return _mm512_srai_epi64(v, amt);
 }
 
 // Here, b is assumed to contain one 32-bit value repeated.
-static inline __m512i libdivide_mullhi_u32_vec512(__m512i a, __m512i b) {
+static LIBDIVIDE_INLINE __m512i libdivide_mullhi_u32_vec512(__m512i a, __m512i b) {
     __m512i hi_product_0Z2Z = _mm512_srli_epi64(_mm512_mul_epu32(a, b), 32);
     __m512i a1X3X = _mm512_srli_epi64(a, 32);
     __m512i mask = _mm512_set_epi32(-1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0);
@@ -1507,7 +2039,7 @@ static inline __m512i libdivide_mullhi_u32_vec512(__m512i a, __m512i b) {
 }
 
 // b is one 32-bit value repeated.
-static inline __m512i libdivide_mullhi_s32_vec512(__m512i a, __m512i b) {
+static LIBDIVIDE_INLINE __m512i libdivide_mullhi_s32_vec512(__m512i a, __m512i b) {
     __m512i hi_product_0Z2Z = _mm512_srli_epi64(_mm512_mul_epi32(a, b), 32);
     __m512i a1X3X = _mm512_srli_epi64(a, 32);
     __m512i mask = _mm512_set_epi32(-1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0);
@@ -1516,7 +2048,7 @@ static inline __m512i libdivide_mullhi_s32_vec512(__m512i a, __m512i b) {
 }
 
 // Here, y is assumed to contain one 64-bit value repeated.
-static inline __m512i libdivide_mullhi_u64_vec512(__m512i x, __m512i y) {
+static LIBDIVIDE_INLINE __m512i libdivide_mullhi_u64_vec512(__m512i x, __m512i y) {
     // see m128i variant for comments.
     __m512i x0y0 = _mm512_mul_epu32(x, y);
     __m512i x0y0_hi = _mm512_srli_epi64(x0y0, 32);
@@ -1539,15 +2071,25 @@ static inline __m512i libdivide_mullhi_u64_vec512(__m512i x, __m512i y) {
 }
 
 // y is one 64-bit value repeated.
-static inline __m512i libdivide_mullhi_s64_vec512(__m512i x, __m512i y) {
+static LIBDIVIDE_INLINE __m512i libdivide_mullhi_s64_vec512(__m512i x, __m512i y) {
     __m512i p = libdivide_mullhi_u64_vec512(x, y);
-    __m512i t1 = _mm512_and_si512(libdivide_s64_signbits(x), y);
-    __m512i t2 = _mm512_and_si512(libdivide_s64_signbits(y), x);
+    __m512i t1 = _mm512_and_si512(libdivide_s64_signbits_vec512(x), y);
+    __m512i t2 = _mm512_and_si512(libdivide_s64_signbits_vec512(y), x);
     p = _mm512_sub_epi64(p, t1);
     p = _mm512_sub_epi64(p, t2);
     return p;
 }
 
+////////// UINT16
+
+__m512i libdivide_u16_do_vec512(__m512i numers, const struct libdivide_u16_t *denom) {
+    SIMPLE_VECTOR_DIVISION(uint16_t, __m512i, u16)
+}
+
+__m512i libdivide_u16_branchfree_do_vec512(__m512i numers, const struct libdivide_u16_branchfree_t *denom) {
+    SIMPLE_VECTOR_DIVISION(uint16_t, __m512i, u16_branchfree)
+}
+
 ////////// UINT32
 
 __m512i libdivide_u32_do_vec512(__m512i numers, const struct libdivide_u32_t *denom) {
@@ -1602,13 +2144,23 @@ __m512i libdivide_u64_branchfree_do_vec512(
     return _mm512_srli_epi64(t, denom->more);
 }
 
+////////// SINT16
+
+__m512i libdivide_s16_do_vec512(__m512i numers, const struct libdivide_s16_t *denom) {
+    SIMPLE_VECTOR_DIVISION(int16_t, __m512i, s16)
+}
+
+__m512i libdivide_s16_branchfree_do_vec512(__m512i numers, const struct libdivide_s16_branchfree_t *denom) {
+    SIMPLE_VECTOR_DIVISION(int16_t, __m512i, s16_branchfree)
+}
+
 ////////// SINT32
 
 __m512i libdivide_s32_do_vec512(__m512i numers, const struct libdivide_s32_t *denom) {
     uint8_t more = denom->more;
     if (!denom->magic) {
         uint32_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
-        uint32_t mask = (1U << shift) - 1;
+        uint32_t mask = ((uint32_t)1 << shift) - 1;
         __m512i roundToZeroTweak = _mm512_set1_epi32(mask);
         // q = numer + ((numer >> 31) & roundToZeroTweak);
         __m512i q = _mm512_add_epi32(
@@ -1648,7 +2200,7 @@ __m512i libdivide_s32_branchfree_do_vec512(
     // a power of 2, or (2**shift) if it is not a power of 2
     uint32_t is_power_of_2 = (magic == 0);
     __m512i q_sign = _mm512_srai_epi32(q, 31);  // q_sign = q >> 31
-    __m512i mask = _mm512_set1_epi32((1U << shift) - is_power_of_2);
+    __m512i mask = _mm512_set1_epi32(((uint32_t)1 << shift) - is_power_of_2);
     q = _mm512_add_epi32(q, _mm512_and_si512(q_sign, mask));  // q = q + (q_sign & mask)
     q = _mm512_srai_epi32(q, shift);                          // q >>= shift
     q = _mm512_sub_epi32(_mm512_xor_si512(q, sign), sign);    // q = (q ^ sign) - sign
@@ -1662,11 +2214,11 @@ __m512i libdivide_s64_do_vec512(__m512i numers, const struct libdivide_s64_t *de
     int64_t magic = denom->magic;
     if (magic == 0) {  // shift path
         uint32_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
-        uint64_t mask = (1ULL << shift) - 1;
+        uint64_t mask = ((uint64_t)1 << shift) - 1;
         __m512i roundToZeroTweak = _mm512_set1_epi64(mask);
         // q = numer + ((numer >> 63) & roundToZeroTweak);
         __m512i q = _mm512_add_epi64(
-            numers, _mm512_and_si512(libdivide_s64_signbits(numers), roundToZeroTweak));
+            numers, _mm512_and_si512(libdivide_s64_signbits_vec512(numers), roundToZeroTweak));
         q = libdivide_s64_shift_right_vec512(q, shift);
         __m512i sign = _mm512_set1_epi32((int8_t)more >> 7);
         // q = (q ^ sign) - sign;
@@ -1703,8 +2255,8 @@ __m512i libdivide_s64_branchfree_do_vec512(
     // If q is negative, we want to add either (2**shift)-1 if d is
     // a power of 2, or (2**shift) if it is not a power of 2.
     uint32_t is_power_of_2 = (magic == 0);
-    __m512i q_sign = libdivide_s64_signbits(q);  // q_sign = q >> 63
-    __m512i mask = _mm512_set1_epi64((1ULL << shift) - is_power_of_2);
+    __m512i q_sign = libdivide_s64_signbits_vec512(q);  // q_sign = q >> 63
+    __m512i mask = _mm512_set1_epi64(((uint64_t)1 << shift) - is_power_of_2);
     q = _mm512_add_epi64(q, _mm512_and_si512(q_sign, mask));  // q = q + (q_sign & mask)
     q = libdivide_s64_shift_right_vec512(q, shift);           // q >>= shift
     q = _mm512_sub_epi64(_mm512_xor_si512(q, sign), sign);    // q = (q ^ sign) - sign
@@ -1715,40 +2267,52 @@ __m512i libdivide_s64_branchfree_do_vec512(
 
 #if defined(LIBDIVIDE_AVX2)
 
-static inline __m256i libdivide_u32_do_vec256(__m256i numers, const struct libdivide_u32_t *denom);
-static inline __m256i libdivide_s32_do_vec256(__m256i numers, const struct libdivide_s32_t *denom);
-static inline __m256i libdivide_u64_do_vec256(__m256i numers, const struct libdivide_u64_t *denom);
-static inline __m256i libdivide_s64_do_vec256(__m256i numers, const struct libdivide_s64_t *denom);
+static LIBDIVIDE_INLINE __m256i libdivide_u16_do_vec256(
+    __m256i numers, const struct libdivide_u16_t *denom);
+static LIBDIVIDE_INLINE __m256i libdivide_s16_do_vec256(
+    __m256i numers, const struct libdivide_s16_t *denom);
+static LIBDIVIDE_INLINE __m256i libdivide_u32_do_vec256(
+    __m256i numers, const struct libdivide_u32_t *denom);
+static LIBDIVIDE_INLINE __m256i libdivide_s32_do_vec256(
+    __m256i numers, const struct libdivide_s32_t *denom);
+static LIBDIVIDE_INLINE __m256i libdivide_u64_do_vec256(
+    __m256i numers, const struct libdivide_u64_t *denom);
+static LIBDIVIDE_INLINE __m256i libdivide_s64_do_vec256(
+    __m256i numers, const struct libdivide_s64_t *denom);
 
-static inline __m256i libdivide_u32_branchfree_do_vec256(
+static LIBDIVIDE_INLINE __m256i libdivide_u16_branchfree_do_vec256(
+    __m256i numers, const struct libdivide_u16_branchfree_t *denom);
+static LIBDIVIDE_INLINE __m256i libdivide_s16_branchfree_do_vec256(
+    __m256i numers, const struct libdivide_s16_branchfree_t *denom);
+static LIBDIVIDE_INLINE __m256i libdivide_u32_branchfree_do_vec256(
     __m256i numers, const struct libdivide_u32_branchfree_t *denom);
-static inline __m256i libdivide_s32_branchfree_do_vec256(
+static LIBDIVIDE_INLINE __m256i libdivide_s32_branchfree_do_vec256(
     __m256i numers, const struct libdivide_s32_branchfree_t *denom);
-static inline __m256i libdivide_u64_branchfree_do_vec256(
+static LIBDIVIDE_INLINE __m256i libdivide_u64_branchfree_do_vec256(
     __m256i numers, const struct libdivide_u64_branchfree_t *denom);
-static inline __m256i libdivide_s64_branchfree_do_vec256(
+static LIBDIVIDE_INLINE __m256i libdivide_s64_branchfree_do_vec256(
     __m256i numers, const struct libdivide_s64_branchfree_t *denom);
 
 //////// Internal Utility Functions
 
 // Implementation of _mm256_srai_epi64(v, 63) (from AVX512).
-static inline __m256i libdivide_s64_signbits(__m256i v) {
+static LIBDIVIDE_INLINE __m256i libdivide_s64_signbits_vec256(__m256i v) {
     __m256i hiBitsDuped = _mm256_shuffle_epi32(v, _MM_SHUFFLE(3, 3, 1, 1));
     __m256i signBits = _mm256_srai_epi32(hiBitsDuped, 31);
     return signBits;
 }
 
 // Implementation of _mm256_srai_epi64 (from AVX512).
-static inline __m256i libdivide_s64_shift_right_vec256(__m256i v, int amt) {
+static LIBDIVIDE_INLINE __m256i libdivide_s64_shift_right_vec256(__m256i v, int amt) {
     const int b = 64 - amt;
-    __m256i m = _mm256_set1_epi64x(1ULL << (b - 1));
+    __m256i m = _mm256_set1_epi64x((uint64_t)1 << (b - 1));
     __m256i x = _mm256_srli_epi64(v, amt);
     __m256i result = _mm256_sub_epi64(_mm256_xor_si256(x, m), m);
     return result;
 }
 
 // Here, b is assumed to contain one 32-bit value repeated.
-static inline __m256i libdivide_mullhi_u32_vec256(__m256i a, __m256i b) {
+static LIBDIVIDE_INLINE __m256i libdivide_mullhi_u32_vec256(__m256i a, __m256i b) {
     __m256i hi_product_0Z2Z = _mm256_srli_epi64(_mm256_mul_epu32(a, b), 32);
     __m256i a1X3X = _mm256_srli_epi64(a, 32);
     __m256i mask = _mm256_set_epi32(-1, 0, -1, 0, -1, 0, -1, 0);
@@ -1757,7 +2321,7 @@ static inline __m256i libdivide_mullhi_u32_vec256(__m256i a, __m256i b) {
 }
 
 // b is one 32-bit value repeated.
-static inline __m256i libdivide_mullhi_s32_vec256(__m256i a, __m256i b) {
+static LIBDIVIDE_INLINE __m256i libdivide_mullhi_s32_vec256(__m256i a, __m256i b) {
     __m256i hi_product_0Z2Z = _mm256_srli_epi64(_mm256_mul_epi32(a, b), 32);
     __m256i a1X3X = _mm256_srli_epi64(a, 32);
     __m256i mask = _mm256_set_epi32(-1, 0, -1, 0, -1, 0, -1, 0);
@@ -1766,7 +2330,7 @@ static inline __m256i libdivide_mullhi_s32_vec256(__m256i a, __m256i b) {
 }
 
 // Here, y is assumed to contain one 64-bit value repeated.
-static inline __m256i libdivide_mullhi_u64_vec256(__m256i x, __m256i y) {
+static LIBDIVIDE_INLINE __m256i libdivide_mullhi_u64_vec256(__m256i x, __m256i y) {
     // see m128i variant for comments.
     __m256i x0y0 = _mm256_mul_epu32(x, y);
     __m256i x0y0_hi = _mm256_srli_epi64(x0y0, 32);
@@ -1789,15 +2353,25 @@ static inline __m256i libdivide_mullhi_u64_vec256(__m256i x, __m256i y) {
 }
 
 // y is one 64-bit value repeated.
-static inline __m256i libdivide_mullhi_s64_vec256(__m256i x, __m256i y) {
+static LIBDIVIDE_INLINE __m256i libdivide_mullhi_s64_vec256(__m256i x, __m256i y) {
     __m256i p = libdivide_mullhi_u64_vec256(x, y);
-    __m256i t1 = _mm256_and_si256(libdivide_s64_signbits(x), y);
-    __m256i t2 = _mm256_and_si256(libdivide_s64_signbits(y), x);
+    __m256i t1 = _mm256_and_si256(libdivide_s64_signbits_vec256(x), y);
+    __m256i t2 = _mm256_and_si256(libdivide_s64_signbits_vec256(y), x);
     p = _mm256_sub_epi64(p, t1);
     p = _mm256_sub_epi64(p, t2);
     return p;
 }
 
+////////// UINT16
+
+__m256i libdivide_u16_do_vec256(__m256i numers, const struct libdivide_u16_t *denom) {
+    SIMPLE_VECTOR_DIVISION(uint16_t, __m256i, u16)
+}
+
+__m256i libdivide_u16_branchfree_do_vec256(__m256i numers, const struct libdivide_u16_branchfree_t *denom) {
+    SIMPLE_VECTOR_DIVISION(uint16_t, __m256i, u16_branchfree)
+}
+
 ////////// UINT32
 
 __m256i libdivide_u32_do_vec256(__m256i numers, const struct libdivide_u32_t *denom) {
@@ -1852,13 +2426,23 @@ __m256i libdivide_u64_branchfree_do_vec256(
     return _mm256_srli_epi64(t, denom->more);
 }
 
+////////// SINT16
+
+__m256i libdivide_s16_do_vec256(__m256i numers, const struct libdivide_s16_t *denom) {
+    SIMPLE_VECTOR_DIVISION(int16_t, __m256i, s16)
+}
+
+__m256i libdivide_s16_branchfree_do_vec256(__m256i numers, const struct libdivide_s16_branchfree_t *denom) {
+    SIMPLE_VECTOR_DIVISION(int16_t, __m256i, s16_branchfree)
+}
+
 ////////// SINT32
 
 __m256i libdivide_s32_do_vec256(__m256i numers, const struct libdivide_s32_t *denom) {
     uint8_t more = denom->more;
     if (!denom->magic) {
         uint32_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
-        uint32_t mask = (1U << shift) - 1;
+        uint32_t mask = ((uint32_t)1 << shift) - 1;
         __m256i roundToZeroTweak = _mm256_set1_epi32(mask);
         // q = numer + ((numer >> 31) & roundToZeroTweak);
         __m256i q = _mm256_add_epi32(
@@ -1898,7 +2482,7 @@ __m256i libdivide_s32_branchfree_do_vec256(
     // a power of 2, or (2**shift) if it is not a power of 2
     uint32_t is_power_of_2 = (magic == 0);
     __m256i q_sign = _mm256_srai_epi32(q, 31);  // q_sign = q >> 31
-    __m256i mask = _mm256_set1_epi32((1U << shift) - is_power_of_2);
+    __m256i mask = _mm256_set1_epi32(((uint32_t)1 << shift) - is_power_of_2);
     q = _mm256_add_epi32(q, _mm256_and_si256(q_sign, mask));  // q = q + (q_sign & mask)
     q = _mm256_srai_epi32(q, shift);                          // q >>= shift
     q = _mm256_sub_epi32(_mm256_xor_si256(q, sign), sign);    // q = (q ^ sign) - sign
@@ -1912,11 +2496,11 @@ __m256i libdivide_s64_do_vec256(__m256i numers, const struct libdivide_s64_t *de
     int64_t magic = denom->magic;
     if (magic == 0) {  // shift path
         uint32_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
-        uint64_t mask = (1ULL << shift) - 1;
+        uint64_t mask = ((uint64_t)1 << shift) - 1;
         __m256i roundToZeroTweak = _mm256_set1_epi64x(mask);
         // q = numer + ((numer >> 63) & roundToZeroTweak);
         __m256i q = _mm256_add_epi64(
-            numers, _mm256_and_si256(libdivide_s64_signbits(numers), roundToZeroTweak));
+            numers, _mm256_and_si256(libdivide_s64_signbits_vec256(numers), roundToZeroTweak));
         q = libdivide_s64_shift_right_vec256(q, shift);
         __m256i sign = _mm256_set1_epi32((int8_t)more >> 7);
         // q = (q ^ sign) - sign;
@@ -1953,8 +2537,8 @@ __m256i libdivide_s64_branchfree_do_vec256(
     // If q is negative, we want to add either (2**shift)-1 if d is
     // a power of 2, or (2**shift) if it is not a power of 2.
     uint32_t is_power_of_2 = (magic == 0);
-    __m256i q_sign = libdivide_s64_signbits(q);  // q_sign = q >> 63
-    __m256i mask = _mm256_set1_epi64x((1ULL << shift) - is_power_of_2);
+    __m256i q_sign = libdivide_s64_signbits_vec256(q);  // q_sign = q >> 63
+    __m256i mask = _mm256_set1_epi64x(((uint64_t)1 << shift) - is_power_of_2);
     q = _mm256_add_epi64(q, _mm256_and_si256(q_sign, mask));  // q = q + (q_sign & mask)
     q = libdivide_s64_shift_right_vec256(q, shift);           // q >>= shift
     q = _mm256_sub_epi64(_mm256_xor_si256(q, sign), sign);    // q = (q ^ sign) - sign
@@ -1965,40 +2549,52 @@ __m256i libdivide_s64_branchfree_do_vec256(
 
 #if defined(LIBDIVIDE_SSE2)
 
-static inline __m128i libdivide_u32_do_vec128(__m128i numers, const struct libdivide_u32_t *denom);
-static inline __m128i libdivide_s32_do_vec128(__m128i numers, const struct libdivide_s32_t *denom);
-static inline __m128i libdivide_u64_do_vec128(__m128i numers, const struct libdivide_u64_t *denom);
-static inline __m128i libdivide_s64_do_vec128(__m128i numers, const struct libdivide_s64_t *denom);
+static LIBDIVIDE_INLINE __m128i libdivide_u16_do_vec128(
+    __m128i numers, const struct libdivide_u16_t *denom);
+static LIBDIVIDE_INLINE __m128i libdivide_s16_do_vec128(
+    __m128i numers, const struct libdivide_s16_t *denom);
+static LIBDIVIDE_INLINE __m128i libdivide_u32_do_vec128(
+    __m128i numers, const struct libdivide_u32_t *denom);
+static LIBDIVIDE_INLINE __m128i libdivide_s32_do_vec128(
+    __m128i numers, const struct libdivide_s32_t *denom);
+static LIBDIVIDE_INLINE __m128i libdivide_u64_do_vec128(
+    __m128i numers, const struct libdivide_u64_t *denom);
+static LIBDIVIDE_INLINE __m128i libdivide_s64_do_vec128(
+    __m128i numers, const struct libdivide_s64_t *denom);
 
-static inline __m128i libdivide_u32_branchfree_do_vec128(
+static LIBDIVIDE_INLINE __m128i libdivide_u16_branchfree_do_vec128(
+    __m128i numers, const struct libdivide_u16_branchfree_t *denom);
+static LIBDIVIDE_INLINE __m128i libdivide_s16_branchfree_do_vec128(
+    __m128i numers, const struct libdivide_s16_branchfree_t *denom);
+static LIBDIVIDE_INLINE __m128i libdivide_u32_branchfree_do_vec128(
     __m128i numers, const struct libdivide_u32_branchfree_t *denom);
-static inline __m128i libdivide_s32_branchfree_do_vec128(
+static LIBDIVIDE_INLINE __m128i libdivide_s32_branchfree_do_vec128(
     __m128i numers, const struct libdivide_s32_branchfree_t *denom);
-static inline __m128i libdivide_u64_branchfree_do_vec128(
+static LIBDIVIDE_INLINE __m128i libdivide_u64_branchfree_do_vec128(
     __m128i numers, const struct libdivide_u64_branchfree_t *denom);
-static inline __m128i libdivide_s64_branchfree_do_vec128(
+static LIBDIVIDE_INLINE __m128i libdivide_s64_branchfree_do_vec128(
     __m128i numers, const struct libdivide_s64_branchfree_t *denom);
 
 //////// Internal Utility Functions
 
 // Implementation of _mm_srai_epi64(v, 63) (from AVX512).
-static inline __m128i libdivide_s64_signbits(__m128i v) {
+static LIBDIVIDE_INLINE __m128i libdivide_s64_signbits_vec128(__m128i v) {
     __m128i hiBitsDuped = _mm_shuffle_epi32(v, _MM_SHUFFLE(3, 3, 1, 1));
     __m128i signBits = _mm_srai_epi32(hiBitsDuped, 31);
     return signBits;
 }
 
 // Implementation of _mm_srai_epi64 (from AVX512).
-static inline __m128i libdivide_s64_shift_right_vec128(__m128i v, int amt) {
+static LIBDIVIDE_INLINE __m128i libdivide_s64_shift_right_vec128(__m128i v, int amt) {
     const int b = 64 - amt;
-    __m128i m = _mm_set1_epi64x(1ULL << (b - 1));
+    __m128i m = _mm_set1_epi64x((uint64_t)1 << (b - 1));
     __m128i x = _mm_srli_epi64(v, amt);
     __m128i result = _mm_sub_epi64(_mm_xor_si128(x, m), m);
     return result;
 }
 
 // Here, b is assumed to contain one 32-bit value repeated.
-static inline __m128i libdivide_mullhi_u32_vec128(__m128i a, __m128i b) {
+static LIBDIVIDE_INLINE __m128i libdivide_mullhi_u32_vec128(__m128i a, __m128i b) {
     __m128i hi_product_0Z2Z = _mm_srli_epi64(_mm_mul_epu32(a, b), 32);
     __m128i a1X3X = _mm_srli_epi64(a, 32);
     __m128i mask = _mm_set_epi32(-1, 0, -1, 0);
@@ -2009,7 +2605,7 @@ static inline __m128i libdivide_mullhi_u32_vec128(__m128i a, __m128i b) {
 // SSE2 does not have a signed multiplication instruction, but we can convert
 // unsigned to signed pretty efficiently. Again, b is just a 32 bit value
 // repeated four times.
-static inline __m128i libdivide_mullhi_s32_vec128(__m128i a, __m128i b) {
+static LIBDIVIDE_INLINE __m128i libdivide_mullhi_s32_vec128(__m128i a, __m128i b) {
     __m128i p = libdivide_mullhi_u32_vec128(a, b);
     // t1 = (a >> 31) & y, arithmetic shift
     __m128i t1 = _mm_and_si128(_mm_srai_epi32(a, 31), b);
@@ -2020,7 +2616,7 @@ static inline __m128i libdivide_mullhi_s32_vec128(__m128i a, __m128i b) {
 }
 
 // Here, y is assumed to contain one 64-bit value repeated.
-static inline __m128i libdivide_mullhi_u64_vec128(__m128i x, __m128i y) {
+static LIBDIVIDE_INLINE __m128i libdivide_mullhi_u64_vec128(__m128i x, __m128i y) {
     // full 128 bits product is:
     // x0*y0 + (x0*y1 << 32) + (x1*y0 << 32) + (x1*y1 << 64)
     // Note x0,y0,x1,y1 are all conceptually uint32, products are 32x32->64.
@@ -2053,15 +2649,25 @@ static inline __m128i libdivide_mullhi_u64_vec128(__m128i x, __m128i y) {
 }
 
 // y is one 64-bit value repeated.
-static inline __m128i libdivide_mullhi_s64_vec128(__m128i x, __m128i y) {
+static LIBDIVIDE_INLINE __m128i libdivide_mullhi_s64_vec128(__m128i x, __m128i y) {
     __m128i p = libdivide_mullhi_u64_vec128(x, y);
-    __m128i t1 = _mm_and_si128(libdivide_s64_signbits(x), y);
-    __m128i t2 = _mm_and_si128(libdivide_s64_signbits(y), x);
+    __m128i t1 = _mm_and_si128(libdivide_s64_signbits_vec128(x), y);
+    __m128i t2 = _mm_and_si128(libdivide_s64_signbits_vec128(y), x);
     p = _mm_sub_epi64(p, t1);
     p = _mm_sub_epi64(p, t2);
     return p;
 }
 
+////////// UINT26
+
+__m128i libdivide_u16_do_vec128(__m128i numers, const struct libdivide_u16_t *denom) {
+    SIMPLE_VECTOR_DIVISION(uint16_t, __m128i, u16)
+}
+
+__m128i libdivide_u16_branchfree_do_vec128(__m128i numers, const struct libdivide_u16_branchfree_t *denom) {
+    SIMPLE_VECTOR_DIVISION(uint16_t, __m128i, u16_branchfree)
+}
+
 ////////// UINT32
 
 __m128i libdivide_u32_do_vec128(__m128i numers, const struct libdivide_u32_t *denom) {
@@ -2116,13 +2722,23 @@ __m128i libdivide_u64_branchfree_do_vec128(
     return _mm_srli_epi64(t, denom->more);
 }
 
+////////// SINT16
+
+__m128i libdivide_s16_do_vec128(__m128i numers, const struct libdivide_s16_t *denom) {
+    SIMPLE_VECTOR_DIVISION(int16_t, __m128i, s16)
+}
+
+__m128i libdivide_s16_branchfree_do_vec128(__m128i numers, const struct libdivide_s16_branchfree_t *denom) {
+    SIMPLE_VECTOR_DIVISION(int16_t, __m128i, s16_branchfree)
+}
+
 ////////// SINT32
 
 __m128i libdivide_s32_do_vec128(__m128i numers, const struct libdivide_s32_t *denom) {
     uint8_t more = denom->more;
     if (!denom->magic) {
         uint32_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
-        uint32_t mask = (1U << shift) - 1;
+        uint32_t mask = ((uint32_t)1 << shift) - 1;
         __m128i roundToZeroTweak = _mm_set1_epi32(mask);
         // q = numer + ((numer >> 31) & roundToZeroTweak);
         __m128i q =
@@ -2162,7 +2778,7 @@ __m128i libdivide_s32_branchfree_do_vec128(
     // a power of 2, or (2**shift) if it is not a power of 2
     uint32_t is_power_of_2 = (magic == 0);
     __m128i q_sign = _mm_srai_epi32(q, 31);  // q_sign = q >> 31
-    __m128i mask = _mm_set1_epi32((1U << shift) - is_power_of_2);
+    __m128i mask = _mm_set1_epi32(((uint32_t)1 << shift) - is_power_of_2);
     q = _mm_add_epi32(q, _mm_and_si128(q_sign, mask));  // q = q + (q_sign & mask)
     q = _mm_srai_epi32(q, shift);                       // q >>= shift
     q = _mm_sub_epi32(_mm_xor_si128(q, sign), sign);    // q = (q ^ sign) - sign
@@ -2176,11 +2792,11 @@ __m128i libdivide_s64_do_vec128(__m128i numers, const struct libdivide_s64_t *de
     int64_t magic = denom->magic;
     if (magic == 0) {  // shift path
         uint32_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
-        uint64_t mask = (1ULL << shift) - 1;
+        uint64_t mask = ((uint64_t)1 << shift) - 1;
         __m128i roundToZeroTweak = _mm_set1_epi64x(mask);
         // q = numer + ((numer >> 63) & roundToZeroTweak);
         __m128i q =
-            _mm_add_epi64(numers, _mm_and_si128(libdivide_s64_signbits(numers), roundToZeroTweak));
+            _mm_add_epi64(numers, _mm_and_si128(libdivide_s64_signbits_vec128(numers), roundToZeroTweak));
         q = libdivide_s64_shift_right_vec128(q, shift);
         __m128i sign = _mm_set1_epi32((int8_t)more >> 7);
         // q = (q ^ sign) - sign;
@@ -2217,8 +2833,8 @@ __m128i libdivide_s64_branchfree_do_vec128(
     // If q is negative, we want to add either (2**shift)-1 if d is
     // a power of 2, or (2**shift) if it is not a power of 2.
     uint32_t is_power_of_2 = (magic == 0);
-    __m128i q_sign = libdivide_s64_signbits(q);  // q_sign = q >> 63
-    __m128i mask = _mm_set1_epi64x((1ULL << shift) - is_power_of_2);
+    __m128i q_sign = libdivide_s64_signbits_vec128(q);  // q_sign = q >> 63
+    __m128i mask = _mm_set1_epi64x(((uint64_t)1 << shift) - is_power_of_2);
     q = _mm_add_epi64(q, _mm_and_si128(q_sign, mask));  // q = q + (q_sign & mask)
     q = libdivide_s64_shift_right_vec128(q, shift);     // q >>= shift
     q = _mm_sub_epi64(_mm_xor_si128(q, sign), sign);    // q = (q ^ sign) - sign
@@ -2241,6 +2857,16 @@ enum Branching {
 template <typename T>
 struct NeonVecFor {};
 
+template <>
+struct NeonVecFor<uint16_t> {
+    typedef uint16x8_t type;
+};
+
+template <>
+struct NeonVecFor<int16_t> {
+    typedef int16x8_t type;
+};
+
 template <>
 struct NeonVecFor<uint32_t> {
     typedef uint32x4_t type;
@@ -2264,82 +2890,105 @@ struct NeonVecFor<int64_t> {
 
 // Versions of our algorithms for SIMD.
 #if defined(LIBDIVIDE_NEON)
-#define LIBDIVIDE_DIVIDE_NEON(ALGO, INT_TYPE)                                                 \
-    typename NeonVecFor<INT_TYPE>::type divide(typename NeonVecFor<INT_TYPE>::type n) const { \
-        return libdivide_##ALGO##_do_vec128(n, &denom);                                       \
+#define LIBDIVIDE_DIVIDE_NEON(ALGO, INT_TYPE)                    \
+    LIBDIVIDE_INLINE typename NeonVecFor<INT_TYPE>::type divide( \
+        typename NeonVecFor<INT_TYPE>::type n) const {           \
+        return libdivide_##ALGO##_do_vec128(n, &denom);          \
     }
 #else
 #define LIBDIVIDE_DIVIDE_NEON(ALGO, INT_TYPE)
 #endif
 #if defined(LIBDIVIDE_SSE2)
-#define LIBDIVIDE_DIVIDE_SSE2(ALGO) \
-    __m128i divide(__m128i n) const { return libdivide_##ALGO##_do_vec128(n, &denom); }
+#define LIBDIVIDE_DIVIDE_SSE2(ALGO)                     \
+    LIBDIVIDE_INLINE __m128i divide(__m128i n) const {  \
+        return libdivide_##ALGO##_do_vec128(n, &denom); \
+    }
 #else
 #define LIBDIVIDE_DIVIDE_SSE2(ALGO)
 #endif
 
 #if defined(LIBDIVIDE_AVX2)
-#define LIBDIVIDE_DIVIDE_AVX2(ALGO) \
-    __m256i divide(__m256i n) const { return libdivide_##ALGO##_do_vec256(n, &denom); }
+#define LIBDIVIDE_DIVIDE_AVX2(ALGO)                     \
+    LIBDIVIDE_INLINE __m256i divide(__m256i n) const {  \
+        return libdivide_##ALGO##_do_vec256(n, &denom); \
+    }
 #else
 #define LIBDIVIDE_DIVIDE_AVX2(ALGO)
 #endif
 
 #if defined(LIBDIVIDE_AVX512)
-#define LIBDIVIDE_DIVIDE_AVX512(ALGO) \
-    __m512i divide(__m512i n) const { return libdivide_##ALGO##_do_vec512(n, &denom); }
+#define LIBDIVIDE_DIVIDE_AVX512(ALGO)                   \
+    LIBDIVIDE_INLINE __m512i divide(__m512i n) const {  \
+        return libdivide_##ALGO##_do_vec512(n, &denom); \
+    }
 #else
 #define LIBDIVIDE_DIVIDE_AVX512(ALGO)
 #endif
 
 // The DISPATCHER_GEN() macro generates C++ methods (for the given integer
 // and algorithm types) that redirect to libdivide's C API.
-#define DISPATCHER_GEN(T, ALGO)                                      \
-    libdivide_##ALGO##_t denom;                                      \
-    dispatcher() {}                                                  \
-    dispatcher(T d) : denom(libdivide_##ALGO##_gen(d)) {}            \
-    T divide(T n) const { return libdivide_##ALGO##_do(n, &denom); } \
-    T recover() const { return libdivide_##ALGO##_recover(&denom); } \
-    LIBDIVIDE_DIVIDE_NEON(ALGO, T)                                   \
-    LIBDIVIDE_DIVIDE_SSE2(ALGO)                                      \
-    LIBDIVIDE_DIVIDE_AVX2(ALGO)                                      \
+#define DISPATCHER_GEN(T, ALGO)                                                       \
+    libdivide_##ALGO##_t denom;                                                       \
+    LIBDIVIDE_INLINE dispatcher() {}                                                  \
+    LIBDIVIDE_INLINE dispatcher(T d) : denom(libdivide_##ALGO##_gen(d)) {}            \
+    LIBDIVIDE_INLINE T divide(T n) const { return libdivide_##ALGO##_do(n, &denom); } \
+    LIBDIVIDE_INLINE T recover() const { return libdivide_##ALGO##_recover(&denom); } \
+    LIBDIVIDE_DIVIDE_NEON(ALGO, T)                                                    \
+    LIBDIVIDE_DIVIDE_SSE2(ALGO)                                                       \
+    LIBDIVIDE_DIVIDE_AVX2(ALGO)                                                       \
     LIBDIVIDE_DIVIDE_AVX512(ALGO)
 
 // The dispatcher selects a specific division algorithm for a given
 // type and ALGO using partial template specialization.
-template <bool IS_INTEGRAL, bool IS_SIGNED, int SIZEOF, Branching ALGO>
+template <typename _IntT, Branching ALGO>
 struct dispatcher {};
 
 template <>
-struct dispatcher<true, true, sizeof(int32_t), BRANCHFULL> {
+struct dispatcher<int16_t, BRANCHFULL> {
+    DISPATCHER_GEN(int16_t, s16)
+};
+template <>
+struct dispatcher<int16_t, BRANCHFREE> {
+    DISPATCHER_GEN(int16_t, s16_branchfree)
+};
+template <>
+struct dispatcher<uint16_t, BRANCHFULL> {
+    DISPATCHER_GEN(uint16_t, u16)
+};
+template <>
+struct dispatcher<uint16_t, BRANCHFREE> {
+    DISPATCHER_GEN(uint16_t, u16_branchfree)
+};
+template <>
+struct dispatcher<int32_t, BRANCHFULL> {
     DISPATCHER_GEN(int32_t, s32)
 };
 template <>
-struct dispatcher<true, true, sizeof(int32_t), BRANCHFREE> {
+struct dispatcher<int32_t, BRANCHFREE> {
     DISPATCHER_GEN(int32_t, s32_branchfree)
 };
 template <>
-struct dispatcher<true, false, sizeof(uint32_t), BRANCHFULL> {
+struct dispatcher<uint32_t, BRANCHFULL> {
     DISPATCHER_GEN(uint32_t, u32)
 };
 template <>
-struct dispatcher<true, false, sizeof(uint32_t), BRANCHFREE> {
+struct dispatcher<uint32_t, BRANCHFREE> {
     DISPATCHER_GEN(uint32_t, u32_branchfree)
 };
 template <>
-struct dispatcher<true, true, sizeof(int64_t), BRANCHFULL> {
+struct dispatcher<int64_t, BRANCHFULL> {
     DISPATCHER_GEN(int64_t, s64)
 };
 template <>
-struct dispatcher<true, true, sizeof(int64_t), BRANCHFREE> {
+struct dispatcher<int64_t, BRANCHFREE> {
     DISPATCHER_GEN(int64_t, s64_branchfree)
 };
 template <>
-struct dispatcher<true, false, sizeof(uint64_t), BRANCHFULL> {
+struct dispatcher<uint64_t, BRANCHFULL> {
     DISPATCHER_GEN(uint64_t, u64)
 };
 template <>
-struct dispatcher<true, false, sizeof(uint64_t), BRANCHFREE> {
+struct dispatcher<uint64_t, BRANCHFREE> {
     DISPATCHER_GEN(uint64_t, u64_branchfree)
 };
 
@@ -2348,6 +2997,9 @@ struct dispatcher<true, false, sizeof(uint64_t), BRANCHFREE> {
 // based on the integer and algorithm template parameters.
 template <typename T, Branching ALGO = BRANCHFULL>
 class divider {
+   private:
+    typedef dispatcher<T, ALGO> dispatcher_t;
+
    public:
     // We leave the default constructor empty so that creating
     // an array of dividers and then initializing them
@@ -2355,10 +3007,10 @@ class divider {
     divider() {}
 
     // Constructor that takes the divisor as a parameter
-    divider(T d) : div(d) {}
+    LIBDIVIDE_INLINE divider(T d) : div(d) {}
 
     // Divides n by the divisor
-    T divide(T n) const { return div.divide(n); }
+    LIBDIVIDE_INLINE T divide(T n) const { return div.divide(n); }
 
     // Recovers the divisor, returns the value that was
     // used to initialize this divider object.
@@ -2374,34 +3026,34 @@ class divider {
     // (e.g. s32, u32, s64, u64) and divides each of them by the divider, returning the packed
     // quotients.
 #if defined(LIBDIVIDE_SSE2)
-    __m128i divide(__m128i n) const { return div.divide(n); }
+    LIBDIVIDE_INLINE __m128i divide(__m128i n) const { return div.divide(n); }
 #endif
 #if defined(LIBDIVIDE_AVX2)
-    __m256i divide(__m256i n) const { return div.divide(n); }
+    LIBDIVIDE_INLINE __m256i divide(__m256i n) const { return div.divide(n); }
 #endif
 #if defined(LIBDIVIDE_AVX512)
-    __m512i divide(__m512i n) const { return div.divide(n); }
+    LIBDIVIDE_INLINE __m512i divide(__m512i n) const { return div.divide(n); }
 #endif
 #if defined(LIBDIVIDE_NEON)
-    typename NeonVecFor<T>::type divide(typename NeonVecFor<T>::type n) const {
+    LIBDIVIDE_INLINE typename NeonVecFor<T>::type divide(typename NeonVecFor<T>::type n) const {
         return div.divide(n);
     }
 #endif
 
    private:
     // Storage for the actual divisor
-    dispatcher<std::is_integral<T>::value, std::is_signed<T>::value, sizeof(T), ALGO> div;
+    dispatcher_t div;
 };
 
 // Overload of operator / for scalar division
 template <typename T, Branching ALGO>
-T operator/(T n, const divider<T, ALGO> &div) {
+LIBDIVIDE_INLINE T operator/(T n, const divider<T, ALGO> &div) {
     return div.divide(n);
 }
 
 // Overload of operator /= for scalar division
 template <typename T, Branching ALGO>
-T &operator/=(T &n, const divider<T, ALGO> &div) {
+LIBDIVIDE_INLINE T &operator/=(T &n, const divider<T, ALGO> &div) {
     n = div.divide(n);
     return n;
 }
@@ -2409,88 +3061,55 @@ T &operator/=(T &n, const divider<T, ALGO> &div) {
 // Overloads for vector types.
 #if defined(LIBDIVIDE_SSE2)
 template <typename T, Branching ALGO>
-__m128i operator/(__m128i n, const divider<T, ALGO> &div) {
+LIBDIVIDE_INLINE __m128i operator/(__m128i n, const divider<T, ALGO> &div) {
     return div.divide(n);
 }
 
 template <typename T, Branching ALGO>
-__m128i operator/=(__m128i &n, const divider<T, ALGO> &div) {
+LIBDIVIDE_INLINE __m128i operator/=(__m128i &n, const divider<T, ALGO> &div) {
     n = div.divide(n);
     return n;
 }
 #endif
 #if defined(LIBDIVIDE_AVX2)
 template <typename T, Branching ALGO>
-__m256i operator/(__m256i n, const divider<T, ALGO> &div) {
+LIBDIVIDE_INLINE __m256i operator/(__m256i n, const divider<T, ALGO> &div) {
     return div.divide(n);
 }
 
 template <typename T, Branching ALGO>
-__m256i operator/=(__m256i &n, const divider<T, ALGO> &div) {
+LIBDIVIDE_INLINE __m256i operator/=(__m256i &n, const divider<T, ALGO> &div) {
     n = div.divide(n);
     return n;
 }
 #endif
 #if defined(LIBDIVIDE_AVX512)
 template <typename T, Branching ALGO>
-__m512i operator/(__m512i n, const divider<T, ALGO> &div) {
+LIBDIVIDE_INLINE __m512i operator/(__m512i n, const divider<T, ALGO> &div) {
     return div.divide(n);
 }
 
 template <typename T, Branching ALGO>
-__m512i operator/=(__m512i &n, const divider<T, ALGO> &div) {
+LIBDIVIDE_INLINE __m512i operator/=(__m512i &n, const divider<T, ALGO> &div) {
     n = div.divide(n);
     return n;
 }
 #endif
 
 #if defined(LIBDIVIDE_NEON)
-template <Branching ALGO>
-uint32x4_t operator/(uint32x4_t n, const divider<uint32_t, ALGO> &div) {
+template <typename T, Branching ALGO>
+LIBDIVIDE_INLINE typename NeonVecFor<T>::type operator/(typename NeonVecFor<T>::type n, const divider<T, ALGO> &div) {
     return div.divide(n);
 }
 
-template <Branching ALGO>
-int32x4_t operator/(int32x4_t n, const divider<int32_t, ALGO> &div) {
-    return div.divide(n);
-}
-
-template <Branching ALGO>
-uint64x2_t operator/(uint64x2_t n, const divider<uint64_t, ALGO> &div) {
-    return div.divide(n);
-}
-
-template <Branching ALGO>
-int64x2_t operator/(int64x2_t n, const divider<int64_t, ALGO> &div) {
-    return div.divide(n);
-}
-
-template <Branching ALGO>
-uint32x4_t operator/=(uint32x4_t &n, const divider<uint32_t, ALGO> &div) {
-    n = div.divide(n);
-    return n;
-}
-
-template <Branching ALGO>
-int32x4_t operator/=(int32x4_t &n, const divider<int32_t, ALGO> &div) {
-    n = div.divide(n);
-    return n;
-}
-
-template <Branching ALGO>
-uint64x2_t operator/=(uint64x2_t &n, const divider<uint64_t, ALGO> &div) {
-    n = div.divide(n);
-    return n;
-}
-
-template <Branching ALGO>
-int64x2_t operator/=(int64x2_t &n, const divider<int64_t, ALGO> &div) {
+template <typename T, Branching ALGO>
+LIBDIVIDE_INLINE typename NeonVecFor<T>::type operator/=(typename NeonVecFor<T>::type &n, const divider<T, ALGO> &div) {
     n = div.divide(n);
     return n;
 }
 #endif
 
-#if __cplusplus >= 201103L
+#if __cplusplus >= 201103L || (defined(_MSC_VER) && _MSC_VER >= 1900)
 // libdivide::branchfree_divider<T>
 template <typename T>
 using branchfree_divider = divider<T, BRANCHFREE>;
@@ -2500,4 +3119,8 @@ using branchfree_divider = divider<T, BRANCHFREE>;
 
 #endif  // __cplusplus
 
+#if defined(_MSC_VER)
+#pragma warning(pop)
+#endif
+
 #endif  // LIBDIVIDE_H