diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml
index e77baba..8470947 100644
--- a/.github/workflows/build-and-test.yml
+++ b/.github/workflows/build-and-test.yml
@@ -9,30 +9,14 @@ on:
 jobs:
   build-ubuntu-gcc:
     runs-on: ubuntu-latest
-    strategy:
-      matrix:
-        version: [12, 13, 14]
     steps:
       - uses: actions/checkout@v4
-      - name: Setting up gcc version
-        run: |
-          sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-${{ matrix.version }} 100
-          sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-${{ matrix.version }} 100
       - name: Build
         run: make test
   build-ubuntu-clang:
     runs-on: ubuntu-latest
-    strategy:
-      matrix:
-        version: [14, 15, 16, 17, 18]
     steps:
       - uses: actions/checkout@v4
-      - name: Install dependencies
-        run: sudo apt-get update && sudo apt-get install -y --no-install-recommends clang-14 clang-15
-      - name: Setting up clang version
-        run: |
-          sudo update-alternatives --install /usr/bin/clang++ clang++ /usr/bin/clang++-${{ matrix.version }} 100
-          sudo update-alternatives --install /usr/bin/clang clang /usr/bin/clang-${{ matrix.version }} 100
       - name: Build
         run: CC=clang CXX=clang++ make test
   build-musl:
diff --git a/Android.bp b/Android.bp
index f6a7a9c..11725a6 100644
--- a/Android.bp
+++ b/Android.bp
@@ -5,6 +5,8 @@ common_cflags = [
     "-fPIC",
     "-fvisibility=hidden",
     //"-fno-plt",
+    "-Wall",
+    "-Wextra",
     "-Wcast-align",
     "-Wcast-qual",
     "-Wwrite-strings",
@@ -71,9 +73,6 @@ cc_library {
         debuggable: {
             cflags: ["-DLABEL_MEMORY"],
         },
-        device_has_arm_mte: {
-            cflags: ["-DHAS_ARM_MTE", "-march=armv8-a+dotprod+memtag"]
-        },
     },
     apex_available: [
         "com.android.runtime",
diff --git a/CREDITS b/CREDITS
index 31b6875..3ad8617 100644
--- a/CREDITS
+++ b/CREDITS
@@ -54,230 +54,3 @@ libdivide:
 
 random.c get_random_{type}_uniform functions are based on Fast Random Integer
 Generation in an Interval by Daniel Lemire
-
-arm_mte.h arm_mte_tag_and_clear_mem function contents were copied from storeTags function in scudo:
-
-    ==============================================================================
-    The LLVM Project is under the Apache License v2.0 with LLVM Exceptions:
-    ==============================================================================
-
-                                     Apache License
-                               Version 2.0, January 2004
-                            http://www.apache.org/licenses/
-
-        TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-
-        1. Definitions.
-
-          "License" shall mean the terms and conditions for use, reproduction,
-          and distribution as defined by Sections 1 through 9 of this document.
-
-          "Licensor" shall mean the copyright owner or entity authorized by
-          the copyright owner that is granting the License.
-
-          "Legal Entity" shall mean the union of the acting entity and all
-          other entities that control, are controlled by, or are under common
-          control with that entity. For the purposes of this definition,
-          "control" means (i) the power, direct or indirect, to cause the
-          direction or management of such entity, whether by contract or
-          otherwise, or (ii) ownership of fifty percent (50%) or more of the
-          outstanding shares, or (iii) beneficial ownership of such entity.
-
-          "You" (or "Your") shall mean an individual or Legal Entity
-          exercising permissions granted by this License.
-
-          "Source" form shall mean the preferred form for making modifications,
-          including but not limited to software source code, documentation
-          source, and configuration files.
-
-          "Object" form shall mean any form resulting from mechanical
-          transformation or translation of a Source form, including but
-          not limited to compiled object code, generated documentation,
-          and conversions to other media types.
-
-          "Work" shall mean the work of authorship, whether in Source or
-          Object form, made available under the License, as indicated by a
-          copyright notice that is included in or attached to the work
-          (an example is provided in the Appendix below).
-
-          "Derivative Works" shall mean any work, whether in Source or Object
-          form, that is based on (or derived from) the Work and for which the
-          editorial revisions, annotations, elaborations, or other modifications
-          represent, as a whole, an original work of authorship. For the purposes
-          of this License, Derivative Works shall not include works that remain
-          separable from, or merely link (or bind by name) to the interfaces of,
-          the Work and Derivative Works thereof.
-
-          "Contribution" shall mean any work of authorship, including
-          the original version of the Work and any modifications or additions
-          to that Work or Derivative Works thereof, that is intentionally
-          submitted to Licensor for inclusion in the Work by the copyright owner
-          or by an individual or Legal Entity authorized to submit on behalf of
-          the copyright owner. For the purposes of this definition, "submitted"
-          means any form of electronic, verbal, or written communication sent
-          to the Licensor or its representatives, including but not limited to
-          communication on electronic mailing lists, source code control systems,
-          and issue tracking systems that are managed by, or on behalf of, the
-          Licensor for the purpose of discussing and improving the Work, but
-          excluding communication that is conspicuously marked or otherwise
-          designated in writing by the copyright owner as "Not a Contribution."
-
-          "Contributor" shall mean Licensor and any individual or Legal Entity
-          on behalf of whom a Contribution has been received by Licensor and
-          subsequently incorporated within the Work.
-
-        2. Grant of Copyright License. Subject to the terms and conditions of
-          this License, each Contributor hereby grants to You a perpetual,
-          worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-          copyright license to reproduce, prepare Derivative Works of,
-          publicly display, publicly perform, sublicense, and distribute the
-          Work and such Derivative Works in Source or Object form.
-
-        3. Grant of Patent License. Subject to the terms and conditions of
-          this License, each Contributor hereby grants to You a perpetual,
-          worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-          (except as stated in this section) patent license to make, have made,
-          use, offer to sell, sell, import, and otherwise transfer the Work,
-          where such license applies only to those patent claims licensable
-          by such Contributor that are necessarily infringed by their
-          Contribution(s) alone or by combination of their Contribution(s)
-          with the Work to which such Contribution(s) was submitted. If You
-          institute patent litigation against any entity (including a
-          cross-claim or counterclaim in a lawsuit) alleging that the Work
-          or a Contribution incorporated within the Work constitutes direct
-          or contributory patent infringement, then any patent licenses
-          granted to You under this License for that Work shall terminate
-          as of the date such litigation is filed.
-
-        4. Redistribution. You may reproduce and distribute copies of the
-          Work or Derivative Works thereof in any medium, with or without
-          modifications, and in Source or Object form, provided that You
-          meet the following conditions:
-
-          (a) You must give any other recipients of the Work or
-              Derivative Works a copy of this License; and
-
-          (b) You must cause any modified files to carry prominent notices
-              stating that You changed the files; and
-
-          (c) You must retain, in the Source form of any Derivative Works
-              that You distribute, all copyright, patent, trademark, and
-              attribution notices from the Source form of the Work,
-              excluding those notices that do not pertain to any part of
-              the Derivative Works; and
-
-          (d) If the Work includes a "NOTICE" text file as part of its
-              distribution, then any Derivative Works that You distribute must
-              include a readable copy of the attribution notices contained
-              within such NOTICE file, excluding those notices that do not
-              pertain to any part of the Derivative Works, in at least one
-              of the following places: within a NOTICE text file distributed
-              as part of the Derivative Works; within the Source form or
-              documentation, if provided along with the Derivative Works; or,
-              within a display generated by the Derivative Works, if and
-              wherever such third-party notices normally appear. The contents
-              of the NOTICE file are for informational purposes only and
-              do not modify the License. You may add Your own attribution
-              notices within Derivative Works that You distribute, alongside
-              or as an addendum to the NOTICE text from the Work, provided
-              that such additional attribution notices cannot be construed
-              as modifying the License.
-
-          You may add Your own copyright statement to Your modifications and
-          may provide additional or different license terms and conditions
-          for use, reproduction, or distribution of Your modifications, or
-          for any such Derivative Works as a whole, provided Your use,
-          reproduction, and distribution of the Work otherwise complies with
-          the conditions stated in this License.
-
-        5. Submission of Contributions. Unless You explicitly state otherwise,
-          any Contribution intentionally submitted for inclusion in the Work
-          by You to the Licensor shall be under the terms and conditions of
-          this License, without any additional terms or conditions.
-          Notwithstanding the above, nothing herein shall supersede or modify
-          the terms of any separate license agreement you may have executed
-          with Licensor regarding such Contributions.
-
-        6. Trademarks. This License does not grant permission to use the trade
-          names, trademarks, service marks, or product names of the Licensor,
-          except as required for reasonable and customary use in describing the
-          origin of the Work and reproducing the content of the NOTICE file.
-
-        7. Disclaimer of Warranty. Unless required by applicable law or
-          agreed to in writing, Licensor provides the Work (and each
-          Contributor provides its Contributions) on an "AS IS" BASIS,
-          WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-          implied, including, without limitation, any warranties or conditions
-          of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
-          PARTICULAR PURPOSE. You are solely responsible for determining the
-          appropriateness of using or redistributing the Work and assume any
-          risks associated with Your exercise of permissions under this License.
-
-        8. Limitation of Liability. In no event and under no legal theory,
-          whether in tort (including negligence), contract, or otherwise,
-          unless required by applicable law (such as deliberate and grossly
-          negligent acts) or agreed to in writing, shall any Contributor be
-          liable to You for damages, including any direct, indirect, special,
-          incidental, or consequential damages of any character arising as a
-          result of this License or out of the use or inability to use the
-          Work (including but not limited to damages for loss of goodwill,
-          work stoppage, computer failure or malfunction, or any and all
-          other commercial damages or losses), even if such Contributor
-          has been advised of the possibility of such damages.
-
-        9. Accepting Warranty or Additional Liability. While redistributing
-          the Work or Derivative Works thereof, You may choose to offer,
-          and charge a fee for, acceptance of support, warranty, indemnity,
-          or other liability obligations and/or rights consistent with this
-          License. However, in accepting such obligations, You may act only
-          on Your own behalf and on Your sole responsibility, not on behalf
-          of any other Contributor, and only if You agree to indemnify,
-          defend, and hold each Contributor harmless for any liability
-          incurred by, or claims asserted against, such Contributor by reason
-          of your accepting any such warranty or additional liability.
-
-        END OF TERMS AND CONDITIONS
-
-        APPENDIX: How to apply the Apache License to your work.
-
-          To apply the Apache License to your work, attach the following
-          boilerplate notice, with the fields enclosed by brackets "[]"
-          replaced with your own identifying information. (Don't include
-          the brackets!)  The text should be enclosed in the appropriate
-          comment syntax for the file format. We also recommend that a
-          file or class name and description of purpose be included on the
-          same "printed page" as the copyright notice for easier
-          identification within third-party archives.
-
-        Copyright [yyyy] [name of copyright owner]
-
-        Licensed under the Apache License, Version 2.0 (the "License");
-        you may not use this file except in compliance with the License.
-        You may obtain a copy of the License at
-
-           http://www.apache.org/licenses/LICENSE-2.0
-
-        Unless required by applicable law or agreed to in writing, software
-        distributed under the License is distributed on an "AS IS" BASIS,
-        WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-        See the License for the specific language governing permissions and
-        limitations under the License.
-
-
-    ---- LLVM Exceptions to the Apache 2.0 License ----
-
-    As an exception, if, as a result of your compiling your source code, portions
-    of this Software are embedded into an Object form of such source code, you
-    may redistribute such embedded portions in such Object form without complying
-    with the conditions of Sections 4(a), 4(b) and 4(d) of the License.
-
-    In addition, if you combine or link compiled forms of this Software with
-    software that is licensed under the GPLv2 ("Combined Software") and if a
-    court of competent jurisdiction determines that the patent provision (Section
-    3), the indemnity provision (Section 9) or other Section of the License
-    conflicts with the conditions of the GPLv2, you may retroactively and
-    prospectively choose to deem waived or otherwise exclude such Section(s) of
-    the License, but only in their entirety and only with respect to the Combined
-    Software.
-
-    ==============================================================================
diff --git a/LICENSE b/LICENSE
index af4b965..5311a0f 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,4 +1,4 @@
-Copyright © 2018-2024 GrapheneOS
+Copyright © 2018-2023 GrapheneOS
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/README.md b/README.md
index 6a1a91b..b3f820f 100644
--- a/README.md
+++ b/README.md
@@ -83,7 +83,7 @@ there will be custom integration offering better performance in the future
 along with other hardening for the C standard library implementation.
 
 For Android, only the current generation, actively developed maintenance branch of the Android
-Open Source Project will be supported, which currently means `android15-release`.
+Open Source Project will be supported, which currently means `android13-qpr2-release`.
 
 ## Testing
 
@@ -159,9 +159,6 @@ line to the `/etc/ld.so.preload` configuration file:
 The format of this configuration file is a whitespace-separated list, so it's
 good practice to put each library on a separate line.
 
-On Debian systems `libhardened_malloc.so` should be installed into `/usr/lib/`
-to avoid preload failures caused by AppArmor profile restrictions.
-
 Using the `LD_PRELOAD` environment variable to load it on a case-by-case basis
 will not work when `AT_SECURE` is set such as with setuid binaries. It's also
 generally not a recommended approach for production usage. The recommendation
@@ -473,16 +470,16 @@ was a bit less important and if a core goal was finding latent bugs.
     * Errors other than ENOMEM from mmap, munmap, mprotect and mremap treated
       as fatal, which can help to detect memory management gone wrong elsewhere
       in the process.
-* Memory tagging for slab allocations via MTE on ARMv8.5+
+* [future] Memory tagging for slab allocations via MTE on ARMv8.5+
     * random memory tags as the baseline, providing probabilistic protection
       against various forms of memory corruption
     * dedicated tag for free slots, set on free, for deterministic protection
       against accessing freed memory
+    * store previous random tag within freed slab allocations, and increment it
+      to get the next tag for that slot to provide deterministic use-after-free
+      detection through multiple cycles of memory reuse
     * guarantee distinct tags for adjacent memory allocations by incrementing
       past matching values for deterministic detection of linear overflows
-    * [future] store previous random tag and increment it to get the next tag
-      for that slot to provide deterministic use-after-free detection through
-      multiple cycles of memory reuse
 
 ## Randomness
 
@@ -724,46 +721,77 @@ freeing as there would be if the kernel supported these features directly.
 
 ## Memory tagging
 
-Random tags are set for all slab allocations when allocated, with 4 excluded values:
+Integrating extensive support for ARMv8.5 memory tagging is planned and this
+section will be expanded to cover the details on the chosen design. The approach
+for slab allocations is currently covered, but it can also be used for the
+allocator metadata region and large allocations.
 
-1. the reserved `0` tag
-2. the previous tag used for the slot
-3. the current (or previous) tag used for the slot to the left
-4. the current (or previous) tag used for the slot to the right
+Memory allocations are already always multiples of naturally aligned 16 byte
+units, so memory tags are a natural fit into a malloc implementation due to the
+16 byte alignment requirement. The only extra memory consumption will come from
+the hardware supported storage for the tag values (4 bits per 16 bytes).
 
-When a slab allocation is freed, the reserved `0` tag is set for the slot.
-Slab allocation slots are cleared before reuse when memory tagging is enabled.
+The baseline policy will be to generate random tags for each slab allocation
+slot on first use. The highest value will be reserved for marking freed memory
+allocations to detect any accesses to freed memory so it won't be part of the
+generated range. Adjacent slots will be guaranteed to have distinct memory tags
+in order to guarantee that linear overflows are detected. There are a few ways
+of implementing this and it will end up depending on the performance costs of
+different approaches. If there's an efficient way to fetch the adjacent tag
+values without wasting extra memory, it will be possible to check for them and
+skip them either by generating a new random value in a loop or incrementing
+past them since the tiny bit of bias wouldn't matter. Another approach would be
+alternating odd and even tag values but that would substantially reduce the
+overall randomness of the tags and there's very little entropy from the start.
 
-This ensures the following properties:
+Once a slab allocation has been freed, the tag will be set to the reserved
+value for free memory and the previous tag value will be stored inside the
+allocation itself. The next time the slot is allocated, the chosen tag value
+will be the previous value incremented by one to provide use-after-free
+detection between generations of allocations. The stored tag will be wiped
+before retagging the memory, to avoid leaking it and as part of preserving the
+security property of newly allocated memory being zeroed due to zero-on-free.
+It will eventually wrap all the way around, but this ends up providing a strong
+guarantee for many allocation cycles due to the combination of 4 bit tags with
+the FIFO quarantine feature providing delayed free. It also benefits from
+random slot allocation and the randomized portion of delayed free, which result
+in a further delay along with preventing a deterministic bypass by forcing a
+reuse after a certain number of allocation cycles. Similarly to the initial tag
+generation, tag values for adjacent allocations will be skipped by incrementing
+past them.
 
-- Linear overflows are deterministically detected.
-- Use-after-free are deterministically detected until the freed slot goes through
-  both the random and FIFO quarantines, gets allocated again, goes through both
-  quarantines again and then finally gets allocated again for a 2nd time.
-- Since the default `0` tag is reserved, untagged pointers can't access slab
-  allocations and vice versa.
+For example, consider this slab of allocations that are not yet used with 15
+representing the tag for free memory. For the sake of simplicity, there will be
+no quarantine or other slabs for this example:
 
-Slab allocations are done in a statically reserved region for each size class
-and all metadata is in a statically reserved region, so interactions between
-different uses of the same address space is not applicable.
+    | 15 | 15 | 15 | 15 | 15 | 15 |
 
-Large allocations beyond the largest slab allocation size class (128k by
-default) are guaranteed to have randomly sized guard regions to the left and
-right. Random and FIFO address space quarantines provide use-after-free
-detection. We need to test whether the cost of random tags is acceptable to enabled them by default,
-since they would be useful for:
+Three slots are randomly chosen for allocations, with random tags assigned (2,
+7, 14) since these slots haven't ever been used and don't have saved values:
 
-- probabilistic detection of overflows
-- probabilistic detection of use-after-free once the address space is
-  out of the quarantine and reused for another allocation
-- deterministic detection of use-after-free for reuse by another allocator.
+    | 15 | 2  | 15 | 7  | 14 | 15 |
 
-When memory tagging is enabled, checking for write-after-free at allocation
-time and checking canaries are both disabled. Canaries will be more thoroughly
-disabled when using memory tagging in the future, but Android currently has
-[very dynamic memory tagging support](https://source.android.com/docs/security/test/memory-safety/arm-mte)
-where it can be disabled at any time which creates a barrier to optimizing
-by disabling redundant features.
+The 2nd allocation slot is freed, and is set back to the tag for free memory
+(15), but with the previous tag value stored in the freed space:
+
+    | 15 | 15 | 15 | 7  | 14 | 15 |
+
+The first slot is allocated for the first time, receiving the random value 3:
+
+    | 3  | 15 | 15 | 7  | 14 | 15 |
+
+The 2nd slot is randomly chosen again, so the previous tag (2) is retrieved and
+incremented to 3 as part of the use-after-free mitigation. An adjacent
+allocation already uses the tag 3, so the tag is further incremented to 4 (it
+would be incremented to 5 if one of the adjacent tags was 4):
+
+    | 3  | 4  | 15 | 7  | 14 | 15 |
+
+The last slot is randomly chosen for the next allocation, and is assigned the
+random value 14. However, it's placed next to an allocation with the tag 14 so
+the tag is incremented and wraps around to 0:
+
+    | 3  | 4  | 15 | 7  | 14 | 0  |
 
 ## API extensions
 
diff --git a/androidtest/Android.bp b/androidtest/Android.bp
deleted file mode 100644
index ae0aa49..0000000
--- a/androidtest/Android.bp
+++ /dev/null
@@ -1,25 +0,0 @@
-java_test_host {
-    name: "HMallocTest",
-    srcs: [
-        "src/**/*.java",
-    ],
-
-    libs: [
-        "tradefed",
-        "compatibility-tradefed",
-        "compatibility-host-util",
-    ],
-
-    static_libs: [
-        "cts-host-utils",
-        "frameworks-base-hostutils",
-    ],
-
-    test_suites: [
-        "general-tests",
-    ],
-
-    data_device_bins_64: [
-        "memtag_test",
-    ],
-}
diff --git a/androidtest/AndroidTest.xml b/androidtest/AndroidTest.xml
deleted file mode 100644
index 333f1dd..0000000
--- a/androidtest/AndroidTest.xml
+++ /dev/null
@@ -1,13 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<configuration description="hardened_malloc test">
-
-    <target_preparer class="com.android.compatibility.common.tradefed.targetprep.FilePusher">
-        <option name="cleanup" value="true" />
-        <option name="push" value="memtag_test->/data/local/tmp/memtag_test" />
-    </target_preparer>
-
-    <test class="com.android.compatibility.common.tradefed.testtype.JarHostTest" >
-        <option name="jar" value="HMallocTest.jar" />
-    </test>
-
-</configuration>
diff --git a/androidtest/memtag/Android.bp b/androidtest/memtag/Android.bp
deleted file mode 100644
index 75287f6..0000000
--- a/androidtest/memtag/Android.bp
+++ /dev/null
@@ -1,17 +0,0 @@
-cc_test {
-    name: "memtag_test",
-    srcs: ["memtag_test.cc"],
-    cflags: [
-        "-Wall",
-        "-Werror",
-        "-Wextra",
-        "-O0",
-        "-march=armv9-a+memtag",
-    ],
-
-    compile_multilib: "64",
-
-    sanitize: {
-        memtag_heap: true,
-    },
-}
diff --git a/androidtest/memtag/memtag_test.cc b/androidtest/memtag/memtag_test.cc
deleted file mode 100644
index f858292..0000000
--- a/androidtest/memtag/memtag_test.cc
+++ /dev/null
@@ -1,351 +0,0 @@
-// needed to uncondionally enable assertions
-#undef NDEBUG
-#include <assert.h>
-#include <malloc.h>
-#include <signal.h>
-#include <stdio.h>
-#include <stdint.h>
-#include <stdlib.h>
-#include <sys/mman.h>
-#include <sys/utsname.h>
-#include <unistd.h>
-
-#include <map>
-#include <set>
-#include <string>
-#include <unordered_map>
-
-#include "../../arm_mte.h"
-
-using namespace std;
-
-using u8 = uint8_t;
-using uptr = uintptr_t;
-using u64 = uint64_t;
-
-const size_t DEFAULT_ALLOC_SIZE = 8;
-const size_t CANARY_SIZE = 8;
-
-void do_context_switch() {
-    utsname s;
-    uname(&s);
-}
-
-u8 get_pointer_tag(void *ptr) {
-    return (((uptr) ptr) >> 56) & 0xf;
-}
-
-void *untag_pointer(void *ptr) {
-    const uintptr_t mask = UINTPTR_MAX >> 8;
-    return (void *) ((uintptr_t) ptr & mask);
-}
-
-void *set_pointer_tag(void *ptr, u8 tag) {
-    return (void *) (((uintptr_t) tag << 56) | (uintptr_t) untag_pointer(ptr));
-}
-
-// This test checks that slab slot allocation uses tag that is distint from tags of its neighbors
-// and from the tag of the previous allocation that used the same slot
-void tag_distinctness() {
-    // tag 0 is reserved
-    const int min_tag = 1;
-    const int max_tag = 0xf;
-
-    struct SizeClass {
-        int size;
-        int slot_cnt;
-    };
-
-    // values from size_classes[] and size_class_slots[] in h_malloc.c
-    SizeClass size_classes[] = {
-        { .size = 16,    .slot_cnt = 256, },
-        { .size = 32,    .slot_cnt = 128, },
-        // this size class is used by allocations that are made by the addr_tag_map, which breaks
-        // tag distinctess checks
-        // { .size = 48,    .slot_cnt = 85,  },
-        { .size = 64,    .slot_cnt = 64,  },
-        { .size = 80,    .slot_cnt = 51,  },
-        { .size = 96,    .slot_cnt = 42,  },
-        { .size = 112,   .slot_cnt = 36,  },
-        { .size = 128,   .slot_cnt = 64,  },
-        { .size = 160,   .slot_cnt = 51,  },
-        { .size = 192,   .slot_cnt = 64,  },
-        { .size = 224,   .slot_cnt = 54,  },
-        { .size = 10240, .slot_cnt = 6,   },
-        { .size = 20480, .slot_cnt = 1,   },
-    };
-
-    int tag_usage[max_tag + 1];
-
-    for (size_t sc_idx = 0; sc_idx < sizeof(size_classes) / sizeof(SizeClass); ++sc_idx) {
-        SizeClass &sc = size_classes[sc_idx];
-
-        const size_t full_alloc_size = sc.size;
-        const size_t alloc_size = full_alloc_size - CANARY_SIZE;
-
-        // "tdc" is short for "tag distinctness check"
-        int left_neighbor_tdc_cnt = 0;
-        int right_neighbor_tdc_cnt = 0;
-        int prev_alloc_tdc_cnt = 0;
-
-        int iter_cnt = 600;
-
-        unordered_map<uptr, u8> addr_tag_map;
-        addr_tag_map.reserve(iter_cnt * sc.slot_cnt);
-
-        u64 seen_tags = 0;
-
-        for (int iter = 0; iter < iter_cnt; ++iter) {
-            uptr allocations[256]; // 256 is max slot count
-
-            for (int i = 0; i < sc.slot_cnt; ++i) {
-                u8 *p = (u8 *) malloc(alloc_size);
-                assert(p);
-                uptr addr = (uptr) untag_pointer(p);
-                u8 tag = get_pointer_tag(p);
-
-                assert(tag >= min_tag && tag <= max_tag);
-                seen_tags |= 1 << tag;
-                ++tag_usage[tag];
-
-                // check most recent tags of left and right neighbors
-
-                auto left = addr_tag_map.find(addr - full_alloc_size);
-                if (left != addr_tag_map.end()) {
-                    assert(left->second != tag);
-                    ++left_neighbor_tdc_cnt;
-                }
-
-                auto right = addr_tag_map.find(addr + full_alloc_size);
-                if (right != addr_tag_map.end()) {
-                    assert(right->second != tag);
-                    ++right_neighbor_tdc_cnt;
-                }
-
-                // check previous tag of this slot
-                auto prev = addr_tag_map.find(addr);
-                if (prev != addr_tag_map.end()) {
-                    assert(prev->second != tag);
-                    ++prev_alloc_tdc_cnt;
-                    addr_tag_map.erase(addr);
-                }
-
-                addr_tag_map.emplace(addr, tag);
-
-                for (size_t j = 0; j < alloc_size; ++j) {
-                    // check that slot is zeroed
-                    assert(p[j] == 0);
-                    // check that slot is readable and writable
-                    p[j]++;
-                }
-
-                allocations[i] = addr;
-            }
-
-            // free some of allocations to allow their slots to be reused
-            for (int i = sc.slot_cnt - 1; i >= 0; i -= 2) {
-                free((void *) allocations[i]);
-            }
-        }
-
-        // check that all of the tags were used, except for the reserved tag 0
-        assert(seen_tags == (0xffff & ~(1 << 0)));
-
-        printf("size_class\t%i\t" "tdc_left %i\t" "tdc_right %i\t" "tdc_prev_alloc %i\n",
-               sc.size, left_neighbor_tdc_cnt, right_neighbor_tdc_cnt, prev_alloc_tdc_cnt);
-
-        // make sure tag distinctess checks were actually performed
-        int min_tdc_cnt = sc.slot_cnt * iter_cnt / 5;
-
-        assert(prev_alloc_tdc_cnt > min_tdc_cnt);
-
-        if (sc.slot_cnt > 1) {
-            assert(left_neighbor_tdc_cnt > min_tdc_cnt);
-            assert(right_neighbor_tdc_cnt > min_tdc_cnt);
-        }
-
-        // async tag check failures are reported on context switch
-        do_context_switch();
-    }
-
-    printf("\nTag use counters:\n");
-
-    int min = INT_MAX;
-    int max = 0;
-    double geomean = 0.0;
-    for (int i = min_tag; i <= max_tag; ++i) {
-        int v = tag_usage[i];
-        geomean += log(v);
-        min = std::min(min, v);
-        max = std::max(max, v);
-        printf("%i\t%i\n", i, tag_usage[i]);
-    }
-    int tag_cnt = 1 + max_tag - min_tag;
-    geomean = exp(geomean / tag_cnt);
-
-    double max_deviation = std::max((double) max - geomean, geomean - min);
-
-    printf("geomean: %.2f, max deviation from geomean: %.2f%%\n", geomean, (100.0 * max_deviation) / geomean);
-}
-
-u8* alloc_default() {
-    const size_t full_alloc_size = DEFAULT_ALLOC_SIZE + CANARY_SIZE;
-    set<uptr> addrs;
-
-    // make sure allocation has both left and right neighbors, otherwise overflow/underflow tests
-    // will fail when allocation is at the end/beginning of slab
-    for (;;) {
-        u8 *p = (u8 *) malloc(DEFAULT_ALLOC_SIZE);
-        assert(p);
-
-        uptr addr = (uptr) untag_pointer(p);
-        uptr left = addr - full_alloc_size;
-        if (addrs.find(left) != addrs.end()) {
-            uptr right = addr + full_alloc_size;
-            if (addrs.find(right) != addrs.end()) {
-                return p;
-            }
-        }
-
-        addrs.emplace(addr);
-    }
-}
-
-int expected_segv_code;
-
-#define expect_segv(exp, segv_code) ({\
-    expected_segv_code = segv_code; \
-    volatile auto val = exp; \
-    (void) val; \
-    do_context_switch(); \
-    fprintf(stderr, "didn't receive SEGV code %i", segv_code); \
-    exit(1); })
-
-// it's expected that the device is configured to use asymm MTE tag checking mode (sync read checks,
-// async write checks)
-#define expect_read_segv(exp) expect_segv(exp, SEGV_MTESERR)
-#define expect_write_segv(exp) expect_segv(exp, SEGV_MTEAERR)
-
-void read_after_free() {
-    u8 *p = alloc_default();
-    free(p);
-    expect_read_segv(p[0]);
-}
-
-void write_after_free() {
-    u8 *p = alloc_default();
-    free(p);
-    expect_write_segv(p[0] = 1);
-}
-
-void underflow_read() {
-    u8 *p = alloc_default();
-    expect_read_segv(p[-1]);
-}
-
-void underflow_write() {
-    u8 *p = alloc_default();
-    expect_write_segv(p[-1] = 1);
-}
-
-void overflow_read() {
-    u8 *p = alloc_default();
-    expect_read_segv(p[DEFAULT_ALLOC_SIZE + CANARY_SIZE]);
-}
-
-void overflow_write() {
-    u8 *p = alloc_default();
-    expect_write_segv(p[DEFAULT_ALLOC_SIZE + CANARY_SIZE] = 1);
-}
-
-void untagged_read() {
-    u8 *p = alloc_default();
-    p = (u8 *) untag_pointer(p);
-    expect_read_segv(p[0]);
-}
-
-void untagged_write() {
-    u8 *p = alloc_default();
-    p = (u8 *) untag_pointer(p);
-    expect_write_segv(p[0] = 1);
-}
-
-// checks that each of memory locations inside the buffer is tagged with expected_tag
-void check_tag(void *buf, size_t len, u8 expected_tag) {
-    for (size_t i = 0; i < len; ++i) {
-        assert(get_pointer_tag(__arm_mte_get_tag((void *) ((uintptr_t) buf + i))) == expected_tag);
-    }
-}
-
-void madvise_dontneed() {
-    const size_t len = 100'000;
-    void *ptr = mmap(NULL, len, PROT_READ | PROT_WRITE | PROT_MTE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
-    assert(ptr != MAP_FAILED);
-
-    // check that 0 is the initial tag
-    check_tag(ptr, len, 0);
-
-    arm_mte_tag_and_clear_mem(set_pointer_tag(ptr, 1), len);
-    check_tag(ptr, len, 1);
-
-    memset(set_pointer_tag(ptr, 1), 1, len);
-
-    assert(madvise(ptr, len, MADV_DONTNEED) == 0);
-    // check that MADV_DONTNEED resets the tag
-    check_tag(ptr, len, 0);
-
-    // check that MADV_DONTNEED clears the memory
-    for (size_t i = 0; i < len; ++i) {
-        assert(((u8 *) ptr)[i] == 0);
-    }
-
-    // check that mistagged read after MADV_DONTNEED fails
-    expect_read_segv(*((u8 *) set_pointer_tag(ptr, 1)));
-}
-
-map<string, function<void()>> tests = {
-#define TEST(s) { #s, s }
-    TEST(tag_distinctness),
-    TEST(read_after_free),
-    TEST(write_after_free),
-    TEST(overflow_read),
-    TEST(overflow_write),
-    TEST(underflow_read),
-    TEST(underflow_write),
-    TEST(untagged_read),
-    TEST(untagged_write),
-    TEST(madvise_dontneed),
-#undef TEST
-};
-
-void segv_handler(int, siginfo_t *si, void *) {
-    if (expected_segv_code == 0 || expected_segv_code != si->si_code) {
-        fprintf(stderr, "received unexpected SEGV_CODE %i", si->si_code);
-        exit(139); // standard exit code for SIGSEGV
-    }
-
-    exit(0);
-}
-
-int main(int argc, char **argv) {
-    setbuf(stdout, NULL);
-    assert(argc == 2);
-
-    auto test_name = string(argv[1]);
-    auto test_fn = tests[test_name];
-    assert(test_fn != nullptr);
-
-    assert(mallopt(M_BIONIC_SET_HEAP_TAGGING_LEVEL, M_HEAP_TAGGING_LEVEL_ASYNC) == 1);
-
-    struct sigaction sa = {
-        .sa_sigaction = segv_handler,
-        .sa_flags = SA_SIGINFO,
-    };
-
-    assert(sigaction(SIGSEGV, &sa, nullptr) == 0);
-
-    test_fn();
-    do_context_switch();
-
-    return 0;
-}
diff --git a/androidtest/src/grapheneos/hmalloc/MemtagTest.java b/androidtest/src/grapheneos/hmalloc/MemtagTest.java
deleted file mode 100644
index be04bd9..0000000
--- a/androidtest/src/grapheneos/hmalloc/MemtagTest.java
+++ /dev/null
@@ -1,79 +0,0 @@
-package grapheneos.hmalloc;
-
-import com.android.tradefed.device.DeviceNotAvailableException;
-import com.android.tradefed.testtype.DeviceJUnit4ClassRunner;
-import com.android.tradefed.testtype.junit4.BaseHostJUnit4Test;
-
-import org.junit.Test;
-import org.junit.runner.RunWith;
-
-import java.util.ArrayList;
-
-import static org.junit.Assert.assertEquals;
-
-@RunWith(DeviceJUnit4ClassRunner.class)
-public class MemtagTest extends BaseHostJUnit4Test {
-    private static final String TEST_BINARY = "/data/local/tmp/memtag_test";
-
-    private void runTest(String name) throws DeviceNotAvailableException {
-        var args = new ArrayList<String>();
-        args.add(TEST_BINARY);
-        args.add(name);
-        String cmdLine = String.join(" ", args);
-
-        var result = getDevice().executeShellV2Command(cmdLine);
-
-        assertEquals("stderr", "", result.getStderr());
-        assertEquals("process exit code", 0, result.getExitCode().intValue());
-    }
-
-    @Test
-    public void tag_distinctness() throws DeviceNotAvailableException {
-        runTest("tag_distinctness");
-    }
-
-    @Test
-    public void read_after_free() throws DeviceNotAvailableException {
-        runTest("read_after_free");
-    }
-
-    @Test
-    public void write_after_free() throws DeviceNotAvailableException {
-        runTest("write_after_free");
-    }
-
-    @Test
-    public void underflow_read() throws DeviceNotAvailableException {
-        runTest("underflow_read");
-    }
-
-    @Test
-    public void underflow_write() throws DeviceNotAvailableException {
-        runTest("underflow_write");
-    }
-
-    @Test
-    public void overflow_read() throws DeviceNotAvailableException {
-        runTest("overflow_read");
-    }
-
-    @Test
-    public void overflow_write() throws DeviceNotAvailableException {
-        runTest("overflow_write");
-    }
-
-    @Test
-    public void untagged_read() throws DeviceNotAvailableException {
-        runTest("untagged_read");
-    }
-
-    @Test
-    public void untagged_write() throws DeviceNotAvailableException {
-        runTest("untagged_write");
-    }
-
-    @Test
-    public void madvise_dontneed() throws DeviceNotAvailableException {
-        runTest("madvise_dontneed");
-    }
-}
diff --git a/arm_mte.h b/arm_mte.h
deleted file mode 100644
index 5ed900d..0000000
--- a/arm_mte.h
+++ /dev/null
@@ -1,91 +0,0 @@
-#ifndef ARM_MTE_H
-#define ARM_MTE_H
-
-#include <arm_acle.h>
-#include <stdint.h>
-
-// Returns a tagged pointer.
-// See https://developer.arm.com/documentation/ddi0602/2023-09/Base-Instructions/IRG--Insert-Random-Tag-
-static inline void *arm_mte_create_random_tag(void *p, uint64_t exclusion_mask) {
-    return __arm_mte_create_random_tag(p, exclusion_mask);
-}
-
-// Tag the memory region with the tag specified in tag bits of tagged_ptr. Memory region itself is
-// zeroed.
-// tagged_ptr has to be aligned by 16, and len has to be a multiple of 16 (tag granule size).
-//
-// Arm's software optimization guide says:
-// "it is recommended to use STZGM (or DCZGVA) to set tag if data is not a concern." (STZGM and
-// DCGZVA are zeroing variants of tagging instructions).
-//
-// Contents of this function were copied from scudo:
-// https://android.googlesource.com/platform/external/scudo/+/refs/tags/android-14.0.0_r1/standalone/memtag.h#167
-//
-// scudo is licensed under the Apache License v2.0 with LLVM Exceptions, which is compatible with
-// the hardened_malloc's MIT license
-static inline void arm_mte_tag_and_clear_mem(void *tagged_ptr, size_t len) {
-    uintptr_t Begin = (uintptr_t) tagged_ptr;
-    uintptr_t End = Begin + len;
-    uintptr_t LineSize, Next, Tmp;
-    __asm__ __volatile__(
-        ".arch_extension memtag \n\t"
-
-        // Compute the cache line size in bytes (DCZID_EL0 stores it as the log2
-        // of the number of 4-byte words) and bail out to the slow path if DCZID_EL0
-        // indicates that the DC instructions are unavailable.
-        "DCZID .req %[Tmp] \n\t"
-        "mrs DCZID, dczid_el0 \n\t"
-        "tbnz DCZID, #4, 3f \n\t"
-        "and DCZID, DCZID, #15 \n\t"
-        "mov %[LineSize], #4 \n\t"
-        "lsl %[LineSize], %[LineSize], DCZID \n\t"
-        ".unreq DCZID \n\t"
-
-        // Our main loop doesn't handle the case where we don't need to perform any
-        // DC GZVA operations. If the size of our tagged region is less than
-        // twice the cache line size, bail out to the slow path since it's not
-        // guaranteed that we'll be able to do a DC GZVA.
-        "Size .req %[Tmp] \n\t"
-        "sub Size, %[End], %[Cur] \n\t"
-        "cmp Size, %[LineSize], lsl #1 \n\t"
-        "b.lt 3f \n\t"
-        ".unreq Size \n\t"
-
-        "LineMask .req %[Tmp] \n\t"
-        "sub LineMask, %[LineSize], #1 \n\t"
-
-        // STZG until the start of the next cache line.
-        "orr %[Next], %[Cur], LineMask \n\t"
-
-        "1:\n\t"
-        "stzg %[Cur], [%[Cur]], #16 \n\t"
-        "cmp %[Cur], %[Next] \n\t"
-        "b.lt 1b \n\t"
-
-        // DC GZVA cache lines until we have no more full cache lines.
-        "bic %[Next], %[End], LineMask \n\t"
-        ".unreq LineMask \n\t"
-
-        "2: \n\t"
-        "dc gzva, %[Cur] \n\t"
-        "add %[Cur], %[Cur], %[LineSize] \n\t"
-        "cmp %[Cur], %[Next] \n\t"
-        "b.lt 2b \n\t"
-
-        // STZG until the end of the tagged region. This loop is also used to handle
-        // slow path cases.
-
-        "3: \n\t"
-        "cmp %[Cur], %[End] \n\t"
-        "b.ge 4f \n\t"
-        "stzg %[Cur], [%[Cur]], #16 \n\t"
-        "b 3b \n\t"
-
-        "4: \n\t"
-
-        : [Cur] "+&r"(Begin), [LineSize] "=&r"(LineSize), [Next] "=&r"(Next), [Tmp] "=&r"(Tmp)
-        : [End] "r"(End)
-        : "memory"
-    );
-}
-#endif
diff --git a/calculate-waste b/calculate_waste.py
similarity index 100%
rename from calculate-waste
rename to calculate_waste.py
diff --git a/h_malloc.c b/h_malloc.c
index 6221d0b..2dc0bde 100644
--- a/h_malloc.c
+++ b/h_malloc.c
@@ -14,7 +14,6 @@
 
 #include "h_malloc.h"
 #include "memory.h"
-#include "memtag.h"
 #include "mutex.h"
 #include "pages.h"
 #include "random.h"
@@ -76,9 +75,6 @@ static union {
         struct region_metadata *regions[2];
 #ifdef USE_PKEY
         int metadata_pkey;
-#endif
-#ifdef MEMTAG
-        bool is_memtag_disabled;
 #endif
     };
     char padding[PAGE_SIZE];
@@ -88,30 +84,6 @@ static inline void *get_slab_region_end(void) {
     return atomic_load_explicit(&ro.slab_region_end, memory_order_acquire);
 }
 
-#ifdef MEMTAG
-static inline bool is_memtag_enabled(void) {
-    return !ro.is_memtag_disabled;
-}
-#endif
-
-static void *memory_map_tagged(size_t size) {
-#ifdef HAS_ARM_MTE
-    if (likely51(is_memtag_enabled())) {
-        return memory_map_mte(size);
-    }
-#endif
-    return memory_map(size);
-}
-
-static bool memory_map_fixed_tagged(void *ptr, size_t size) {
-#ifdef HAS_ARM_MTE
-    if (likely51(is_memtag_enabled())) {
-        return memory_map_fixed_mte(ptr, size);
-    }
-#endif
-    return memory_map_fixed(ptr, size);
-}
-
 #define SLAB_METADATA_COUNT
 
 struct slab_metadata {
@@ -127,18 +99,6 @@ struct slab_metadata {
 #if SLAB_QUARANTINE
     u64 quarantine_bitmap[4];
 #endif
-#ifdef HAS_ARM_MTE
-    // arm_mte_tags is used as a u4 array (MTE tags are 4-bit wide)
-    //
-    // Its size is calculated by the following formula:
-    // (MAX_SLAB_SLOT_COUNT + 2) / 2
-    // MAX_SLAB_SLOT_COUNT is currently 256, 2 extra slots are needed for branchless handling of
-    // edge slots in tag_and_clear_slab_slot()
-    //
-    // It's intentionally placed at the end of struct to improve locality: for most size classes,
-    // slot count is far lower than MAX_SLAB_SLOT_COUNT.
-    u8 arm_mte_tags[129];
-#endif
 };
 
 static const size_t min_align = 16;
@@ -487,12 +447,6 @@ static void write_after_free_check(const char *p, size_t size) {
         return;
     }
 
-#ifdef HAS_ARM_MTE
-    if (likely51(is_memtag_enabled())) {
-        return;
-    }
-#endif
-
     for (size_t i = 0; i < size; i += sizeof(u64)) {
         if (unlikely(*(const u64 *)(const void *)(p + i))) {
             fatal_error("detected write after free");
@@ -507,48 +461,19 @@ static void set_slab_canary_value(UNUSED struct slab_metadata *metadata, UNUSED
         0x00ffffffffffffffUL;
 
     metadata->canary_value = get_random_u64(rng) & canary_mask;
-#ifdef HAS_ARM_MTE
-    if (unlikely(metadata->canary_value == 0)) {
-        // 0 is reserved to support disabling MTE at runtime (this is required on Android).
-        // When MTE is enabled, writing and reading of canaries is disabled, i.e. canary remains zeroed.
-        // After MTE is disabled, canaries that are set to 0 are ignored, since they wouldn't match
-        // slab's metadata->canary_value.
-        // 0x100 was chosen arbitrarily, and can be encoded as an immediate value on ARM by the compiler.
-        metadata->canary_value = 0x100;
-    }
-#endif
 #endif
 }
 
 static void set_canary(UNUSED const struct slab_metadata *metadata, UNUSED void *p, UNUSED size_t size) {
 #if SLAB_CANARY
-#ifdef HAS_ARM_MTE
-    if (likely51(is_memtag_enabled())) {
-        return;
-    }
-#endif
-
     memcpy((char *)p + size - canary_size, &metadata->canary_value, canary_size);
 #endif
 }
 
 static void check_canary(UNUSED const struct slab_metadata *metadata, UNUSED const void *p, UNUSED size_t size) {
 #if SLAB_CANARY
-#ifdef HAS_ARM_MTE
-    if (likely51(is_memtag_enabled())) {
-        return;
-    }
-#endif
-
     u64 canary_value;
     memcpy(&canary_value, (const char *)p + size - canary_size, canary_size);
-
-#ifdef HAS_ARM_MTE
-    if (unlikely(canary_value == 0)) {
-        return;
-    }
-#endif
-
     if (unlikely(canary_value != metadata->canary_value)) {
         fatal_error("canary corrupted");
     }
@@ -581,38 +506,6 @@ static inline void stats_slab_deallocate(UNUSED struct size_class *c, UNUSED siz
 #endif
 }
 
-#ifdef HAS_ARM_MTE
-static void *tag_and_clear_slab_slot(struct slab_metadata *metadata, void *slot_ptr, size_t slot_idx, size_t slot_size) {
-    // arm_mte_tags is an array of 4-bit unsigned integers stored as u8 array (MTE tags are 4-bit wide)
-    //
-    // It stores the most recent tag for each slab slot, or 0 if the slot was never used.
-    // Slab indices in arm_mte_tags array are shifted to the right by 1, and size of this array
-    // is (MAX_SLAB_SLOT_COUNT + 2). This means that first and last values of arm_mte_tags array
-    // are always 0, which allows to handle edge slots in a branchless way when tag exclusion mask
-    // is constructed.
-    u8 *slot_tags = metadata->arm_mte_tags;
-
-    // tag exclusion mask
-    u64 tem = (1 << RESERVED_TAG);
-
-    // current or previous tag of left neighbor or 0 if there's no left neighbor or if it was never used
-    tem |= (1 << u4_arr_get(slot_tags, slot_idx));
-    // previous tag of this slot or 0 if it was never used
-    tem |= (1 << u4_arr_get(slot_tags, slot_idx + 1));
-    // current or previous tag of right neighbor or 0 if there's no right neighbor or if it was never used
-    tem |= (1 << u4_arr_get(slot_tags, slot_idx + 2));
-
-    void *tagged_ptr = arm_mte_create_random_tag(slot_ptr, tem);
-    // slot addresses and sizes are always aligned by 16
-    arm_mte_tag_and_clear_mem(tagged_ptr, slot_size);
-
-    // store new tag of this slot
-    u4_arr_set(slot_tags, slot_idx + 1, get_pointer_tag(tagged_ptr));
-
-    return tagged_ptr;
-}
-#endif
-
 static inline void *allocate_small(unsigned arena, size_t requested_size) {
     struct size_info info = get_size_info(requested_size);
     size_t size = likely(info.size) ? info.size : 16;
@@ -641,11 +534,6 @@ static inline void *allocate_small(unsigned arena, size_t requested_size) {
             if (requested_size) {
                 write_after_free_check(p, size - canary_size);
                 set_canary(metadata, p, size);
-#ifdef HAS_ARM_MTE
-                if (likely51(is_memtag_enabled())) {
-                    p = tag_and_clear_slab_slot(metadata, p, slot, size);
-                }
-#endif
             }
             stats_small_allocate(c, size);
 
@@ -678,11 +566,6 @@ static inline void *allocate_small(unsigned arena, size_t requested_size) {
             void *p = slot_pointer(size, slab, slot);
             if (requested_size) {
                 set_canary(metadata, p, size);
-#ifdef HAS_ARM_MTE
-                if (likely51(is_memtag_enabled())) {
-                    p = tag_and_clear_slab_slot(metadata, p, slot, size);
-                }
-#endif
             }
             stats_slab_allocate(c, slab_size);
             stats_small_allocate(c, size);
@@ -705,11 +588,6 @@ static inline void *allocate_small(unsigned arena, size_t requested_size) {
         void *p = slot_pointer(size, slab, slot);
         if (requested_size) {
             set_canary(metadata, p, size);
-#ifdef HAS_ARM_MTE
-            if (likely51(is_memtag_enabled())) {
-                p = tag_and_clear_slab_slot(metadata, p, slot, size);
-            }
-#endif
         }
         stats_slab_allocate(c, slab_size);
         stats_small_allocate(c, size);
@@ -734,11 +612,6 @@ static inline void *allocate_small(unsigned arena, size_t requested_size) {
     if (requested_size) {
         write_after_free_check(p, size - canary_size);
         set_canary(metadata, p, size);
-#ifdef HAS_ARM_MTE
-        if (likely51(is_memtag_enabled())) {
-            p = tag_and_clear_slab_slot(metadata, p, slot, size);
-        }
-#endif
     }
     stats_small_allocate(c, size);
 
@@ -821,16 +694,7 @@ static inline void deallocate_small(void *p, const size_t *expected_size) {
     if (likely(!is_zero_size)) {
         check_canary(metadata, p, size);
 
-        bool skip_zero = false;
-#ifdef HAS_ARM_MTE
-        if (likely51(is_memtag_enabled())) {
-            arm_mte_tag_and_clear_mem(set_pointer_tag(p, RESERVED_TAG), size);
-            // metadata->arm_mte_tags is intentionally not updated, see tag_and_clear_slab_slot()
-            skip_zero = true;
-        }
-#endif
-
-        if (ZERO_ON_FREE && !skip_zero) {
+        if (ZERO_ON_FREE) {
             memset(p, 0, size - canary_size);
         }
     }
@@ -908,7 +772,7 @@ static inline void deallocate_small(void *p, const size_t *expected_size) {
 
         if (c->empty_slabs_total + slab_size > max_empty_slabs_total) {
             int saved_errno = errno;
-            if (!memory_map_fixed_tagged(slab, slab_size)) {
+            if (!memory_map_fixed(slab, slab_size)) {
                 label_slab(slab, slab_size, class);
                 stats_slab_deallocate(c, slab_size);
                 enqueue_free_slab(c, metadata);
@@ -1210,14 +1074,13 @@ static inline void enforce_init(void) {
     }
 }
 
-static struct mutex init_lock = MUTEX_INITIALIZER;
-
 COLD static void init_slow_path(void) {
+    static struct mutex lock = MUTEX_INITIALIZER;
 
-    mutex_lock(&init_lock);
+    mutex_lock(&lock);
 
     if (unlikely(is_init())) {
-        mutex_unlock(&init_lock);
+        mutex_unlock(&lock);
         return;
     }
 
@@ -1260,7 +1123,8 @@ COLD static void init_slow_path(void) {
     if (unlikely(memory_protect_rw_metadata(ra->regions, ra->total * sizeof(struct region_metadata)))) {
         fatal_error("failed to unprotect memory for regions table");
     }
-    ro.slab_region_start = memory_map_tagged(slab_region_size);
+
+    ro.slab_region_start = memory_map(slab_region_size);
     if (unlikely(ro.slab_region_start == NULL)) {
         fatal_error("failed to allocate slab region");
     }
@@ -1300,7 +1164,7 @@ COLD static void init_slow_path(void) {
     }
     memory_set_name(&ro, sizeof(ro), "malloc read-only after init");
 
-    mutex_unlock(&init_lock);
+    mutex_unlock(&lock);
 
     // may allocate, so wait until the allocator is initialized to avoid deadlocking
     if (unlikely(pthread_atfork(full_lock, full_unlock, post_fork_child))) {
@@ -1504,11 +1368,6 @@ EXPORT void *h_calloc(size_t nmemb, size_t size) {
     if (!ZERO_ON_FREE && likely(p != NULL) && total_size && total_size <= max_slab_size_class) {
         memset(p, 0, total_size - canary_size);
     }
-#ifdef HAS_ARM_MTE
-    // use an assert instead of adding a conditional to memset() above (freed memory is always
-    // zeroed when MTE is enabled)
-    static_assert(ZERO_ON_FREE, "disabling ZERO_ON_FREE reduces performance when ARM MTE is enabled");
-#endif
     return p;
 }
 
@@ -1526,14 +1385,11 @@ EXPORT void *h_realloc(void *old, size_t size) {
         }
     }
 
-    void *old_orig = old;
-    old = untag_pointer(old);
-
     size_t old_size;
     if (old < get_slab_region_end() && old >= ro.slab_region_start) {
         old_size = slab_usable_size(old);
         if (size <= max_slab_size_class && get_size_info(size).size == old_size) {
-            return old_orig;
+            return old;
         }
         thread_unseal_metadata();
     } else {
@@ -1646,7 +1502,7 @@ EXPORT void *h_realloc(void *old, size_t size) {
     if (copy_size > 0 && copy_size <= max_slab_size_class) {
         copy_size -= canary_size;
     }
-    memcpy(new, old_orig, copy_size);
+    memcpy(new, old, copy_size);
     if (old_size <= max_slab_size_class) {
         deallocate_small(old, NULL);
     } else {
@@ -1687,8 +1543,6 @@ EXPORT void h_free(void *p) {
         return;
     }
 
-    p = untag_pointer(p);
-
     if (p < get_slab_region_end() && p >= ro.slab_region_start) {
         thread_unseal_metadata();
         deallocate_small(p, NULL);
@@ -1712,8 +1566,6 @@ EXPORT void h_free_sized(void *p, size_t expected_size) {
         return;
     }
 
-    p = untag_pointer(p);
-
     expected_size = adjust_size_for_canary(expected_size);
 
     if (p < get_slab_region_end() && p >= ro.slab_region_start) {
@@ -1767,13 +1619,11 @@ static inline void memory_corruption_check_small(const void *p) {
     mutex_unlock(&c->lock);
 }
 
-EXPORT size_t h_malloc_usable_size(H_MALLOC_USABLE_SIZE_CONST void *arg) {
-    if (arg == NULL) {
+EXPORT size_t h_malloc_usable_size(H_MALLOC_USABLE_SIZE_CONST void *p) {
+    if (p == NULL) {
         return 0;
     }
 
-    const void *p = untag_const_pointer(arg);
-
     if (p < get_slab_region_end() && p >= ro.slab_region_start) {
         thread_unseal_metadata();
         memory_corruption_check_small(p);
@@ -1905,7 +1755,7 @@ EXPORT int h_malloc_trim(UNUSED size_t pad) {
             struct slab_metadata *iterator = c->empty_slabs;
             while (iterator) {
                 void *slab = get_slab(c, slab_size, iterator);
-                if (memory_map_fixed_tagged(slab, slab_size)) {
+                if (memory_map_fixed(slab, slab_size)) {
                     break;
                 }
                 label_slab(slab, slab_size, class);
@@ -2175,26 +2025,3 @@ COLD EXPORT int h_malloc_set_state(UNUSED void *state) {
     return -2;
 }
 #endif
-
-#ifdef __ANDROID__
-COLD EXPORT void h_malloc_disable_memory_tagging(void) {
-#ifdef HAS_ARM_MTE
-    mutex_lock(&init_lock);
-    if (!ro.is_memtag_disabled) {
-        if (is_init()) {
-            if (unlikely(memory_protect_rw(&ro, sizeof(ro)))) {
-                fatal_error("failed to unprotect allocator data");
-            }
-            ro.is_memtag_disabled = true;
-            if (unlikely(memory_protect_ro(&ro, sizeof(ro)))) {
-                fatal_error("failed to protect allocator data");
-            }
-        } else {
-            // bionic calls this function very early in some cases
-            ro.is_memtag_disabled = true;
-        }
-    }
-    mutex_unlock(&init_lock);
-#endif
-}
-#endif
diff --git a/include/h_malloc.h b/include/h_malloc.h
index 0eee395..5824abb 100644
--- a/include/h_malloc.h
+++ b/include/h_malloc.h
@@ -99,7 +99,6 @@ int h_malloc_iterate(uintptr_t base, size_t size, void (*callback)(uintptr_t ptr
               void *arg);
 void h_malloc_disable(void);
 void h_malloc_enable(void);
-void h_malloc_disable_memory_tagging(void);
 #endif
 
 // hardened_malloc extensions
diff --git a/memory.c b/memory.c
index 2e54f6d..04afc23 100644
--- a/memory.c
+++ b/memory.c
@@ -17,8 +17,8 @@
 #include "memory.h"
 #include "util.h"
 
-static void *memory_map_prot(size_t size, int prot) {
-    void *p = mmap(NULL, size, prot, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
+void *memory_map(size_t size) {
+    void *p = mmap(NULL, size, PROT_NONE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
     if (unlikely(p == MAP_FAILED)) {
         if (errno != ENOMEM) {
             fatal_error("non-ENOMEM mmap failure");
@@ -28,19 +28,8 @@ static void *memory_map_prot(size_t size, int prot) {
     return p;
 }
 
-void *memory_map(size_t size) {
-    return memory_map_prot(size, PROT_NONE);
-}
-
-#ifdef HAS_ARM_MTE
-// Note that PROT_MTE can't be cleared via mprotect
-void *memory_map_mte(size_t size) {
-    return memory_map_prot(size, PROT_MTE);
-}
-#endif
-
-static bool memory_map_fixed_prot(void *ptr, size_t size, int prot) {
-    void *p = mmap(ptr, size, prot, MAP_ANONYMOUS|MAP_PRIVATE|MAP_FIXED, -1, 0);
+bool memory_map_fixed(void *ptr, size_t size) {
+    void *p = mmap(ptr, size, PROT_NONE, MAP_ANONYMOUS|MAP_PRIVATE|MAP_FIXED, -1, 0);
     bool ret = p == MAP_FAILED;
     if (unlikely(ret) && errno != ENOMEM) {
         fatal_error("non-ENOMEM MAP_FIXED mmap failure");
@@ -48,17 +37,6 @@ static bool memory_map_fixed_prot(void *ptr, size_t size, int prot) {
     return ret;
 }
 
-bool memory_map_fixed(void *ptr, size_t size) {
-    return memory_map_fixed_prot(ptr, size, PROT_NONE);
-}
-
-#ifdef HAS_ARM_MTE
-// Note that PROT_MTE can't be cleared via mprotect
-bool memory_map_fixed_mte(void *ptr, size_t size) {
-    return memory_map_fixed_prot(ptr, size, PROT_MTE);
-}
-#endif
-
 bool memory_unmap(void *ptr, size_t size) {
     bool ret = munmap(ptr, size);
     if (unlikely(ret) && errno != ENOMEM) {
diff --git a/memory.h b/memory.h
index d5e336b..c04bfd9 100644
--- a/memory.h
+++ b/memory.h
@@ -11,13 +11,7 @@
 int get_metadata_key(void);
 
 void *memory_map(size_t size);
-#ifdef HAS_ARM_MTE
-void *memory_map_mte(size_t size);
-#endif
 bool memory_map_fixed(void *ptr, size_t size);
-#ifdef HAS_ARM_MTE
-bool memory_map_fixed_mte(void *ptr, size_t size);
-#endif
 bool memory_unmap(void *ptr, size_t size);
 bool memory_protect_ro(void *ptr, size_t size);
 bool memory_protect_rw(void *ptr, size_t size);
diff --git a/memtag.h b/memtag.h
deleted file mode 100644
index e431283..0000000
--- a/memtag.h
+++ /dev/null
@@ -1,50 +0,0 @@
-#ifndef MEMTAG_H
-#define MEMTAG_H
-
-#include "util.h"
-
-#ifdef HAS_ARM_MTE
-#include "arm_mte.h"
-#define MEMTAG 1
-// Note that bionic libc always reserves tag 0 via PR_MTE_TAG_MASK prctl
-#define RESERVED_TAG 0
-#define TAG_WIDTH 4
-#endif
-
-static inline void *untag_pointer(void *ptr) {
-#ifdef HAS_ARM_MTE
-    const uintptr_t mask = UINTPTR_MAX >> 8;
-    return (void *) ((uintptr_t) ptr & mask);
-#else
-    return ptr;
-#endif
-}
-
-static inline const void *untag_const_pointer(const void *ptr) {
-#ifdef HAS_ARM_MTE
-    const uintptr_t mask = UINTPTR_MAX >> 8;
-    return (const void *) ((uintptr_t) ptr & mask);
-#else
-    return ptr;
-#endif
-}
-
-static inline void *set_pointer_tag(void *ptr, u8 tag) {
-#ifdef HAS_ARM_MTE
-    return (void *) (((uintptr_t) tag << 56) | (uintptr_t) untag_pointer(ptr));
-#else
-    (void) tag;
-    return ptr;
-#endif
-}
-
-static inline u8 get_pointer_tag(void *ptr) {
-#ifdef HAS_ARM_MTE
-    return (((uintptr_t) ptr) >> 56) & 0xf;
-#else
-    (void) ptr;
-    return 0;
-#endif
-}
-
-#endif
diff --git a/test/malloc_info.c b/test/malloc_info.c
index 3b99ead..50b256f 100644
--- a/test/malloc_info.c
+++ b/test/malloc_info.c
@@ -1,6 +1,5 @@
 #include <pthread.h>
 #include <stdio.h>
-#include <stdlib.h>
 
 #if defined(__GLIBC__) || defined(__ANDROID__)
 #include <malloc.h>
diff --git a/third_party/libdivide.h b/third_party/libdivide.h
index bddc763..e9a31d1 100644
--- a/third_party/libdivide.h
+++ b/third_party/libdivide.h
@@ -1,8 +1,8 @@
 // libdivide.h - Optimized integer division
 // https://libdivide.com
 //
-// Copyright (C) 2010 - 2022 ridiculous_fish, <libdivide@ridiculousfish.com>
-// Copyright (C) 2016 - 2022 Kim Walisch, <kim.walisch@gmail.com>
+// Copyright (C) 2010 - 2021 ridiculous_fish, <libdivide@ridiculousfish.com>
+// Copyright (C) 2016 - 2021 Kim Walisch, <kim.walisch@gmail.com>
 //
 // libdivide is dual-licensed under the Boost or zlib licenses.
 // You may use libdivide under the terms of either of these.
@@ -11,14 +11,11 @@
 #ifndef LIBDIVIDE_H
 #define LIBDIVIDE_H
 
-// *** Version numbers are auto generated - do not edit ***
-#define LIBDIVIDE_VERSION "5.2.0"
+#define LIBDIVIDE_VERSION "5.0"
 #define LIBDIVIDE_VERSION_MAJOR 5
-#define LIBDIVIDE_VERSION_MINOR 2
-#define LIBDIVIDE_VERSION_PATCH 0
+#define LIBDIVIDE_VERSION_MINOR 0
 
 #include <stdint.h>
-
 #if !defined(__AVR__)
 #include <stdio.h>
 #include <stdlib.h>
@@ -27,29 +24,20 @@
 #if defined(LIBDIVIDE_SSE2)
 #include <emmintrin.h>
 #endif
-
 #if defined(LIBDIVIDE_AVX2) || defined(LIBDIVIDE_AVX512)
 #include <immintrin.h>
 #endif
-
 #if defined(LIBDIVIDE_NEON)
 #include <arm_neon.h>
 #endif
 
-// Clang-cl prior to Visual Studio 2022 doesn't include __umulh/__mulh intrinsics
-#if defined(_MSC_VER) && defined(LIBDIVIDE_X86_64) && (!defined(__clang__) || _MSC_VER>1930)
-#define LIBDIVIDE_X64_INTRINSICS
-#endif
-
 #if defined(_MSC_VER)
-#if defined(LIBDIVIDE_X64_INTRINSICS)
 #include <intrin.h>
-#endif
 #pragma warning(push)
 // disable warning C4146: unary minus operator applied
 // to unsigned type, result still unsigned
 #pragma warning(disable : 4146)
-// disable warning C4204: nonstandard extension used : non-constant aggregate
+// disable warning C4204: nonstandard extension used : non-constant aggregate 
 // initializer
 //
 // It's valid C99
@@ -250,32 +238,24 @@ static LIBDIVIDE_INLINE struct libdivide_u64_branchfree_t libdivide_u64_branchfr
 static LIBDIVIDE_INLINE int16_t libdivide_s16_do_raw(
     int16_t numer, int16_t magic, uint8_t more);
 static LIBDIVIDE_INLINE int16_t libdivide_s16_do(
-    int16_t numer, const struct libdivide_s16_t *denom);
+    int16_t numer, const struct libdivide_s16_t* denom);
 static LIBDIVIDE_INLINE uint16_t libdivide_u16_do_raw(
-    uint16_t numer, uint16_t magic, uint8_t more);
+    uint16_t numer, uint16_t magic, uint8_t more);    
 static LIBDIVIDE_INLINE uint16_t libdivide_u16_do(
-    uint16_t numer, const struct libdivide_u16_t *denom);
-static LIBDIVIDE_INLINE int32_t libdivide_s32_do_raw(
-    int32_t numer, int32_t magic, uint8_t more);
+    uint16_t numer, const struct libdivide_u16_t* denom);
 static LIBDIVIDE_INLINE int32_t libdivide_s32_do(
     int32_t numer, const struct libdivide_s32_t *denom);
-static LIBDIVIDE_INLINE uint32_t libdivide_u32_do_raw(
-    uint32_t numer, uint32_t magic, uint8_t more);
 static LIBDIVIDE_INLINE uint32_t libdivide_u32_do(
     uint32_t numer, const struct libdivide_u32_t *denom);
-static LIBDIVIDE_INLINE int64_t libdivide_s64_do_raw(
-    int64_t numer, int64_t magic, uint8_t more);
 static LIBDIVIDE_INLINE int64_t libdivide_s64_do(
     int64_t numer, const struct libdivide_s64_t *denom);
-static LIBDIVIDE_INLINE uint64_t libdivide_u64_do_raw(
-    uint64_t numer, uint64_t magic, uint8_t more);
 static LIBDIVIDE_INLINE uint64_t libdivide_u64_do(
     uint64_t numer, const struct libdivide_u64_t *denom);
 
 static LIBDIVIDE_INLINE int16_t libdivide_s16_branchfree_do(
-    int16_t numer, const struct libdivide_s16_branchfree_t *denom);
+    int16_t numer, const struct libdivide_s16_branchfree_t* denom);
 static LIBDIVIDE_INLINE uint16_t libdivide_u16_branchfree_do(
-    uint16_t numer, const struct libdivide_u16_branchfree_t *denom);
+    uint16_t numer, const struct libdivide_u16_branchfree_t* denom);
 static LIBDIVIDE_INLINE int32_t libdivide_s32_branchfree_do(
     int32_t numer, const struct libdivide_s32_branchfree_t *denom);
 static LIBDIVIDE_INLINE uint32_t libdivide_u32_branchfree_do(
@@ -285,17 +265,17 @@ static LIBDIVIDE_INLINE int64_t libdivide_s64_branchfree_do(
 static LIBDIVIDE_INLINE uint64_t libdivide_u64_branchfree_do(
     uint64_t numer, const struct libdivide_u64_branchfree_t *denom);
 
-static LIBDIVIDE_INLINE int16_t libdivide_s16_recover(const struct libdivide_s16_t *denom);
-static LIBDIVIDE_INLINE uint16_t libdivide_u16_recover(const struct libdivide_u16_t *denom);
+static LIBDIVIDE_INLINE int16_t libdivide_s16_recover(const struct libdivide_s16_t* denom);
+static LIBDIVIDE_INLINE uint16_t libdivide_u16_recover(const struct libdivide_u16_t* denom);
 static LIBDIVIDE_INLINE int32_t libdivide_s32_recover(const struct libdivide_s32_t *denom);
 static LIBDIVIDE_INLINE uint32_t libdivide_u32_recover(const struct libdivide_u32_t *denom);
 static LIBDIVIDE_INLINE int64_t libdivide_s64_recover(const struct libdivide_s64_t *denom);
 static LIBDIVIDE_INLINE uint64_t libdivide_u64_recover(const struct libdivide_u64_t *denom);
 
 static LIBDIVIDE_INLINE int16_t libdivide_s16_branchfree_recover(
-    const struct libdivide_s16_branchfree_t *denom);
+    const struct libdivide_s16_branchfree_t* denom);
 static LIBDIVIDE_INLINE uint16_t libdivide_u16_branchfree_recover(
-    const struct libdivide_u16_branchfree_t *denom);
+    const struct libdivide_u16_branchfree_t* denom);
 static LIBDIVIDE_INLINE int32_t libdivide_s32_branchfree_recover(
     const struct libdivide_s32_branchfree_t *denom);
 static LIBDIVIDE_INLINE uint32_t libdivide_u32_branchfree_recover(
@@ -334,7 +314,7 @@ static LIBDIVIDE_INLINE int32_t libdivide_mullhi_s32(int32_t x, int32_t y) {
 }
 
 static LIBDIVIDE_INLINE uint64_t libdivide_mullhi_u64(uint64_t x, uint64_t y) {
-#if defined(LIBDIVIDE_X64_INTRINSICS)
+#if defined(LIBDIVIDE_VC) && defined(LIBDIVIDE_X86_64)
     return __umulh(x, y);
 #elif defined(HAS_INT128_T)
     __uint128_t xl = x, yl = y;
@@ -360,7 +340,7 @@ static LIBDIVIDE_INLINE uint64_t libdivide_mullhi_u64(uint64_t x, uint64_t y) {
 }
 
 static LIBDIVIDE_INLINE int64_t libdivide_mullhi_s64(int64_t x, int64_t y) {
-#if defined(LIBDIVIDE_X64_INTRINSICS)
+#if defined(LIBDIVIDE_VC) && defined(LIBDIVIDE_X86_64)
     return __mulh(x, y);
 #elif defined(HAS_INT128_T)
     __int128_t xl = x, yl = y;
@@ -413,7 +393,7 @@ static LIBDIVIDE_INLINE int16_t libdivide_count_leading_zeros16(uint16_t val) {
 
 static LIBDIVIDE_INLINE int32_t libdivide_count_leading_zeros32(uint32_t val) {
 #if defined(__AVR__)
-    // Fast way to count leading zeros
+   // Fast way to count leading zeros
     return __builtin_clzl(val);
 #elif defined(__GNUC__) || __has_builtin(__builtin_clz)
     // Fast way to count leading zeros
@@ -462,7 +442,7 @@ static LIBDIVIDE_INLINE int32_t libdivide_count_leading_zeros64(uint64_t val) {
 // uint {v}. The result must fit in 16 bits.
 // Returns the quotient directly and the remainder in *r
 static LIBDIVIDE_INLINE uint16_t libdivide_32_div_16_to_16(
-    uint16_t u1, uint16_t u0, uint16_t v, uint16_t *r) {
+    uint16_t u1, uint16_t u0, uint16_t v, uint16_t* r) {
     uint32_t n = ((uint32_t)u1 << 16) | u0;
     uint16_t result = (uint16_t)(n / v);
     *r = (uint16_t)(n - result * (uint32_t)v);
@@ -532,7 +512,7 @@ static LIBDIVIDE_INLINE uint64_t libdivide_128_div_64_to_64(
 
     // Check for overflow and divide by 0.
     if (numhi >= den) {
-        if (r) *r = ~0ull;
+        if (r != NULL) *r = ~0ull;
         return ~0ull;
     }
 
@@ -578,14 +558,11 @@ static LIBDIVIDE_INLINE uint64_t libdivide_128_div_64_to_64(
     q0 = (uint32_t)qhat;
 
     // Return remainder if requested.
-    if (r) *r = (rem * b + num0 - q0 * den) >> shift;
+    if (r != NULL) *r = (rem * b + num0 - q0 * den) >> shift;
     return ((uint64_t)q1 << 32) | q0;
 #endif
 }
 
-#if !(defined(HAS_INT128_T) && \
-      defined(HAS_INT128_DIV))
-
 // Bitshift a u128 in place, left (signed_shift > 0) or right (signed_shift < 0)
 static LIBDIVIDE_INLINE void libdivide_u128_shift(
     uint64_t *u1, uint64_t *u0, int32_t signed_shift) {
@@ -602,8 +579,6 @@ static LIBDIVIDE_INLINE void libdivide_u128_shift(
     }
 }
 
-#endif
-
 // Computes a 128 / 128 -> 64 bit division, with a 128 bit remainder.
 static LIBDIVIDE_INLINE uint64_t libdivide_128_div_128_to_64(
     uint64_t u_hi, uint64_t u_lo, uint64_t v_hi, uint64_t v_lo, uint64_t *r_hi, uint64_t *r_lo) {
@@ -721,7 +696,8 @@ static LIBDIVIDE_INLINE struct libdivide_u16_t libdivide_internal_u16_gen(
         // 1 in its recovery algorithm.
         result.magic = 0;
         result.more = (uint8_t)(floor_log_2_d - (branchfree != 0));
-    } else {
+    }
+    else {
         uint8_t more;
         uint16_t rem, proposed_m;
         proposed_m = libdivide_32_div_16_to_16((uint16_t)1 << floor_log_2_d, 0, d, &rem);
@@ -733,7 +709,8 @@ static LIBDIVIDE_INLINE struct libdivide_u16_t libdivide_internal_u16_gen(
         if (!branchfree && (e < ((uint16_t)1 << floor_log_2_d))) {
             // This power works
             more = floor_log_2_d;
-        } else {
+        }
+        else {
             // We have to use the general 17-bit algorithm.  We need to compute
             // (2**power) / d. However, we already have (2**(power-1))/d and
             // its remainder.  By doubling both, and then correcting the
@@ -765,7 +742,7 @@ struct libdivide_u16_branchfree_t libdivide_u16_branchfree_gen(uint16_t d) {
     }
     struct libdivide_u16_t tmp = libdivide_internal_u16_gen(d, 1);
     struct libdivide_u16_branchfree_t ret = {
-        tmp.magic, (uint8_t)(tmp.more & LIBDIVIDE_16_SHIFT_MASK)};
+        tmp.magic, (uint8_t)(tmp.more & LIBDIVIDE_16_SHIFT_MASK) };
     return ret;
 }
 
@@ -775,25 +752,27 @@ struct libdivide_u16_branchfree_t libdivide_u16_branchfree_gen(uint16_t d) {
 uint16_t libdivide_u16_do_raw(uint16_t numer, uint16_t magic, uint8_t more) {
     if (!magic) {
         return numer >> more;
-    } else {
+    }
+    else {
         uint16_t q = libdivide_mullhi_u16(magic, numer);
         if (more & LIBDIVIDE_ADD_MARKER) {
             uint16_t t = ((numer - q) >> 1) + q;
             return t >> (more & LIBDIVIDE_16_SHIFT_MASK);
-        } else {
+        }
+        else {
             // All upper bits are 0,
             // don't need to mask them off.
             return q >> more;
         }
-    }
+    }    
 }
 
-uint16_t libdivide_u16_do(uint16_t numer, const struct libdivide_u16_t *denom) {
+uint16_t libdivide_u16_do(uint16_t numer, const struct libdivide_u16_t* denom) {
     return libdivide_u16_do_raw(numer, denom->magic, denom->more);
 }
 
 uint16_t libdivide_u16_branchfree_do(
-    uint16_t numer, const struct libdivide_u16_branchfree_t *denom) {
+    uint16_t numer, const struct libdivide_u16_branchfree_t* denom) {
     uint16_t q = libdivide_mullhi_u16(denom->magic, numer);
     uint16_t t = ((numer - q) >> 1) + q;
     return t >> denom->more;
@@ -821,7 +800,7 @@ uint16_t libdivide_u16_recover(const struct libdivide_u16_t *denom) {
         // overflow. So we have to compute it as 2^(16+shift)/(m+2^16), and
         // then double the quotient and remainder.
         uint32_t half_n = (uint32_t)1 << (16 + shift);
-        uint32_t d = ((uint32_t)1 << 16) | denom->magic;
+        uint32_t d = ( (uint32_t)1 << 16) | denom->magic;
         // Note that the quotient is guaranteed <= 16 bits, but the remainder
         // may need 17!
         uint16_t half_q = (uint16_t)(half_n / d);
@@ -933,11 +912,12 @@ struct libdivide_u32_branchfree_t libdivide_u32_branchfree_gen(uint32_t d) {
     return ret;
 }
 
-uint32_t libdivide_u32_do_raw(uint32_t numer, uint32_t magic, uint8_t more) {
-    if (!magic) {
+uint32_t libdivide_u32_do(uint32_t numer, const struct libdivide_u32_t *denom) {
+    uint8_t more = denom->more;
+    if (!denom->magic) {
         return numer >> more;
     } else {
-        uint32_t q = libdivide_mullhi_u32(magic, numer);
+        uint32_t q = libdivide_mullhi_u32(denom->magic, numer);
         if (more & LIBDIVIDE_ADD_MARKER) {
             uint32_t t = ((numer - q) >> 1) + q;
             return t >> (more & LIBDIVIDE_32_SHIFT_MASK);
@@ -949,10 +929,6 @@ uint32_t libdivide_u32_do_raw(uint32_t numer, uint32_t magic, uint8_t more) {
     }
 }
 
-uint32_t libdivide_u32_do(uint32_t numer, const struct libdivide_u32_t *denom) {
-    return libdivide_u32_do_raw(numer, denom->magic, denom->more);
-}
-
 uint32_t libdivide_u32_branchfree_do(
     uint32_t numer, const struct libdivide_u32_branchfree_t *denom) {
     uint32_t q = libdivide_mullhi_u32(denom->magic, numer);
@@ -1096,11 +1072,12 @@ struct libdivide_u64_branchfree_t libdivide_u64_branchfree_gen(uint64_t d) {
     return ret;
 }
 
-uint64_t libdivide_u64_do_raw(uint64_t numer, uint64_t magic, uint8_t more) {
-   if (!magic) {
+uint64_t libdivide_u64_do(uint64_t numer, const struct libdivide_u64_t *denom) {
+    uint8_t more = denom->more;
+    if (!denom->magic) {
         return numer >> more;
     } else {
-        uint64_t q = libdivide_mullhi_u64(magic, numer);
+        uint64_t q = libdivide_mullhi_u64(denom->magic, numer);
         if (more & LIBDIVIDE_ADD_MARKER) {
             uint64_t t = ((numer - q) >> 1) + q;
             return t >> (more & LIBDIVIDE_64_SHIFT_MASK);
@@ -1112,10 +1089,6 @@ uint64_t libdivide_u64_do_raw(uint64_t numer, uint64_t magic, uint8_t more) {
     }
 }
 
-uint64_t libdivide_u64_do(uint64_t numer, const struct libdivide_u64_t *denom) {
-    return libdivide_u64_do_raw(numer, denom->magic, denom->more);
-}
-
 uint64_t libdivide_u64_branchfree_do(
     uint64_t numer, const struct libdivide_u64_branchfree_t *denom) {
     uint64_t q = libdivide_mullhi_u64(denom->magic, numer);
@@ -1455,10 +1428,11 @@ struct libdivide_s32_branchfree_t libdivide_s32_branchfree_gen(int32_t d) {
     return result;
 }
 
-int32_t libdivide_s32_do_raw(int32_t numer, int32_t magic, uint8_t more) {
+int32_t libdivide_s32_do(int32_t numer, const struct libdivide_s32_t *denom) {
+    uint8_t more = denom->more;
     uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
 
-    if (!magic) {
+    if (!denom->magic) {
         uint32_t sign = (int8_t)more >> 7;
         uint32_t mask = ((uint32_t)1 << shift) - 1;
         uint32_t uq = numer + ((numer >> 31) & mask);
@@ -1467,7 +1441,7 @@ int32_t libdivide_s32_do_raw(int32_t numer, int32_t magic, uint8_t more) {
         q = (q ^ sign) - sign;
         return q;
     } else {
-        uint32_t uq = (uint32_t)libdivide_mullhi_s32(magic, numer);
+        uint32_t uq = (uint32_t)libdivide_mullhi_s32(denom->magic, numer);
         if (more & LIBDIVIDE_ADD_MARKER) {
             // must be arithmetic shift and then sign extend
             int32_t sign = (int8_t)more >> 7;
@@ -1482,10 +1456,6 @@ int32_t libdivide_s32_do_raw(int32_t numer, int32_t magic, uint8_t more) {
     }
 }
 
-int32_t libdivide_s32_do(int32_t numer, const struct libdivide_s32_t *denom) {
-    return libdivide_s32_do_raw(numer, denom->magic, denom->more);
-}
-
 int32_t libdivide_s32_branchfree_do(int32_t numer, const struct libdivide_s32_branchfree_t *denom) {
     uint8_t more = denom->more;
     uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
@@ -1627,10 +1597,11 @@ struct libdivide_s64_branchfree_t libdivide_s64_branchfree_gen(int64_t d) {
     return ret;
 }
 
-int64_t libdivide_s64_do_raw(int64_t numer, int64_t magic, uint8_t more) {
+int64_t libdivide_s64_do(int64_t numer, const struct libdivide_s64_t *denom) {
+    uint8_t more = denom->more;
     uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
 
-    if (!magic) {  // shift path
+    if (!denom->magic) {  // shift path
         uint64_t mask = ((uint64_t)1 << shift) - 1;
         uint64_t uq = numer + ((numer >> 63) & mask);
         int64_t q = (int64_t)uq;
@@ -1640,7 +1611,7 @@ int64_t libdivide_s64_do_raw(int64_t numer, int64_t magic, uint8_t more) {
         q = (q ^ sign) - sign;
         return q;
     } else {
-        uint64_t uq = (uint64_t)libdivide_mullhi_s64(magic, numer);
+        uint64_t uq = (uint64_t)libdivide_mullhi_s64(denom->magic, numer);
         if (more & LIBDIVIDE_ADD_MARKER) {
             // must be arithmetic shift and then sign extend
             int64_t sign = (int8_t)more >> 7;
@@ -1655,10 +1626,6 @@ int64_t libdivide_s64_do_raw(int64_t numer, int64_t magic, uint8_t more) {
     }
 }
 
-int64_t libdivide_s64_do(int64_t numer, const struct libdivide_s64_t *denom) {
-    return libdivide_s64_do_raw(numer, denom->magic, denom->more);
-}
-
 int64_t libdivide_s64_branchfree_do(int64_t numer, const struct libdivide_s64_branchfree_t *denom) {
     uint8_t more = denom->more;
     uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
@@ -1715,22 +1682,15 @@ int64_t libdivide_s64_branchfree_recover(const struct libdivide_s64_branchfree_t
 
 // Simplest possible vector type division: treat the vector type as an array
 // of underlying native type.
-//
-// Use a union to read a vector via pointer-to-integer, without violating strict
-// aliasing.
-#define SIMPLE_VECTOR_DIVISION(IntT, VecT, Algo)                          \
-    const size_t count = sizeof(VecT) / sizeof(IntT);                     \
-    union type_pun_vec {                                                  \
-        VecT vec;                                                         \
-        IntT arr[sizeof(VecT) / sizeof(IntT)];                            \
-    };                                                                    \
-    union type_pun_vec result;                                            \
-    union type_pun_vec input;                                             \
-    input.vec = numers;                                                   \
-    for (size_t loop = 0; loop < count; ++loop) {                         \
-        result.arr[loop] = libdivide_##Algo##_do(input.arr[loop], denom); \
-    }                                                                     \
-    return result.vec;
+#define SIMPLE_VECTOR_DIVISION(IntT, VecT, Algo) \
+    const size_t count = sizeof(VecT) / sizeof(IntT); \
+    VecT result; \
+    IntT *pSource = (IntT *)&numers; \
+    IntT *pTarget = (IntT *)&result; \
+    for (size_t loop=0; loop<count; ++loop) { \
+        pTarget[loop] = libdivide_##Algo##_do(pSource[loop], denom); \
+    } \
+    return result; \
 
 #if defined(LIBDIVIDE_NEON)
 
@@ -1844,12 +1804,13 @@ static LIBDIVIDE_INLINE int64x2_t libdivide_mullhi_s64_vec128(int64x2_t x, int64
 
 ////////// UINT16
 
-uint16x8_t libdivide_u16_do_vec128(uint16x8_t numers, const struct libdivide_u16_t *denom){
-    SIMPLE_VECTOR_DIVISION(uint16_t, uint16x8_t, u16)}
+uint16x8_t libdivide_u16_do_vec128(uint16x8_t numers, const struct libdivide_u16_t *denom) {
+    SIMPLE_VECTOR_DIVISION(uint16_t, uint16x8_t, u16)
+}
 
-uint16x8_t libdivide_u16_branchfree_do_vec128(
-    uint16x8_t numers, const struct libdivide_u16_branchfree_t *denom){
-    SIMPLE_VECTOR_DIVISION(uint16_t, uint16x8_t, u16_branchfree)}
+uint16x8_t libdivide_u16_branchfree_do_vec128(uint16x8_t numers, const struct libdivide_u16_branchfree_t *denom) {
+    SIMPLE_VECTOR_DIVISION(uint16_t, uint16x8_t, u16_branchfree)
+}
 
 ////////// UINT32
 
@@ -1909,12 +1870,13 @@ uint64x2_t libdivide_u64_branchfree_do_vec128(
 
 ////////// SINT16
 
-int16x8_t libdivide_s16_do_vec128(int16x8_t numers, const struct libdivide_s16_t *denom){
-    SIMPLE_VECTOR_DIVISION(int16_t, int16x8_t, s16)}
+int16x8_t libdivide_s16_do_vec128(int16x8_t numers, const struct libdivide_s16_t *denom) {
+    SIMPLE_VECTOR_DIVISION(int16_t, int16x8_t, s16)
+}
 
-int16x8_t libdivide_s16_branchfree_do_vec128(
-    int16x8_t numers, const struct libdivide_s16_branchfree_t *denom){
-    SIMPLE_VECTOR_DIVISION(int16_t, int16x8_t, s16_branchfree)}
+int16x8_t libdivide_s16_branchfree_do_vec128(int16x8_t numers, const struct libdivide_s16_branchfree_t *denom) {
+    SIMPLE_VECTOR_DIVISION(int16_t, int16x8_t, s16_branchfree)
+}
 
 ////////// SINT32
 
@@ -2120,12 +2082,13 @@ static LIBDIVIDE_INLINE __m512i libdivide_mullhi_s64_vec512(__m512i x, __m512i y
 
 ////////// UINT16
 
-__m512i libdivide_u16_do_vec512(__m512i numers, const struct libdivide_u16_t *denom){
-    SIMPLE_VECTOR_DIVISION(uint16_t, __m512i, u16)}
+__m512i libdivide_u16_do_vec512(__m512i numers, const struct libdivide_u16_t *denom) {
+    SIMPLE_VECTOR_DIVISION(uint16_t, __m512i, u16)
+}
 
-__m512i libdivide_u16_branchfree_do_vec512(
-    __m512i numers, const struct libdivide_u16_branchfree_t *denom){
-    SIMPLE_VECTOR_DIVISION(uint16_t, __m512i, u16_branchfree)}
+__m512i libdivide_u16_branchfree_do_vec512(__m512i numers, const struct libdivide_u16_branchfree_t *denom) {
+    SIMPLE_VECTOR_DIVISION(uint16_t, __m512i, u16_branchfree)
+}
 
 ////////// UINT32
 
@@ -2183,12 +2146,13 @@ __m512i libdivide_u64_branchfree_do_vec512(
 
 ////////// SINT16
 
-__m512i libdivide_s16_do_vec512(__m512i numers, const struct libdivide_s16_t *denom){
-    SIMPLE_VECTOR_DIVISION(int16_t, __m512i, s16)}
+__m512i libdivide_s16_do_vec512(__m512i numers, const struct libdivide_s16_t *denom) {
+    SIMPLE_VECTOR_DIVISION(int16_t, __m512i, s16)
+}
 
-__m512i libdivide_s16_branchfree_do_vec512(
-    __m512i numers, const struct libdivide_s16_branchfree_t *denom){
-    SIMPLE_VECTOR_DIVISION(int16_t, __m512i, s16_branchfree)}
+__m512i libdivide_s16_branchfree_do_vec512(__m512i numers, const struct libdivide_s16_branchfree_t *denom) {
+    SIMPLE_VECTOR_DIVISION(int16_t, __m512i, s16_branchfree)
+}
 
 ////////// SINT32
 
@@ -2401,25 +2365,11 @@ static LIBDIVIDE_INLINE __m256i libdivide_mullhi_s64_vec256(__m256i x, __m256i y
 ////////// UINT16
 
 __m256i libdivide_u16_do_vec256(__m256i numers, const struct libdivide_u16_t *denom) {
-    uint8_t more = denom->more;
-    if (!denom->magic) {
-        return _mm256_srli_epi16(numers, more);
-    } else {
-        __m256i q = _mm256_mulhi_epu16(numers, _mm256_set1_epi16(denom->magic));
-        if (more & LIBDIVIDE_ADD_MARKER) {
-            __m256i t = _mm256_adds_epu16(_mm256_srli_epi16(_mm256_subs_epu16(numers, q), 1), q);
-            return _mm256_srli_epi16(t, (more & LIBDIVIDE_16_SHIFT_MASK));
-        } else {
-            return _mm256_srli_epi16(q, more);
-        }
-    }
+    SIMPLE_VECTOR_DIVISION(uint16_t, __m256i, u16)
 }
 
-__m256i libdivide_u16_branchfree_do_vec256(
-    __m256i numers, const struct libdivide_u16_branchfree_t *denom) {
-    __m256i q = _mm256_mulhi_epu16(numers, _mm256_set1_epi16(denom->magic));
-    __m256i t = _mm256_adds_epu16(_mm256_srli_epi16(_mm256_subs_epu16(numers, q), 1), q);
-    return _mm256_srli_epi16(t, denom->more);
+__m256i libdivide_u16_branchfree_do_vec256(__m256i numers, const struct libdivide_u16_branchfree_t *denom) {
+    SIMPLE_VECTOR_DIVISION(uint16_t, __m256i, u16_branchfree)
 }
 
 ////////// UINT32
@@ -2479,54 +2429,11 @@ __m256i libdivide_u64_branchfree_do_vec256(
 ////////// SINT16
 
 __m256i libdivide_s16_do_vec256(__m256i numers, const struct libdivide_s16_t *denom) {
-    uint8_t more = denom->more;
-    if (!denom->magic) {
-        uint16_t shift = more & LIBDIVIDE_16_SHIFT_MASK;
-        uint16_t mask = ((uint16_t)1 << shift) - 1;
-        __m256i roundToZeroTweak = _mm256_set1_epi16(mask);
-        // q = numer + ((numer >> 15) & roundToZeroTweak);
-        __m256i q = _mm256_add_epi16(
-            numers, _mm256_and_si256(_mm256_srai_epi16(numers, 15), roundToZeroTweak));
-        q = _mm256_srai_epi16(q, shift);
-        __m256i sign = _mm256_set1_epi16((int8_t)more >> 7);
-        // q = (q ^ sign) - sign;
-        q = _mm256_sub_epi16(_mm256_xor_si256(q, sign), sign);
-        return q;
-    } else {
-        __m256i q = _mm256_mulhi_epi16(numers, _mm256_set1_epi16(denom->magic));
-        if (more & LIBDIVIDE_ADD_MARKER) {
-            // must be arithmetic shift
-            __m256i sign = _mm256_set1_epi16((int8_t)more >> 7);
-            // q += ((numer ^ sign) - sign);
-            q = _mm256_add_epi16(q, _mm256_sub_epi16(_mm256_xor_si256(numers, sign), sign));
-        }
-        // q >>= shift
-        q = _mm256_srai_epi16(q, more & LIBDIVIDE_16_SHIFT_MASK);
-        q = _mm256_add_epi16(q, _mm256_srli_epi16(q, 15));  // q += (q < 0)
-        return q;
-    }
+    SIMPLE_VECTOR_DIVISION(int16_t, __m256i, s16)
 }
 
-__m256i libdivide_s16_branchfree_do_vec256(
-    __m256i numers, const struct libdivide_s16_branchfree_t *denom) {
-    int16_t magic = denom->magic;
-    uint8_t more = denom->more;
-    uint8_t shift = more & LIBDIVIDE_16_SHIFT_MASK;
-    // must be arithmetic shift
-    __m256i sign = _mm256_set1_epi16((int8_t)more >> 7);
-    __m256i q = _mm256_mulhi_epi16(numers, _mm256_set1_epi16(magic));
-    q = _mm256_add_epi16(q, numers);  // q += numers
-
-    // If q is non-negative, we have nothing to do
-    // If q is negative, we want to add either (2**shift)-1 if d is
-    // a power of 2, or (2**shift) if it is not a power of 2
-    uint16_t is_power_of_2 = (magic == 0);
-    __m256i q_sign = _mm256_srai_epi16(q, 15);  // q_sign = q >> 15
-    __m256i mask = _mm256_set1_epi16(((uint16_t)1 << shift) - is_power_of_2);
-    q = _mm256_add_epi16(q, _mm256_and_si256(q_sign, mask));  // q = q + (q_sign & mask)
-    q = _mm256_srai_epi16(q, shift);                          // q >>= shift
-    q = _mm256_sub_epi16(_mm256_xor_si256(q, sign), sign);    // q = (q ^ sign) - sign
-    return q;
+__m256i libdivide_s16_branchfree_do_vec256(__m256i numers, const struct libdivide_s16_branchfree_t *denom) {
+    SIMPLE_VECTOR_DIVISION(int16_t, __m256i, s16_branchfree)
 }
 
 ////////// SINT32
@@ -2754,25 +2661,11 @@ static LIBDIVIDE_INLINE __m128i libdivide_mullhi_s64_vec128(__m128i x, __m128i y
 ////////// UINT26
 
 __m128i libdivide_u16_do_vec128(__m128i numers, const struct libdivide_u16_t *denom) {
-    uint8_t more = denom->more;
-    if (!denom->magic) {
-        return _mm_srli_epi16(numers, more);
-    } else {
-        __m128i q = _mm_mulhi_epu16(numers, _mm_set1_epi16(denom->magic));
-        if (more & LIBDIVIDE_ADD_MARKER) {
-            __m128i t = _mm_adds_epu16(_mm_srli_epi16(_mm_subs_epu16(numers, q), 1), q);
-            return _mm_srli_epi16(t, (more & LIBDIVIDE_16_SHIFT_MASK));
-        } else {
-            return _mm_srli_epi16(q, more);
-        }
-    }
+    SIMPLE_VECTOR_DIVISION(uint16_t, __m128i, u16)
 }
 
-__m128i libdivide_u16_branchfree_do_vec128(
-    __m128i numers, const struct libdivide_u16_branchfree_t *denom) {
-    __m128i q = _mm_mulhi_epu16(numers, _mm_set1_epi16(denom->magic));
-    __m128i t = _mm_adds_epu16(_mm_srli_epi16(_mm_subs_epu16(numers, q), 1), q);
-    return _mm_srli_epi16(t, denom->more);
+__m128i libdivide_u16_branchfree_do_vec128(__m128i numers, const struct libdivide_u16_branchfree_t *denom) {
+    SIMPLE_VECTOR_DIVISION(uint16_t, __m128i, u16_branchfree)
 }
 
 ////////// UINT32
@@ -2832,54 +2725,11 @@ __m128i libdivide_u64_branchfree_do_vec128(
 ////////// SINT16
 
 __m128i libdivide_s16_do_vec128(__m128i numers, const struct libdivide_s16_t *denom) {
-    uint8_t more = denom->more;
-    if (!denom->magic) {
-        uint16_t shift = more & LIBDIVIDE_16_SHIFT_MASK;
-        uint16_t mask = ((uint16_t)1 << shift) - 1;
-        __m128i roundToZeroTweak = _mm_set1_epi16(mask);
-        // q = numer + ((numer >> 15) & roundToZeroTweak);
-        __m128i q =
-            _mm_add_epi16(numers, _mm_and_si128(_mm_srai_epi16(numers, 15), roundToZeroTweak));
-        q = _mm_srai_epi16(q, shift);
-        __m128i sign = _mm_set1_epi16((int8_t)more >> 7);
-        // q = (q ^ sign) - sign;
-        q = _mm_sub_epi16(_mm_xor_si128(q, sign), sign);
-        return q;
-    } else {
-        __m128i q = _mm_mulhi_epi16(numers, _mm_set1_epi16(denom->magic));
-        if (more & LIBDIVIDE_ADD_MARKER) {
-            // must be arithmetic shift
-            __m128i sign = _mm_set1_epi16((int8_t)more >> 7);
-            // q += ((numer ^ sign) - sign);
-            q = _mm_add_epi16(q, _mm_sub_epi16(_mm_xor_si128(numers, sign), sign));
-        }
-        // q >>= shift
-        q = _mm_srai_epi16(q, more & LIBDIVIDE_16_SHIFT_MASK);
-        q = _mm_add_epi16(q, _mm_srli_epi16(q, 15));  // q += (q < 0)
-        return q;
-    }
+    SIMPLE_VECTOR_DIVISION(int16_t, __m128i, s16)
 }
 
-__m128i libdivide_s16_branchfree_do_vec128(
-    __m128i numers, const struct libdivide_s16_branchfree_t *denom) {
-    int16_t magic = denom->magic;
-    uint8_t more = denom->more;
-    uint8_t shift = more & LIBDIVIDE_16_SHIFT_MASK;
-    // must be arithmetic shift
-    __m128i sign = _mm_set1_epi16((int8_t)more >> 7);
-    __m128i q = _mm_mulhi_epi16(numers, _mm_set1_epi16(magic));
-    q = _mm_add_epi16(q, numers);  // q += numers
-
-    // If q is non-negative, we have nothing to do
-    // If q is negative, we want to add either (2**shift)-1 if d is
-    // a power of 2, or (2**shift) if it is not a power of 2
-    uint16_t is_power_of_2 = (magic == 0);
-    __m128i q_sign = _mm_srai_epi16(q, 15);  // q_sign = q >> 15
-    __m128i mask = _mm_set1_epi16(((uint16_t)1 << shift) - is_power_of_2);
-    q = _mm_add_epi16(q, _mm_and_si128(q_sign, mask));  // q = q + (q_sign & mask)
-    q = _mm_srai_epi16(q, shift);                       // q >>= shift
-    q = _mm_sub_epi16(_mm_xor_si128(q, sign), sign);    // q = (q ^ sign) - sign
-    return q;
+__m128i libdivide_s16_branchfree_do_vec128(__m128i numers, const struct libdivide_s16_branchfree_t *denom) {
+    SIMPLE_VECTOR_DIVISION(int16_t, __m128i, s16_branchfree)
 }
 
 ////////// SINT32
@@ -2945,8 +2795,8 @@ __m128i libdivide_s64_do_vec128(__m128i numers, const struct libdivide_s64_t *de
         uint64_t mask = ((uint64_t)1 << shift) - 1;
         __m128i roundToZeroTweak = _mm_set1_epi64x(mask);
         // q = numer + ((numer >> 63) & roundToZeroTweak);
-        __m128i q = _mm_add_epi64(
-            numers, _mm_and_si128(libdivide_s64_signbits_vec128(numers), roundToZeroTweak));
+        __m128i q =
+            _mm_add_epi64(numers, _mm_and_si128(libdivide_s64_signbits_vec128(numers), roundToZeroTweak));
         q = libdivide_s64_shift_right_vec128(q, shift);
         __m128i sign = _mm_set1_epi32((int8_t)more >> 7);
         // q = (q ^ sign) - sign;
@@ -2997,80 +2847,49 @@ __m128i libdivide_s64_branchfree_do_vec128(
 
 #ifdef __cplusplus
 
-//for constexpr zero initialization,
-//c++11 might handle things ok,
-//but just limit to at least c++14 to ensure
-//we don't break anyone's code:
-
-// for gcc and clang, use https://en.cppreference.com/w/cpp/feature_test#cpp_constexpr
-#if (defined(__GNUC__) || defined(__clang__)) && (__cpp_constexpr >= 201304L)
-#define LIBDIVIDE_CONSTEXPR constexpr
-
-// supposedly, MSVC might not implement feature test macros right (https://stackoverflow.com/questions/49316752/feature-test-macros-not-working-properly-in-visual-c)
-// so check that _MSVC_LANG corresponds to at least c++14, and _MSC_VER corresponds to at least VS 2017 15.0 (for extended constexpr support https://learn.microsoft.com/en-us/cpp/overview/visual-cpp-language-conformance?view=msvc-170)
-#elif defined(_MSC_VER) && _MSC_VER >= 1910 && defined(_MSVC_LANG) && _MSVC_LANG >=201402L
-#define LIBDIVIDE_CONSTEXPR constexpr
-
-// in case some other obscure compiler has the right __cpp_constexpr :
-#elif defined(__cpp_constexpr) && __cpp_constexpr >= 201304L
-#define LIBDIVIDE_CONSTEXPR constexpr
-
-#else
-#define LIBDIVIDE_CONSTEXPR LIBDIVIDE_INLINE
-#endif
-
 enum Branching {
     BRANCHFULL,  // use branching algorithms
     BRANCHFREE   // use branchfree algorithms
 };
 
-namespace detail {
-enum Signedness {
-    SIGNED,
-    UNSIGNED,
-};
-
 #if defined(LIBDIVIDE_NEON)
 // Helper to deduce NEON vector type for integral type.
-template <int _WIDTH, Signedness _SIGN>
-struct NeonVec {};
+template <typename T>
+struct NeonVecFor {};
 
 template <>
-struct NeonVec<16, UNSIGNED> {
+struct NeonVecFor<uint16_t> {
     typedef uint16x8_t type;
 };
 
 template <>
-struct NeonVec<16, SIGNED> {
+struct NeonVecFor<int16_t> {
     typedef int16x8_t type;
 };
 
 template <>
-struct NeonVec<32, UNSIGNED> {
+struct NeonVecFor<uint32_t> {
     typedef uint32x4_t type;
 };
 
 template <>
-struct NeonVec<32, SIGNED> {
+struct NeonVecFor<int32_t> {
     typedef int32x4_t type;
 };
 
 template <>
-struct NeonVec<64, UNSIGNED> {
+struct NeonVecFor<uint64_t> {
     typedef uint64x2_t type;
 };
 
 template <>
-struct NeonVec<64, SIGNED> {
+struct NeonVecFor<int64_t> {
     typedef int64x2_t type;
 };
+#endif
 
-template <typename T>
-struct NeonVecFor {
-    // See 'class divider' for an explanation of these template parameters.
-    typedef typename NeonVec<sizeof(T) * 8, (((T)0 >> 0) > (T)(-1) ? SIGNED : UNSIGNED)>::type type;
-};
-
+// Versions of our algorithms for SIMD.
+#if defined(LIBDIVIDE_NEON)
 #define LIBDIVIDE_DIVIDE_NEON(ALGO, INT_TYPE)                    \
     LIBDIVIDE_INLINE typename NeonVecFor<INT_TYPE>::type divide( \
         typename NeonVecFor<INT_TYPE>::type n) const {           \
@@ -3079,7 +2898,6 @@ struct NeonVecFor {
 #else
 #define LIBDIVIDE_DIVIDE_NEON(ALGO, INT_TYPE)
 #endif
-
 #if defined(LIBDIVIDE_SSE2)
 #define LIBDIVIDE_DIVIDE_SSE2(ALGO)                     \
     LIBDIVIDE_INLINE __m128i divide(__m128i n) const {  \
@@ -3112,7 +2930,6 @@ struct NeonVecFor {
 #define DISPATCHER_GEN(T, ALGO)                                                       \
     libdivide_##ALGO##_t denom;                                                       \
     LIBDIVIDE_INLINE dispatcher() {}                                                  \
-    explicit LIBDIVIDE_CONSTEXPR dispatcher(decltype(nullptr)) : denom{} {}              \
     LIBDIVIDE_INLINE dispatcher(T d) : denom(libdivide_##ALGO##_gen(d)) {}            \
     LIBDIVIDE_INLINE T divide(T n) const { return libdivide_##ALGO##_do(n, &denom); } \
     LIBDIVIDE_INLINE T recover() const { return libdivide_##ALGO##_recover(&denom); } \
@@ -3122,81 +2939,66 @@ struct NeonVecFor {
     LIBDIVIDE_DIVIDE_AVX512(ALGO)
 
 // The dispatcher selects a specific division algorithm for a given
-// width, signedness, and ALGO using partial template specialization.
-template <int _WIDTH, Signedness _SIGN, Branching _ALGO>
+// type and ALGO using partial template specialization.
+template <typename _IntT, Branching ALGO>
 struct dispatcher {};
 
 template <>
-struct dispatcher<16, SIGNED, BRANCHFULL> {
+struct dispatcher<int16_t, BRANCHFULL> {
     DISPATCHER_GEN(int16_t, s16)
 };
 template <>
-struct dispatcher<16, SIGNED, BRANCHFREE> {
+struct dispatcher<int16_t, BRANCHFREE> {
     DISPATCHER_GEN(int16_t, s16_branchfree)
 };
 template <>
-struct dispatcher<16, UNSIGNED, BRANCHFULL> {
+struct dispatcher<uint16_t, BRANCHFULL> {
     DISPATCHER_GEN(uint16_t, u16)
 };
 template <>
-struct dispatcher<16, UNSIGNED, BRANCHFREE> {
+struct dispatcher<uint16_t, BRANCHFREE> {
     DISPATCHER_GEN(uint16_t, u16_branchfree)
 };
 template <>
-struct dispatcher<32, SIGNED, BRANCHFULL> {
+struct dispatcher<int32_t, BRANCHFULL> {
     DISPATCHER_GEN(int32_t, s32)
 };
 template <>
-struct dispatcher<32, SIGNED, BRANCHFREE> {
+struct dispatcher<int32_t, BRANCHFREE> {
     DISPATCHER_GEN(int32_t, s32_branchfree)
 };
 template <>
-struct dispatcher<32, UNSIGNED, BRANCHFULL> {
+struct dispatcher<uint32_t, BRANCHFULL> {
     DISPATCHER_GEN(uint32_t, u32)
 };
 template <>
-struct dispatcher<32, UNSIGNED, BRANCHFREE> {
+struct dispatcher<uint32_t, BRANCHFREE> {
     DISPATCHER_GEN(uint32_t, u32_branchfree)
 };
 template <>
-struct dispatcher<64, SIGNED, BRANCHFULL> {
+struct dispatcher<int64_t, BRANCHFULL> {
     DISPATCHER_GEN(int64_t, s64)
 };
 template <>
-struct dispatcher<64, SIGNED, BRANCHFREE> {
+struct dispatcher<int64_t, BRANCHFREE> {
     DISPATCHER_GEN(int64_t, s64_branchfree)
 };
 template <>
-struct dispatcher<64, UNSIGNED, BRANCHFULL> {
+struct dispatcher<uint64_t, BRANCHFULL> {
     DISPATCHER_GEN(uint64_t, u64)
 };
 template <>
-struct dispatcher<64, UNSIGNED, BRANCHFREE> {
+struct dispatcher<uint64_t, BRANCHFREE> {
     DISPATCHER_GEN(uint64_t, u64_branchfree)
 };
-}  // namespace detail
-
-#if defined(LIBDIVIDE_NEON)
-// Allow NeonVecFor outside of detail namespace.
-template <typename T>
-struct NeonVecFor {
-    typedef typename detail::NeonVecFor<T>::type type;
-};
-#endif
 
 // This is the main divider class for use by the user (C++ API).
 // The actual division algorithm is selected using the dispatcher struct
-// based on the integer width and algorithm template parameters.
+// based on the integer and algorithm template parameters.
 template <typename T, Branching ALGO = BRANCHFULL>
 class divider {
    private:
-    // Dispatch based on the size and signedness.
-    // We avoid using type_traits as it's not available in AVR.
-    // Detect signedness by checking if T(-1) is less than T(0).
-    // Also throw in a shift by 0, which prevents floating point types from being passed.
-    typedef detail::dispatcher<sizeof(T) * 8,
-        (((T)0 >> 0) > (T)(-1) ? detail::SIGNED : detail::UNSIGNED), ALGO>
-        dispatcher_t;
+    typedef dispatcher<T, ALGO> dispatcher_t;
 
    public:
     // We leave the default constructor empty so that creating
@@ -3204,9 +3006,6 @@ class divider {
     // later doesn't slow us down.
     divider() {}
 
-    // constexpr zero-initialization to allow for use w/ static constinit
-    explicit LIBDIVIDE_CONSTEXPR divider(decltype(nullptr)) : div(nullptr) {}
-
     // Constructor that takes the divisor as a parameter
     LIBDIVIDE_INLINE divider(T d) : div(d) {}
 
@@ -3218,7 +3017,7 @@ class divider {
     T recover() const { return div.recover(); }
 
     bool operator==(const divider<T, ALGO> &other) const {
-        return div.denom.magic == other.div.denom.magic && div.denom.more == other.div.denom.more;
+        return div.denom.magic == other.denom.magic && div.denom.more == other.denom.more;
     }
 
     bool operator!=(const divider<T, ALGO> &other) const { return !(*this == other); }
@@ -3299,14 +3098,12 @@ LIBDIVIDE_INLINE __m512i operator/=(__m512i &n, const divider<T, ALGO> &div) {
 
 #if defined(LIBDIVIDE_NEON)
 template <typename T, Branching ALGO>
-LIBDIVIDE_INLINE typename NeonVecFor<T>::type operator/(
-    typename NeonVecFor<T>::type n, const divider<T, ALGO> &div) {
+LIBDIVIDE_INLINE typename NeonVecFor<T>::type operator/(typename NeonVecFor<T>::type n, const divider<T, ALGO> &div) {
     return div.divide(n);
 }
 
 template <typename T, Branching ALGO>
-LIBDIVIDE_INLINE typename NeonVecFor<T>::type operator/=(
-    typename NeonVecFor<T>::type &n, const divider<T, ALGO> &div) {
+LIBDIVIDE_INLINE typename NeonVecFor<T>::type operator/=(typename NeonVecFor<T>::type &n, const divider<T, ALGO> &div) {
     n = div.divide(n);
     return n;
 }
diff --git a/util.c b/util.c
index a43679c..a3d6f0c 100644
--- a/util.c
+++ b/util.c
@@ -6,8 +6,6 @@
 
 #ifdef __ANDROID__
 #include <async_safe/log.h>
-int mallopt(int param, int value);
-#define M_BIONIC_RESTORE_DEFAULT_SIGABRT_HANDLER (-1003)
 #endif
 
 #include "util.h"
@@ -32,7 +30,6 @@ static int write_full(int fd, const char *buf, size_t length) {
 
 COLD noreturn void fatal_error(const char *s) {
 #ifdef __ANDROID__
-    mallopt(M_BIONIC_RESTORE_DEFAULT_SIGABRT_HANDLER, 0);
     async_safe_fatal("hardened_malloc: fatal allocator error: %s", s);
 #else
     const char *prefix = "fatal allocator error: ";
diff --git a/util.h b/util.h
index 6b1a390..9a4a7af 100644
--- a/util.h
+++ b/util.h
@@ -9,9 +9,7 @@
 #define noreturn __attribute__((noreturn))
 
 #define likely(x) __builtin_expect(!!(x), 1)
-#define likely51(x) __builtin_expect_with_probability(!!(x), 1, 0.51)
 #define unlikely(x) __builtin_expect(!!(x), 0)
-#define unlikely51(x) __builtin_expect_with_probability(!!(x), 0, 0.51)
 
 #define min(x, y) ({ \
     __typeof__(x) _x = (x); \
@@ -59,22 +57,6 @@ static inline size_t align(size_t size, size_t align) {
     return (size + mask) & ~mask;
 }
 
-// u4_arr_{set,get} are helper functions for using u8 array as an array of unsigned 4-bit values.
-
-// val is treated as a 4-bit value
-static inline void u4_arr_set(u8 *arr, size_t idx, u8 val) {
-    size_t off = idx >> 1;
-    size_t shift = (idx & 1) << 2;
-    u8 mask = (u8) (0xf0 >> shift);
-    arr[off] = (arr[off] & mask) | (val << shift);
-}
-
-static inline u8 u4_arr_get(const u8 *arr, size_t idx) {
-    size_t off = idx >> 1;
-    size_t shift = (idx & 1) << 2;
-    return (u8) ((arr[off] >> shift) & 0xf);
-}
-
 COLD noreturn void fatal_error(const char *s);
 
 #if CONFIG_SEAL_METADATA