From 19a46e0f9691e6acf0b6c400508397f1e1e05472 Mon Sep 17 00:00:00 2001
From: Dmitry Muhomor <muhomor.dmitry@gmail.com>
Date: Fri, 27 Oct 2023 21:14:32 +0300
Subject: [PATCH 01/53] add helper functions for using u8 array as u4 array

---
 util.h | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/util.h b/util.h
index 9a4a7af..9b5abe6 100644
--- a/util.h
+++ b/util.h
@@ -1,6 +1,7 @@
 #ifndef UTIL_H
 #define UTIL_H
 
+#include <assert.h>
 #include <stdbool.h>
 #include <stddef.h>
 #include <stdint.h>
@@ -57,6 +58,23 @@ static inline size_t align(size_t size, size_t align) {
     return (size + mask) & ~mask;
 }
 
+// u4_arr_{set,get} are helper functions for using u8 array as an array of unsigned 4-bit values.
+static_assert(sizeof(u8) == 1, "unexpected u8 size");
+
+// val is treated as a 4-bit value
+static inline void u4_arr_set(u8 *arr, size_t idx, u8 val) {
+    size_t off = idx >> 1;
+    size_t shift = (idx & 1) << 2;
+    u8 mask = (u8) (0xf0 >> shift);
+    arr[off] = (arr[off] & mask) | (val << shift);
+}
+
+static inline u8 u4_arr_get(const u8 *arr, size_t idx) {
+    size_t off = idx >> 1;
+    size_t shift = (idx & 1) << 2;
+    return (u8) ((arr[off] >> shift) & 0xf);
+}
+
 COLD noreturn void fatal_error(const char *s);
 
 #if CONFIG_SEAL_METADATA

From e3686ae457bef717e12ad08cfaaf17a1e9f88bbe Mon Sep 17 00:00:00 2001
From: Dmitry Muhomor <muhomor.dmitry@gmail.com>
Date: Thu, 26 Oct 2023 10:22:08 +0300
Subject: [PATCH 02/53] add support for Arm MTE memory tagging

- tag slab allocations with [1..14] tags
- tag freed slab allocations with the "15" tag value to detect accesses to freed slab memory
- when generating tag value for a slab slot, always exclude most recent tag value for that slot
(to make use-after-free detection more reliable) and most recent tag values of its immediate
neighbors (to detect linear overflows and underflows)
---
 arm_mte.h  |  86 +++++++++++++++++++++++++++++++++++++++++++
 h_malloc.c | 105 ++++++++++++++++++++++++++++++++++++++++++++++++++---
 memory.c   |  14 +++++++
 memory.h   |   3 ++
 memtag.h   |  52 ++++++++++++++++++++++++++
 5 files changed, 254 insertions(+), 6 deletions(-)
 create mode 100644 arm_mte.h
 create mode 100644 memtag.h

diff --git a/arm_mte.h b/arm_mte.h
new file mode 100644
index 0000000..8deefb2
--- /dev/null
+++ b/arm_mte.h
@@ -0,0 +1,86 @@
+#ifndef ARM_MTE_H
+#define ARM_MTE_H
+
+#include <arm_acle.h>
+#include <util.h>
+
+// Returns a tagged pointer.
+// See https://developer.arm.com/documentation/ddi0602/2023-09/Base-Instructions/IRG--Insert-Random-Tag-
+static inline void *arm_mte_create_random_tag(void *p, u64 exclusion_mask) {
+    return __arm_mte_create_random_tag(p, exclusion_mask);
+}
+
+// Tag the memory region with the tag specified in tag bits of tagged_ptr. Memory region itself is
+// zeroed.
+// Arm's software optimization guide says:
+// "it is recommended to use STZGM (or DCZGVA) to set tag if data is not a concern." (STZGM and
+// DCGZVA are zeroing variants of tagging instructions).
+//
+// Contents of this function were copied from scudo:
+// https://android.googlesource.com/platform/external/scudo/+/refs/tags/android-14.0.0_r1/standalone/memtag.h#167
+static inline void arm_mte_store_tags_and_clear(void *tagged_ptr, size_t len) {
+    uintptr_t Begin = (uintptr_t) tagged_ptr;
+    uintptr_t End = Begin + len;
+    uintptr_t LineSize, Next, Tmp;
+    __asm__ __volatile__(
+        ".arch_extension memtag \n\t"
+
+        // Compute the cache line size in bytes (DCZID_EL0 stores it as the log2
+        // of the number of 4-byte words) and bail out to the slow path if DCZID_EL0
+        // indicates that the DC instructions are unavailable.
+        "DCZID .req %[Tmp] \n\t"
+        "mrs DCZID, dczid_el0 \n\t"
+        "tbnz DCZID, #4, 3f \n\t"
+        "and DCZID, DCZID, #15 \n\t"
+        "mov %[LineSize], #4 \n\t"
+        "lsl %[LineSize], %[LineSize], DCZID \n\t"
+        ".unreq DCZID \n\t"
+
+        // Our main loop doesn't handle the case where we don't need to perform any
+        // DC GZVA operations. If the size of our tagged region is less than
+        // twice the cache line size, bail out to the slow path since it's not
+        // guaranteed that we'll be able to do a DC GZVA.
+        "Size .req %[Tmp] \n\t"
+        "sub Size, %[End], %[Cur] \n\t"
+        "cmp Size, %[LineSize], lsl #1 \n\t"
+        "b.lt 3f \n\t"
+        ".unreq Size \n\t"
+
+        "LineMask .req %[Tmp] \n\t"
+        "sub LineMask, %[LineSize], #1 \n\t"
+
+        // STZG until the start of the next cache line.
+        "orr %[Next], %[Cur], LineMask \n\t"
+
+        "1:\n\t"
+        "stzg %[Cur], [%[Cur]], #16 \n\t"
+        "cmp %[Cur], %[Next] \n\t"
+        "b.lt 1b \n\t"
+
+        // DC GZVA cache lines until we have no more full cache lines.
+        "bic %[Next], %[End], LineMask \n\t"
+        ".unreq LineMask \n\t"
+
+        "2: \n\t"
+        "dc gzva, %[Cur] \n\t"
+        "add %[Cur], %[Cur], %[LineSize] \n\t"
+        "cmp %[Cur], %[Next] \n\t"
+        "b.lt 2b \n\t"
+
+        // STZG until the end of the tagged region. This loop is also used to handle
+        // slow path cases.
+
+        "3: \n\t"
+        "cmp %[Cur], %[End] \n\t"
+        "b.ge 4f \n\t"
+        "stzg %[Cur], [%[Cur]], #16 \n\t"
+        "b 3b \n\t"
+
+        "4: \n\t"
+
+        : [Cur] "+&r"(Begin), [LineSize] "=&r"(LineSize), [Next] "=&r"(Next), [Tmp] "=&r"(Tmp)
+        : [End] "r"(End)
+        : "memory"
+    );
+}
+#endif
diff --git a/h_malloc.c b/h_malloc.c
index 2dc0bde..098eb37 100644
--- a/h_malloc.c
+++ b/h_malloc.c
@@ -14,6 +14,7 @@
 
 #include "h_malloc.h"
 #include "memory.h"
+#include "memtag.h"
 #include "mutex.h"
 #include "pages.h"
 #include "random.h"
@@ -66,6 +67,10 @@ static atomic_uint thread_arena_counter = 0;
 static const unsigned thread_arena = 0;
 #endif
 
+#ifdef MEMTAG
+bool __is_memtag_enabled = true;
+#endif
+
 static union {
     struct {
         void *slab_region_start;
@@ -99,6 +104,18 @@ struct slab_metadata {
 #if SLAB_QUARANTINE
     u64 quarantine_bitmap[4];
 #endif
+#ifdef HAS_ARM_MTE
+    // arm_mte_tags is used as a u4 array (MTE tags are 4-bit wide)
+    //
+    // Its size is calculated by the following formula:
+    // (MAX_SLAB_SLOT_COUNT + 2) / 2
+    // MAX_SLAB_SLOT_COUNT is currently 256, 2 extra slots are needed for branchless handling of
+    // edge slots in tag_and_clear_slab_slot()
+    //
+    // It's intentionally placed at the end of struct to improve locality: for most size classes,
+    // slot count is far lower than MAX_SLAB_SLOT_COUNT.
+    u8 arm_mte_tags[129];
+#endif
 };
 
 static const size_t min_align = 16;
@@ -506,6 +523,47 @@ static inline void stats_slab_deallocate(UNUSED struct size_class *c, UNUSED siz
 #endif
 }
 
+static void *tag_and_clear_slab_slot(struct slab_metadata *metadata, void *slot_ptr, size_t slot_idx, size_t slot_size) {
+#ifdef HAS_ARM_MTE
+    if (unlikely(!is_memtag_enabled())) {
+        return slot_ptr;
+    }
+
+    // arm_mte_tags is an array of 4-bit unsigned integers stored as u8 array (MTE tags are 4-bit wide)
+    //
+    // It stores the most recent tag for each slab slot, or 0 if the slot was never used.
+    // Slab indices in arm_mte_tags array are shifted to the right by 1, and size of this array
+    // is (MAX_SLAB_SLOT_COUNT + 2). This means that first and last values of arm_mte_tags array
+    // are always 0, which allows to handle edge slots in a branchless way when tag exclusion mask
+    // is constructed.
+    u8 *slot_tags = metadata->arm_mte_tags;
+
+    // Tag exclusion mask
+    u64 tem = (1 << 0) | (1 << RESERVED_TAG);
+
+    // current or previous tag of left neighbor or 0 if there's no left neighbor or if it was never used
+    tem |= (1 << u4_arr_get(slot_tags, slot_idx));
+    // previous tag of this slot or 0 if it was never used
+    tem |= (1 << u4_arr_get(slot_tags, slot_idx + 1));
+    // current or previous tag of right neighbor or 0 if there's no right neighbor or if it was never used
+    tem |= (1 << u4_arr_get(slot_tags, slot_idx + 2));
+
+    void *tagged_ptr = arm_mte_create_random_tag(slot_ptr, tem);
+    // slot addresses and sizes are always aligned by 16
+    arm_mte_store_tags_and_clear(tagged_ptr, slot_size);
+
+    // store new tag of this slot
+    u4_arr_set(slot_tags, slot_idx + 1, get_pointer_tag(tagged_ptr));
+
+    return tagged_ptr;
+#else
+    (void) metadata;
+    (void) slot_idx;
+    (void) slot_size;
+    return slot_ptr;
+#endif
+}
+
 static inline void *allocate_small(unsigned arena, size_t requested_size) {
     struct size_info info = get_size_info(requested_size);
     size_t size = likely(info.size) ? info.size : 16;
@@ -534,6 +592,7 @@ static inline void *allocate_small(unsigned arena, size_t requested_size) {
             if (requested_size) {
                 write_after_free_check(p, size - canary_size);
                 set_canary(metadata, p, size);
+                p = tag_and_clear_slab_slot(metadata, p, slot, size);
             }
             stats_small_allocate(c, size);
 
@@ -566,6 +625,7 @@ static inline void *allocate_small(unsigned arena, size_t requested_size) {
             void *p = slot_pointer(size, slab, slot);
             if (requested_size) {
                 set_canary(metadata, p, size);
+                p = tag_and_clear_slab_slot(metadata, p, slot, size);
             }
             stats_slab_allocate(c, slab_size);
             stats_small_allocate(c, size);
@@ -588,6 +648,7 @@ static inline void *allocate_small(unsigned arena, size_t requested_size) {
         void *p = slot_pointer(size, slab, slot);
         if (requested_size) {
             set_canary(metadata, p, size);
+            p = tag_and_clear_slab_slot(metadata, p, slot, size);
         }
         stats_slab_allocate(c, slab_size);
         stats_small_allocate(c, size);
@@ -612,6 +673,7 @@ static inline void *allocate_small(unsigned arena, size_t requested_size) {
     if (requested_size) {
         write_after_free_check(p, size - canary_size);
         set_canary(metadata, p, size);
+        p = tag_and_clear_slab_slot(metadata, p, slot, size);
     }
     stats_small_allocate(c, size);
 
@@ -694,7 +756,17 @@ static inline void deallocate_small(void *p, const size_t *expected_size) {
     if (likely(!is_zero_size)) {
         check_canary(metadata, p, size);
 
-        if (ZERO_ON_FREE) {
+        bool skip_zero = false;
+#ifdef HAS_ARM_MTE
+        if (likely(is_memtag_enabled())) {
+            arm_mte_store_tags_and_clear(set_pointer_tag(p, RESERVED_TAG), size);
+            // metadata->arm_mte_tags is intentionally not updated, it should keep the previous slot
+            // tag after slot is freed
+            skip_zero = true;
+        }
+#endif
+
+        if (ZERO_ON_FREE && !skip_zero) {
             memset(p, 0, size - canary_size);
         }
     }
@@ -1123,8 +1195,15 @@ COLD static void init_slow_path(void) {
     if (unlikely(memory_protect_rw_metadata(ra->regions, ra->total * sizeof(struct region_metadata)))) {
         fatal_error("failed to unprotect memory for regions table");
     }
-
+#ifdef HAS_ARM_MTE
+    if (likely(is_memtag_enabled())) {
+        ro.slab_region_start = memory_map_mte(slab_region_size);
+    } else {
+        ro.slab_region_start = memory_map(slab_region_size);
+    }
+#else
     ro.slab_region_start = memory_map(slab_region_size);
+#endif
     if (unlikely(ro.slab_region_start == NULL)) {
         fatal_error("failed to allocate slab region");
     }
@@ -1368,6 +1447,11 @@ EXPORT void *h_calloc(size_t nmemb, size_t size) {
     if (!ZERO_ON_FREE && likely(p != NULL) && total_size && total_size <= max_slab_size_class) {
         memset(p, 0, total_size - canary_size);
     }
+#ifdef HAS_ARM_MTE
+    // use an assert instead of adding a conditional to memset() above (freed memory is always
+    // zeroed when MTE is enabled)
+    static_assert(ZERO_ON_FREE, "disabling ZERO_ON_FREE reduces performance when ARM MTE is enabled");
+#endif
     return p;
 }
 
@@ -1385,11 +1469,14 @@ EXPORT void *h_realloc(void *old, size_t size) {
         }
     }
 
+    void *old_orig = old;
+    old = untag_pointer(old);
+
     size_t old_size;
     if (old < get_slab_region_end() && old >= ro.slab_region_start) {
         old_size = slab_usable_size(old);
         if (size <= max_slab_size_class && get_size_info(size).size == old_size) {
-            return old;
+            return old_orig;
         }
         thread_unseal_metadata();
     } else {
@@ -1502,7 +1589,7 @@ EXPORT void *h_realloc(void *old, size_t size) {
     if (copy_size > 0 && copy_size <= max_slab_size_class) {
         copy_size -= canary_size;
     }
-    memcpy(new, old, copy_size);
+    memcpy(new, old_orig, copy_size);
     if (old_size <= max_slab_size_class) {
         deallocate_small(old, NULL);
     } else {
@@ -1543,6 +1630,8 @@ EXPORT void h_free(void *p) {
         return;
     }
 
+    p = untag_pointer(p);
+
     if (p < get_slab_region_end() && p >= ro.slab_region_start) {
         thread_unseal_metadata();
         deallocate_small(p, NULL);
@@ -1566,6 +1655,8 @@ EXPORT void h_free_sized(void *p, size_t expected_size) {
         return;
     }
 
+    p = untag_pointer(p);
+
     expected_size = adjust_size_for_canary(expected_size);
 
     if (p < get_slab_region_end() && p >= ro.slab_region_start) {
@@ -1619,11 +1710,13 @@ static inline void memory_corruption_check_small(const void *p) {
     mutex_unlock(&c->lock);
 }
 
-EXPORT size_t h_malloc_usable_size(H_MALLOC_USABLE_SIZE_CONST void *p) {
-    if (p == NULL) {
+EXPORT size_t h_malloc_usable_size(H_MALLOC_USABLE_SIZE_CONST void *arg) {
+    if (arg == NULL) {
         return 0;
     }
 
+    void *p = untag_pointer((void *) (uintptr_t) arg);
+
     if (p < get_slab_region_end() && p >= ro.slab_region_start) {
         thread_unseal_metadata();
         memory_corruption_check_small(p);
diff --git a/memory.c b/memory.c
index 04afc23..5434060 100644
--- a/memory.c
+++ b/memory.c
@@ -28,6 +28,20 @@ void *memory_map(size_t size) {
     return p;
 }
 
+#ifdef HAS_ARM_MTE
+// Note that PROT_MTE can't be cleared via mprotect
+void *memory_map_mte(size_t size) {
+    void *p = mmap(NULL, size, PROT_MTE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
+    if (unlikely(p == MAP_FAILED)) {
+        if (errno != ENOMEM) {
+            fatal_error("non-ENOMEM MTE mmap failure");
+        }
+        return NULL;
+    }
+    return p;
+}
+#endif
+
 bool memory_map_fixed(void *ptr, size_t size) {
     void *p = mmap(ptr, size, PROT_NONE, MAP_ANONYMOUS|MAP_PRIVATE|MAP_FIXED, -1, 0);
     bool ret = p == MAP_FAILED;
diff --git a/memory.h b/memory.h
index c04bfd9..6e4cd4d 100644
--- a/memory.h
+++ b/memory.h
@@ -11,6 +11,9 @@
 int get_metadata_key(void);
 
 void *memory_map(size_t size);
+#ifdef HAS_ARM_MTE
+void *memory_map_mte(size_t size);
+#endif
 bool memory_map_fixed(void *ptr, size_t size);
 bool memory_unmap(void *ptr, size_t size);
 bool memory_protect_ro(void *ptr, size_t size);
diff --git a/memtag.h b/memtag.h
new file mode 100644
index 0000000..a768d41
--- /dev/null
+++ b/memtag.h
@@ -0,0 +1,52 @@
+#ifndef MEMTAG_H
+#define MEMTAG_H
+
+#include "util.h"
+
+#ifdef HAS_ARM_MTE
+#include "arm_mte.h"
+#define MEMTAG 1
+#define RESERVED_TAG 15
+#define TAG_WIDTH 4
+#endif
+
+#ifdef MEMTAG
+extern bool __is_memtag_enabled;
+#endif
+
+static inline bool is_memtag_enabled(void) {
+#ifdef MEMTAG
+    return __is_memtag_enabled;
+#else
+    return false;
+#endif
+}
+
+static inline void *untag_pointer(void *ptr) {
+#ifdef HAS_ARM_MTE
+    const uintptr_t mask = UINTPTR_MAX >> 8;
+    return (void *) ((uintptr_t) ptr & mask);
+#else
+    return ptr;
+#endif
+}
+
+static inline void *set_pointer_tag(void *ptr, u8 tag) {
+#ifdef HAS_ARM_MTE
+    return (void *) (((uintptr_t) tag << 56) | (uintptr_t) untag_pointer(ptr));
+#else
+    (void) tag;
+    return ptr;
+#endif
+}
+
+static inline u8 get_pointer_tag(void *ptr) {
+#ifdef HAS_ARM_MTE
+    return (((uintptr_t) ptr) >> 56) & 0xf;
+#else
+    (void) ptr;
+    return 0;
+#endif
+}
+
+#endif

From 70c91f4c3e553bc37e146c9c545a186e4ec02054 Mon Sep 17 00:00:00 2001
From: Dmitry Muhomor <muhomor.dmitry@gmail.com>
Date: Thu, 26 Oct 2023 10:17:21 +0300
Subject: [PATCH 03/53] mte: disable write-after-free check for slab
 allocations when MTE is on

Freed slab memory is tagged with a reserved tag value that is never used for live allocations.
---
 h_malloc.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/h_malloc.c b/h_malloc.c
index 098eb37..fc36ad7 100644
--- a/h_malloc.c
+++ b/h_malloc.c
@@ -464,6 +464,12 @@ static void write_after_free_check(const char *p, size_t size) {
         return;
     }
 
+#ifdef HAS_ARM_MTE
+    if (likely(is_memtag_enabled())) {
+        return;
+    }
+#endif
+
     for (size_t i = 0; i < size; i += sizeof(u64)) {
         if (unlikely(*(const u64 *)(const void *)(p + i))) {
             fatal_error("detected write after free");

From 001fc865855aad7933bd2230a55267cd4ab5cdb9 Mon Sep 17 00:00:00 2001
From: Dmitry Muhomor <muhomor.dmitry@gmail.com>
Date: Thu, 26 Oct 2023 10:19:20 +0300
Subject: [PATCH 04/53] mte: disable slab canaries when MTE is on

Canary with the "0" value is now reserved to support re-enabling slab canaries if MTE is turned off
at runtime.
---
 h_malloc.c | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/h_malloc.c b/h_malloc.c
index fc36ad7..9a3a732 100644
--- a/h_malloc.c
+++ b/h_malloc.c
@@ -484,19 +484,43 @@ static void set_slab_canary_value(UNUSED struct slab_metadata *metadata, UNUSED
         0x00ffffffffffffffUL;
 
     metadata->canary_value = get_random_u64(rng) & canary_mask;
+#ifdef HAS_ARM_MTE
+    if (unlikely(metadata->canary_value == 0)) {
+        metadata->canary_value = 0x100;
+    }
+#endif
 #endif
 }
 
 static void set_canary(UNUSED const struct slab_metadata *metadata, UNUSED void *p, UNUSED size_t size) {
 #if SLAB_CANARY
+#ifdef HAS_ARM_MTE
+    if (likely(is_memtag_enabled())) {
+        return;
+    }
+#endif
+
     memcpy((char *)p + size - canary_size, &metadata->canary_value, canary_size);
 #endif
 }
 
 static void check_canary(UNUSED const struct slab_metadata *metadata, UNUSED const void *p, UNUSED size_t size) {
 #if SLAB_CANARY
+#ifdef HAS_ARM_MTE
+    if (likely(is_memtag_enabled())) {
+        return;
+    }
+#endif
+
     u64 canary_value;
     memcpy(&canary_value, (const char *)p + size - canary_size, canary_size);
+
+#ifdef HAS_ARM_MTE
+    if (unlikely(canary_value == 0)) {
+        return;
+    }
+#endif
+
     if (unlikely(canary_value != metadata->canary_value)) {
         fatal_error("canary corrupted");
     }

From f042a6b9b006a5d02660384aa809f54e56cc2db0 Mon Sep 17 00:00:00 2001
From: Dmitry Muhomor <muhomor.dmitry@gmail.com>
Date: Thu, 26 Oct 2023 10:22:55 +0300
Subject: [PATCH 05/53] android: add function for disabling MTE at runtime

On Android, MTE is always enabled in Zygote, and is disabled after fork for apps that didn't opt-in
to MTE.

Depends on the slab canary adjustments in the previous commit.
---
 h_malloc.c         | 8 ++++++++
 include/h_malloc.h | 1 +
 2 files changed, 9 insertions(+)

diff --git a/h_malloc.c b/h_malloc.c
index 9a3a732..c3ce7e2 100644
--- a/h_malloc.c
+++ b/h_malloc.c
@@ -2148,3 +2148,11 @@ COLD EXPORT int h_malloc_set_state(UNUSED void *state) {
     return -2;
 }
 #endif
+
+#ifdef __ANDROID__
+COLD EXPORT void h_malloc_disable_memory_tagging(void) {
+#ifdef HAS_ARM_MTE
+    __is_memtag_enabled = false;
+#endif
+}
+#endif
diff --git a/include/h_malloc.h b/include/h_malloc.h
index 5824abb..0eee395 100644
--- a/include/h_malloc.h
+++ b/include/h_malloc.h
@@ -99,6 +99,7 @@ int h_malloc_iterate(uintptr_t base, size_t size, void (*callback)(uintptr_t ptr
               void *arg);
 void h_malloc_disable(void);
 void h_malloc_enable(void);
+void h_malloc_disable_memory_tagging(void);
 #endif
 
 // hardened_malloc extensions

From 5137d2da4dbcde359b91ba19f320de80e85c0028 Mon Sep 17 00:00:00 2001
From: Dmitry Muhomor <muhomor.dmitry@gmail.com>
Date: Fri, 27 Oct 2023 21:26:38 +0300
Subject: [PATCH 06/53] android: enable MTE on devices that declare having it

---
 Android.bp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/Android.bp b/Android.bp
index 11725a6..0db6a04 100644
--- a/Android.bp
+++ b/Android.bp
@@ -73,6 +73,9 @@ cc_library {
         debuggable: {
             cflags: ["-DLABEL_MEMORY"],
         },
+        device_has_arm_mte: {
+            cflags: ["-DHAS_ARM_MTE", "-march=armv9-a+memtag"]
+        },
     },
     apex_available: [
         "com.android.runtime",

From 576328b1b4f5e9345bc4fd0099aa6ec3364bcd83 Mon Sep 17 00:00:00 2001
From: Dmitry Muhomor <muhomor.dmitry@gmail.com>
Date: Sat, 28 Oct 2023 17:04:27 +0300
Subject: [PATCH 07/53] android: add MTE tests

To run them, connect an MTE-enabled device via adb and execute `atest HMallocTest:MemtagTest`.

Since these tests are not deterministic (and neither is hardened_malloc itself), it's better to run
them multiple times, e.g. `atest --iterations 30 HMallocTest:MemtagTest`.

There are also CTS tests that are useful for checking correctness of the Android integration:
`atest CtsTaggingHostTestCases`
---
 androidtest/Android.bp                        |  25 +++
 androidtest/AndroidTest.xml                   |  13 ++
 androidtest/memtag/Android.bp                 |  16 ++
 androidtest/memtag/memtag_test.cc             | 204 ++++++++++++++++++
 .../src/grapheneos/hmalloc/MemtagTest.java    | 136 ++++++++++++
 5 files changed, 394 insertions(+)
 create mode 100644 androidtest/Android.bp
 create mode 100644 androidtest/AndroidTest.xml
 create mode 100644 androidtest/memtag/Android.bp
 create mode 100644 androidtest/memtag/memtag_test.cc
 create mode 100644 androidtest/src/grapheneos/hmalloc/MemtagTest.java

diff --git a/androidtest/Android.bp b/androidtest/Android.bp
new file mode 100644
index 0000000..ae0aa49
--- /dev/null
+++ b/androidtest/Android.bp
@@ -0,0 +1,25 @@
+java_test_host {
+    name: "HMallocTest",
+    srcs: [
+        "src/**/*.java",
+    ],
+
+    libs: [
+        "tradefed",
+        "compatibility-tradefed",
+        "compatibility-host-util",
+    ],
+
+    static_libs: [
+        "cts-host-utils",
+        "frameworks-base-hostutils",
+    ],
+
+    test_suites: [
+        "general-tests",
+    ],
+
+    data_device_bins_64: [
+        "memtag_test",
+    ],
+}
diff --git a/androidtest/AndroidTest.xml b/androidtest/AndroidTest.xml
new file mode 100644
index 0000000..333f1dd
--- /dev/null
+++ b/androidtest/AndroidTest.xml
@@ -0,0 +1,13 @@
+<?xml version="1.0" encoding="utf-8"?>
+<configuration description="hardened_malloc test">
+
+    <target_preparer class="com.android.compatibility.common.tradefed.targetprep.FilePusher">
+        <option name="cleanup" value="true" />
+        <option name="push" value="memtag_test->/data/local/tmp/memtag_test" />
+    </target_preparer>
+
+    <test class="com.android.compatibility.common.tradefed.testtype.JarHostTest" >
+        <option name="jar" value="HMallocTest.jar" />
+    </test>
+
+</configuration>
diff --git a/androidtest/memtag/Android.bp b/androidtest/memtag/Android.bp
new file mode 100644
index 0000000..14ab691
--- /dev/null
+++ b/androidtest/memtag/Android.bp
@@ -0,0 +1,16 @@
+cc_test {
+    name: "memtag_test",
+    srcs: ["memtag_test.cc"],
+    cflags: [
+        "-Wall",
+        "-Werror",
+        "-Wextra",
+        "-O0",
+    ],
+
+    compile_multilib: "64",
+
+    sanitize: {
+        memtag_heap: true,
+    },
+}
diff --git a/androidtest/memtag/memtag_test.cc b/androidtest/memtag/memtag_test.cc
new file mode 100644
index 0000000..16c379d
--- /dev/null
+++ b/androidtest/memtag/memtag_test.cc
@@ -0,0 +1,204 @@
+// needed to uncondionally enable assertions
+#undef NDEBUG
+#include <assert.h>
+#include <malloc.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <sys/utsname.h>
+#include <unistd.h>
+
+#include <map>
+#include <string>
+#include <unordered_map>
+
+using namespace std;
+
+using u8 = uint8_t;
+using uptr = uintptr_t;
+using u64 = uint64_t;
+
+const size_t DEFAULT_ALLOC_SIZE = 8;
+const size_t CANARY_SIZE = 8;
+
+void do_context_switch() {
+    utsname s;
+    uname(&s);
+}
+
+u8 get_pointer_tag(void *ptr) {
+    return (((uptr) ptr) >> 56) & 0xf;
+}
+
+void *untag_pointer(void *ptr) {
+    const uintptr_t mask = UINTPTR_MAX >> 8;
+    return (void *) ((uintptr_t) ptr & mask);
+}
+
+void tag_distinctness() {
+    if (rand() & 1) {
+        // make allocations in all of used size classes and free half of them
+
+        const int max = 21000;
+        void *ptrs[max];
+
+        for (int i = 0; i < max; ++i) {
+            ptrs[i] = malloc(max);
+        }
+
+        for (int i = 1; i < max; i += 2) {
+            free(ptrs[i]);
+        }
+    }
+
+    const size_t cnt = 3000;
+    const size_t iter_cnt = 5;
+    const size_t alloc_cnt = cnt * iter_cnt;
+
+    const int sizes[] = { 16, 160, 10240, 20480 };
+
+    for (size_t size_idx = 0; size_idx < sizeof(sizes) / sizeof(int); ++size_idx) {
+        const size_t full_alloc_size = sizes[size_idx];
+        const size_t alloc_size = full_alloc_size - CANARY_SIZE;
+
+        unordered_map<uptr, u8> map;
+        map.reserve(alloc_cnt);
+
+        for (size_t iter = 0; iter < iter_cnt; ++iter) {
+            uptr allocations[cnt];
+
+            for (size_t i = 0; i < cnt; ++i) {
+                u8 *p = (u8 *) malloc(alloc_size);
+                uptr addr = (uptr) untag_pointer(p);
+                u8 tag = get_pointer_tag(p);
+                assert(tag >= 1 && tag <= 14);
+
+                // check most recent tags of left and right neighbors
+
+                auto left = map.find(addr - full_alloc_size);
+                if (left != map.end()) {
+                    assert(left->second != tag);
+                }
+
+                auto right = map.find(addr + full_alloc_size);
+                if (right != map.end()) {
+                    assert(right->second != tag);
+                }
+
+                // check previous tag of this slot
+                auto prev = map.find(addr);
+                if (prev != map.end()) {
+                    assert(prev->second != tag);
+                    map.erase(addr);
+                }
+
+                map.emplace(addr, tag);
+
+                for (size_t j = 0; j < alloc_size; ++j) {
+                    // check that slot is zeroed
+                    assert(p[j] == 0);
+                    // check that slot is readable and writable
+                    p[j]++;
+                }
+
+                allocations[i] = addr;
+                // async tag check failures are reported on context switch
+                do_context_switch();
+            }
+
+            for (size_t i = 0; i < cnt; ++i) {
+                free((void *) allocations[i]);
+            }
+        }
+    }
+}
+
+u8* alloc_default() {
+    if (rand() & 1) {
+        int cnt = rand() & 0x3f;
+        for (int i = 0; i < cnt; ++i) {
+            (void) malloc(DEFAULT_ALLOC_SIZE);
+        }
+    }
+    return (u8 *) malloc(DEFAULT_ALLOC_SIZE);
+}
+
+volatile u8 u8_var;
+
+void read_after_free() {
+    u8 *p = alloc_default();
+    free(p);
+    volatile u8 v = p[0];
+    (void) v;
+}
+
+void write_after_free() {
+    u8 *p = alloc_default();
+    free(p);
+    p[0] = 1;
+}
+
+void underflow_read() {
+    u8 *p = alloc_default();
+    volatile u8 v = p[-1];
+    (void) v;
+}
+
+void underflow_write() {
+    u8 *p = alloc_default();
+    p[-1] = 1;
+}
+
+void overflow_read() {
+    u8 *p = alloc_default();
+    volatile u8 v = p[DEFAULT_ALLOC_SIZE + CANARY_SIZE];
+    (void) v;
+}
+
+void overflow_write() {
+    u8 *p = alloc_default();
+    p[DEFAULT_ALLOC_SIZE + CANARY_SIZE] = 1;
+}
+
+void untagged_read() {
+    u8 *p = alloc_default();
+    p = (u8 *) untag_pointer(p);
+    volatile u8 v = p[0];
+    (void) v;
+}
+
+void untagged_write() {
+    u8 *p = alloc_default();
+    p = (u8 *) untag_pointer(p);
+    p[0] = 1;
+}
+
+map<string, function<void()>> tests = {
+#define TEST(s) { #s, s }
+    TEST(tag_distinctness),
+    TEST(read_after_free),
+    TEST(write_after_free),
+    TEST(overflow_read),
+    TEST(overflow_write),
+    TEST(underflow_read),
+    TEST(underflow_write),
+    TEST(untagged_read),
+    TEST(untagged_write),
+#undef TEST
+};
+
+int main(int argc, char **argv) {
+    setbuf(stdout, NULL);
+    assert(argc == 2);
+
+    auto test_name = string(argv[1]);
+    auto test_fn = tests[test_name];
+    assert(test_fn != nullptr);
+
+    assert(mallopt(M_BIONIC_SET_HEAP_TAGGING_LEVEL, M_HEAP_TAGGING_LEVEL_ASYNC) == 1);
+
+    test_fn();
+    do_context_switch();
+    
+    return 0;
+}
diff --git a/androidtest/src/grapheneos/hmalloc/MemtagTest.java b/androidtest/src/grapheneos/hmalloc/MemtagTest.java
new file mode 100644
index 0000000..5544128
--- /dev/null
+++ b/androidtest/src/grapheneos/hmalloc/MemtagTest.java
@@ -0,0 +1,136 @@
+package grapheneos.hmalloc;
+
+import com.android.tradefed.device.DeviceNotAvailableException;
+import com.android.tradefed.testtype.DeviceJUnit4ClassRunner;
+import com.android.tradefed.testtype.junit4.BaseHostJUnit4Test;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+
+import java.io.IOException;
+import java.util.ArrayList;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.fail;
+
+@RunWith(DeviceJUnit4ClassRunner.class)
+public class MemtagTest extends BaseHostJUnit4Test {
+
+    private static final String TEST_BINARY = "/data/local/tmp/memtag_test";
+
+    enum Result {
+        SUCCESS,
+        // it's expected that the device is configured to use asymm MTE tag checking mode
+        ASYNC_MTE_ERROR,
+        SYNC_MTE_ERROR,
+    }
+
+    private static final int SEGV_EXIT_CODE = 139;
+
+    private void runTest(String name, Result expectedResult) throws DeviceNotAvailableException {
+        var args = new ArrayList<String>();
+        args.add(TEST_BINARY);
+        args.add(name);
+        var device = getDevice();
+        long deviceDate = device.getDeviceDate();
+        String cmdLine = String.join(" ", args);
+        var result = device.executeShellV2Command(cmdLine);
+
+        int expectedExitCode = expectedResult == Result.SUCCESS ? 0 : SEGV_EXIT_CODE;
+
+        assertEquals("process exit code", expectedExitCode, result.getExitCode().intValue());
+
+        if (expectedResult == Result.SUCCESS) {
+            return;
+        }
+
+        try {
+            // wait a bit for debuggerd to capture the crash
+            Thread.sleep(50);
+        } catch (InterruptedException e) {
+            throw new IllegalStateException(e);
+        }
+
+        try (var logcat = device.getLogcatSince(deviceDate)) {
+            try (var s = logcat.createInputStream()) {
+                String[] lines = new String(s.readAllBytes()).split("\n");
+                boolean foundCmd = false;
+                String cmd = "Cmdline: " + cmdLine;
+                String expectedSignalCode = switch (expectedResult) {
+                    case ASYNC_MTE_ERROR -> "SEGV_MTEAERR";
+                    case SYNC_MTE_ERROR -> "SEGV_MTESERR";
+                    default -> throw new IllegalStateException(expectedResult.name());
+                };
+                for (String line : lines) {
+                    if (!foundCmd) {
+                        if (line.contains(cmd)) {
+                            foundCmd = true;
+                        }
+                        continue;
+                    }
+
+                    if (line.contains("signal 11 (SIGSEGV), code")) {
+                        if (!line.contains(expectedSignalCode)) {
+                            break;
+                        } else {
+                            return;
+                        }
+                    }
+
+                    if (line.contains("backtrace")) {
+                        break;
+                    }
+                }
+
+                fail("missing " + expectedSignalCode + " crash in logcat");
+            } catch (IOException e) {
+                throw new IllegalStateException(e);
+            }
+        }
+    }
+
+    @Test
+    public void tag_distinctness() throws DeviceNotAvailableException {
+        runTest("tag_distinctness", Result.SUCCESS);
+    }
+
+    @Test
+    public void read_after_free() throws DeviceNotAvailableException {
+        runTest("read_after_free", Result.SYNC_MTE_ERROR);
+    }
+
+    @Test
+    public void write_after_free() throws DeviceNotAvailableException {
+        runTest("write_after_free", Result.ASYNC_MTE_ERROR);
+    }
+
+    @Test
+    public void underflow_read() throws DeviceNotAvailableException {
+        runTest("underflow_read", Result.SYNC_MTE_ERROR);
+    }
+
+    @Test
+    public void underflow_write() throws DeviceNotAvailableException {
+        runTest("underflow_write", Result.ASYNC_MTE_ERROR);
+    }
+
+    @Test
+    public void overflow_read() throws DeviceNotAvailableException {
+        runTest("overflow_read", Result.SYNC_MTE_ERROR);
+    }
+
+    @Test
+    public void overflow_write() throws DeviceNotAvailableException {
+        runTest("overflow_write", Result.ASYNC_MTE_ERROR);
+    }
+
+    @Test
+    public void untagged_read() throws DeviceNotAvailableException {
+        runTest("untagged_read", Result.SYNC_MTE_ERROR);
+    }
+
+    @Test
+    public void untagged_write() throws DeviceNotAvailableException {
+        runTest("untagged_write", Result.ASYNC_MTE_ERROR);
+    }
+}

From 01a199e19e345193dbdc3a407ef564eda789ae0a Mon Sep 17 00:00:00 2001
From: Dmitry Muhomor <muhomor.dmitry@gmail.com>
Date: Sat, 28 Oct 2023 22:55:34 +0300
Subject: [PATCH 08/53] mte: move is_memtag_enabled to read-only allocator data

---
 h_malloc.c | 28 +++++++++++++++++++++++-----
 memtag.h   | 12 ------------
 2 files changed, 23 insertions(+), 17 deletions(-)

diff --git a/h_malloc.c b/h_malloc.c
index c3ce7e2..deb40d5 100644
--- a/h_malloc.c
+++ b/h_malloc.c
@@ -67,10 +67,6 @@ static atomic_uint thread_arena_counter = 0;
 static const unsigned thread_arena = 0;
 #endif
 
-#ifdef MEMTAG
-bool __is_memtag_enabled = true;
-#endif
-
 static union {
     struct {
         void *slab_region_start;
@@ -80,6 +76,9 @@ static union {
         struct region_metadata *regions[2];
 #ifdef USE_PKEY
         int metadata_pkey;
+#endif
+#ifdef MEMTAG
+        bool is_memtag_disabled;
 #endif
     };
     char padding[PAGE_SIZE];
@@ -89,6 +88,12 @@ static inline void *get_slab_region_end(void) {
     return atomic_load_explicit(&ro.slab_region_end, memory_order_acquire);
 }
 
+#ifdef MEMTAG
+static inline bool is_memtag_enabled(void) {
+    return !ro.is_memtag_disabled;
+}
+#endif
+
 #define SLAB_METADATA_COUNT
 
 struct slab_metadata {
@@ -2152,7 +2157,20 @@ COLD EXPORT int h_malloc_set_state(UNUSED void *state) {
 #ifdef __ANDROID__
 COLD EXPORT void h_malloc_disable_memory_tagging(void) {
 #ifdef HAS_ARM_MTE
-    __is_memtag_enabled = false;
+    if (!ro.is_memtag_disabled) {
+        if (is_init()) {
+            if (unlikely(memory_protect_rw(&ro, sizeof(ro)))) {
+                fatal_error("failed to unprotect allocator data");
+            }
+            ro.is_memtag_disabled = true;
+            if (unlikely(memory_protect_ro(&ro, sizeof(ro)))) {
+                fatal_error("failed to protect allocator data");
+            }
+        } else {
+            // bionic calls this function very early in some cases
+            ro.is_memtag_disabled = true;
+        }
+    }
 #endif
 }
 #endif
diff --git a/memtag.h b/memtag.h
index a768d41..89bff75 100644
--- a/memtag.h
+++ b/memtag.h
@@ -10,18 +10,6 @@
 #define TAG_WIDTH 4
 #endif
 
-#ifdef MEMTAG
-extern bool __is_memtag_enabled;
-#endif
-
-static inline bool is_memtag_enabled(void) {
-#ifdef MEMTAG
-    return __is_memtag_enabled;
-#else
-    return false;
-#endif
-}
-
 static inline void *untag_pointer(void *ptr) {
 #ifdef HAS_ARM_MTE
     const uintptr_t mask = UINTPTR_MAX >> 8;

From 93aa9eefe413abbdff9037206bdfd4eae384cd50 Mon Sep 17 00:00:00 2001
From: Dmitry Muhomor <muhomor.dmitry@gmail.com>
Date: Sun, 29 Oct 2023 09:43:57 +0200
Subject: [PATCH 09/53] mte: make h_malloc_disable_memory_tagging() thread-safe

---
 h_malloc.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/h_malloc.c b/h_malloc.c
index deb40d5..447b114 100644
--- a/h_malloc.c
+++ b/h_malloc.c
@@ -1181,13 +1181,14 @@ static inline void enforce_init(void) {
     }
 }
 
-COLD static void init_slow_path(void) {
-    static struct mutex lock = MUTEX_INITIALIZER;
+static struct mutex init_lock = MUTEX_INITIALIZER;
 
-    mutex_lock(&lock);
+COLD static void init_slow_path(void) {
+
+    mutex_lock(&init_lock);
 
     if (unlikely(is_init())) {
-        mutex_unlock(&lock);
+        mutex_unlock(&init_lock);
         return;
     }
 
@@ -1278,7 +1279,7 @@ COLD static void init_slow_path(void) {
     }
     memory_set_name(&ro, sizeof(ro), "malloc read-only after init");
 
-    mutex_unlock(&lock);
+    mutex_unlock(&init_lock);
 
     // may allocate, so wait until the allocator is initialized to avoid deadlocking
     if (unlikely(pthread_atfork(full_lock, full_unlock, post_fork_child))) {
@@ -2157,6 +2158,7 @@ COLD EXPORT int h_malloc_set_state(UNUSED void *state) {
 #ifdef __ANDROID__
 COLD EXPORT void h_malloc_disable_memory_tagging(void) {
 #ifdef HAS_ARM_MTE
+    mutex_lock(&init_lock);
     if (!ro.is_memtag_disabled) {
         if (is_init()) {
             if (unlikely(memory_protect_rw(&ro, sizeof(ro)))) {
@@ -2171,6 +2173,7 @@ COLD EXPORT void h_malloc_disable_memory_tagging(void) {
             ro.is_memtag_disabled = true;
         }
     }
+    mutex_unlock(&init_lock);
 #endif
 }
 #endif

From 577d9583ebfe3f381b1050465c71b2386bdf0cd2 Mon Sep 17 00:00:00 2001
From: Dmitry Muhomor <muhomor.dmitry@gmail.com>
Date: Sun, 29 Oct 2023 09:49:22 +0200
Subject: [PATCH 10/53] mte: add licensing info for code that was copied from
 scudo

---
 arm_mte.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arm_mte.h b/arm_mte.h
index 8deefb2..28ff2c0 100644
--- a/arm_mte.h
+++ b/arm_mte.h
@@ -18,6 +18,9 @@ static inline void *arm_mte_create_random_tag(void *p, u64 exclusion_mask) {
 //
 // Contents of this function were copied from scudo:
 // https://android.googlesource.com/platform/external/scudo/+/refs/tags/android-14.0.0_r1/standalone/memtag.h#167
+//
+// scudo is licensed under the Apache License v2.0 with LLVM Exceptions, which is compatible with
+// the hardened_malloc's MIT license
 static inline void arm_mte_store_tags_and_clear(void *tagged_ptr, size_t len) {
     uintptr_t Begin = (uintptr_t) tagged_ptr;
     uintptr_t End = Begin + len;

From 28d5d394cf95c81674ff4aae55ae22f9740fe35f Mon Sep 17 00:00:00 2001
From: Dmitry Muhomor <muhomor.dmitry@gmail.com>
Date: Sun, 29 Oct 2023 13:23:40 +0200
Subject: [PATCH 11/53] memtag_test: remove usages of rand()

It didn't work correctly due to not being seeded and its usage wasn't necessary.
---
 androidtest/memtag/memtag_test.cc | 25 +++----------------------
 1 file changed, 3 insertions(+), 22 deletions(-)

diff --git a/androidtest/memtag/memtag_test.cc b/androidtest/memtag/memtag_test.cc
index 16c379d..097f1fa 100644
--- a/androidtest/memtag/memtag_test.cc
+++ b/androidtest/memtag/memtag_test.cc
@@ -36,21 +36,6 @@ void *untag_pointer(void *ptr) {
 }
 
 void tag_distinctness() {
-    if (rand() & 1) {
-        // make allocations in all of used size classes and free half of them
-
-        const int max = 21000;
-        void *ptrs[max];
-
-        for (int i = 0; i < max; ++i) {
-            ptrs[i] = malloc(max);
-        }
-
-        for (int i = 1; i < max; i += 2) {
-            free(ptrs[i]);
-        }
-    }
-
     const size_t cnt = 3000;
     const size_t iter_cnt = 5;
     const size_t alloc_cnt = cnt * iter_cnt;
@@ -114,13 +99,9 @@ void tag_distinctness() {
 }
 
 u8* alloc_default() {
-    if (rand() & 1) {
-        int cnt = rand() & 0x3f;
-        for (int i = 0; i < cnt; ++i) {
-            (void) malloc(DEFAULT_ALLOC_SIZE);
-        }
-    }
-    return (u8 *) malloc(DEFAULT_ALLOC_SIZE);
+    u8 *p = (u8 *) malloc(DEFAULT_ALLOC_SIZE);
+    assert(p);
+    return p;
 }
 
 volatile u8 u8_var;

From 155800526a7211d27b4104a2bf36098c6d31991f Mon Sep 17 00:00:00 2001
From: Dmitry Muhomor <muhomor.dmitry@gmail.com>
Date: Sun, 29 Oct 2023 13:30:36 +0200
Subject: [PATCH 12/53] memtag_test: improve tag_distinctness test

- check that tag distinctess checks are actually reached (it was previously verified manually by
looking at the now-removed printf output)
- check that only non-reserved tags are used
- check that all of non-reserved tags are used
- print tag usage statistics at the end of run
---
 androidtest/memtag/memtag_test.cc | 127 ++++++++++++++++++++++++------
 1 file changed, 104 insertions(+), 23 deletions(-)

diff --git a/androidtest/memtag/memtag_test.cc b/androidtest/memtag/memtag_test.cc
index 097f1fa..db48afa 100644
--- a/androidtest/memtag/memtag_test.cc
+++ b/androidtest/memtag/memtag_test.cc
@@ -35,49 +35,93 @@ void *untag_pointer(void *ptr) {
     return (void *) ((uintptr_t) ptr & mask);
 }
 
+// This test checks that slab slot allocation uses tag that is distint from tags of its neighbors
+// and from the tag of the previous allocation that used the same slot
 void tag_distinctness() {
-    const size_t cnt = 3000;
-    const size_t iter_cnt = 5;
-    const size_t alloc_cnt = cnt * iter_cnt;
+    // 0 and 15 are reserved
+    const int min_tag = 1;
+    const int max_tag = 14;
 
-    const int sizes[] = { 16, 160, 10240, 20480 };
+    struct SizeClass {
+        int size;
+        int slot_cnt;
+    };
 
-    for (size_t size_idx = 0; size_idx < sizeof(sizes) / sizeof(int); ++size_idx) {
-        const size_t full_alloc_size = sizes[size_idx];
+    // values from size_classes[] and size_class_slots[] in h_malloc.c
+    SizeClass size_classes[] = {
+        { .size = 16,    .slot_cnt = 256, },
+        { .size = 32,    .slot_cnt = 128, },
+        // this size class is used by allocations that are made by the addr_tag_map, which breaks
+        // tag distinctess checks
+        // { .size = 48,    .slot_cnt = 85,  },
+        { .size = 64,    .slot_cnt = 64,  },
+        { .size = 80,    .slot_cnt = 51,  },
+        { .size = 96,    .slot_cnt = 42,  },
+        { .size = 112,   .slot_cnt = 36,  },
+        { .size = 128,   .slot_cnt = 64,  },
+        { .size = 160,   .slot_cnt = 51,  },
+        { .size = 192,   .slot_cnt = 64,  },
+        { .size = 224,   .slot_cnt = 54,  },
+        { .size = 10240, .slot_cnt = 6,   },
+        { .size = 20480, .slot_cnt = 1,   },
+    };
+
+    int tag_usage[max_tag + 1];
+
+    for (size_t sc_idx = 0; sc_idx < sizeof(size_classes) / sizeof(SizeClass); ++sc_idx) {
+        SizeClass &sc = size_classes[sc_idx];
+
+        const size_t full_alloc_size = sc.size;
         const size_t alloc_size = full_alloc_size - CANARY_SIZE;
 
-        unordered_map<uptr, u8> map;
-        map.reserve(alloc_cnt);
+        // "tdc" is short for "tag distinctness check"
+        int left_neighbor_tdc_cnt = 0;
+        int right_neighbor_tdc_cnt = 0;
+        int prev_alloc_tdc_cnt = 0;
 
-        for (size_t iter = 0; iter < iter_cnt; ++iter) {
-            uptr allocations[cnt];
+        int iter_cnt = 600;
 
-            for (size_t i = 0; i < cnt; ++i) {
+        unordered_map<uptr, u8> addr_tag_map;
+        addr_tag_map.reserve(iter_cnt * sc.slot_cnt);
+
+        u64 seen_tags = 0;
+
+        for (int iter = 0; iter < iter_cnt; ++iter) {
+            uptr allocations[256]; // 256 is max slot count
+
+            for (int i = 0; i < sc.slot_cnt; ++i) {
                 u8 *p = (u8 *) malloc(alloc_size);
+                assert(p);
                 uptr addr = (uptr) untag_pointer(p);
                 u8 tag = get_pointer_tag(p);
-                assert(tag >= 1 && tag <= 14);
+
+                assert(tag >= min_tag && tag <= max_tag);
+                seen_tags |= 1 << tag;
+                ++tag_usage[tag];
 
                 // check most recent tags of left and right neighbors
 
-                auto left = map.find(addr - full_alloc_size);
-                if (left != map.end()) {
+                auto left = addr_tag_map.find(addr - full_alloc_size);
+                if (left != addr_tag_map.end()) {
                     assert(left->second != tag);
+                    ++left_neighbor_tdc_cnt;
                 }
 
-                auto right = map.find(addr + full_alloc_size);
-                if (right != map.end()) {
+                auto right = addr_tag_map.find(addr + full_alloc_size);
+                if (right != addr_tag_map.end()) {
                     assert(right->second != tag);
+                    ++right_neighbor_tdc_cnt;
                 }
 
                 // check previous tag of this slot
-                auto prev = map.find(addr);
-                if (prev != map.end()) {
+                auto prev = addr_tag_map.find(addr);
+                if (prev != addr_tag_map.end()) {
                     assert(prev->second != tag);
-                    map.erase(addr);
+                    ++prev_alloc_tdc_cnt;
+                    addr_tag_map.erase(addr);
                 }
 
-                map.emplace(addr, tag);
+                addr_tag_map.emplace(addr, tag);
 
                 for (size_t j = 0; j < alloc_size; ++j) {
                     // check that slot is zeroed
@@ -87,15 +131,52 @@ void tag_distinctness() {
                 }
 
                 allocations[i] = addr;
-                // async tag check failures are reported on context switch
-                do_context_switch();
             }
 
-            for (size_t i = 0; i < cnt; ++i) {
+            // free some of allocations to allow their slots to be reused
+            for (int i = sc.slot_cnt - 1; i >= 0; i -= 2) {
                 free((void *) allocations[i]);
             }
         }
+
+        // check that all of the tags were used, except reserved ones
+        assert(seen_tags == (0xffff & ~(1 << 0 | 1 << 15)));
+
+        printf("size_class\t%i\t" "tdc_left %i\t" "tdc_right %i\t" "tdc_prev_alloc %i\n",
+               sc.size, left_neighbor_tdc_cnt, right_neighbor_tdc_cnt, prev_alloc_tdc_cnt);
+
+        // make sure tag distinctess checks were actually performed
+        int min_tdc_cnt = sc.slot_cnt * iter_cnt / 5;
+
+        assert(prev_alloc_tdc_cnt > min_tdc_cnt);
+
+        if (sc.slot_cnt > 1) {
+            assert(left_neighbor_tdc_cnt > min_tdc_cnt);
+            assert(right_neighbor_tdc_cnt > min_tdc_cnt);
+        }
+
+        // async tag check failures are reported on context switch
+        do_context_switch();
     }
+
+    printf("\nTag use counters:\n");
+
+    int min = INT_MAX;
+    int max = 0;
+    double geomean = 0.0;
+    for (int i = min_tag; i <= max_tag; ++i) {
+        int v = tag_usage[i];
+        geomean += log(v);
+        min = std::min(min, v);
+        max = std::max(max, v);
+        printf("%i\t%i\n", i, tag_usage[i]);
+    }
+    int tag_cnt = 1 + max_tag - min_tag;
+    geomean = exp(geomean / tag_cnt);
+
+    double max_deviation = std::max((double) max - geomean, geomean - min);
+
+    printf("geomean: %.2f, max deviation from geomean: %.2f%%\n", geomean, (100.0 * max_deviation) / geomean);
 }
 
 u8* alloc_default() {

From f16ef601d4149b7c2690c2d4961b08cec13517a4 Mon Sep 17 00:00:00 2001
From: Dmitry Muhomor <muhomor.dmitry@gmail.com>
Date: Sun, 29 Oct 2023 15:06:14 +0200
Subject: [PATCH 13/53] memtag_test: improve capturing of test results

Using debuggerd + logcat parsing is unreliable and slow, print SEGV signal code to stderr instead.
---
 androidtest/memtag/memtag_test.cc             | 13 ++++
 .../src/grapheneos/hmalloc/MemtagTest.java    | 71 ++++---------------
 2 files changed, 28 insertions(+), 56 deletions(-)

diff --git a/androidtest/memtag/memtag_test.cc b/androidtest/memtag/memtag_test.cc
index db48afa..e6ec68d 100644
--- a/androidtest/memtag/memtag_test.cc
+++ b/androidtest/memtag/memtag_test.cc
@@ -2,6 +2,7 @@
 #undef NDEBUG
 #include <assert.h>
 #include <malloc.h>
+#include <signal.h>
 #include <stdio.h>
 #include <stdint.h>
 #include <stdlib.h>
@@ -249,6 +250,11 @@ map<string, function<void()>> tests = {
 #undef TEST
 };
 
+void segv_handler(int, siginfo_t *si, void *) {
+    fprintf(stderr, "SEGV_CODE %i", si->si_code);
+    exit(139); // standard exit code for SIGSEGV
+}
+
 int main(int argc, char **argv) {
     setbuf(stdout, NULL);
     assert(argc == 2);
@@ -259,6 +265,13 @@ int main(int argc, char **argv) {
 
     assert(mallopt(M_BIONIC_SET_HEAP_TAGGING_LEVEL, M_HEAP_TAGGING_LEVEL_ASYNC) == 1);
 
+    struct sigaction sa = {
+        .sa_sigaction = segv_handler,
+        .sa_flags = SA_SIGINFO,
+    };
+
+    assert(sigaction(SIGSEGV, &sa, nullptr) == 0);
+
     test_fn();
     do_context_switch();
     
diff --git a/androidtest/src/grapheneos/hmalloc/MemtagTest.java b/androidtest/src/grapheneos/hmalloc/MemtagTest.java
index 5544128..8cb7a45 100644
--- a/androidtest/src/grapheneos/hmalloc/MemtagTest.java
+++ b/androidtest/src/grapheneos/hmalloc/MemtagTest.java
@@ -19,10 +19,19 @@ public class MemtagTest extends BaseHostJUnit4Test {
     private static final String TEST_BINARY = "/data/local/tmp/memtag_test";
 
     enum Result {
-        SUCCESS,
+        SUCCESS(0, ""),
         // it's expected that the device is configured to use asymm MTE tag checking mode
-        ASYNC_MTE_ERROR,
-        SYNC_MTE_ERROR,
+        ASYNC_MTE_ERROR(139, "SEGV_CODE 8"),
+        SYNC_MTE_ERROR(139, "SEGV_CODE 9"),
+        ;
+
+        public final int exitCode;
+        public final String stderr;
+
+        Result(int exitCode, String stderr) {
+            this.exitCode = exitCode;
+            this.stderr = stderr;
+        }
     }
 
     private static final int SEGV_EXIT_CODE = 139;
@@ -31,62 +40,12 @@ public class MemtagTest extends BaseHostJUnit4Test {
         var args = new ArrayList<String>();
         args.add(TEST_BINARY);
         args.add(name);
-        var device = getDevice();
-        long deviceDate = device.getDeviceDate();
         String cmdLine = String.join(" ", args);
-        var result = device.executeShellV2Command(cmdLine);
 
-        int expectedExitCode = expectedResult == Result.SUCCESS ? 0 : SEGV_EXIT_CODE;
+        var result = getDevice().executeShellV2Command(cmdLine);
 
-        assertEquals("process exit code", expectedExitCode, result.getExitCode().intValue());
-
-        if (expectedResult == Result.SUCCESS) {
-            return;
-        }
-
-        try {
-            // wait a bit for debuggerd to capture the crash
-            Thread.sleep(50);
-        } catch (InterruptedException e) {
-            throw new IllegalStateException(e);
-        }
-
-        try (var logcat = device.getLogcatSince(deviceDate)) {
-            try (var s = logcat.createInputStream()) {
-                String[] lines = new String(s.readAllBytes()).split("\n");
-                boolean foundCmd = false;
-                String cmd = "Cmdline: " + cmdLine;
-                String expectedSignalCode = switch (expectedResult) {
-                    case ASYNC_MTE_ERROR -> "SEGV_MTEAERR";
-                    case SYNC_MTE_ERROR -> "SEGV_MTESERR";
-                    default -> throw new IllegalStateException(expectedResult.name());
-                };
-                for (String line : lines) {
-                    if (!foundCmd) {
-                        if (line.contains(cmd)) {
-                            foundCmd = true;
-                        }
-                        continue;
-                    }
-
-                    if (line.contains("signal 11 (SIGSEGV), code")) {
-                        if (!line.contains(expectedSignalCode)) {
-                            break;
-                        } else {
-                            return;
-                        }
-                    }
-
-                    if (line.contains("backtrace")) {
-                        break;
-                    }
-                }
-
-                fail("missing " + expectedSignalCode + " crash in logcat");
-            } catch (IOException e) {
-                throw new IllegalStateException(e);
-            }
-        }
+        assertEquals("process exit code", expectedResult.exitCode, result.getExitCode().intValue());
+        assertEquals("stderr", expectedResult.stderr, result.getStderr());
     }
 
     @Test

From 7a6dbd81524e69fdc3a31c895946c13b64399c10 Mon Sep 17 00:00:00 2001
From: Dmitry Muhomor <muhomor.dmitry@gmail.com>
Date: Sun, 29 Oct 2023 15:36:37 +0200
Subject: [PATCH 14/53] mte: add comment about the reserved slab canary value

---
 h_malloc.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/h_malloc.c b/h_malloc.c
index 447b114..8c4d487 100644
--- a/h_malloc.c
+++ b/h_malloc.c
@@ -491,7 +491,11 @@ static void set_slab_canary_value(UNUSED struct slab_metadata *metadata, UNUSED
     metadata->canary_value = get_random_u64(rng) & canary_mask;
 #ifdef HAS_ARM_MTE
     if (unlikely(metadata->canary_value == 0)) {
-        metadata->canary_value = 0x100;
+        // 0 is reserved to support disabling MTE at runtime (this is required on Android).
+        // When MTE is enabled, writing and reading of canaries is disabled, i.e. canary remains zeroed.
+        // After MTE is disabled, canaries that are set to 0 are ignored, since they wouldn't match
+        // slab's metadata->canary_value.
+        metadata->canary_value = 0x100; // 0x100 was chosen as the smallest acceptable value
     }
 #endif
 #endif

From 03883eb2ced012a5fd024d156b428ef27bbbd552 Mon Sep 17 00:00:00 2001
From: Dmitry Muhomor <muhomor.dmitry@gmail.com>
Date: Mon, 30 Oct 2023 12:27:47 +0200
Subject: [PATCH 15/53] mte: rename arm_mte_store_tags_and_clear() to
 arm_mte_tag_and_clear_mem()

---
 arm_mte.h  | 2 +-
 h_malloc.c | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/arm_mte.h b/arm_mte.h
index 28ff2c0..b5e6fcf 100644
--- a/arm_mte.h
+++ b/arm_mte.h
@@ -21,7 +21,7 @@ static inline void *arm_mte_create_random_tag(void *p, u64 exclusion_mask) {
 //
 // scudo is licensed under the Apache License v2.0 with LLVM Exceptions, which is compatible with
 // the hardened_malloc's MIT license
-static inline void arm_mte_store_tags_and_clear(void *tagged_ptr, size_t len) {
+static inline void arm_mte_tag_and_clear_mem(void *tagged_ptr, size_t len) {
     uintptr_t Begin = (uintptr_t) tagged_ptr;
     uintptr_t End = Begin + len;
     uintptr_t LineSize, Next, Tmp;
diff --git a/h_malloc.c b/h_malloc.c
index 8c4d487..aff5211 100644
--- a/h_malloc.c
+++ b/h_malloc.c
@@ -589,7 +589,7 @@ static void *tag_and_clear_slab_slot(struct slab_metadata *metadata, void *slot_
 
     void *tagged_ptr = arm_mte_create_random_tag(slot_ptr, tem);
     // slot addresses and sizes are always aligned by 16
-    arm_mte_store_tags_and_clear(tagged_ptr, slot_size);
+    arm_mte_tag_and_clear_mem(tagged_ptr, slot_size);
 
     // store new tag of this slot
     u4_arr_set(slot_tags, slot_idx + 1, get_pointer_tag(tagged_ptr));
@@ -798,7 +798,7 @@ static inline void deallocate_small(void *p, const size_t *expected_size) {
         bool skip_zero = false;
 #ifdef HAS_ARM_MTE
         if (likely(is_memtag_enabled())) {
-            arm_mte_store_tags_and_clear(set_pointer_tag(p, RESERVED_TAG), size);
+            arm_mte_tag_and_clear_mem(set_pointer_tag(p, RESERVED_TAG), size);
             // metadata->arm_mte_tags is intentionally not updated, it should keep the previous slot
             // tag after slot is freed
             skip_zero = true;

From 009f2dad764fc5de8c56167d761528816cb48c9e Mon Sep 17 00:00:00 2001
From: Dmitry Muhomor <muhomor.dmitry@gmail.com>
Date: Mon, 30 Oct 2023 12:30:28 +0200
Subject: [PATCH 16/53] mte: note alignment requirements of
 arm_mte_tag_and_clear_mem()

---
 arm_mte.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arm_mte.h b/arm_mte.h
index b5e6fcf..ea3445e 100644
--- a/arm_mte.h
+++ b/arm_mte.h
@@ -12,6 +12,8 @@ static inline void *arm_mte_create_random_tag(void *p, u64 exclusion_mask) {
 
 // Tag the memory region with the tag specified in tag bits of tagged_ptr. Memory region itself is
 // zeroed.
+// tagged_ptr has to be aligned by 16, and len has to be a multiple of 16 (tag granule size).
+//
 // Arm's software optimization guide says:
 // "it is recommended to use STZGM (or DCZGVA) to set tag if data is not a concern." (STZGM and
 // DCGZVA are zeroing variants of tagging instructions).

From b560431c017dff5fc5901197aaeb3e818ac842c1 Mon Sep 17 00:00:00 2001
From: Dmitry Muhomor <muhomor.dmitry@gmail.com>
Date: Mon, 30 Oct 2023 12:31:12 +0200
Subject: [PATCH 17/53] mte: note why 0 tag is excluded

---
 h_malloc.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/h_malloc.c b/h_malloc.c
index aff5211..5fa2863 100644
--- a/h_malloc.c
+++ b/h_malloc.c
@@ -577,7 +577,8 @@ static void *tag_and_clear_slab_slot(struct slab_metadata *metadata, void *slot_
     // is constructed.
     u8 *slot_tags = metadata->arm_mte_tags;
 
-    // Tag exclusion mask
+    // Tag exclusion mask. 0 tag is always excluded to detect accesses to slab memory via untagged
+    // pointers. Moreover, 0 tag is excluded in bionic via PR_MTE_TAG_MASK prctl
     u64 tem = (1 << 0) | (1 << RESERVED_TAG);
 
     // current or previous tag of left neighbor or 0 if there's no left neighbor or if it was never used

From c75cb4c3f3182d5ad0cffe9622560d7aad6f2598 Mon Sep 17 00:00:00 2001
From: Dmitry Muhomor <muhomor.dmitry@gmail.com>
Date: Mon, 30 Oct 2023 12:34:41 +0200
Subject: [PATCH 18/53] mte: refactor tag_and_clear_slab_slot()

Explicitly call is_memtag_enabled() before calling tag_and_clear_slab_slot() to make it clearer that
memory is not zeroed when MTE is disabled.
---
 h_malloc.c | 37 ++++++++++++++++++++++---------------
 1 file changed, 22 insertions(+), 15 deletions(-)

diff --git a/h_malloc.c b/h_malloc.c
index 5fa2863..611d5cf 100644
--- a/h_malloc.c
+++ b/h_malloc.c
@@ -562,12 +562,8 @@ static inline void stats_slab_deallocate(UNUSED struct size_class *c, UNUSED siz
 #endif
 }
 
-static void *tag_and_clear_slab_slot(struct slab_metadata *metadata, void *slot_ptr, size_t slot_idx, size_t slot_size) {
 #ifdef HAS_ARM_MTE
-    if (unlikely(!is_memtag_enabled())) {
-        return slot_ptr;
-    }
-
+static void *tag_and_clear_slab_slot(struct slab_metadata *metadata, void *slot_ptr, size_t slot_idx, size_t slot_size) {
     // arm_mte_tags is an array of 4-bit unsigned integers stored as u8 array (MTE tags are 4-bit wide)
     //
     // It stores the most recent tag for each slab slot, or 0 if the slot was never used.
@@ -596,13 +592,8 @@ static void *tag_and_clear_slab_slot(struct slab_metadata *metadata, void *slot_
     u4_arr_set(slot_tags, slot_idx + 1, get_pointer_tag(tagged_ptr));
 
     return tagged_ptr;
-#else
-    (void) metadata;
-    (void) slot_idx;
-    (void) slot_size;
-    return slot_ptr;
-#endif
 }
+#endif
 
 static inline void *allocate_small(unsigned arena, size_t requested_size) {
     struct size_info info = get_size_info(requested_size);
@@ -632,7 +623,11 @@ static inline void *allocate_small(unsigned arena, size_t requested_size) {
             if (requested_size) {
                 write_after_free_check(p, size - canary_size);
                 set_canary(metadata, p, size);
-                p = tag_and_clear_slab_slot(metadata, p, slot, size);
+#ifdef HAS_ARM_MTE
+                if (likely(is_memtag_enabled())) {
+                    p = tag_and_clear_slab_slot(metadata, p, slot, size);
+                }
+#endif
             }
             stats_small_allocate(c, size);
 
@@ -665,7 +660,11 @@ static inline void *allocate_small(unsigned arena, size_t requested_size) {
             void *p = slot_pointer(size, slab, slot);
             if (requested_size) {
                 set_canary(metadata, p, size);
-                p = tag_and_clear_slab_slot(metadata, p, slot, size);
+#ifdef HAS_ARM_MTE
+                if (likely(is_memtag_enabled())) {
+                    p = tag_and_clear_slab_slot(metadata, p, slot, size);
+                }
+#endif
             }
             stats_slab_allocate(c, slab_size);
             stats_small_allocate(c, size);
@@ -688,7 +687,11 @@ static inline void *allocate_small(unsigned arena, size_t requested_size) {
         void *p = slot_pointer(size, slab, slot);
         if (requested_size) {
             set_canary(metadata, p, size);
-            p = tag_and_clear_slab_slot(metadata, p, slot, size);
+#ifdef HAS_ARM_MTE
+            if (likely(is_memtag_enabled())) {
+                p = tag_and_clear_slab_slot(metadata, p, slot, size);
+            }
+#endif
         }
         stats_slab_allocate(c, slab_size);
         stats_small_allocate(c, size);
@@ -713,7 +716,11 @@ static inline void *allocate_small(unsigned arena, size_t requested_size) {
     if (requested_size) {
         write_after_free_check(p, size - canary_size);
         set_canary(metadata, p, size);
-        p = tag_and_clear_slab_slot(metadata, p, slot, size);
+#ifdef HAS_ARM_MTE
+        if (likely(is_memtag_enabled())) {
+            p = tag_and_clear_slab_slot(metadata, p, slot, size);
+        }
+#endif
     }
     stats_small_allocate(c, size);
 

From 25f0fe9c69f6b0bc0f5f428c2ead2358cdcf82b2 Mon Sep 17 00:00:00 2001
From: Dmitry Muhomor <muhomor.dmitry@gmail.com>
Date: Mon, 30 Oct 2023 12:35:24 +0200
Subject: [PATCH 19/53] remove an always-true sizeof(u8) assert

---
 util.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/util.h b/util.h
index 9b5abe6..fc22c23 100644
--- a/util.h
+++ b/util.h
@@ -1,7 +1,6 @@
 #ifndef UTIL_H
 #define UTIL_H
 
-#include <assert.h>
 #include <stdbool.h>
 #include <stddef.h>
 #include <stdint.h>
@@ -59,7 +58,6 @@ static inline size_t align(size_t size, size_t align) {
 }
 
 // u4_arr_{set,get} are helper functions for using u8 array as an array of unsigned 4-bit values.
-static_assert(sizeof(u8) == 1, "unexpected u8 size");
 
 // val is treated as a 4-bit value
 static inline void u4_arr_set(u8 *arr, size_t idx, u8 val) {

From be08eeee2dbbfd85db0b79125a34bd27f265b898 Mon Sep 17 00:00:00 2001
From: Dmitry Muhomor <muhomor.dmitry@gmail.com>
Date: Mon, 30 Oct 2023 12:45:45 +0200
Subject: [PATCH 20/53] mte: update comment about skipped tag array update in
 deallocate_small()

---
 h_malloc.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/h_malloc.c b/h_malloc.c
index 611d5cf..8ccb3b7 100644
--- a/h_malloc.c
+++ b/h_malloc.c
@@ -807,8 +807,7 @@ static inline void deallocate_small(void *p, const size_t *expected_size) {
 #ifdef HAS_ARM_MTE
         if (likely(is_memtag_enabled())) {
             arm_mte_tag_and_clear_mem(set_pointer_tag(p, RESERVED_TAG), size);
-            // metadata->arm_mte_tags is intentionally not updated, it should keep the previous slot
-            // tag after slot is freed
+            // metadata->arm_mte_tags is intentionally not updated, see tag_and_clear_slab_slot()
             skip_zero = true;
         }
 #endif

From 72dc236d5f9834f7129674ea0541f0e98c6b7748 Mon Sep 17 00:00:00 2001
From: Dmitry Muhomor <muhomor.dmitry@gmail.com>
Date: Mon, 30 Oct 2023 12:52:09 +0200
Subject: [PATCH 21/53] mte: add untag_pointer() variant for const pointers

---
 h_malloc.c | 2 +-
 memtag.h   | 9 +++++++++
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/h_malloc.c b/h_malloc.c
index 8ccb3b7..ffcf0e4 100644
--- a/h_malloc.c
+++ b/h_malloc.c
@@ -1762,7 +1762,7 @@ EXPORT size_t h_malloc_usable_size(H_MALLOC_USABLE_SIZE_CONST void *arg) {
         return 0;
     }
 
-    void *p = untag_pointer((void *) (uintptr_t) arg);
+    const void *p = untag_const_pointer(arg);
 
     if (p < get_slab_region_end() && p >= ro.slab_region_start) {
         thread_unseal_metadata();
diff --git a/memtag.h b/memtag.h
index 89bff75..0ba4cbc 100644
--- a/memtag.h
+++ b/memtag.h
@@ -19,6 +19,15 @@ static inline void *untag_pointer(void *ptr) {
 #endif
 }
 
+static inline const void *untag_const_pointer(const void *ptr) {
+#ifdef HAS_ARM_MTE
+    const uintptr_t mask = UINTPTR_MAX >> 8;
+    return (const void *) ((uintptr_t) ptr & mask);
+#else
+    return ptr;
+#endif
+}
+
 static inline void *set_pointer_tag(void *ptr, u8 tag) {
 #ifdef HAS_ARM_MTE
     return (void *) (((uintptr_t) tag << 56) | (uintptr_t) untag_pointer(ptr));

From fd75fc1ba877c27cbfb67501a72c35e84a23efe7 Mon Sep 17 00:00:00 2001
From: Dmitry Muhomor <muhomor.dmitry@gmail.com>
Date: Mon, 30 Oct 2023 20:08:47 +0200
Subject: [PATCH 22/53] mte: add scudo to CREDITS file

---
 CREDITS | 227 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 227 insertions(+)

diff --git a/CREDITS b/CREDITS
index 3ad8617..31b6875 100644
--- a/CREDITS
+++ b/CREDITS
@@ -54,3 +54,230 @@ libdivide:
 
 random.c get_random_{type}_uniform functions are based on Fast Random Integer
 Generation in an Interval by Daniel Lemire
+
+arm_mte.h arm_mte_tag_and_clear_mem function contents were copied from storeTags function in scudo:
+
+    ==============================================================================
+    The LLVM Project is under the Apache License v2.0 with LLVM Exceptions:
+    ==============================================================================
+
+                                     Apache License
+                               Version 2.0, January 2004
+                            http://www.apache.org/licenses/
+
+        TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+        1. Definitions.
+
+          "License" shall mean the terms and conditions for use, reproduction,
+          and distribution as defined by Sections 1 through 9 of this document.
+
+          "Licensor" shall mean the copyright owner or entity authorized by
+          the copyright owner that is granting the License.
+
+          "Legal Entity" shall mean the union of the acting entity and all
+          other entities that control, are controlled by, or are under common
+          control with that entity. For the purposes of this definition,
+          "control" means (i) the power, direct or indirect, to cause the
+          direction or management of such entity, whether by contract or
+          otherwise, or (ii) ownership of fifty percent (50%) or more of the
+          outstanding shares, or (iii) beneficial ownership of such entity.
+
+          "You" (or "Your") shall mean an individual or Legal Entity
+          exercising permissions granted by this License.
+
+          "Source" form shall mean the preferred form for making modifications,
+          including but not limited to software source code, documentation
+          source, and configuration files.
+
+          "Object" form shall mean any form resulting from mechanical
+          transformation or translation of a Source form, including but
+          not limited to compiled object code, generated documentation,
+          and conversions to other media types.
+
+          "Work" shall mean the work of authorship, whether in Source or
+          Object form, made available under the License, as indicated by a
+          copyright notice that is included in or attached to the work
+          (an example is provided in the Appendix below).
+
+          "Derivative Works" shall mean any work, whether in Source or Object
+          form, that is based on (or derived from) the Work and for which the
+          editorial revisions, annotations, elaborations, or other modifications
+          represent, as a whole, an original work of authorship. For the purposes
+          of this License, Derivative Works shall not include works that remain
+          separable from, or merely link (or bind by name) to the interfaces of,
+          the Work and Derivative Works thereof.
+
+          "Contribution" shall mean any work of authorship, including
+          the original version of the Work and any modifications or additions
+          to that Work or Derivative Works thereof, that is intentionally
+          submitted to Licensor for inclusion in the Work by the copyright owner
+          or by an individual or Legal Entity authorized to submit on behalf of
+          the copyright owner. For the purposes of this definition, "submitted"
+          means any form of electronic, verbal, or written communication sent
+          to the Licensor or its representatives, including but not limited to
+          communication on electronic mailing lists, source code control systems,
+          and issue tracking systems that are managed by, or on behalf of, the
+          Licensor for the purpose of discussing and improving the Work, but
+          excluding communication that is conspicuously marked or otherwise
+          designated in writing by the copyright owner as "Not a Contribution."
+
+          "Contributor" shall mean Licensor and any individual or Legal Entity
+          on behalf of whom a Contribution has been received by Licensor and
+          subsequently incorporated within the Work.
+
+        2. Grant of Copyright License. Subject to the terms and conditions of
+          this License, each Contributor hereby grants to You a perpetual,
+          worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+          copyright license to reproduce, prepare Derivative Works of,
+          publicly display, publicly perform, sublicense, and distribute the
+          Work and such Derivative Works in Source or Object form.
+
+        3. Grant of Patent License. Subject to the terms and conditions of
+          this License, each Contributor hereby grants to You a perpetual,
+          worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+          (except as stated in this section) patent license to make, have made,
+          use, offer to sell, sell, import, and otherwise transfer the Work,
+          where such license applies only to those patent claims licensable
+          by such Contributor that are necessarily infringed by their
+          Contribution(s) alone or by combination of their Contribution(s)
+          with the Work to which such Contribution(s) was submitted. If You
+          institute patent litigation against any entity (including a
+          cross-claim or counterclaim in a lawsuit) alleging that the Work
+          or a Contribution incorporated within the Work constitutes direct
+          or contributory patent infringement, then any patent licenses
+          granted to You under this License for that Work shall terminate
+          as of the date such litigation is filed.
+
+        4. Redistribution. You may reproduce and distribute copies of the
+          Work or Derivative Works thereof in any medium, with or without
+          modifications, and in Source or Object form, provided that You
+          meet the following conditions:
+
+          (a) You must give any other recipients of the Work or
+              Derivative Works a copy of this License; and
+
+          (b) You must cause any modified files to carry prominent notices
+              stating that You changed the files; and
+
+          (c) You must retain, in the Source form of any Derivative Works
+              that You distribute, all copyright, patent, trademark, and
+              attribution notices from the Source form of the Work,
+              excluding those notices that do not pertain to any part of
+              the Derivative Works; and
+
+          (d) If the Work includes a "NOTICE" text file as part of its
+              distribution, then any Derivative Works that You distribute must
+              include a readable copy of the attribution notices contained
+              within such NOTICE file, excluding those notices that do not
+              pertain to any part of the Derivative Works, in at least one
+              of the following places: within a NOTICE text file distributed
+              as part of the Derivative Works; within the Source form or
+              documentation, if provided along with the Derivative Works; or,
+              within a display generated by the Derivative Works, if and
+              wherever such third-party notices normally appear. The contents
+              of the NOTICE file are for informational purposes only and
+              do not modify the License. You may add Your own attribution
+              notices within Derivative Works that You distribute, alongside
+              or as an addendum to the NOTICE text from the Work, provided
+              that such additional attribution notices cannot be construed
+              as modifying the License.
+
+          You may add Your own copyright statement to Your modifications and
+          may provide additional or different license terms and conditions
+          for use, reproduction, or distribution of Your modifications, or
+          for any such Derivative Works as a whole, provided Your use,
+          reproduction, and distribution of the Work otherwise complies with
+          the conditions stated in this License.
+
+        5. Submission of Contributions. Unless You explicitly state otherwise,
+          any Contribution intentionally submitted for inclusion in the Work
+          by You to the Licensor shall be under the terms and conditions of
+          this License, without any additional terms or conditions.
+          Notwithstanding the above, nothing herein shall supersede or modify
+          the terms of any separate license agreement you may have executed
+          with Licensor regarding such Contributions.
+
+        6. Trademarks. This License does not grant permission to use the trade
+          names, trademarks, service marks, or product names of the Licensor,
+          except as required for reasonable and customary use in describing the
+          origin of the Work and reproducing the content of the NOTICE file.
+
+        7. Disclaimer of Warranty. Unless required by applicable law or
+          agreed to in writing, Licensor provides the Work (and each
+          Contributor provides its Contributions) on an "AS IS" BASIS,
+          WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+          implied, including, without limitation, any warranties or conditions
+          of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+          PARTICULAR PURPOSE. You are solely responsible for determining the
+          appropriateness of using or redistributing the Work and assume any
+          risks associated with Your exercise of permissions under this License.
+
+        8. Limitation of Liability. In no event and under no legal theory,
+          whether in tort (including negligence), contract, or otherwise,
+          unless required by applicable law (such as deliberate and grossly
+          negligent acts) or agreed to in writing, shall any Contributor be
+          liable to You for damages, including any direct, indirect, special,
+          incidental, or consequential damages of any character arising as a
+          result of this License or out of the use or inability to use the
+          Work (including but not limited to damages for loss of goodwill,
+          work stoppage, computer failure or malfunction, or any and all
+          other commercial damages or losses), even if such Contributor
+          has been advised of the possibility of such damages.
+
+        9. Accepting Warranty or Additional Liability. While redistributing
+          the Work or Derivative Works thereof, You may choose to offer,
+          and charge a fee for, acceptance of support, warranty, indemnity,
+          or other liability obligations and/or rights consistent with this
+          License. However, in accepting such obligations, You may act only
+          on Your own behalf and on Your sole responsibility, not on behalf
+          of any other Contributor, and only if You agree to indemnify,
+          defend, and hold each Contributor harmless for any liability
+          incurred by, or claims asserted against, such Contributor by reason
+          of your accepting any such warranty or additional liability.
+
+        END OF TERMS AND CONDITIONS
+
+        APPENDIX: How to apply the Apache License to your work.
+
+          To apply the Apache License to your work, attach the following
+          boilerplate notice, with the fields enclosed by brackets "[]"
+          replaced with your own identifying information. (Don't include
+          the brackets!)  The text should be enclosed in the appropriate
+          comment syntax for the file format. We also recommend that a
+          file or class name and description of purpose be included on the
+          same "printed page" as the copyright notice for easier
+          identification within third-party archives.
+
+        Copyright [yyyy] [name of copyright owner]
+
+        Licensed under the Apache License, Version 2.0 (the "License");
+        you may not use this file except in compliance with the License.
+        You may obtain a copy of the License at
+
+           http://www.apache.org/licenses/LICENSE-2.0
+
+        Unless required by applicable law or agreed to in writing, software
+        distributed under the License is distributed on an "AS IS" BASIS,
+        WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+        See the License for the specific language governing permissions and
+        limitations under the License.
+
+
+    ---- LLVM Exceptions to the Apache 2.0 License ----
+
+    As an exception, if, as a result of your compiling your source code, portions
+    of this Software are embedded into an Object form of such source code, you
+    may redistribute such embedded portions in such Object form without complying
+    with the conditions of Sections 4(a), 4(b) and 4(d) of the License.
+
+    In addition, if you combine or link compiled forms of this Software with
+    software that is licensed under the GPLv2 ("Combined Software") and if a
+    court of competent jurisdiction determines that the patent provision (Section
+    3), the indemnity provision (Section 9) or other Section of the License
+    conflicts with the conditions of the GPLv2, you may retroactively and
+    prospectively choose to deem waived or otherwise exclude such Section(s) of
+    the License, but only in their entirety and only with respect to the Combined
+    Software.
+
+    ==============================================================================

From f793a3edf676f0e72c7f21de4d89aa14b247a61a Mon Sep 17 00:00:00 2001
From: Daniel Micay <daniel.micay@grapheneos.org>
Date: Mon, 30 Oct 2023 14:23:07 -0400
Subject: [PATCH 23/53] update README now that MTE is implemented

---
 README.md | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index b3f820f..8962037 100644
--- a/README.md
+++ b/README.md
@@ -470,16 +470,16 @@ was a bit less important and if a core goal was finding latent bugs.
     * Errors other than ENOMEM from mmap, munmap, mprotect and mremap treated
       as fatal, which can help to detect memory management gone wrong elsewhere
       in the process.
-* [future] Memory tagging for slab allocations via MTE on ARMv8.5+
+* Memory tagging for slab allocations via MTE on ARMv8.5+
     * random memory tags as the baseline, providing probabilistic protection
       against various forms of memory corruption
     * dedicated tag for free slots, set on free, for deterministic protection
       against accessing freed memory
-    * store previous random tag within freed slab allocations, and increment it
-      to get the next tag for that slot to provide deterministic use-after-free
-      detection through multiple cycles of memory reuse
     * guarantee distinct tags for adjacent memory allocations by incrementing
       past matching values for deterministic detection of linear overflows
+    * [future] store previous random tag and increment it to get the next tag
+      for that slot to provide deterministic use-after-free detection through
+      multiple cycles of memory reuse
 
 ## Randomness
 
@@ -721,6 +721,9 @@ freeing as there would be if the kernel supported these features directly.
 
 ## Memory tagging
 
+**Memory tagging has been implemented and this section is currently
+out-of-date.**
+
 Integrating extensive support for ARMv8.5 memory tagging is planned and this
 section will be expanded to cover the details on the chosen design. The approach
 for slab allocations is currently covered, but it can also be used for the

From 88b3c1acf9cfc8dd6957671d5c3ab4bcbf53c429 Mon Sep 17 00:00:00 2001
From: Dmitry Muhomor <muhomor.dmitry@gmail.com>
Date: Wed, 1 Nov 2023 21:33:54 +0200
Subject: [PATCH 24/53] memtag_test: fix sporadic failures of
 overflow/underflow tests

---
 androidtest/memtag/memtag_test.cc | 24 +++++++++++++++++++++---
 1 file changed, 21 insertions(+), 3 deletions(-)

diff --git a/androidtest/memtag/memtag_test.cc b/androidtest/memtag/memtag_test.cc
index e6ec68d..ca491d8 100644
--- a/androidtest/memtag/memtag_test.cc
+++ b/androidtest/memtag/memtag_test.cc
@@ -10,6 +10,7 @@
 #include <unistd.h>
 
 #include <map>
+#include <set>
 #include <string>
 #include <unordered_map>
 
@@ -181,9 +182,26 @@ void tag_distinctness() {
 }
 
 u8* alloc_default() {
-    u8 *p = (u8 *) malloc(DEFAULT_ALLOC_SIZE);
-    assert(p);
-    return p;
+    const size_t full_alloc_size = DEFAULT_ALLOC_SIZE + CANARY_SIZE;
+    set<uptr> addrs;
+
+    // make sure allocation has both left and right neighbors, otherwise overflow/underflow tests
+    // will fail when allocation is at the end/beginning of slab
+    for (;;) {
+        u8 *p = (u8 *) malloc(DEFAULT_ALLOC_SIZE);
+        assert(p);
+
+        uptr addr = (uptr) untag_pointer(p);
+        uptr left = addr - full_alloc_size;
+        if (addrs.find(left) != addrs.end()) {
+            uptr right = addr + full_alloc_size;
+            if (addrs.find(right) != addrs.end()) {
+                return p;
+            }
+        }
+
+        addrs.emplace(addr);
+    }
 }
 
 volatile u8 u8_var;

From 352c083f6527fa757567f0d3fa0b03953c03c3c3 Mon Sep 17 00:00:00 2001
From: jvoisin <julien.voisin@dustri.org>
Date: Sun, 5 Nov 2023 18:10:27 +0100
Subject: [PATCH 25/53] Run the testsuite on multiple compiler versions

---
 .github/workflows/build-and-test.yml | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml
index 8470947..82496af 100644
--- a/.github/workflows/build-and-test.yml
+++ b/.github/workflows/build-and-test.yml
@@ -9,14 +9,28 @@ on:
 jobs:
   build-ubuntu-gcc:
     runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        version: [12]
     steps:
       - uses: actions/checkout@v4
+      - name: Setting up gcc version
+        run: |
+          sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-${{ matrix.version }} 100
+          sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-${{ matrix.version }} 100
       - name: Build
         run: make test
   build-ubuntu-clang:
     runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        version: [14, 15]
     steps:
       - uses: actions/checkout@v4
+      - name: Setting up clang version
+        run: |
+          sudo update-alternatives --install /usr/bin/clang++ clang++ /usr/bin/clang++-${{ matrix.version }} 100
+          sudo update-alternatives --install /usr/bin/clang clang /usr/bin/clang-${{ matrix.version }} 100
       - name: Build
         run: CC=clang CXX=clang++ make test
   build-musl:

From 4171bd164e2ec4cf2546daa2b0f6f95af0d782df Mon Sep 17 00:00:00 2001
From: Daniel Micay <daniel.micay@grapheneos.org>
Date: Wed, 8 Nov 2023 14:21:04 -0500
Subject: [PATCH 26/53] use safe_flag for -fstack-clash-protection

---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index f33f88e..574a088 100644
--- a/Makefile
+++ b/Makefile
@@ -19,7 +19,7 @@ endef
 
 CPPFLAGS := $(CPPFLAGS) -D_GNU_SOURCE -I include
 SHARED_FLAGS := -pipe -O3 -flto -fPIC -fvisibility=hidden -fno-plt \
-    -fstack-clash-protection $(call safe_flag,-fcf-protection) -fstack-protector-strong \
+    $(call safe_flag,-fstack-clash-protection) $(call safe_flag,-fcf-protection) -fstack-protector-strong \
     -Wall -Wextra $(call safe_flag,-Wcast-align=strict,-Wcast-align) -Wcast-qual -Wwrite-strings \
     -Wundef
 

From 3c274731ba8dfed837cfcb03037e749312f06e0e Mon Sep 17 00:00:00 2001
From: Daniel Micay <daniel.micay@grapheneos.org>
Date: Tue, 14 Nov 2023 16:19:20 -0500
Subject: [PATCH 27/53] Revert "use safe_flag for -fstack-clash-protection"

This reverts commit 4171bd164e2ec4cf2546daa2b0f6f95af0d782df.
---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 574a088..f33f88e 100644
--- a/Makefile
+++ b/Makefile
@@ -19,7 +19,7 @@ endef
 
 CPPFLAGS := $(CPPFLAGS) -D_GNU_SOURCE -I include
 SHARED_FLAGS := -pipe -O3 -flto -fPIC -fvisibility=hidden -fno-plt \
-    $(call safe_flag,-fstack-clash-protection) $(call safe_flag,-fcf-protection) -fstack-protector-strong \
+    -fstack-clash-protection $(call safe_flag,-fcf-protection) -fstack-protector-strong \
     -Wall -Wextra $(call safe_flag,-Wcast-align=strict,-Wcast-align) -Wcast-qual -Wwrite-strings \
     -Wundef
 

From 61821b02c8615bc9f359497686b3ac952ddb56f3 Mon Sep 17 00:00:00 2001
From: jvoisin <julien.voisin@dustri.org>
Date: Thu, 16 Nov 2023 17:49:11 +0100
Subject: [PATCH 28/53] Clarify a bit why a particular magic number was chosen

---
 h_malloc.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/h_malloc.c b/h_malloc.c
index ffcf0e4..014b461 100644
--- a/h_malloc.c
+++ b/h_malloc.c
@@ -495,7 +495,8 @@ static void set_slab_canary_value(UNUSED struct slab_metadata *metadata, UNUSED
         // When MTE is enabled, writing and reading of canaries is disabled, i.e. canary remains zeroed.
         // After MTE is disabled, canaries that are set to 0 are ignored, since they wouldn't match
         // slab's metadata->canary_value.
-        metadata->canary_value = 0x100; // 0x100 was chosen as the smallest acceptable value
+        // 0x100 was chosen arbitrarily, and can be encoded as an immediate value on ARM by the compiler.
+        metadata->canary_value = 0x100;
     }
 #endif
 #endif

From 7093fdc4822aec123c51f663811ffe5e96b65213 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Christian=20G=C3=B6ttsche?= <cgzones@googlemail.com>
Date: Thu, 14 Dec 2023 10:36:59 +0100
Subject: [PATCH 29/53] README: add note about AppArmor constraint on Debian

---
 README.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/README.md b/README.md
index 8962037..9029924 100644
--- a/README.md
+++ b/README.md
@@ -159,6 +159,9 @@ line to the `/etc/ld.so.preload` configuration file:
 The format of this configuration file is a whitespace-separated list, so it's
 good practice to put each library on a separate line.
 
+On Debian systems `libhardened_malloc.so` should be installed into `/usr/lib/`
+to avoid preload failures caused by AppArmor profile restrictions.
+
 Using the `LD_PRELOAD` environment variable to load it on a case-by-case basis
 will not work when `AT_SECURE` is set such as with setuid binaries. It's also
 generally not a recommended approach for production usage. The recommendation

From 365ee6900d2ae0cc9ad9e03d48f6747f58deb69d Mon Sep 17 00:00:00 2001
From: Dmitry Muhomor <muhomor.dmitry@gmail.com>
Date: Sun, 31 Dec 2023 15:37:32 +0200
Subject: [PATCH 30/53] android: restore the default SIGABRT handler in
 fatal_error()

async_safe_fatal() calls abort() at the end, which can be intercepted by a custom SIGABRT handler.

In particular, crashlytics installs such a handler and tries to fork() after catching SIGABRT.

hardened_malloc uses pthread_atfork() to register fork handlers. These handlers try to lock internal
hardened_malloc mutexes. If at least one of those mutexes is already locked, which is usually the
case, thread that called fatai_error() gets deadlocked, while the other threads (if there are any)
continue to run.
---
 util.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/util.c b/util.c
index a3d6f0c..a43679c 100644
--- a/util.c
+++ b/util.c
@@ -6,6 +6,8 @@
 
 #ifdef __ANDROID__
 #include <async_safe/log.h>
+int mallopt(int param, int value);
+#define M_BIONIC_RESTORE_DEFAULT_SIGABRT_HANDLER (-1003)
 #endif
 
 #include "util.h"
@@ -30,6 +32,7 @@ static int write_full(int fd, const char *buf, size_t length) {
 
 COLD noreturn void fatal_error(const char *s) {
 #ifdef __ANDROID__
+    mallopt(M_BIONIC_RESTORE_DEFAULT_SIGABRT_HANDLER, 0);
     async_safe_fatal("hardened_malloc: fatal allocator error: %s", s);
 #else
     const char *prefix = "fatal allocator error: ";

From abe54dba274b2f6b668f3fae063903068b695d89 Mon Sep 17 00:00:00 2001
From: Daniel Micay <daniel.micay@grapheneos.org>
Date: Wed, 3 Jan 2024 11:55:30 -0500
Subject: [PATCH 31/53] update memory tagging documentation

---
 README.md | 100 ++++++++++++++++--------------------------------------
 1 file changed, 29 insertions(+), 71 deletions(-)

diff --git a/README.md b/README.md
index 9029924..6fb3c82 100644
--- a/README.md
+++ b/README.md
@@ -724,80 +724,38 @@ freeing as there would be if the kernel supported these features directly.
 
 ## Memory tagging
 
-**Memory tagging has been implemented and this section is currently
-out-of-date.**
+Random tags are set for all slab allocations when allocated. 5 possible values
+are excluded: the default 0 tag, a statically reserved free tag, the previous
+tag used for the slot, the current (or previous) tag used for the slot to the
+left and the current (or previous) tag used for the slot to the right. 3 of
+these are dynamic random values. When a slab allocation is freed, the reserved
+free tag is set for the slot. Linear overflows are deterministically detected.
+Use-after-free has deterministic detection until the freed slot goes through
+both the random and FIFO quarantines, gets allocated again, goes through both
+quarantines again and then finally gets allocated again for a 2nd time. Since
+the default 0 tag isn't used, untagged memory can't access malloc allocations
+and vice versa, although it may make sense to reuse the default tag for free
+data to avoid reducing the possible random tags from 15 to 14, since freed
+data is always zeroed anyway.
 
-Integrating extensive support for ARMv8.5 memory tagging is planned and this
-section will be expanded to cover the details on the chosen design. The approach
-for slab allocations is currently covered, but it can also be used for the
-allocator metadata region and large allocations.
+Slab allocations are done in a statically reserved region for each size class
+and all metadata is in a statically reserved region, so interactions between
+different uses of the same address space is not applicable.
 
-Memory allocations are already always multiples of naturally aligned 16 byte
-units, so memory tags are a natural fit into a malloc implementation due to the
-16 byte alignment requirement. The only extra memory consumption will come from
-the hardware supported storage for the tag values (4 bits per 16 bytes).
+Large allocations beyond the largest slab allocation size class (128k by
+default) are guaranteed to have randomly sized guard regions to the left and
+right. Random and FIFO address space quarantines provide use-after-free
+detection. Random tags would still be useful for probabilistic detection of
+overflows, probabilistic detection of use-after-free once the address space is
+out of the quarantine and reused for another allocation and deterministic
+detection of use-after-free for reuse by another allocator. We need to test
+whether the cost is acceptable for enabling this by default.
 
-The baseline policy will be to generate random tags for each slab allocation
-slot on first use. The highest value will be reserved for marking freed memory
-allocations to detect any accesses to freed memory so it won't be part of the
-generated range. Adjacent slots will be guaranteed to have distinct memory tags
-in order to guarantee that linear overflows are detected. There are a few ways
-of implementing this and it will end up depending on the performance costs of
-different approaches. If there's an efficient way to fetch the adjacent tag
-values without wasting extra memory, it will be possible to check for them and
-skip them either by generating a new random value in a loop or incrementing
-past them since the tiny bit of bias wouldn't matter. Another approach would be
-alternating odd and even tag values but that would substantially reduce the
-overall randomness of the tags and there's very little entropy from the start.
-
-Once a slab allocation has been freed, the tag will be set to the reserved
-value for free memory and the previous tag value will be stored inside the
-allocation itself. The next time the slot is allocated, the chosen tag value
-will be the previous value incremented by one to provide use-after-free
-detection between generations of allocations. The stored tag will be wiped
-before retagging the memory, to avoid leaking it and as part of preserving the
-security property of newly allocated memory being zeroed due to zero-on-free.
-It will eventually wrap all the way around, but this ends up providing a strong
-guarantee for many allocation cycles due to the combination of 4 bit tags with
-the FIFO quarantine feature providing delayed free. It also benefits from
-random slot allocation and the randomized portion of delayed free, which result
-in a further delay along with preventing a deterministic bypass by forcing a
-reuse after a certain number of allocation cycles. Similarly to the initial tag
-generation, tag values for adjacent allocations will be skipped by incrementing
-past them.
-
-For example, consider this slab of allocations that are not yet used with 15
-representing the tag for free memory. For the sake of simplicity, there will be
-no quarantine or other slabs for this example:
-
-    | 15 | 15 | 15 | 15 | 15 | 15 |
-
-Three slots are randomly chosen for allocations, with random tags assigned (2,
-7, 14) since these slots haven't ever been used and don't have saved values:
-
-    | 15 | 2  | 15 | 7  | 14 | 15 |
-
-The 2nd allocation slot is freed, and is set back to the tag for free memory
-(15), but with the previous tag value stored in the freed space:
-
-    | 15 | 15 | 15 | 7  | 14 | 15 |
-
-The first slot is allocated for the first time, receiving the random value 3:
-
-    | 3  | 15 | 15 | 7  | 14 | 15 |
-
-The 2nd slot is randomly chosen again, so the previous tag (2) is retrieved and
-incremented to 3 as part of the use-after-free mitigation. An adjacent
-allocation already uses the tag 3, so the tag is further incremented to 4 (it
-would be incremented to 5 if one of the adjacent tags was 4):
-
-    | 3  | 4  | 15 | 7  | 14 | 15 |
-
-The last slot is randomly chosen for the next allocation, and is assigned the
-random value 14. However, it's placed next to an allocation with the tag 14 so
-the tag is incremented and wraps around to 0:
-
-    | 3  | 4  | 15 | 7  | 14 | 0  |
+When memory tagging is enabled, checking for write-after-free at allocation
+time and checking canaries are both disabled. Canaries will be more thoroughly
+disabled when using memory tagging in the future, but Android currently has
+very dynamic memory tagging support where it can be enabled/disabled at any
+time which creates a barrier to optimizing by disabling redundant features.
 
 ## API extensions
 

From 53a45b4661b858e62dff476c5903b14b9ce1efa5 Mon Sep 17 00:00:00 2001
From: Julien Voisin <jvoisin@users.noreply.github.com>
Date: Wed, 3 Jan 2024 19:39:24 +0100
Subject: [PATCH 32/53] Improve a bit the formulation of the MTE documentation

---
 README.md | 50 ++++++++++++++++++++++++++++++--------------------
 1 file changed, 30 insertions(+), 20 deletions(-)

diff --git a/README.md b/README.md
index 6fb3c82..28349b8 100644
--- a/README.md
+++ b/README.md
@@ -724,19 +724,26 @@ freeing as there would be if the kernel supported these features directly.
 
 ## Memory tagging
 
-Random tags are set for all slab allocations when allocated. 5 possible values
-are excluded: the default 0 tag, a statically reserved free tag, the previous
-tag used for the slot, the current (or previous) tag used for the slot to the
-left and the current (or previous) tag used for the slot to the right. 3 of
-these are dynamic random values. When a slab allocation is freed, the reserved
-free tag is set for the slot. Linear overflows are deterministically detected.
-Use-after-free has deterministic detection until the freed slot goes through
-both the random and FIFO quarantines, gets allocated again, goes through both
-quarantines again and then finally gets allocated again for a 2nd time. Since
-the default 0 tag isn't used, untagged memory can't access malloc allocations
-and vice versa, although it may make sense to reuse the default tag for free
-data to avoid reducing the possible random tags from 15 to 14, since freed
-data is always zeroed anyway.
+Random tags are set for all slab allocations when allocated, with 5 excluded values:
+
+1. the default `0` tag
+2. a statically *reserved free tag*
+3. the previous tag used for the slot
+4. the current (or previous) tag used for the slot to the left
+5. the current (or previous) tag used for the slot to the right
+
+When a slab allocation is freed, the *reserved free tag* is set for the slot.
+
+This ensures the following properties:
+
+- Linear overflows are deterministically detected.
+- Use-after-free are deterministically detected until the freed slot goes through
+  both the random and FIFO quarantines, gets allocated again, goes through both
+  quarantines again and then finally gets allocated again for a 2nd time.
+  Since the default `0` tag isn't used, untagged memory can't access malloc allocations
+  and vice versa, although it may make sense to reuse the default tag for free
+  data to avoid reducing the possible random tags from 15 to 14, since freed
+  data is always zeroed anyway.
 
 Slab allocations are done in a statically reserved region for each size class
 and all metadata is in a statically reserved region, so interactions between
@@ -745,17 +752,20 @@ different uses of the same address space is not applicable.
 Large allocations beyond the largest slab allocation size class (128k by
 default) are guaranteed to have randomly sized guard regions to the left and
 right. Random and FIFO address space quarantines provide use-after-free
-detection. Random tags would still be useful for probabilistic detection of
-overflows, probabilistic detection of use-after-free once the address space is
-out of the quarantine and reused for another allocation and deterministic
-detection of use-after-free for reuse by another allocator. We need to test
-whether the cost is acceptable for enabling this by default.
+detection. We need to test whether the cost of random tags is acceptable to enabled them by default,
+since they would be useful for:
+
+- probabilistic detection of overflows
+- probabilistic detection of use-after-free once the address space is
+  out of the quarantine and reused for another allocation
+- deterministic detection of use-after-free for reuse by another allocator.
 
 When memory tagging is enabled, checking for write-after-free at allocation
 time and checking canaries are both disabled. Canaries will be more thoroughly
 disabled when using memory tagging in the future, but Android currently has
-very dynamic memory tagging support where it can be enabled/disabled at any
-time which creates a barrier to optimizing by disabling redundant features.
+[very dynamic memory tagging support](https://source.android.com/docs/security/test/memory-safety/arm-mte) 
+where it can be enabled/disabled at any time which creates a barrier to
+optimizing by disabling redundant features.
 
 ## API extensions
 

From a3bf742c3e6972f59a0e99eb706d6d7b239d6fe7 Mon Sep 17 00:00:00 2001
From: Daniel Micay <daniel.micay@grapheneos.org>
Date: Wed, 3 Jan 2024 14:44:08 -0500
Subject: [PATCH 33/53] remove trailing whitespace

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 28349b8..fe5c5bf 100644
--- a/README.md
+++ b/README.md
@@ -763,7 +763,7 @@ since they would be useful for:
 When memory tagging is enabled, checking for write-after-free at allocation
 time and checking canaries are both disabled. Canaries will be more thoroughly
 disabled when using memory tagging in the future, but Android currently has
-[very dynamic memory tagging support](https://source.android.com/docs/security/test/memory-safety/arm-mte) 
+[very dynamic memory tagging support](https://source.android.com/docs/security/test/memory-safety/arm-mte)
 where it can be enabled/disabled at any time which creates a barrier to
 optimizing by disabling redundant features.
 

From 4756716904e3342e9c7747266f3876f1c3c04127 Mon Sep 17 00:00:00 2001
From: Dmitry Muhomor <muhomor.dmitry@gmail.com>
Date: Tue, 23 Jan 2024 12:34:52 +0200
Subject: [PATCH 34/53] memtag_test: move SEGV code checks to device-side
 binary

---
 androidtest/memtag/memtag_test.cc             | 43 +++++++++++-------
 .../src/grapheneos/hmalloc/MemtagTest.java    | 45 +++++--------------
 2 files changed, 40 insertions(+), 48 deletions(-)

diff --git a/androidtest/memtag/memtag_test.cc b/androidtest/memtag/memtag_test.cc
index ca491d8..8b6a784 100644
--- a/androidtest/memtag/memtag_test.cc
+++ b/androidtest/memtag/memtag_test.cc
@@ -204,54 +204,63 @@ u8* alloc_default() {
     }
 }
 
-volatile u8 u8_var;
+int expected_segv_code;
+
+#define expect_segv(exp, segv_code) ({\
+    expected_segv_code = segv_code; \
+    volatile auto val = exp; \
+    (void) val; \
+    do_context_switch(); \
+    fprintf(stderr, "didn't receive SEGV code %i", segv_code); \
+    exit(1); })
+
+// it's expected that the device is configured to use asymm MTE tag checking mode (sync read checks,
+// async write checks)
+#define expect_read_segv(exp) expect_segv(exp, SEGV_MTESERR)
+#define expect_write_segv(exp) expect_segv(exp, SEGV_MTEAERR)
 
 void read_after_free() {
     u8 *p = alloc_default();
     free(p);
-    volatile u8 v = p[0];
-    (void) v;
+    expect_read_segv(p[0]);
 }
 
 void write_after_free() {
     u8 *p = alloc_default();
     free(p);
-    p[0] = 1;
+    expect_write_segv(p[0] = 1);
 }
 
 void underflow_read() {
     u8 *p = alloc_default();
-    volatile u8 v = p[-1];
-    (void) v;
+    expect_read_segv(p[-1]);
 }
 
 void underflow_write() {
     u8 *p = alloc_default();
-    p[-1] = 1;
+    expect_write_segv(p[-1] = 1);
 }
 
 void overflow_read() {
     u8 *p = alloc_default();
-    volatile u8 v = p[DEFAULT_ALLOC_SIZE + CANARY_SIZE];
-    (void) v;
+    expect_read_segv(p[DEFAULT_ALLOC_SIZE + CANARY_SIZE]);
 }
 
 void overflow_write() {
     u8 *p = alloc_default();
-    p[DEFAULT_ALLOC_SIZE + CANARY_SIZE] = 1;
+    expect_write_segv(p[DEFAULT_ALLOC_SIZE + CANARY_SIZE] = 1);
 }
 
 void untagged_read() {
     u8 *p = alloc_default();
     p = (u8 *) untag_pointer(p);
-    volatile u8 v = p[0];
-    (void) v;
+    expect_read_segv(p[0]);
 }
 
 void untagged_write() {
     u8 *p = alloc_default();
     p = (u8 *) untag_pointer(p);
-    p[0] = 1;
+    expect_write_segv(p[0] = 1);
 }
 
 map<string, function<void()>> tests = {
@@ -269,8 +278,12 @@ map<string, function<void()>> tests = {
 };
 
 void segv_handler(int, siginfo_t *si, void *) {
-    fprintf(stderr, "SEGV_CODE %i", si->si_code);
-    exit(139); // standard exit code for SIGSEGV
+    if (expected_segv_code == 0 || expected_segv_code != si->si_code) {
+        fprintf(stderr, "received unexpected SEGV_CODE %i", si->si_code);
+        exit(139); // standard exit code for SIGSEGV
+    }
+
+    exit(0);
 }
 
 int main(int argc, char **argv) {
diff --git a/androidtest/src/grapheneos/hmalloc/MemtagTest.java b/androidtest/src/grapheneos/hmalloc/MemtagTest.java
index 8cb7a45..455e0d0 100644
--- a/androidtest/src/grapheneos/hmalloc/MemtagTest.java
+++ b/androidtest/src/grapheneos/hmalloc/MemtagTest.java
@@ -7,36 +7,15 @@ import com.android.tradefed.testtype.junit4.BaseHostJUnit4Test;
 import org.junit.Test;
 import org.junit.runner.RunWith;
 
-import java.io.IOException;
 import java.util.ArrayList;
 
 import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.fail;
 
 @RunWith(DeviceJUnit4ClassRunner.class)
 public class MemtagTest extends BaseHostJUnit4Test {
-
     private static final String TEST_BINARY = "/data/local/tmp/memtag_test";
 
-    enum Result {
-        SUCCESS(0, ""),
-        // it's expected that the device is configured to use asymm MTE tag checking mode
-        ASYNC_MTE_ERROR(139, "SEGV_CODE 8"),
-        SYNC_MTE_ERROR(139, "SEGV_CODE 9"),
-        ;
-
-        public final int exitCode;
-        public final String stderr;
-
-        Result(int exitCode, String stderr) {
-            this.exitCode = exitCode;
-            this.stderr = stderr;
-        }
-    }
-
-    private static final int SEGV_EXIT_CODE = 139;
-
-    private void runTest(String name, Result expectedResult) throws DeviceNotAvailableException {
+    private void runTest(String name) throws DeviceNotAvailableException {
         var args = new ArrayList<String>();
         args.add(TEST_BINARY);
         args.add(name);
@@ -44,52 +23,52 @@ public class MemtagTest extends BaseHostJUnit4Test {
 
         var result = getDevice().executeShellV2Command(cmdLine);
 
-        assertEquals("process exit code", expectedResult.exitCode, result.getExitCode().intValue());
-        assertEquals("stderr", expectedResult.stderr, result.getStderr());
+        assertEquals("stderr", "", result.getStderr());
+        assertEquals("process exit code", 0, result.getExitCode().intValue());
     }
 
     @Test
     public void tag_distinctness() throws DeviceNotAvailableException {
-        runTest("tag_distinctness", Result.SUCCESS);
+        runTest("tag_distinctness");
     }
 
     @Test
     public void read_after_free() throws DeviceNotAvailableException {
-        runTest("read_after_free", Result.SYNC_MTE_ERROR);
+        runTest("read_after_free");
     }
 
     @Test
     public void write_after_free() throws DeviceNotAvailableException {
-        runTest("write_after_free", Result.ASYNC_MTE_ERROR);
+        runTest("write_after_free");
     }
 
     @Test
     public void underflow_read() throws DeviceNotAvailableException {
-        runTest("underflow_read", Result.SYNC_MTE_ERROR);
+        runTest("underflow_read");
     }
 
     @Test
     public void underflow_write() throws DeviceNotAvailableException {
-        runTest("underflow_write", Result.ASYNC_MTE_ERROR);
+        runTest("underflow_write");
     }
 
     @Test
     public void overflow_read() throws DeviceNotAvailableException {
-        runTest("overflow_read", Result.SYNC_MTE_ERROR);
+        runTest("overflow_read");
     }
 
     @Test
     public void overflow_write() throws DeviceNotAvailableException {
-        runTest("overflow_write", Result.ASYNC_MTE_ERROR);
+        runTest("overflow_write");
     }
 
     @Test
     public void untagged_read() throws DeviceNotAvailableException {
-        runTest("untagged_read", Result.SYNC_MTE_ERROR);
+        runTest("untagged_read");
     }
 
     @Test
     public void untagged_write() throws DeviceNotAvailableException {
-        runTest("untagged_write", Result.ASYNC_MTE_ERROR);
+        runTest("untagged_write");
     }
 }

From 7d2151e40c2b5b282f1ba75285c3493543a5ce9a Mon Sep 17 00:00:00 2001
From: Dmitry Muhomor <muhomor.dmitry@gmail.com>
Date: Tue, 23 Jan 2024 13:32:49 +0200
Subject: [PATCH 35/53] mte: remove util.h dependency from arm_mte.h

It's needed for including arm_mte.h into memtag_test.cc
---
 arm_mte.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arm_mte.h b/arm_mte.h
index ea3445e..5ed900d 100644
--- a/arm_mte.h
+++ b/arm_mte.h
@@ -2,11 +2,11 @@
 #define ARM_MTE_H
 
 #include <arm_acle.h>
-#include <util.h>
+#include <stdint.h>
 
 // Returns a tagged pointer.
 // See https://developer.arm.com/documentation/ddi0602/2023-09/Base-Instructions/IRG--Insert-Random-Tag-
-static inline void *arm_mte_create_random_tag(void *p, u64 exclusion_mask) {
+static inline void *arm_mte_create_random_tag(void *p, uint64_t exclusion_mask) {
     return __arm_mte_create_random_tag(p, exclusion_mask);
 }
 

From 5fbbdc2ef8972df8560891110463bc649ef14335 Mon Sep 17 00:00:00 2001
From: Dmitry Muhomor <muhomor.dmitry@gmail.com>
Date: Tue, 23 Jan 2024 13:33:06 +0200
Subject: [PATCH 36/53] memtag_test: add test for MADV_DONTNEED behavior

---
 androidtest/memtag/Android.bp                 |  1 +
 androidtest/memtag/memtag_test.cc             | 41 +++++++++++++++++++
 .../src/grapheneos/hmalloc/MemtagTest.java    |  5 +++
 3 files changed, 47 insertions(+)

diff --git a/androidtest/memtag/Android.bp b/androidtest/memtag/Android.bp
index 14ab691..75287f6 100644
--- a/androidtest/memtag/Android.bp
+++ b/androidtest/memtag/Android.bp
@@ -6,6 +6,7 @@ cc_test {
         "-Werror",
         "-Wextra",
         "-O0",
+        "-march=armv9-a+memtag",
     ],
 
     compile_multilib: "64",
diff --git a/androidtest/memtag/memtag_test.cc b/androidtest/memtag/memtag_test.cc
index 8b6a784..910ef5b 100644
--- a/androidtest/memtag/memtag_test.cc
+++ b/androidtest/memtag/memtag_test.cc
@@ -6,6 +6,7 @@
 #include <stdio.h>
 #include <stdint.h>
 #include <stdlib.h>
+#include <sys/mman.h>
 #include <sys/utsname.h>
 #include <unistd.h>
 
@@ -14,6 +15,8 @@
 #include <string>
 #include <unordered_map>
 
+#include "../../arm_mte.h"
+
 using namespace std;
 
 using u8 = uint8_t;
@@ -37,6 +40,10 @@ void *untag_pointer(void *ptr) {
     return (void *) ((uintptr_t) ptr & mask);
 }
 
+void *set_pointer_tag(void *ptr, u8 tag) {
+    return (void *) (((uintptr_t) tag << 56) | (uintptr_t) untag_pointer(ptr));
+}
+
 // This test checks that slab slot allocation uses tag that is distint from tags of its neighbors
 // and from the tag of the previous allocation that used the same slot
 void tag_distinctness() {
@@ -263,6 +270,39 @@ void untagged_write() {
     expect_write_segv(p[0] = 1);
 }
 
+// checks that each of memory locations inside the buffer is tagged with expected_tag
+void check_tag(void *buf, size_t len, u8 expected_tag) {
+    for (size_t i = 0; i < len; ++i) {
+        assert(get_pointer_tag(__arm_mte_get_tag((void *) ((uintptr_t) buf + i))) == expected_tag);
+    }
+}
+
+void madvise_dontneed() {
+    const size_t len = 100'000;
+    void *ptr = mmap(NULL, len, PROT_READ | PROT_WRITE | PROT_MTE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+    assert(ptr != MAP_FAILED);
+
+    // check that 0 is the initial tag
+    check_tag(ptr, len, 0);
+
+    arm_mte_tag_and_clear_mem(set_pointer_tag(ptr, 1), len);
+    check_tag(ptr, len, 1);
+
+    memset(set_pointer_tag(ptr, 1), 1, len);
+
+    assert(madvise(ptr, len, MADV_DONTNEED) == 0);
+    // check that MADV_DONTNEED resets the tag
+    check_tag(ptr, len, 0);
+
+    // check that MADV_DONTNEED clears the memory
+    for (size_t i = 0; i < len; ++i) {
+        assert(((u8 *) ptr)[i] == 0);
+    }
+
+    // check that mistagged read after MADV_DONTNEED fails
+    expect_read_segv(*((u8 *) set_pointer_tag(ptr, 1)));
+}
+
 map<string, function<void()>> tests = {
 #define TEST(s) { #s, s }
     TEST(tag_distinctness),
@@ -274,6 +314,7 @@ map<string, function<void()>> tests = {
     TEST(underflow_write),
     TEST(untagged_read),
     TEST(untagged_write),
+    TEST(madvise_dontneed),
 #undef TEST
 };
 
diff --git a/androidtest/src/grapheneos/hmalloc/MemtagTest.java b/androidtest/src/grapheneos/hmalloc/MemtagTest.java
index 455e0d0..be04bd9 100644
--- a/androidtest/src/grapheneos/hmalloc/MemtagTest.java
+++ b/androidtest/src/grapheneos/hmalloc/MemtagTest.java
@@ -71,4 +71,9 @@ public class MemtagTest extends BaseHostJUnit4Test {
     public void untagged_write() throws DeviceNotAvailableException {
         runTest("untagged_write");
     }
+
+    @Test
+    public void madvise_dontneed() throws DeviceNotAvailableException {
+        runTest("madvise_dontneed");
+    }
 }

From 3c1f40aff0865dac62b4d5cb36968ff0b97a185c Mon Sep 17 00:00:00 2001
From: Dmitry Muhomor <muhomor.dmitry@gmail.com>
Date: Tue, 23 Jan 2024 19:11:12 +0200
Subject: [PATCH 37/53] amend memory tagging README section

Memory tagging is enabled by default in bionic, but can be disabled at any point.
Memory tagging can't be re-enabled after it's disabled.
---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index fe5c5bf..76a4f6d 100644
--- a/README.md
+++ b/README.md
@@ -764,8 +764,8 @@ When memory tagging is enabled, checking for write-after-free at allocation
 time and checking canaries are both disabled. Canaries will be more thoroughly
 disabled when using memory tagging in the future, but Android currently has
 [very dynamic memory tagging support](https://source.android.com/docs/security/test/memory-safety/arm-mte)
-where it can be enabled/disabled at any time which creates a barrier to
-optimizing by disabling redundant features.
+where it can be disabled at any time which creates a barrier to optimizing
+by disabling redundant features.
 
 ## API extensions
 

From 72681899337a839249cf873bf22ef10dff23c67f Mon Sep 17 00:00:00 2001
From: Dmitry Muhomor <muhomor.dmitry@gmail.com>
Date: Tue, 23 Jan 2024 19:50:26 +0200
Subject: [PATCH 38/53] mte: use tag 0 for freed slots, stop reserving tag 15

---
 README.md                         | 20 +++++++++-----------
 androidtest/memtag/memtag_test.cc |  8 ++++----
 h_malloc.c                        |  5 ++---
 memtag.h                          |  3 ++-
 4 files changed, 17 insertions(+), 19 deletions(-)

diff --git a/README.md b/README.md
index 76a4f6d..bccafc2 100644
--- a/README.md
+++ b/README.md
@@ -724,15 +724,15 @@ freeing as there would be if the kernel supported these features directly.
 
 ## Memory tagging
 
-Random tags are set for all slab allocations when allocated, with 5 excluded values:
+Random tags are set for all slab allocations when allocated, with 4 excluded values:
 
-1. the default `0` tag
-2. a statically *reserved free tag*
-3. the previous tag used for the slot
-4. the current (or previous) tag used for the slot to the left
-5. the current (or previous) tag used for the slot to the right
+1. the reserved `0` tag
+2. the previous tag used for the slot
+3. the current (or previous) tag used for the slot to the left
+4. the current (or previous) tag used for the slot to the right
 
-When a slab allocation is freed, the *reserved free tag* is set for the slot.
+When a slab allocation is freed, the reserved `0` tag is set for the slot. 
+Slab allocation slots are cleared before reuse when memory tagging is enabled.
 
 This ensures the following properties:
 
@@ -740,10 +740,8 @@ This ensures the following properties:
 - Use-after-free are deterministically detected until the freed slot goes through
   both the random and FIFO quarantines, gets allocated again, goes through both
   quarantines again and then finally gets allocated again for a 2nd time.
-  Since the default `0` tag isn't used, untagged memory can't access malloc allocations
-  and vice versa, although it may make sense to reuse the default tag for free
-  data to avoid reducing the possible random tags from 15 to 14, since freed
-  data is always zeroed anyway.
+- Since the default `0` tag is reserved, untagged pointers can't access slab 
+  allocations and vice versa.
 
 Slab allocations are done in a statically reserved region for each size class
 and all metadata is in a statically reserved region, so interactions between
diff --git a/androidtest/memtag/memtag_test.cc b/androidtest/memtag/memtag_test.cc
index 910ef5b..5083636 100644
--- a/androidtest/memtag/memtag_test.cc
+++ b/androidtest/memtag/memtag_test.cc
@@ -47,9 +47,9 @@ void *set_pointer_tag(void *ptr, u8 tag) {
 // This test checks that slab slot allocation uses tag that is distint from tags of its neighbors
 // and from the tag of the previous allocation that used the same slot
 void tag_distinctness() {
-    // 0 and 15 are reserved
+    // tag 0 is reserved
     const int min_tag = 1;
-    const int max_tag = 14;
+    const int max_tag = 0xf;
 
     struct SizeClass {
         int size;
@@ -148,8 +148,8 @@ void tag_distinctness() {
             }
         }
 
-        // check that all of the tags were used, except reserved ones
-        assert(seen_tags == (0xffff & ~(1 << 0 | 1 << 15)));
+        // check that all of the tags were used, except for the reserved tag 0
+        assert(seen_tags == (0xffff & ~(1 << 0)));
 
         printf("size_class\t%i\t" "tdc_left %i\t" "tdc_right %i\t" "tdc_prev_alloc %i\n",
                sc.size, left_neighbor_tdc_cnt, right_neighbor_tdc_cnt, prev_alloc_tdc_cnt);
diff --git a/h_malloc.c b/h_malloc.c
index 014b461..15be0a2 100644
--- a/h_malloc.c
+++ b/h_malloc.c
@@ -574,9 +574,8 @@ static void *tag_and_clear_slab_slot(struct slab_metadata *metadata, void *slot_
     // is constructed.
     u8 *slot_tags = metadata->arm_mte_tags;
 
-    // Tag exclusion mask. 0 tag is always excluded to detect accesses to slab memory via untagged
-    // pointers. Moreover, 0 tag is excluded in bionic via PR_MTE_TAG_MASK prctl
-    u64 tem = (1 << 0) | (1 << RESERVED_TAG);
+    // tag exclusion mask
+    u64 tem = (1 << RESERVED_TAG);
 
     // current or previous tag of left neighbor or 0 if there's no left neighbor or if it was never used
     tem |= (1 << u4_arr_get(slot_tags, slot_idx));
diff --git a/memtag.h b/memtag.h
index 0ba4cbc..e431283 100644
--- a/memtag.h
+++ b/memtag.h
@@ -6,7 +6,8 @@
 #ifdef HAS_ARM_MTE
 #include "arm_mte.h"
 #define MEMTAG 1
-#define RESERVED_TAG 15
+// Note that bionic libc always reserves tag 0 via PR_MTE_TAG_MASK prctl
+#define RESERVED_TAG 0
 #define TAG_WIDTH 4
 #endif
 

From 749640c274d54e084505a24fa758bcb5f96a25ef Mon Sep 17 00:00:00 2001
From: Daniel Micay <daniel.micay@grapheneos.org>
Date: Thu, 15 Feb 2024 02:57:33 -0500
Subject: [PATCH 39/53] update copyright notice

---
 LICENSE | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/LICENSE b/LICENSE
index 5311a0f..af4b965 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,4 +1,4 @@
-Copyright © 2018-2023 GrapheneOS
+Copyright © 2018-2024 GrapheneOS
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal

From 3f07acfab1cfff00baf33c477f96d3e3f874275a Mon Sep 17 00:00:00 2001
From: Daniel Micay <daniel.micay@grapheneos.org>
Date: Mon, 5 Aug 2024 02:25:55 -0400
Subject: [PATCH 40/53] update libdivide to 5.1

---
 third_party/libdivide.h | 394 +++++++++++++++++++++++++++++-----------
 1 file changed, 283 insertions(+), 111 deletions(-)

diff --git a/third_party/libdivide.h b/third_party/libdivide.h
index e9a31d1..4421888 100644
--- a/third_party/libdivide.h
+++ b/third_party/libdivide.h
@@ -1,8 +1,8 @@
 // libdivide.h - Optimized integer division
 // https://libdivide.com
 //
-// Copyright (C) 2010 - 2021 ridiculous_fish, <libdivide@ridiculousfish.com>
-// Copyright (C) 2016 - 2021 Kim Walisch, <kim.walisch@gmail.com>
+// Copyright (C) 2010 - 2022 ridiculous_fish, <libdivide@ridiculousfish.com>
+// Copyright (C) 2016 - 2022 Kim Walisch, <kim.walisch@gmail.com>
 //
 // libdivide is dual-licensed under the Boost or zlib licenses.
 // You may use libdivide under the terms of either of these.
@@ -11,11 +11,12 @@
 #ifndef LIBDIVIDE_H
 #define LIBDIVIDE_H
 
-#define LIBDIVIDE_VERSION "5.0"
+#define LIBDIVIDE_VERSION "5.1"
 #define LIBDIVIDE_VERSION_MAJOR 5
-#define LIBDIVIDE_VERSION_MINOR 0
+#define LIBDIVIDE_VERSION_MINOR 1
 
 #include <stdint.h>
+
 #if !defined(__AVR__)
 #include <stdio.h>
 #include <stdlib.h>
@@ -24,9 +25,11 @@
 #if defined(LIBDIVIDE_SSE2)
 #include <emmintrin.h>
 #endif
+
 #if defined(LIBDIVIDE_AVX2) || defined(LIBDIVIDE_AVX512)
 #include <immintrin.h>
 #endif
+
 #if defined(LIBDIVIDE_NEON)
 #include <arm_neon.h>
 #endif
@@ -37,7 +40,7 @@
 // disable warning C4146: unary minus operator applied
 // to unsigned type, result still unsigned
 #pragma warning(disable : 4146)
-// disable warning C4204: nonstandard extension used : non-constant aggregate 
+// disable warning C4204: nonstandard extension used : non-constant aggregate
 // initializer
 //
 // It's valid C99
@@ -235,14 +238,12 @@ static LIBDIVIDE_INLINE struct libdivide_u32_branchfree_t libdivide_u32_branchfr
 static LIBDIVIDE_INLINE struct libdivide_s64_branchfree_t libdivide_s64_branchfree_gen(int64_t d);
 static LIBDIVIDE_INLINE struct libdivide_u64_branchfree_t libdivide_u64_branchfree_gen(uint64_t d);
 
-static LIBDIVIDE_INLINE int16_t libdivide_s16_do_raw(
-    int16_t numer, int16_t magic, uint8_t more);
+static LIBDIVIDE_INLINE int16_t libdivide_s16_do_raw(int16_t numer, int16_t magic, uint8_t more);
 static LIBDIVIDE_INLINE int16_t libdivide_s16_do(
-    int16_t numer, const struct libdivide_s16_t* denom);
-static LIBDIVIDE_INLINE uint16_t libdivide_u16_do_raw(
-    uint16_t numer, uint16_t magic, uint8_t more);    
+    int16_t numer, const struct libdivide_s16_t *denom);
+static LIBDIVIDE_INLINE uint16_t libdivide_u16_do_raw(uint16_t numer, uint16_t magic, uint8_t more);
 static LIBDIVIDE_INLINE uint16_t libdivide_u16_do(
-    uint16_t numer, const struct libdivide_u16_t* denom);
+    uint16_t numer, const struct libdivide_u16_t *denom);
 static LIBDIVIDE_INLINE int32_t libdivide_s32_do(
     int32_t numer, const struct libdivide_s32_t *denom);
 static LIBDIVIDE_INLINE uint32_t libdivide_u32_do(
@@ -253,9 +254,9 @@ static LIBDIVIDE_INLINE uint64_t libdivide_u64_do(
     uint64_t numer, const struct libdivide_u64_t *denom);
 
 static LIBDIVIDE_INLINE int16_t libdivide_s16_branchfree_do(
-    int16_t numer, const struct libdivide_s16_branchfree_t* denom);
+    int16_t numer, const struct libdivide_s16_branchfree_t *denom);
 static LIBDIVIDE_INLINE uint16_t libdivide_u16_branchfree_do(
-    uint16_t numer, const struct libdivide_u16_branchfree_t* denom);
+    uint16_t numer, const struct libdivide_u16_branchfree_t *denom);
 static LIBDIVIDE_INLINE int32_t libdivide_s32_branchfree_do(
     int32_t numer, const struct libdivide_s32_branchfree_t *denom);
 static LIBDIVIDE_INLINE uint32_t libdivide_u32_branchfree_do(
@@ -265,17 +266,17 @@ static LIBDIVIDE_INLINE int64_t libdivide_s64_branchfree_do(
 static LIBDIVIDE_INLINE uint64_t libdivide_u64_branchfree_do(
     uint64_t numer, const struct libdivide_u64_branchfree_t *denom);
 
-static LIBDIVIDE_INLINE int16_t libdivide_s16_recover(const struct libdivide_s16_t* denom);
-static LIBDIVIDE_INLINE uint16_t libdivide_u16_recover(const struct libdivide_u16_t* denom);
+static LIBDIVIDE_INLINE int16_t libdivide_s16_recover(const struct libdivide_s16_t *denom);
+static LIBDIVIDE_INLINE uint16_t libdivide_u16_recover(const struct libdivide_u16_t *denom);
 static LIBDIVIDE_INLINE int32_t libdivide_s32_recover(const struct libdivide_s32_t *denom);
 static LIBDIVIDE_INLINE uint32_t libdivide_u32_recover(const struct libdivide_u32_t *denom);
 static LIBDIVIDE_INLINE int64_t libdivide_s64_recover(const struct libdivide_s64_t *denom);
 static LIBDIVIDE_INLINE uint64_t libdivide_u64_recover(const struct libdivide_u64_t *denom);
 
 static LIBDIVIDE_INLINE int16_t libdivide_s16_branchfree_recover(
-    const struct libdivide_s16_branchfree_t* denom);
+    const struct libdivide_s16_branchfree_t *denom);
 static LIBDIVIDE_INLINE uint16_t libdivide_u16_branchfree_recover(
-    const struct libdivide_u16_branchfree_t* denom);
+    const struct libdivide_u16_branchfree_t *denom);
 static LIBDIVIDE_INLINE int32_t libdivide_s32_branchfree_recover(
     const struct libdivide_s32_branchfree_t *denom);
 static LIBDIVIDE_INLINE uint32_t libdivide_u32_branchfree_recover(
@@ -393,7 +394,7 @@ static LIBDIVIDE_INLINE int16_t libdivide_count_leading_zeros16(uint16_t val) {
 
 static LIBDIVIDE_INLINE int32_t libdivide_count_leading_zeros32(uint32_t val) {
 #if defined(__AVR__)
-   // Fast way to count leading zeros
+    // Fast way to count leading zeros
     return __builtin_clzl(val);
 #elif defined(__GNUC__) || __has_builtin(__builtin_clz)
     // Fast way to count leading zeros
@@ -442,7 +443,7 @@ static LIBDIVIDE_INLINE int32_t libdivide_count_leading_zeros64(uint64_t val) {
 // uint {v}. The result must fit in 16 bits.
 // Returns the quotient directly and the remainder in *r
 static LIBDIVIDE_INLINE uint16_t libdivide_32_div_16_to_16(
-    uint16_t u1, uint16_t u0, uint16_t v, uint16_t* r) {
+    uint16_t u1, uint16_t u0, uint16_t v, uint16_t *r) {
     uint32_t n = ((uint32_t)u1 << 16) | u0;
     uint16_t result = (uint16_t)(n / v);
     *r = (uint16_t)(n - result * (uint32_t)v);
@@ -512,7 +513,7 @@ static LIBDIVIDE_INLINE uint64_t libdivide_128_div_64_to_64(
 
     // Check for overflow and divide by 0.
     if (numhi >= den) {
-        if (r != NULL) *r = ~0ull;
+        if (r) *r = ~0ull;
         return ~0ull;
     }
 
@@ -558,11 +559,14 @@ static LIBDIVIDE_INLINE uint64_t libdivide_128_div_64_to_64(
     q0 = (uint32_t)qhat;
 
     // Return remainder if requested.
-    if (r != NULL) *r = (rem * b + num0 - q0 * den) >> shift;
+    if (r) *r = (rem * b + num0 - q0 * den) >> shift;
     return ((uint64_t)q1 << 32) | q0;
 #endif
 }
 
+#if !(defined(HAS_INT128_T) && \
+      defined(HAS_INT128_DIV))
+
 // Bitshift a u128 in place, left (signed_shift > 0) or right (signed_shift < 0)
 static LIBDIVIDE_INLINE void libdivide_u128_shift(
     uint64_t *u1, uint64_t *u0, int32_t signed_shift) {
@@ -579,6 +583,8 @@ static LIBDIVIDE_INLINE void libdivide_u128_shift(
     }
 }
 
+#endif
+
 // Computes a 128 / 128 -> 64 bit division, with a 128 bit remainder.
 static LIBDIVIDE_INLINE uint64_t libdivide_128_div_128_to_64(
     uint64_t u_hi, uint64_t u_lo, uint64_t v_hi, uint64_t v_lo, uint64_t *r_hi, uint64_t *r_lo) {
@@ -696,8 +702,7 @@ static LIBDIVIDE_INLINE struct libdivide_u16_t libdivide_internal_u16_gen(
         // 1 in its recovery algorithm.
         result.magic = 0;
         result.more = (uint8_t)(floor_log_2_d - (branchfree != 0));
-    }
-    else {
+    } else {
         uint8_t more;
         uint16_t rem, proposed_m;
         proposed_m = libdivide_32_div_16_to_16((uint16_t)1 << floor_log_2_d, 0, d, &rem);
@@ -709,8 +714,7 @@ static LIBDIVIDE_INLINE struct libdivide_u16_t libdivide_internal_u16_gen(
         if (!branchfree && (e < ((uint16_t)1 << floor_log_2_d))) {
             // This power works
             more = floor_log_2_d;
-        }
-        else {
+        } else {
             // We have to use the general 17-bit algorithm.  We need to compute
             // (2**power) / d. However, we already have (2**(power-1))/d and
             // its remainder.  By doubling both, and then correcting the
@@ -742,7 +746,7 @@ struct libdivide_u16_branchfree_t libdivide_u16_branchfree_gen(uint16_t d) {
     }
     struct libdivide_u16_t tmp = libdivide_internal_u16_gen(d, 1);
     struct libdivide_u16_branchfree_t ret = {
-        tmp.magic, (uint8_t)(tmp.more & LIBDIVIDE_16_SHIFT_MASK) };
+        tmp.magic, (uint8_t)(tmp.more & LIBDIVIDE_16_SHIFT_MASK)};
     return ret;
 }
 
@@ -752,27 +756,25 @@ struct libdivide_u16_branchfree_t libdivide_u16_branchfree_gen(uint16_t d) {
 uint16_t libdivide_u16_do_raw(uint16_t numer, uint16_t magic, uint8_t more) {
     if (!magic) {
         return numer >> more;
-    }
-    else {
+    } else {
         uint16_t q = libdivide_mullhi_u16(magic, numer);
         if (more & LIBDIVIDE_ADD_MARKER) {
             uint16_t t = ((numer - q) >> 1) + q;
             return t >> (more & LIBDIVIDE_16_SHIFT_MASK);
-        }
-        else {
+        } else {
             // All upper bits are 0,
             // don't need to mask them off.
             return q >> more;
         }
-    }    
+    }
 }
 
-uint16_t libdivide_u16_do(uint16_t numer, const struct libdivide_u16_t* denom) {
+uint16_t libdivide_u16_do(uint16_t numer, const struct libdivide_u16_t *denom) {
     return libdivide_u16_do_raw(numer, denom->magic, denom->more);
 }
 
 uint16_t libdivide_u16_branchfree_do(
-    uint16_t numer, const struct libdivide_u16_branchfree_t* denom) {
+    uint16_t numer, const struct libdivide_u16_branchfree_t *denom) {
     uint16_t q = libdivide_mullhi_u16(denom->magic, numer);
     uint16_t t = ((numer - q) >> 1) + q;
     return t >> denom->more;
@@ -800,7 +802,7 @@ uint16_t libdivide_u16_recover(const struct libdivide_u16_t *denom) {
         // overflow. So we have to compute it as 2^(16+shift)/(m+2^16), and
         // then double the quotient and remainder.
         uint32_t half_n = (uint32_t)1 << (16 + shift);
-        uint32_t d = ( (uint32_t)1 << 16) | denom->magic;
+        uint32_t d = ((uint32_t)1 << 16) | denom->magic;
         // Note that the quotient is guaranteed <= 16 bits, but the remainder
         // may need 17!
         uint16_t half_q = (uint16_t)(half_n / d);
@@ -1682,15 +1684,22 @@ int64_t libdivide_s64_branchfree_recover(const struct libdivide_s64_branchfree_t
 
 // Simplest possible vector type division: treat the vector type as an array
 // of underlying native type.
-#define SIMPLE_VECTOR_DIVISION(IntT, VecT, Algo) \
-    const size_t count = sizeof(VecT) / sizeof(IntT); \
-    VecT result; \
-    IntT *pSource = (IntT *)&numers; \
-    IntT *pTarget = (IntT *)&result; \
-    for (size_t loop=0; loop<count; ++loop) { \
-        pTarget[loop] = libdivide_##Algo##_do(pSource[loop], denom); \
-    } \
-    return result; \
+//
+// Use a union to read a vector via pointer-to-integer, without violating strict
+// aliasing.
+#define SIMPLE_VECTOR_DIVISION(IntT, VecT, Algo)                          \
+    const size_t count = sizeof(VecT) / sizeof(IntT);                     \
+    union type_pun_vec {                                                  \
+        VecT vec;                                                         \
+        IntT arr[sizeof(VecT) / sizeof(IntT)];                            \
+    };                                                                    \
+    union type_pun_vec result;                                            \
+    union type_pun_vec input;                                             \
+    input.vec = numers;                                                   \
+    for (size_t loop = 0; loop < count; ++loop) {                         \
+        result.arr[loop] = libdivide_##Algo##_do(input.arr[loop], denom); \
+    }                                                                     \
+    return result.vec;
 
 #if defined(LIBDIVIDE_NEON)
 
@@ -1804,13 +1813,12 @@ static LIBDIVIDE_INLINE int64x2_t libdivide_mullhi_s64_vec128(int64x2_t x, int64
 
 ////////// UINT16
 
-uint16x8_t libdivide_u16_do_vec128(uint16x8_t numers, const struct libdivide_u16_t *denom) {
-    SIMPLE_VECTOR_DIVISION(uint16_t, uint16x8_t, u16)
-}
+uint16x8_t libdivide_u16_do_vec128(uint16x8_t numers, const struct libdivide_u16_t *denom){
+    SIMPLE_VECTOR_DIVISION(uint16_t, uint16x8_t, u16)}
 
-uint16x8_t libdivide_u16_branchfree_do_vec128(uint16x8_t numers, const struct libdivide_u16_branchfree_t *denom) {
-    SIMPLE_VECTOR_DIVISION(uint16_t, uint16x8_t, u16_branchfree)
-}
+uint16x8_t libdivide_u16_branchfree_do_vec128(
+    uint16x8_t numers, const struct libdivide_u16_branchfree_t *denom){
+    SIMPLE_VECTOR_DIVISION(uint16_t, uint16x8_t, u16_branchfree)}
 
 ////////// UINT32
 
@@ -1870,13 +1878,12 @@ uint64x2_t libdivide_u64_branchfree_do_vec128(
 
 ////////// SINT16
 
-int16x8_t libdivide_s16_do_vec128(int16x8_t numers, const struct libdivide_s16_t *denom) {
-    SIMPLE_VECTOR_DIVISION(int16_t, int16x8_t, s16)
-}
+int16x8_t libdivide_s16_do_vec128(int16x8_t numers, const struct libdivide_s16_t *denom){
+    SIMPLE_VECTOR_DIVISION(int16_t, int16x8_t, s16)}
 
-int16x8_t libdivide_s16_branchfree_do_vec128(int16x8_t numers, const struct libdivide_s16_branchfree_t *denom) {
-    SIMPLE_VECTOR_DIVISION(int16_t, int16x8_t, s16_branchfree)
-}
+int16x8_t libdivide_s16_branchfree_do_vec128(
+    int16x8_t numers, const struct libdivide_s16_branchfree_t *denom){
+    SIMPLE_VECTOR_DIVISION(int16_t, int16x8_t, s16_branchfree)}
 
 ////////// SINT32
 
@@ -2082,13 +2089,12 @@ static LIBDIVIDE_INLINE __m512i libdivide_mullhi_s64_vec512(__m512i x, __m512i y
 
 ////////// UINT16
 
-__m512i libdivide_u16_do_vec512(__m512i numers, const struct libdivide_u16_t *denom) {
-    SIMPLE_VECTOR_DIVISION(uint16_t, __m512i, u16)
-}
+__m512i libdivide_u16_do_vec512(__m512i numers, const struct libdivide_u16_t *denom){
+    SIMPLE_VECTOR_DIVISION(uint16_t, __m512i, u16)}
 
-__m512i libdivide_u16_branchfree_do_vec512(__m512i numers, const struct libdivide_u16_branchfree_t *denom) {
-    SIMPLE_VECTOR_DIVISION(uint16_t, __m512i, u16_branchfree)
-}
+__m512i libdivide_u16_branchfree_do_vec512(
+    __m512i numers, const struct libdivide_u16_branchfree_t *denom){
+    SIMPLE_VECTOR_DIVISION(uint16_t, __m512i, u16_branchfree)}
 
 ////////// UINT32
 
@@ -2146,13 +2152,12 @@ __m512i libdivide_u64_branchfree_do_vec512(
 
 ////////// SINT16
 
-__m512i libdivide_s16_do_vec512(__m512i numers, const struct libdivide_s16_t *denom) {
-    SIMPLE_VECTOR_DIVISION(int16_t, __m512i, s16)
-}
+__m512i libdivide_s16_do_vec512(__m512i numers, const struct libdivide_s16_t *denom){
+    SIMPLE_VECTOR_DIVISION(int16_t, __m512i, s16)}
 
-__m512i libdivide_s16_branchfree_do_vec512(__m512i numers, const struct libdivide_s16_branchfree_t *denom) {
-    SIMPLE_VECTOR_DIVISION(int16_t, __m512i, s16_branchfree)
-}
+__m512i libdivide_s16_branchfree_do_vec512(
+    __m512i numers, const struct libdivide_s16_branchfree_t *denom){
+    SIMPLE_VECTOR_DIVISION(int16_t, __m512i, s16_branchfree)}
 
 ////////// SINT32
 
@@ -2365,11 +2370,25 @@ static LIBDIVIDE_INLINE __m256i libdivide_mullhi_s64_vec256(__m256i x, __m256i y
 ////////// UINT16
 
 __m256i libdivide_u16_do_vec256(__m256i numers, const struct libdivide_u16_t *denom) {
-    SIMPLE_VECTOR_DIVISION(uint16_t, __m256i, u16)
+    uint8_t more = denom->more;
+    if (!denom->magic) {
+        return _mm256_srli_epi16(numers, more);
+    } else {
+        __m256i q = _mm256_mulhi_epu16(numers, _mm256_set1_epi16(denom->magic));
+        if (more & LIBDIVIDE_ADD_MARKER) {
+            __m256i t = _mm256_adds_epu16(_mm256_srli_epi16(_mm256_subs_epu16(numers, q), 1), q);
+            return _mm256_srli_epi16(t, (more & LIBDIVIDE_16_SHIFT_MASK));
+        } else {
+            return _mm256_srli_epi16(q, more);
+        }
+    }
 }
 
-__m256i libdivide_u16_branchfree_do_vec256(__m256i numers, const struct libdivide_u16_branchfree_t *denom) {
-    SIMPLE_VECTOR_DIVISION(uint16_t, __m256i, u16_branchfree)
+__m256i libdivide_u16_branchfree_do_vec256(
+    __m256i numers, const struct libdivide_u16_branchfree_t *denom) {
+    __m256i q = _mm256_mulhi_epu16(numers, _mm256_set1_epi16(denom->magic));
+    __m256i t = _mm256_adds_epu16(_mm256_srli_epi16(_mm256_subs_epu16(numers, q), 1), q);
+    return _mm256_srli_epi16(t, denom->more);
 }
 
 ////////// UINT32
@@ -2429,11 +2448,54 @@ __m256i libdivide_u64_branchfree_do_vec256(
 ////////// SINT16
 
 __m256i libdivide_s16_do_vec256(__m256i numers, const struct libdivide_s16_t *denom) {
-    SIMPLE_VECTOR_DIVISION(int16_t, __m256i, s16)
+    uint8_t more = denom->more;
+    if (!denom->magic) {
+        uint16_t shift = more & LIBDIVIDE_16_SHIFT_MASK;
+        uint16_t mask = ((uint16_t)1 << shift) - 1;
+        __m256i roundToZeroTweak = _mm256_set1_epi16(mask);
+        // q = numer + ((numer >> 15) & roundToZeroTweak);
+        __m256i q = _mm256_add_epi16(
+            numers, _mm256_and_si256(_mm256_srai_epi16(numers, 15), roundToZeroTweak));
+        q = _mm256_srai_epi16(q, shift);
+        __m256i sign = _mm256_set1_epi16((int8_t)more >> 7);
+        // q = (q ^ sign) - sign;
+        q = _mm256_sub_epi16(_mm256_xor_si256(q, sign), sign);
+        return q;
+    } else {
+        __m256i q = _mm256_mulhi_epi16(numers, _mm256_set1_epi16(denom->magic));
+        if (more & LIBDIVIDE_ADD_MARKER) {
+            // must be arithmetic shift
+            __m256i sign = _mm256_set1_epi16((int8_t)more >> 7);
+            // q += ((numer ^ sign) - sign);
+            q = _mm256_add_epi16(q, _mm256_sub_epi16(_mm256_xor_si256(numers, sign), sign));
+        }
+        // q >>= shift
+        q = _mm256_srai_epi16(q, more & LIBDIVIDE_16_SHIFT_MASK);
+        q = _mm256_add_epi16(q, _mm256_srli_epi16(q, 15));  // q += (q < 0)
+        return q;
+    }
 }
 
-__m256i libdivide_s16_branchfree_do_vec256(__m256i numers, const struct libdivide_s16_branchfree_t *denom) {
-    SIMPLE_VECTOR_DIVISION(int16_t, __m256i, s16_branchfree)
+__m256i libdivide_s16_branchfree_do_vec256(
+    __m256i numers, const struct libdivide_s16_branchfree_t *denom) {
+    int16_t magic = denom->magic;
+    uint8_t more = denom->more;
+    uint8_t shift = more & LIBDIVIDE_16_SHIFT_MASK;
+    // must be arithmetic shift
+    __m256i sign = _mm256_set1_epi16((int8_t)more >> 7);
+    __m256i q = _mm256_mulhi_epi16(numers, _mm256_set1_epi16(magic));
+    q = _mm256_add_epi16(q, numers);  // q += numers
+
+    // If q is non-negative, we have nothing to do
+    // If q is negative, we want to add either (2**shift)-1 if d is
+    // a power of 2, or (2**shift) if it is not a power of 2
+    uint16_t is_power_of_2 = (magic == 0);
+    __m256i q_sign = _mm256_srai_epi16(q, 15);  // q_sign = q >> 15
+    __m256i mask = _mm256_set1_epi16(((uint16_t)1 << shift) - is_power_of_2);
+    q = _mm256_add_epi16(q, _mm256_and_si256(q_sign, mask));  // q = q + (q_sign & mask)
+    q = _mm256_srai_epi16(q, shift);                          // q >>= shift
+    q = _mm256_sub_epi16(_mm256_xor_si256(q, sign), sign);    // q = (q ^ sign) - sign
+    return q;
 }
 
 ////////// SINT32
@@ -2661,11 +2723,25 @@ static LIBDIVIDE_INLINE __m128i libdivide_mullhi_s64_vec128(__m128i x, __m128i y
 ////////// UINT26
 
 __m128i libdivide_u16_do_vec128(__m128i numers, const struct libdivide_u16_t *denom) {
-    SIMPLE_VECTOR_DIVISION(uint16_t, __m128i, u16)
+    uint8_t more = denom->more;
+    if (!denom->magic) {
+        return _mm_srli_epi16(numers, more);
+    } else {
+        __m128i q = _mm_mulhi_epu16(numers, _mm_set1_epi16(denom->magic));
+        if (more & LIBDIVIDE_ADD_MARKER) {
+            __m128i t = _mm_adds_epu16(_mm_srli_epi16(_mm_subs_epu16(numers, q), 1), q);
+            return _mm_srli_epi16(t, (more & LIBDIVIDE_16_SHIFT_MASK));
+        } else {
+            return _mm_srli_epi16(q, more);
+        }
+    }
 }
 
-__m128i libdivide_u16_branchfree_do_vec128(__m128i numers, const struct libdivide_u16_branchfree_t *denom) {
-    SIMPLE_VECTOR_DIVISION(uint16_t, __m128i, u16_branchfree)
+__m128i libdivide_u16_branchfree_do_vec128(
+    __m128i numers, const struct libdivide_u16_branchfree_t *denom) {
+    __m128i q = _mm_mulhi_epu16(numers, _mm_set1_epi16(denom->magic));
+    __m128i t = _mm_adds_epu16(_mm_srli_epi16(_mm_subs_epu16(numers, q), 1), q);
+    return _mm_srli_epi16(t, denom->more);
 }
 
 ////////// UINT32
@@ -2725,11 +2801,54 @@ __m128i libdivide_u64_branchfree_do_vec128(
 ////////// SINT16
 
 __m128i libdivide_s16_do_vec128(__m128i numers, const struct libdivide_s16_t *denom) {
-    SIMPLE_VECTOR_DIVISION(int16_t, __m128i, s16)
+    uint8_t more = denom->more;
+    if (!denom->magic) {
+        uint16_t shift = more & LIBDIVIDE_16_SHIFT_MASK;
+        uint16_t mask = ((uint16_t)1 << shift) - 1;
+        __m128i roundToZeroTweak = _mm_set1_epi16(mask);
+        // q = numer + ((numer >> 15) & roundToZeroTweak);
+        __m128i q =
+            _mm_add_epi16(numers, _mm_and_si128(_mm_srai_epi16(numers, 15), roundToZeroTweak));
+        q = _mm_srai_epi16(q, shift);
+        __m128i sign = _mm_set1_epi16((int8_t)more >> 7);
+        // q = (q ^ sign) - sign;
+        q = _mm_sub_epi16(_mm_xor_si128(q, sign), sign);
+        return q;
+    } else {
+        __m128i q = _mm_mulhi_epi16(numers, _mm_set1_epi16(denom->magic));
+        if (more & LIBDIVIDE_ADD_MARKER) {
+            // must be arithmetic shift
+            __m128i sign = _mm_set1_epi16((int8_t)more >> 7);
+            // q += ((numer ^ sign) - sign);
+            q = _mm_add_epi16(q, _mm_sub_epi16(_mm_xor_si128(numers, sign), sign));
+        }
+        // q >>= shift
+        q = _mm_srai_epi16(q, more & LIBDIVIDE_16_SHIFT_MASK);
+        q = _mm_add_epi16(q, _mm_srli_epi16(q, 15));  // q += (q < 0)
+        return q;
+    }
 }
 
-__m128i libdivide_s16_branchfree_do_vec128(__m128i numers, const struct libdivide_s16_branchfree_t *denom) {
-    SIMPLE_VECTOR_DIVISION(int16_t, __m128i, s16_branchfree)
+__m128i libdivide_s16_branchfree_do_vec128(
+    __m128i numers, const struct libdivide_s16_branchfree_t *denom) {
+    int16_t magic = denom->magic;
+    uint8_t more = denom->more;
+    uint8_t shift = more & LIBDIVIDE_16_SHIFT_MASK;
+    // must be arithmetic shift
+    __m128i sign = _mm_set1_epi16((int8_t)more >> 7);
+    __m128i q = _mm_mulhi_epi16(numers, _mm_set1_epi16(magic));
+    q = _mm_add_epi16(q, numers);  // q += numers
+
+    // If q is non-negative, we have nothing to do
+    // If q is negative, we want to add either (2**shift)-1 if d is
+    // a power of 2, or (2**shift) if it is not a power of 2
+    uint16_t is_power_of_2 = (magic == 0);
+    __m128i q_sign = _mm_srai_epi16(q, 15);  // q_sign = q >> 15
+    __m128i mask = _mm_set1_epi16(((uint16_t)1 << shift) - is_power_of_2);
+    q = _mm_add_epi16(q, _mm_and_si128(q_sign, mask));  // q = q + (q_sign & mask)
+    q = _mm_srai_epi16(q, shift);                       // q >>= shift
+    q = _mm_sub_epi16(_mm_xor_si128(q, sign), sign);    // q = (q ^ sign) - sign
+    return q;
 }
 
 ////////// SINT32
@@ -2795,8 +2914,8 @@ __m128i libdivide_s64_do_vec128(__m128i numers, const struct libdivide_s64_t *de
         uint64_t mask = ((uint64_t)1 << shift) - 1;
         __m128i roundToZeroTweak = _mm_set1_epi64x(mask);
         // q = numer + ((numer >> 63) & roundToZeroTweak);
-        __m128i q =
-            _mm_add_epi64(numers, _mm_and_si128(libdivide_s64_signbits_vec128(numers), roundToZeroTweak));
+        __m128i q = _mm_add_epi64(
+            numers, _mm_and_si128(libdivide_s64_signbits_vec128(numers), roundToZeroTweak));
         q = libdivide_s64_shift_right_vec128(q, shift);
         __m128i sign = _mm_set1_epi32((int8_t)more >> 7);
         // q = (q ^ sign) - sign;
@@ -2847,49 +2966,80 @@ __m128i libdivide_s64_branchfree_do_vec128(
 
 #ifdef __cplusplus
 
+//for constexpr zero initialization,
+//c++11 might handle things ok,
+//but just limit to at least c++14 to ensure
+//we don't break anyone's code:
+
+// for gcc and clang, use https://en.cppreference.com/w/cpp/feature_test#cpp_constexpr
+#if (defined(__GNUC__) || defined(__clang__)) && (__cpp_constexpr >= 201304L)
+#define LIBDIVIDE_CONSTEXPR constexpr
+
+// supposedly, MSVC might not implement feature test macros right (https://stackoverflow.com/questions/49316752/feature-test-macros-not-working-properly-in-visual-c)
+// so check that _MSVC_LANG corresponds to at least c++14, and _MSC_VER corresponds to at least VS 2017 15.0 (for extended constexpr support https://learn.microsoft.com/en-us/cpp/overview/visual-cpp-language-conformance?view=msvc-170)
+#elif defined(_MSC_VER) && _MSC_VER >= 1910 && defined(_MSVC_LANG) && _MSVC_LANG >=201402L
+#define LIBDIVIDE_CONSTEXPR constexpr
+
+// in case some other obscure compiler has the right __cpp_constexpr :
+#elif defined(__cpp_constexpr) && __cpp_constexpr >= 201304L
+#define LIBDIVIDE_CONSTEXPR constexpr
+
+#else
+#define LIBDIVIDE_CONSTEXPR LIBDIVIDE_INLINE
+#endif
+
 enum Branching {
     BRANCHFULL,  // use branching algorithms
     BRANCHFREE   // use branchfree algorithms
 };
 
+namespace detail {
+enum Signedness {
+    SIGNED,
+    UNSIGNED,
+};
+
 #if defined(LIBDIVIDE_NEON)
 // Helper to deduce NEON vector type for integral type.
-template <typename T>
-struct NeonVecFor {};
+template <int _WIDTH, Signedness _SIGN>
+struct NeonVec {};
 
 template <>
-struct NeonVecFor<uint16_t> {
+struct NeonVec<16, UNSIGNED> {
     typedef uint16x8_t type;
 };
 
 template <>
-struct NeonVecFor<int16_t> {
+struct NeonVec<16, SIGNED> {
     typedef int16x8_t type;
 };
 
 template <>
-struct NeonVecFor<uint32_t> {
+struct NeonVec<32, UNSIGNED> {
     typedef uint32x4_t type;
 };
 
 template <>
-struct NeonVecFor<int32_t> {
+struct NeonVec<32, SIGNED> {
     typedef int32x4_t type;
 };
 
 template <>
-struct NeonVecFor<uint64_t> {
+struct NeonVec<64, UNSIGNED> {
     typedef uint64x2_t type;
 };
 
 template <>
-struct NeonVecFor<int64_t> {
+struct NeonVec<64, SIGNED> {
     typedef int64x2_t type;
 };
-#endif
 
-// Versions of our algorithms for SIMD.
-#if defined(LIBDIVIDE_NEON)
+template <typename T>
+struct NeonVecFor {
+    // See 'class divider' for an explanation of these template parameters.
+    typedef typename NeonVec<sizeof(T) * 8, (((T)0 >> 0) > (T)(-1) ? SIGNED : UNSIGNED)>::type type;
+};
+
 #define LIBDIVIDE_DIVIDE_NEON(ALGO, INT_TYPE)                    \
     LIBDIVIDE_INLINE typename NeonVecFor<INT_TYPE>::type divide( \
         typename NeonVecFor<INT_TYPE>::type n) const {           \
@@ -2898,6 +3048,7 @@ struct NeonVecFor<int64_t> {
 #else
 #define LIBDIVIDE_DIVIDE_NEON(ALGO, INT_TYPE)
 #endif
+
 #if defined(LIBDIVIDE_SSE2)
 #define LIBDIVIDE_DIVIDE_SSE2(ALGO)                     \
     LIBDIVIDE_INLINE __m128i divide(__m128i n) const {  \
@@ -2930,6 +3081,7 @@ struct NeonVecFor<int64_t> {
 #define DISPATCHER_GEN(T, ALGO)                                                       \
     libdivide_##ALGO##_t denom;                                                       \
     LIBDIVIDE_INLINE dispatcher() {}                                                  \
+    explicit LIBDIVIDE_CONSTEXPR dispatcher(decltype(nullptr)) : denom{} {}              \
     LIBDIVIDE_INLINE dispatcher(T d) : denom(libdivide_##ALGO##_gen(d)) {}            \
     LIBDIVIDE_INLINE T divide(T n) const { return libdivide_##ALGO##_do(n, &denom); } \
     LIBDIVIDE_INLINE T recover() const { return libdivide_##ALGO##_recover(&denom); } \
@@ -2939,66 +3091,81 @@ struct NeonVecFor<int64_t> {
     LIBDIVIDE_DIVIDE_AVX512(ALGO)
 
 // The dispatcher selects a specific division algorithm for a given
-// type and ALGO using partial template specialization.
-template <typename _IntT, Branching ALGO>
+// width, signedness, and ALGO using partial template specialization.
+template <int _WIDTH, Signedness _SIGN, Branching _ALGO>
 struct dispatcher {};
 
 template <>
-struct dispatcher<int16_t, BRANCHFULL> {
+struct dispatcher<16, SIGNED, BRANCHFULL> {
     DISPATCHER_GEN(int16_t, s16)
 };
 template <>
-struct dispatcher<int16_t, BRANCHFREE> {
+struct dispatcher<16, SIGNED, BRANCHFREE> {
     DISPATCHER_GEN(int16_t, s16_branchfree)
 };
 template <>
-struct dispatcher<uint16_t, BRANCHFULL> {
+struct dispatcher<16, UNSIGNED, BRANCHFULL> {
     DISPATCHER_GEN(uint16_t, u16)
 };
 template <>
-struct dispatcher<uint16_t, BRANCHFREE> {
+struct dispatcher<16, UNSIGNED, BRANCHFREE> {
     DISPATCHER_GEN(uint16_t, u16_branchfree)
 };
 template <>
-struct dispatcher<int32_t, BRANCHFULL> {
+struct dispatcher<32, SIGNED, BRANCHFULL> {
     DISPATCHER_GEN(int32_t, s32)
 };
 template <>
-struct dispatcher<int32_t, BRANCHFREE> {
+struct dispatcher<32, SIGNED, BRANCHFREE> {
     DISPATCHER_GEN(int32_t, s32_branchfree)
 };
 template <>
-struct dispatcher<uint32_t, BRANCHFULL> {
+struct dispatcher<32, UNSIGNED, BRANCHFULL> {
     DISPATCHER_GEN(uint32_t, u32)
 };
 template <>
-struct dispatcher<uint32_t, BRANCHFREE> {
+struct dispatcher<32, UNSIGNED, BRANCHFREE> {
     DISPATCHER_GEN(uint32_t, u32_branchfree)
 };
 template <>
-struct dispatcher<int64_t, BRANCHFULL> {
+struct dispatcher<64, SIGNED, BRANCHFULL> {
     DISPATCHER_GEN(int64_t, s64)
 };
 template <>
-struct dispatcher<int64_t, BRANCHFREE> {
+struct dispatcher<64, SIGNED, BRANCHFREE> {
     DISPATCHER_GEN(int64_t, s64_branchfree)
 };
 template <>
-struct dispatcher<uint64_t, BRANCHFULL> {
+struct dispatcher<64, UNSIGNED, BRANCHFULL> {
     DISPATCHER_GEN(uint64_t, u64)
 };
 template <>
-struct dispatcher<uint64_t, BRANCHFREE> {
+struct dispatcher<64, UNSIGNED, BRANCHFREE> {
     DISPATCHER_GEN(uint64_t, u64_branchfree)
 };
+}  // namespace detail
+
+#if defined(LIBDIVIDE_NEON)
+// Allow NeonVecFor outside of detail namespace.
+template <typename T>
+struct NeonVecFor {
+    typedef typename detail::NeonVecFor<T>::type type;
+};
+#endif
 
 // This is the main divider class for use by the user (C++ API).
 // The actual division algorithm is selected using the dispatcher struct
-// based on the integer and algorithm template parameters.
+// based on the integer width and algorithm template parameters.
 template <typename T, Branching ALGO = BRANCHFULL>
 class divider {
    private:
-    typedef dispatcher<T, ALGO> dispatcher_t;
+    // Dispatch based on the size and signedness.
+    // We avoid using type_traits as it's not available in AVR.
+    // Detect signedness by checking if T(-1) is less than T(0).
+    // Also throw in a shift by 0, which prevents floating point types from being passed.
+    typedef detail::dispatcher<sizeof(T) * 8,
+        (((T)0 >> 0) > (T)(-1) ? detail::SIGNED : detail::UNSIGNED), ALGO>
+        dispatcher_t;
 
    public:
     // We leave the default constructor empty so that creating
@@ -3006,6 +3173,9 @@ class divider {
     // later doesn't slow us down.
     divider() {}
 
+    // constexpr zero-initialization to allow for use w/ static constinit
+    explicit LIBDIVIDE_CONSTEXPR divider(decltype(nullptr)) : div(nullptr) {}
+
     // Constructor that takes the divisor as a parameter
     LIBDIVIDE_INLINE divider(T d) : div(d) {}
 
@@ -3017,7 +3187,7 @@ class divider {
     T recover() const { return div.recover(); }
 
     bool operator==(const divider<T, ALGO> &other) const {
-        return div.denom.magic == other.denom.magic && div.denom.more == other.denom.more;
+        return div.denom.magic == other.div.denom.magic && div.denom.more == other.div.denom.more;
     }
 
     bool operator!=(const divider<T, ALGO> &other) const { return !(*this == other); }
@@ -3098,12 +3268,14 @@ LIBDIVIDE_INLINE __m512i operator/=(__m512i &n, const divider<T, ALGO> &div) {
 
 #if defined(LIBDIVIDE_NEON)
 template <typename T, Branching ALGO>
-LIBDIVIDE_INLINE typename NeonVecFor<T>::type operator/(typename NeonVecFor<T>::type n, const divider<T, ALGO> &div) {
+LIBDIVIDE_INLINE typename NeonVecFor<T>::type operator/(
+    typename NeonVecFor<T>::type n, const divider<T, ALGO> &div) {
     return div.divide(n);
 }
 
 template <typename T, Branching ALGO>
-LIBDIVIDE_INLINE typename NeonVecFor<T>::type operator/=(typename NeonVecFor<T>::type &n, const divider<T, ALGO> &div) {
+LIBDIVIDE_INLINE typename NeonVecFor<T>::type operator/=(
+    typename NeonVecFor<T>::type &n, const divider<T, ALGO> &div) {
     n = div.divide(n);
     return n;
 }

From 9ca9d2d925637308f80d77ae9688c44ca5b9298e Mon Sep 17 00:00:00 2001
From: maade93791 <70593890+maade69@users.noreply.github.com>
Date: Mon, 9 Sep 2024 23:58:08 +0300
Subject: [PATCH 41/53] android: use more basic CPU target for memtag

This is required for hardened_malloc to work in microdroid on MTE-enabled devices (currently, 8th
and 9th generation Pixels) since PVMFW only supports ARMv8 cores.

https://android.googlesource.com/platform/packages/modules/Virtualization/+/refs/tags/android-15.0.0_r1/pvmfw/platform.dts#100
---
 Android.bp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Android.bp b/Android.bp
index 0db6a04..26ee403 100644
--- a/Android.bp
+++ b/Android.bp
@@ -74,7 +74,7 @@ cc_library {
             cflags: ["-DLABEL_MEMORY"],
         },
         device_has_arm_mte: {
-            cflags: ["-DHAS_ARM_MTE", "-march=armv9-a+memtag"]
+            cflags: ["-DHAS_ARM_MTE", "-march=armv8-a+dotprod+memtag"]
         },
     },
     apex_available: [

From 6ce663a8bdbc2481afd01bca55f334d42c7cb6be Mon Sep 17 00:00:00 2001
From: Julien Voisin <jvoisin@users.noreply.github.com>
Date: Thu, 3 Oct 2024 22:30:52 +0000
Subject: [PATCH 42/53] Fix -Wimplicit-function-declaration error with gcc 14.

```
malloc_info.c: In function 'leak_memory':
malloc_info.c:12:12: error: implicit declaration of function 'malloc' [-Wimplicit-function-declaration]
   12 |     (void)!malloc(1024 * 1024 * 1024);
      |            ^~~~~~
malloc_info.c:10:1: note: include '<stdlib.h>' or provide a declaration of 'malloc'
    9 | #include "../util.h"
  +++ |+#include <stdlib.h>
   10 |
malloc_info.c:12:12: warning: incompatible implicit declaration of built-in function 'malloc' [-Wbuiltin-declaration-mismatch]
   12 |     (void)!malloc(1024 * 1024 * 1024);
      |            ^~~~~~
```

Taken from https://gitlab.alpinelinux.org/alpine/aports/-/merge_requests/72971/

Co-authored-by: @mio
---
 test/malloc_info.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test/malloc_info.c b/test/malloc_info.c
index 50b256f..3b99ead 100644
--- a/test/malloc_info.c
+++ b/test/malloc_info.c
@@ -1,5 +1,6 @@
 #include <pthread.h>
 #include <stdio.h>
+#include <stdlib.h>
 
 #if defined(__GLIBC__) || defined(__ANDROID__)
 #include <malloc.h>

From e86192e7fe99ed987ae89a6e3bf2b9de4d9a0332 Mon Sep 17 00:00:00 2001
From: Daniel Micay <daniel.micay@grapheneos.org>
Date: Wed, 9 Oct 2024 19:56:50 -0400
Subject: [PATCH 43/53] remove redundant warning switches for Android

Android already enables -Wall and -Wextra in the global soong build
settings.
---
 Android.bp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/Android.bp b/Android.bp
index 26ee403..f6a7a9c 100644
--- a/Android.bp
+++ b/Android.bp
@@ -5,8 +5,6 @@ common_cflags = [
     "-fPIC",
     "-fvisibility=hidden",
     //"-fno-plt",
-    "-Wall",
-    "-Wextra",
     "-Wcast-align",
     "-Wcast-qual",
     "-Wwrite-strings",

From 6402e2b0d4b406ee3f73e5f4e3233d4af23d603b Mon Sep 17 00:00:00 2001
From: Daniel Micay <daniel.micay@grapheneos.org>
Date: Sat, 12 Oct 2024 03:17:44 -0400
Subject: [PATCH 44/53] reduce probability hint for is_memtag_enabled

---
 h_malloc.c | 18 +++++++++---------
 util.h     |  2 ++
 2 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/h_malloc.c b/h_malloc.c
index 15be0a2..89ef91d 100644
--- a/h_malloc.c
+++ b/h_malloc.c
@@ -470,7 +470,7 @@ static void write_after_free_check(const char *p, size_t size) {
     }
 
 #ifdef HAS_ARM_MTE
-    if (likely(is_memtag_enabled())) {
+    if (likely51(is_memtag_enabled())) {
         return;
     }
 #endif
@@ -505,7 +505,7 @@ static void set_slab_canary_value(UNUSED struct slab_metadata *metadata, UNUSED
 static void set_canary(UNUSED const struct slab_metadata *metadata, UNUSED void *p, UNUSED size_t size) {
 #if SLAB_CANARY
 #ifdef HAS_ARM_MTE
-    if (likely(is_memtag_enabled())) {
+    if (likely51(is_memtag_enabled())) {
         return;
     }
 #endif
@@ -517,7 +517,7 @@ static void set_canary(UNUSED const struct slab_metadata *metadata, UNUSED void
 static void check_canary(UNUSED const struct slab_metadata *metadata, UNUSED const void *p, UNUSED size_t size) {
 #if SLAB_CANARY
 #ifdef HAS_ARM_MTE
-    if (likely(is_memtag_enabled())) {
+    if (likely51(is_memtag_enabled())) {
         return;
     }
 #endif
@@ -624,7 +624,7 @@ static inline void *allocate_small(unsigned arena, size_t requested_size) {
                 write_after_free_check(p, size - canary_size);
                 set_canary(metadata, p, size);
 #ifdef HAS_ARM_MTE
-                if (likely(is_memtag_enabled())) {
+                if (likely51(is_memtag_enabled())) {
                     p = tag_and_clear_slab_slot(metadata, p, slot, size);
                 }
 #endif
@@ -661,7 +661,7 @@ static inline void *allocate_small(unsigned arena, size_t requested_size) {
             if (requested_size) {
                 set_canary(metadata, p, size);
 #ifdef HAS_ARM_MTE
-                if (likely(is_memtag_enabled())) {
+                if (likely51(is_memtag_enabled())) {
                     p = tag_and_clear_slab_slot(metadata, p, slot, size);
                 }
 #endif
@@ -688,7 +688,7 @@ static inline void *allocate_small(unsigned arena, size_t requested_size) {
         if (requested_size) {
             set_canary(metadata, p, size);
 #ifdef HAS_ARM_MTE
-            if (likely(is_memtag_enabled())) {
+            if (likely51(is_memtag_enabled())) {
                 p = tag_and_clear_slab_slot(metadata, p, slot, size);
             }
 #endif
@@ -717,7 +717,7 @@ static inline void *allocate_small(unsigned arena, size_t requested_size) {
         write_after_free_check(p, size - canary_size);
         set_canary(metadata, p, size);
 #ifdef HAS_ARM_MTE
-        if (likely(is_memtag_enabled())) {
+        if (likely51(is_memtag_enabled())) {
             p = tag_and_clear_slab_slot(metadata, p, slot, size);
         }
 #endif
@@ -805,7 +805,7 @@ static inline void deallocate_small(void *p, const size_t *expected_size) {
 
         bool skip_zero = false;
 #ifdef HAS_ARM_MTE
-        if (likely(is_memtag_enabled())) {
+        if (likely51(is_memtag_enabled())) {
             arm_mte_tag_and_clear_mem(set_pointer_tag(p, RESERVED_TAG), size);
             // metadata->arm_mte_tags is intentionally not updated, see tag_and_clear_slab_slot()
             skip_zero = true;
@@ -1243,7 +1243,7 @@ COLD static void init_slow_path(void) {
         fatal_error("failed to unprotect memory for regions table");
     }
 #ifdef HAS_ARM_MTE
-    if (likely(is_memtag_enabled())) {
+    if (likely51(is_memtag_enabled())) {
         ro.slab_region_start = memory_map_mte(slab_region_size);
     } else {
         ro.slab_region_start = memory_map(slab_region_size);
diff --git a/util.h b/util.h
index fc22c23..6b1a390 100644
--- a/util.h
+++ b/util.h
@@ -9,7 +9,9 @@
 #define noreturn __attribute__((noreturn))
 
 #define likely(x) __builtin_expect(!!(x), 1)
+#define likely51(x) __builtin_expect_with_probability(!!(x), 1, 0.51)
 #define unlikely(x) __builtin_expect(!!(x), 0)
+#define unlikely51(x) __builtin_expect_with_probability(!!(x), 0, 0.51)
 
 #define min(x, y) ({ \
     __typeof__(x) _x = (x); \

From aa950244f8af6844b597b587a5faf818981e92b9 Mon Sep 17 00:00:00 2001
From: Daniel Micay <daniel.micay@grapheneos.org>
Date: Sat, 12 Oct 2024 02:19:48 -0400
Subject: [PATCH 45/53] reuse code for memory_map_mte

This drops the separate error message since that doesn't seem useful.
---
 memory.c | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

diff --git a/memory.c b/memory.c
index 5434060..e1bd2ef 100644
--- a/memory.c
+++ b/memory.c
@@ -17,8 +17,8 @@
 #include "memory.h"
 #include "util.h"
 
-void *memory_map(size_t size) {
-    void *p = mmap(NULL, size, PROT_NONE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
+static void *memory_map_prot(size_t size, int prot) {
+    void *p = mmap(NULL, size, prot, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
     if (unlikely(p == MAP_FAILED)) {
         if (errno != ENOMEM) {
             fatal_error("non-ENOMEM mmap failure");
@@ -28,17 +28,14 @@ void *memory_map(size_t size) {
     return p;
 }
 
+void *memory_map(size_t size) {
+    return memory_map_prot(size, PROT_NONE);
+}
+
 #ifdef HAS_ARM_MTE
 // Note that PROT_MTE can't be cleared via mprotect
 void *memory_map_mte(size_t size) {
-    void *p = mmap(NULL, size, PROT_MTE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
-    if (unlikely(p == MAP_FAILED)) {
-        if (errno != ENOMEM) {
-            fatal_error("non-ENOMEM MTE mmap failure");
-        }
-        return NULL;
-    }
-    return p;
+    return memory_map_prot(size, PROT_MTE);
 }
 #endif
 

From 9739cb46908ee021fd5028ab64ac8c2ebb3c7a6d Mon Sep 17 00:00:00 2001
From: Daniel Micay <daniel.micay@grapheneos.org>
Date: Sat, 12 Oct 2024 03:03:35 -0400
Subject: [PATCH 46/53] use wrapper for calling memory_map_mte

---
 h_malloc.c | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/h_malloc.c b/h_malloc.c
index 89ef91d..3ceb1d2 100644
--- a/h_malloc.c
+++ b/h_malloc.c
@@ -94,6 +94,15 @@ static inline bool is_memtag_enabled(void) {
 }
 #endif
 
+static void *memory_map_tagged(size_t size) {
+#ifdef HAS_ARM_MTE
+    if (likely51(is_memtag_enabled())) {
+        return memory_map_mte(size);
+    }
+#endif
+    return memory_map(size);
+}
+
 #define SLAB_METADATA_COUNT
 
 struct slab_metadata {
@@ -1242,15 +1251,7 @@ COLD static void init_slow_path(void) {
     if (unlikely(memory_protect_rw_metadata(ra->regions, ra->total * sizeof(struct region_metadata)))) {
         fatal_error("failed to unprotect memory for regions table");
     }
-#ifdef HAS_ARM_MTE
-    if (likely51(is_memtag_enabled())) {
-        ro.slab_region_start = memory_map_mte(slab_region_size);
-    } else {
-        ro.slab_region_start = memory_map(slab_region_size);
-    }
-#else
-    ro.slab_region_start = memory_map(slab_region_size);
-#endif
+    ro.slab_region_start = memory_map_tagged(slab_region_size);
     if (unlikely(ro.slab_region_start == NULL)) {
         fatal_error("failed to allocate slab region");
     }

From e03579253a17afa9ea65dcbe5fae9dd4c583f99e Mon Sep 17 00:00:00 2001
From: Daniel Micay <daniel.micay@grapheneos.org>
Date: Sat, 12 Oct 2024 03:07:07 -0400
Subject: [PATCH 47/53] preserve PROT_MTE when releasing memory

---
 h_malloc.c | 13 +++++++++++--
 memory.c   | 15 +++++++++++++--
 memory.h   |  3 +++
 3 files changed, 27 insertions(+), 4 deletions(-)

diff --git a/h_malloc.c b/h_malloc.c
index 3ceb1d2..6221d0b 100644
--- a/h_malloc.c
+++ b/h_malloc.c
@@ -103,6 +103,15 @@ static void *memory_map_tagged(size_t size) {
     return memory_map(size);
 }
 
+static bool memory_map_fixed_tagged(void *ptr, size_t size) {
+#ifdef HAS_ARM_MTE
+    if (likely51(is_memtag_enabled())) {
+        return memory_map_fixed_mte(ptr, size);
+    }
+#endif
+    return memory_map_fixed(ptr, size);
+}
+
 #define SLAB_METADATA_COUNT
 
 struct slab_metadata {
@@ -899,7 +908,7 @@ static inline void deallocate_small(void *p, const size_t *expected_size) {
 
         if (c->empty_slabs_total + slab_size > max_empty_slabs_total) {
             int saved_errno = errno;
-            if (!memory_map_fixed(slab, slab_size)) {
+            if (!memory_map_fixed_tagged(slab, slab_size)) {
                 label_slab(slab, slab_size, class);
                 stats_slab_deallocate(c, slab_size);
                 enqueue_free_slab(c, metadata);
@@ -1896,7 +1905,7 @@ EXPORT int h_malloc_trim(UNUSED size_t pad) {
             struct slab_metadata *iterator = c->empty_slabs;
             while (iterator) {
                 void *slab = get_slab(c, slab_size, iterator);
-                if (memory_map_fixed(slab, slab_size)) {
+                if (memory_map_fixed_tagged(slab, slab_size)) {
                     break;
                 }
                 label_slab(slab, slab_size, class);
diff --git a/memory.c b/memory.c
index e1bd2ef..2e54f6d 100644
--- a/memory.c
+++ b/memory.c
@@ -39,8 +39,8 @@ void *memory_map_mte(size_t size) {
 }
 #endif
 
-bool memory_map_fixed(void *ptr, size_t size) {
-    void *p = mmap(ptr, size, PROT_NONE, MAP_ANONYMOUS|MAP_PRIVATE|MAP_FIXED, -1, 0);
+static bool memory_map_fixed_prot(void *ptr, size_t size, int prot) {
+    void *p = mmap(ptr, size, prot, MAP_ANONYMOUS|MAP_PRIVATE|MAP_FIXED, -1, 0);
     bool ret = p == MAP_FAILED;
     if (unlikely(ret) && errno != ENOMEM) {
         fatal_error("non-ENOMEM MAP_FIXED mmap failure");
@@ -48,6 +48,17 @@ bool memory_map_fixed(void *ptr, size_t size) {
     return ret;
 }
 
+bool memory_map_fixed(void *ptr, size_t size) {
+    return memory_map_fixed_prot(ptr, size, PROT_NONE);
+}
+
+#ifdef HAS_ARM_MTE
+// Note that PROT_MTE can't be cleared via mprotect
+bool memory_map_fixed_mte(void *ptr, size_t size) {
+    return memory_map_fixed_prot(ptr, size, PROT_MTE);
+}
+#endif
+
 bool memory_unmap(void *ptr, size_t size) {
     bool ret = munmap(ptr, size);
     if (unlikely(ret) && errno != ENOMEM) {
diff --git a/memory.h b/memory.h
index 6e4cd4d..d5e336b 100644
--- a/memory.h
+++ b/memory.h
@@ -15,6 +15,9 @@ void *memory_map(size_t size);
 void *memory_map_mte(size_t size);
 #endif
 bool memory_map_fixed(void *ptr, size_t size);
+#ifdef HAS_ARM_MTE
+bool memory_map_fixed_mte(void *ptr, size_t size);
+#endif
 bool memory_unmap(void *ptr, size_t size);
 bool memory_protect_ro(void *ptr, size_t size);
 bool memory_protect_rw(void *ptr, size_t size);

From b1d9571fecf81a01374e33639147e852b45f9d1a Mon Sep 17 00:00:00 2001
From: Daniel Micay <daniel.micay@grapheneos.org>
Date: Sat, 12 Oct 2024 03:23:03 -0400
Subject: [PATCH 48/53] remove trailing whitespace

---
 README.md                         | 4 ++--
 androidtest/memtag/memtag_test.cc | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index bccafc2..b83595e 100644
--- a/README.md
+++ b/README.md
@@ -731,7 +731,7 @@ Random tags are set for all slab allocations when allocated, with 4 excluded val
 3. the current (or previous) tag used for the slot to the left
 4. the current (or previous) tag used for the slot to the right
 
-When a slab allocation is freed, the reserved `0` tag is set for the slot. 
+When a slab allocation is freed, the reserved `0` tag is set for the slot.
 Slab allocation slots are cleared before reuse when memory tagging is enabled.
 
 This ensures the following properties:
@@ -740,7 +740,7 @@ This ensures the following properties:
 - Use-after-free are deterministically detected until the freed slot goes through
   both the random and FIFO quarantines, gets allocated again, goes through both
   quarantines again and then finally gets allocated again for a 2nd time.
-- Since the default `0` tag is reserved, untagged pointers can't access slab 
+- Since the default `0` tag is reserved, untagged pointers can't access slab
   allocations and vice versa.
 
 Slab allocations are done in a statically reserved region for each size class
diff --git a/androidtest/memtag/memtag_test.cc b/androidtest/memtag/memtag_test.cc
index 5083636..f858292 100644
--- a/androidtest/memtag/memtag_test.cc
+++ b/androidtest/memtag/memtag_test.cc
@@ -346,6 +346,6 @@ int main(int argc, char **argv) {
 
     test_fn();
     do_context_switch();
-    
+
     return 0;
 }

From a7302add63bd0cc33a63f3f97a3581a1bdad893d Mon Sep 17 00:00:00 2001
From: Daniel Micay <daniel.micay@grapheneos.org>
Date: Wed, 23 Oct 2024 06:36:02 -0400
Subject: [PATCH 49/53] update outdated branch in README

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index b83595e..6a1a91b 100644
--- a/README.md
+++ b/README.md
@@ -83,7 +83,7 @@ there will be custom integration offering better performance in the future
 along with other hardening for the C standard library implementation.
 
 For Android, only the current generation, actively developed maintenance branch of the Android
-Open Source Project will be supported, which currently means `android13-qpr2-release`.
+Open Source Project will be supported, which currently means `android15-release`.
 
 ## Testing
 

From c97263ef0c42d25ee8d1fc4d126f87208b9cb73e Mon Sep 17 00:00:00 2001
From: Daniel Micay <daniel.micay@grapheneos.org>
Date: Sun, 15 Dec 2024 22:10:47 -0500
Subject: [PATCH 50/53] handle GitHub runner image updates

clang-14 and clang-15 are no longer installed by default.
---
 .github/workflows/build-and-test.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml
index 82496af..dbb727f 100644
--- a/.github/workflows/build-and-test.yml
+++ b/.github/workflows/build-and-test.yml
@@ -27,6 +27,8 @@ jobs:
         version: [14, 15]
     steps:
       - uses: actions/checkout@v4
+      - name: Install dependencies
+        run: sudo apt-get update && sudo apt-get install -y --no-install-recommends clang-14 clang-15
       - name: Setting up clang version
         run: |
           sudo update-alternatives --install /usr/bin/clang++ clang++ /usr/bin/clang++-${{ matrix.version }} 100

From c894f3ec1d78234657ae5f581abec16b5e518275 Mon Sep 17 00:00:00 2001
From: Daniel Micay <daniel.micay@grapheneos.org>
Date: Sun, 15 Dec 2024 22:19:43 -0500
Subject: [PATCH 51/53] add newer compiler versions for GitHub workflow

---
 .github/workflows/build-and-test.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml
index dbb727f..e77baba 100644
--- a/.github/workflows/build-and-test.yml
+++ b/.github/workflows/build-and-test.yml
@@ -11,7 +11,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        version: [12]
+        version: [12, 13, 14]
     steps:
       - uses: actions/checkout@v4
       - name: Setting up gcc version
@@ -24,7 +24,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        version: [14, 15]
+        version: [14, 15, 16, 17, 18]
     steps:
       - uses: actions/checkout@v4
       - name: Install dependencies

From 3ab23f7ebfd0407c20b89c260bd4fde0eabf23c3 Mon Sep 17 00:00:00 2001
From: Daniel Micay <daniel.micay@grapheneos.org>
Date: Sat, 25 Jan 2025 16:13:22 -0500
Subject: [PATCH 52/53] update libdivide to 5.2.0

---
 third_party/libdivide.h | 75 +++++++++++++++++++++++++++++------------
 1 file changed, 53 insertions(+), 22 deletions(-)

diff --git a/third_party/libdivide.h b/third_party/libdivide.h
index 4421888..bddc763 100644
--- a/third_party/libdivide.h
+++ b/third_party/libdivide.h
@@ -11,9 +11,11 @@
 #ifndef LIBDIVIDE_H
 #define LIBDIVIDE_H
 
-#define LIBDIVIDE_VERSION "5.1"
+// *** Version numbers are auto generated - do not edit ***
+#define LIBDIVIDE_VERSION "5.2.0"
 #define LIBDIVIDE_VERSION_MAJOR 5
-#define LIBDIVIDE_VERSION_MINOR 1
+#define LIBDIVIDE_VERSION_MINOR 2
+#define LIBDIVIDE_VERSION_PATCH 0
 
 #include <stdint.h>
 
@@ -34,8 +36,15 @@
 #include <arm_neon.h>
 #endif
 
+// Clang-cl prior to Visual Studio 2022 doesn't include __umulh/__mulh intrinsics
+#if defined(_MSC_VER) && defined(LIBDIVIDE_X86_64) && (!defined(__clang__) || _MSC_VER>1930)
+#define LIBDIVIDE_X64_INTRINSICS
+#endif
+
 #if defined(_MSC_VER)
+#if defined(LIBDIVIDE_X64_INTRINSICS)
 #include <intrin.h>
+#endif
 #pragma warning(push)
 // disable warning C4146: unary minus operator applied
 // to unsigned type, result still unsigned
@@ -238,18 +247,28 @@ static LIBDIVIDE_INLINE struct libdivide_u32_branchfree_t libdivide_u32_branchfr
 static LIBDIVIDE_INLINE struct libdivide_s64_branchfree_t libdivide_s64_branchfree_gen(int64_t d);
 static LIBDIVIDE_INLINE struct libdivide_u64_branchfree_t libdivide_u64_branchfree_gen(uint64_t d);
 
-static LIBDIVIDE_INLINE int16_t libdivide_s16_do_raw(int16_t numer, int16_t magic, uint8_t more);
+static LIBDIVIDE_INLINE int16_t libdivide_s16_do_raw(
+    int16_t numer, int16_t magic, uint8_t more);
 static LIBDIVIDE_INLINE int16_t libdivide_s16_do(
     int16_t numer, const struct libdivide_s16_t *denom);
-static LIBDIVIDE_INLINE uint16_t libdivide_u16_do_raw(uint16_t numer, uint16_t magic, uint8_t more);
+static LIBDIVIDE_INLINE uint16_t libdivide_u16_do_raw(
+    uint16_t numer, uint16_t magic, uint8_t more);
 static LIBDIVIDE_INLINE uint16_t libdivide_u16_do(
     uint16_t numer, const struct libdivide_u16_t *denom);
+static LIBDIVIDE_INLINE int32_t libdivide_s32_do_raw(
+    int32_t numer, int32_t magic, uint8_t more);
 static LIBDIVIDE_INLINE int32_t libdivide_s32_do(
     int32_t numer, const struct libdivide_s32_t *denom);
+static LIBDIVIDE_INLINE uint32_t libdivide_u32_do_raw(
+    uint32_t numer, uint32_t magic, uint8_t more);
 static LIBDIVIDE_INLINE uint32_t libdivide_u32_do(
     uint32_t numer, const struct libdivide_u32_t *denom);
+static LIBDIVIDE_INLINE int64_t libdivide_s64_do_raw(
+    int64_t numer, int64_t magic, uint8_t more);
 static LIBDIVIDE_INLINE int64_t libdivide_s64_do(
     int64_t numer, const struct libdivide_s64_t *denom);
+static LIBDIVIDE_INLINE uint64_t libdivide_u64_do_raw(
+    uint64_t numer, uint64_t magic, uint8_t more);
 static LIBDIVIDE_INLINE uint64_t libdivide_u64_do(
     uint64_t numer, const struct libdivide_u64_t *denom);
 
@@ -315,7 +334,7 @@ static LIBDIVIDE_INLINE int32_t libdivide_mullhi_s32(int32_t x, int32_t y) {
 }
 
 static LIBDIVIDE_INLINE uint64_t libdivide_mullhi_u64(uint64_t x, uint64_t y) {
-#if defined(LIBDIVIDE_VC) && defined(LIBDIVIDE_X86_64)
+#if defined(LIBDIVIDE_X64_INTRINSICS)
     return __umulh(x, y);
 #elif defined(HAS_INT128_T)
     __uint128_t xl = x, yl = y;
@@ -341,7 +360,7 @@ static LIBDIVIDE_INLINE uint64_t libdivide_mullhi_u64(uint64_t x, uint64_t y) {
 }
 
 static LIBDIVIDE_INLINE int64_t libdivide_mullhi_s64(int64_t x, int64_t y) {
-#if defined(LIBDIVIDE_VC) && defined(LIBDIVIDE_X86_64)
+#if defined(LIBDIVIDE_X64_INTRINSICS)
     return __mulh(x, y);
 #elif defined(HAS_INT128_T)
     __int128_t xl = x, yl = y;
@@ -914,12 +933,11 @@ struct libdivide_u32_branchfree_t libdivide_u32_branchfree_gen(uint32_t d) {
     return ret;
 }
 
-uint32_t libdivide_u32_do(uint32_t numer, const struct libdivide_u32_t *denom) {
-    uint8_t more = denom->more;
-    if (!denom->magic) {
+uint32_t libdivide_u32_do_raw(uint32_t numer, uint32_t magic, uint8_t more) {
+    if (!magic) {
         return numer >> more;
     } else {
-        uint32_t q = libdivide_mullhi_u32(denom->magic, numer);
+        uint32_t q = libdivide_mullhi_u32(magic, numer);
         if (more & LIBDIVIDE_ADD_MARKER) {
             uint32_t t = ((numer - q) >> 1) + q;
             return t >> (more & LIBDIVIDE_32_SHIFT_MASK);
@@ -931,6 +949,10 @@ uint32_t libdivide_u32_do(uint32_t numer, const struct libdivide_u32_t *denom) {
     }
 }
 
+uint32_t libdivide_u32_do(uint32_t numer, const struct libdivide_u32_t *denom) {
+    return libdivide_u32_do_raw(numer, denom->magic, denom->more);
+}
+
 uint32_t libdivide_u32_branchfree_do(
     uint32_t numer, const struct libdivide_u32_branchfree_t *denom) {
     uint32_t q = libdivide_mullhi_u32(denom->magic, numer);
@@ -1074,12 +1096,11 @@ struct libdivide_u64_branchfree_t libdivide_u64_branchfree_gen(uint64_t d) {
     return ret;
 }
 
-uint64_t libdivide_u64_do(uint64_t numer, const struct libdivide_u64_t *denom) {
-    uint8_t more = denom->more;
-    if (!denom->magic) {
+uint64_t libdivide_u64_do_raw(uint64_t numer, uint64_t magic, uint8_t more) {
+   if (!magic) {
         return numer >> more;
     } else {
-        uint64_t q = libdivide_mullhi_u64(denom->magic, numer);
+        uint64_t q = libdivide_mullhi_u64(magic, numer);
         if (more & LIBDIVIDE_ADD_MARKER) {
             uint64_t t = ((numer - q) >> 1) + q;
             return t >> (more & LIBDIVIDE_64_SHIFT_MASK);
@@ -1091,6 +1112,10 @@ uint64_t libdivide_u64_do(uint64_t numer, const struct libdivide_u64_t *denom) {
     }
 }
 
+uint64_t libdivide_u64_do(uint64_t numer, const struct libdivide_u64_t *denom) {
+    return libdivide_u64_do_raw(numer, denom->magic, denom->more);
+}
+
 uint64_t libdivide_u64_branchfree_do(
     uint64_t numer, const struct libdivide_u64_branchfree_t *denom) {
     uint64_t q = libdivide_mullhi_u64(denom->magic, numer);
@@ -1430,11 +1455,10 @@ struct libdivide_s32_branchfree_t libdivide_s32_branchfree_gen(int32_t d) {
     return result;
 }
 
-int32_t libdivide_s32_do(int32_t numer, const struct libdivide_s32_t *denom) {
-    uint8_t more = denom->more;
+int32_t libdivide_s32_do_raw(int32_t numer, int32_t magic, uint8_t more) {
     uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
 
-    if (!denom->magic) {
+    if (!magic) {
         uint32_t sign = (int8_t)more >> 7;
         uint32_t mask = ((uint32_t)1 << shift) - 1;
         uint32_t uq = numer + ((numer >> 31) & mask);
@@ -1443,7 +1467,7 @@ int32_t libdivide_s32_do(int32_t numer, const struct libdivide_s32_t *denom) {
         q = (q ^ sign) - sign;
         return q;
     } else {
-        uint32_t uq = (uint32_t)libdivide_mullhi_s32(denom->magic, numer);
+        uint32_t uq = (uint32_t)libdivide_mullhi_s32(magic, numer);
         if (more & LIBDIVIDE_ADD_MARKER) {
             // must be arithmetic shift and then sign extend
             int32_t sign = (int8_t)more >> 7;
@@ -1458,6 +1482,10 @@ int32_t libdivide_s32_do(int32_t numer, const struct libdivide_s32_t *denom) {
     }
 }
 
+int32_t libdivide_s32_do(int32_t numer, const struct libdivide_s32_t *denom) {
+    return libdivide_s32_do_raw(numer, denom->magic, denom->more);
+}
+
 int32_t libdivide_s32_branchfree_do(int32_t numer, const struct libdivide_s32_branchfree_t *denom) {
     uint8_t more = denom->more;
     uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
@@ -1599,11 +1627,10 @@ struct libdivide_s64_branchfree_t libdivide_s64_branchfree_gen(int64_t d) {
     return ret;
 }
 
-int64_t libdivide_s64_do(int64_t numer, const struct libdivide_s64_t *denom) {
-    uint8_t more = denom->more;
+int64_t libdivide_s64_do_raw(int64_t numer, int64_t magic, uint8_t more) {
     uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
 
-    if (!denom->magic) {  // shift path
+    if (!magic) {  // shift path
         uint64_t mask = ((uint64_t)1 << shift) - 1;
         uint64_t uq = numer + ((numer >> 63) & mask);
         int64_t q = (int64_t)uq;
@@ -1613,7 +1640,7 @@ int64_t libdivide_s64_do(int64_t numer, const struct libdivide_s64_t *denom) {
         q = (q ^ sign) - sign;
         return q;
     } else {
-        uint64_t uq = (uint64_t)libdivide_mullhi_s64(denom->magic, numer);
+        uint64_t uq = (uint64_t)libdivide_mullhi_s64(magic, numer);
         if (more & LIBDIVIDE_ADD_MARKER) {
             // must be arithmetic shift and then sign extend
             int64_t sign = (int8_t)more >> 7;
@@ -1628,6 +1655,10 @@ int64_t libdivide_s64_do(int64_t numer, const struct libdivide_s64_t *denom) {
     }
 }
 
+int64_t libdivide_s64_do(int64_t numer, const struct libdivide_s64_t *denom) {
+    return libdivide_s64_do_raw(numer, denom->magic, denom->more);
+}
+
 int64_t libdivide_s64_branchfree_do(int64_t numer, const struct libdivide_s64_branchfree_t *denom) {
     uint8_t more = denom->more;
     uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;

From 4fe9018b6fc7e89b68b6a4b34ea2a853e8778b77 Mon Sep 17 00:00:00 2001
From: Daniel Micay <daniel.micay@grapheneos.org>
Date: Mon, 17 Feb 2025 12:47:30 -0500
Subject: [PATCH 53/53] rename calculate_waste.py to calculate-waste

---
 calculate_waste.py => calculate-waste | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename calculate_waste.py => calculate-waste (100%)

diff --git a/calculate_waste.py b/calculate-waste
similarity index 100%
rename from calculate_waste.py
rename to calculate-waste