diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml index e77baba..8470947 100644 --- a/.github/workflows/build-and-test.yml +++ b/.github/workflows/build-and-test.yml @@ -9,30 +9,14 @@ on: jobs: build-ubuntu-gcc: runs-on: ubuntu-latest - strategy: - matrix: - version: [12, 13, 14] steps: - uses: actions/checkout@v4 - - name: Setting up gcc version - run: | - sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-${{ matrix.version }} 100 - sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-${{ matrix.version }} 100 - name: Build run: make test build-ubuntu-clang: runs-on: ubuntu-latest - strategy: - matrix: - version: [14, 15, 16, 17, 18] steps: - uses: actions/checkout@v4 - - name: Install dependencies - run: sudo apt-get update && sudo apt-get install -y --no-install-recommends clang-14 clang-15 - - name: Setting up clang version - run: | - sudo update-alternatives --install /usr/bin/clang++ clang++ /usr/bin/clang++-${{ matrix.version }} 100 - sudo update-alternatives --install /usr/bin/clang clang /usr/bin/clang-${{ matrix.version }} 100 - name: Build run: CC=clang CXX=clang++ make test build-musl: diff --git a/Android.bp b/Android.bp index f6a7a9c..11725a6 100644 --- a/Android.bp +++ b/Android.bp @@ -5,6 +5,8 @@ common_cflags = [ "-fPIC", "-fvisibility=hidden", //"-fno-plt", + "-Wall", + "-Wextra", "-Wcast-align", "-Wcast-qual", "-Wwrite-strings", @@ -71,9 +73,6 @@ cc_library { debuggable: { cflags: ["-DLABEL_MEMORY"], }, - device_has_arm_mte: { - cflags: ["-DHAS_ARM_MTE", "-march=armv8-a+dotprod+memtag"] - }, }, apex_available: [ "com.android.runtime", diff --git a/CREDITS b/CREDITS index 31b6875..3ad8617 100644 --- a/CREDITS +++ b/CREDITS @@ -54,230 +54,3 @@ libdivide: random.c get_random_{type}_uniform functions are based on Fast Random Integer Generation in an Interval by Daniel Lemire - -arm_mte.h arm_mte_tag_and_clear_mem function contents were copied from storeTags function in scudo: - - ============================================================================== - The LLVM Project is under the Apache License v2.0 with LLVM Exceptions: - ============================================================================== - - Apache License - Version 2.0, January 2004 - http://www.apache.org/licenses/ - - TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION - - 1. Definitions. - - "License" shall mean the terms and conditions for use, reproduction, - and distribution as defined by Sections 1 through 9 of this document. - - "Licensor" shall mean the copyright owner or entity authorized by - the copyright owner that is granting the License. - - "Legal Entity" shall mean the union of the acting entity and all - other entities that control, are controlled by, or are under common - control with that entity. For the purposes of this definition, - "control" means (i) the power, direct or indirect, to cause the - direction or management of such entity, whether by contract or - otherwise, or (ii) ownership of fifty percent (50%) or more of the - outstanding shares, or (iii) beneficial ownership of such entity. - - "You" (or "Your") shall mean an individual or Legal Entity - exercising permissions granted by this License. - - "Source" form shall mean the preferred form for making modifications, - including but not limited to software source code, documentation - source, and configuration files. - - "Object" form shall mean any form resulting from mechanical - transformation or translation of a Source form, including but - not limited to compiled object code, generated documentation, - and conversions to other media types. - - "Work" shall mean the work of authorship, whether in Source or - Object form, made available under the License, as indicated by a - copyright notice that is included in or attached to the work - (an example is provided in the Appendix below). - - "Derivative Works" shall mean any work, whether in Source or Object - form, that is based on (or derived from) the Work and for which the - editorial revisions, annotations, elaborations, or other modifications - represent, as a whole, an original work of authorship. For the purposes - of this License, Derivative Works shall not include works that remain - separable from, or merely link (or bind by name) to the interfaces of, - the Work and Derivative Works thereof. - - "Contribution" shall mean any work of authorship, including - the original version of the Work and any modifications or additions - to that Work or Derivative Works thereof, that is intentionally - submitted to Licensor for inclusion in the Work by the copyright owner - or by an individual or Legal Entity authorized to submit on behalf of - the copyright owner. For the purposes of this definition, "submitted" - means any form of electronic, verbal, or written communication sent - to the Licensor or its representatives, including but not limited to - communication on electronic mailing lists, source code control systems, - and issue tracking systems that are managed by, or on behalf of, the - Licensor for the purpose of discussing and improving the Work, but - excluding communication that is conspicuously marked or otherwise - designated in writing by the copyright owner as "Not a Contribution." - - "Contributor" shall mean Licensor and any individual or Legal Entity - on behalf of whom a Contribution has been received by Licensor and - subsequently incorporated within the Work. - - 2. Grant of Copyright License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - copyright license to reproduce, prepare Derivative Works of, - publicly display, publicly perform, sublicense, and distribute the - Work and such Derivative Works in Source or Object form. - - 3. Grant of Patent License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - (except as stated in this section) patent license to make, have made, - use, offer to sell, sell, import, and otherwise transfer the Work, - where such license applies only to those patent claims licensable - by such Contributor that are necessarily infringed by their - Contribution(s) alone or by combination of their Contribution(s) - with the Work to which such Contribution(s) was submitted. If You - institute patent litigation against any entity (including a - cross-claim or counterclaim in a lawsuit) alleging that the Work - or a Contribution incorporated within the Work constitutes direct - or contributory patent infringement, then any patent licenses - granted to You under this License for that Work shall terminate - as of the date such litigation is filed. - - 4. Redistribution. You may reproduce and distribute copies of the - Work or Derivative Works thereof in any medium, with or without - modifications, and in Source or Object form, provided that You - meet the following conditions: - - (a) You must give any other recipients of the Work or - Derivative Works a copy of this License; and - - (b) You must cause any modified files to carry prominent notices - stating that You changed the files; and - - (c) You must retain, in the Source form of any Derivative Works - that You distribute, all copyright, patent, trademark, and - attribution notices from the Source form of the Work, - excluding those notices that do not pertain to any part of - the Derivative Works; and - - (d) If the Work includes a "NOTICE" text file as part of its - distribution, then any Derivative Works that You distribute must - include a readable copy of the attribution notices contained - within such NOTICE file, excluding those notices that do not - pertain to any part of the Derivative Works, in at least one - of the following places: within a NOTICE text file distributed - as part of the Derivative Works; within the Source form or - documentation, if provided along with the Derivative Works; or, - within a display generated by the Derivative Works, if and - wherever such third-party notices normally appear. The contents - of the NOTICE file are for informational purposes only and - do not modify the License. You may add Your own attribution - notices within Derivative Works that You distribute, alongside - or as an addendum to the NOTICE text from the Work, provided - that such additional attribution notices cannot be construed - as modifying the License. - - You may add Your own copyright statement to Your modifications and - may provide additional or different license terms and conditions - for use, reproduction, or distribution of Your modifications, or - for any such Derivative Works as a whole, provided Your use, - reproduction, and distribution of the Work otherwise complies with - the conditions stated in this License. - - 5. Submission of Contributions. Unless You explicitly state otherwise, - any Contribution intentionally submitted for inclusion in the Work - by You to the Licensor shall be under the terms and conditions of - this License, without any additional terms or conditions. - Notwithstanding the above, nothing herein shall supersede or modify - the terms of any separate license agreement you may have executed - with Licensor regarding such Contributions. - - 6. Trademarks. This License does not grant permission to use the trade - names, trademarks, service marks, or product names of the Licensor, - except as required for reasonable and customary use in describing the - origin of the Work and reproducing the content of the NOTICE file. - - 7. Disclaimer of Warranty. Unless required by applicable law or - agreed to in writing, Licensor provides the Work (and each - Contributor provides its Contributions) on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or - implied, including, without limitation, any warranties or conditions - of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A - PARTICULAR PURPOSE. You are solely responsible for determining the - appropriateness of using or redistributing the Work and assume any - risks associated with Your exercise of permissions under this License. - - 8. Limitation of Liability. In no event and under no legal theory, - whether in tort (including negligence), contract, or otherwise, - unless required by applicable law (such as deliberate and grossly - negligent acts) or agreed to in writing, shall any Contributor be - liable to You for damages, including any direct, indirect, special, - incidental, or consequential damages of any character arising as a - result of this License or out of the use or inability to use the - Work (including but not limited to damages for loss of goodwill, - work stoppage, computer failure or malfunction, or any and all - other commercial damages or losses), even if such Contributor - has been advised of the possibility of such damages. - - 9. Accepting Warranty or Additional Liability. While redistributing - the Work or Derivative Works thereof, You may choose to offer, - and charge a fee for, acceptance of support, warranty, indemnity, - or other liability obligations and/or rights consistent with this - License. However, in accepting such obligations, You may act only - on Your own behalf and on Your sole responsibility, not on behalf - of any other Contributor, and only if You agree to indemnify, - defend, and hold each Contributor harmless for any liability - incurred by, or claims asserted against, such Contributor by reason - of your accepting any such warranty or additional liability. - - END OF TERMS AND CONDITIONS - - APPENDIX: How to apply the Apache License to your work. - - To apply the Apache License to your work, attach the following - boilerplate notice, with the fields enclosed by brackets "[]" - replaced with your own identifying information. (Don't include - the brackets!) The text should be enclosed in the appropriate - comment syntax for the file format. We also recommend that a - file or class name and description of purpose be included on the - same "printed page" as the copyright notice for easier - identification within third-party archives. - - Copyright [yyyy] [name of copyright owner] - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. - - - ---- LLVM Exceptions to the Apache 2.0 License ---- - - As an exception, if, as a result of your compiling your source code, portions - of this Software are embedded into an Object form of such source code, you - may redistribute such embedded portions in such Object form without complying - with the conditions of Sections 4(a), 4(b) and 4(d) of the License. - - In addition, if you combine or link compiled forms of this Software with - software that is licensed under the GPLv2 ("Combined Software") and if a - court of competent jurisdiction determines that the patent provision (Section - 3), the indemnity provision (Section 9) or other Section of the License - conflicts with the conditions of the GPLv2, you may retroactively and - prospectively choose to deem waived or otherwise exclude such Section(s) of - the License, but only in their entirety and only with respect to the Combined - Software. - - ============================================================================== diff --git a/LICENSE b/LICENSE index af4b965..5311a0f 100644 --- a/LICENSE +++ b/LICENSE @@ -1,4 +1,4 @@ -Copyright © 2018-2024 GrapheneOS +Copyright © 2018-2023 GrapheneOS Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/README.md b/README.md index 6a1a91b..b3f820f 100644 --- a/README.md +++ b/README.md @@ -83,7 +83,7 @@ there will be custom integration offering better performance in the future along with other hardening for the C standard library implementation. For Android, only the current generation, actively developed maintenance branch of the Android -Open Source Project will be supported, which currently means `android15-release`. +Open Source Project will be supported, which currently means `android13-qpr2-release`. ## Testing @@ -159,9 +159,6 @@ line to the `/etc/ld.so.preload` configuration file: The format of this configuration file is a whitespace-separated list, so it's good practice to put each library on a separate line. -On Debian systems `libhardened_malloc.so` should be installed into `/usr/lib/` -to avoid preload failures caused by AppArmor profile restrictions. - Using the `LD_PRELOAD` environment variable to load it on a case-by-case basis will not work when `AT_SECURE` is set such as with setuid binaries. It's also generally not a recommended approach for production usage. The recommendation @@ -473,16 +470,16 @@ was a bit less important and if a core goal was finding latent bugs. * Errors other than ENOMEM from mmap, munmap, mprotect and mremap treated as fatal, which can help to detect memory management gone wrong elsewhere in the process. -* Memory tagging for slab allocations via MTE on ARMv8.5+ +* [future] Memory tagging for slab allocations via MTE on ARMv8.5+ * random memory tags as the baseline, providing probabilistic protection against various forms of memory corruption * dedicated tag for free slots, set on free, for deterministic protection against accessing freed memory + * store previous random tag within freed slab allocations, and increment it + to get the next tag for that slot to provide deterministic use-after-free + detection through multiple cycles of memory reuse * guarantee distinct tags for adjacent memory allocations by incrementing past matching values for deterministic detection of linear overflows - * [future] store previous random tag and increment it to get the next tag - for that slot to provide deterministic use-after-free detection through - multiple cycles of memory reuse ## Randomness @@ -724,46 +721,77 @@ freeing as there would be if the kernel supported these features directly. ## Memory tagging -Random tags are set for all slab allocations when allocated, with 4 excluded values: +Integrating extensive support for ARMv8.5 memory tagging is planned and this +section will be expanded to cover the details on the chosen design. The approach +for slab allocations is currently covered, but it can also be used for the +allocator metadata region and large allocations. -1. the reserved `0` tag -2. the previous tag used for the slot -3. the current (or previous) tag used for the slot to the left -4. the current (or previous) tag used for the slot to the right +Memory allocations are already always multiples of naturally aligned 16 byte +units, so memory tags are a natural fit into a malloc implementation due to the +16 byte alignment requirement. The only extra memory consumption will come from +the hardware supported storage for the tag values (4 bits per 16 bytes). -When a slab allocation is freed, the reserved `0` tag is set for the slot. -Slab allocation slots are cleared before reuse when memory tagging is enabled. +The baseline policy will be to generate random tags for each slab allocation +slot on first use. The highest value will be reserved for marking freed memory +allocations to detect any accesses to freed memory so it won't be part of the +generated range. Adjacent slots will be guaranteed to have distinct memory tags +in order to guarantee that linear overflows are detected. There are a few ways +of implementing this and it will end up depending on the performance costs of +different approaches. If there's an efficient way to fetch the adjacent tag +values without wasting extra memory, it will be possible to check for them and +skip them either by generating a new random value in a loop or incrementing +past them since the tiny bit of bias wouldn't matter. Another approach would be +alternating odd and even tag values but that would substantially reduce the +overall randomness of the tags and there's very little entropy from the start. -This ensures the following properties: +Once a slab allocation has been freed, the tag will be set to the reserved +value for free memory and the previous tag value will be stored inside the +allocation itself. The next time the slot is allocated, the chosen tag value +will be the previous value incremented by one to provide use-after-free +detection between generations of allocations. The stored tag will be wiped +before retagging the memory, to avoid leaking it and as part of preserving the +security property of newly allocated memory being zeroed due to zero-on-free. +It will eventually wrap all the way around, but this ends up providing a strong +guarantee for many allocation cycles due to the combination of 4 bit tags with +the FIFO quarantine feature providing delayed free. It also benefits from +random slot allocation and the randomized portion of delayed free, which result +in a further delay along with preventing a deterministic bypass by forcing a +reuse after a certain number of allocation cycles. Similarly to the initial tag +generation, tag values for adjacent allocations will be skipped by incrementing +past them. -- Linear overflows are deterministically detected. -- Use-after-free are deterministically detected until the freed slot goes through - both the random and FIFO quarantines, gets allocated again, goes through both - quarantines again and then finally gets allocated again for a 2nd time. -- Since the default `0` tag is reserved, untagged pointers can't access slab - allocations and vice versa. +For example, consider this slab of allocations that are not yet used with 15 +representing the tag for free memory. For the sake of simplicity, there will be +no quarantine or other slabs for this example: -Slab allocations are done in a statically reserved region for each size class -and all metadata is in a statically reserved region, so interactions between -different uses of the same address space is not applicable. + | 15 | 15 | 15 | 15 | 15 | 15 | -Large allocations beyond the largest slab allocation size class (128k by -default) are guaranteed to have randomly sized guard regions to the left and -right. Random and FIFO address space quarantines provide use-after-free -detection. We need to test whether the cost of random tags is acceptable to enabled them by default, -since they would be useful for: +Three slots are randomly chosen for allocations, with random tags assigned (2, +7, 14) since these slots haven't ever been used and don't have saved values: -- probabilistic detection of overflows -- probabilistic detection of use-after-free once the address space is - out of the quarantine and reused for another allocation -- deterministic detection of use-after-free for reuse by another allocator. + | 15 | 2 | 15 | 7 | 14 | 15 | -When memory tagging is enabled, checking for write-after-free at allocation -time and checking canaries are both disabled. Canaries will be more thoroughly -disabled when using memory tagging in the future, but Android currently has -[very dynamic memory tagging support](https://source.android.com/docs/security/test/memory-safety/arm-mte) -where it can be disabled at any time which creates a barrier to optimizing -by disabling redundant features. +The 2nd allocation slot is freed, and is set back to the tag for free memory +(15), but with the previous tag value stored in the freed space: + + | 15 | 15 | 15 | 7 | 14 | 15 | + +The first slot is allocated for the first time, receiving the random value 3: + + | 3 | 15 | 15 | 7 | 14 | 15 | + +The 2nd slot is randomly chosen again, so the previous tag (2) is retrieved and +incremented to 3 as part of the use-after-free mitigation. An adjacent +allocation already uses the tag 3, so the tag is further incremented to 4 (it +would be incremented to 5 if one of the adjacent tags was 4): + + | 3 | 4 | 15 | 7 | 14 | 15 | + +The last slot is randomly chosen for the next allocation, and is assigned the +random value 14. However, it's placed next to an allocation with the tag 14 so +the tag is incremented and wraps around to 0: + + | 3 | 4 | 15 | 7 | 14 | 0 | ## API extensions diff --git a/androidtest/Android.bp b/androidtest/Android.bp deleted file mode 100644 index ae0aa49..0000000 --- a/androidtest/Android.bp +++ /dev/null @@ -1,25 +0,0 @@ -java_test_host { - name: "HMallocTest", - srcs: [ - "src/**/*.java", - ], - - libs: [ - "tradefed", - "compatibility-tradefed", - "compatibility-host-util", - ], - - static_libs: [ - "cts-host-utils", - "frameworks-base-hostutils", - ], - - test_suites: [ - "general-tests", - ], - - data_device_bins_64: [ - "memtag_test", - ], -} diff --git a/androidtest/AndroidTest.xml b/androidtest/AndroidTest.xml deleted file mode 100644 index 333f1dd..0000000 --- a/androidtest/AndroidTest.xml +++ /dev/null @@ -1,13 +0,0 @@ - - - - - - - - - - diff --git a/androidtest/memtag/Android.bp b/androidtest/memtag/Android.bp deleted file mode 100644 index 75287f6..0000000 --- a/androidtest/memtag/Android.bp +++ /dev/null @@ -1,17 +0,0 @@ -cc_test { - name: "memtag_test", - srcs: ["memtag_test.cc"], - cflags: [ - "-Wall", - "-Werror", - "-Wextra", - "-O0", - "-march=armv9-a+memtag", - ], - - compile_multilib: "64", - - sanitize: { - memtag_heap: true, - }, -} diff --git a/androidtest/memtag/memtag_test.cc b/androidtest/memtag/memtag_test.cc deleted file mode 100644 index f858292..0000000 --- a/androidtest/memtag/memtag_test.cc +++ /dev/null @@ -1,351 +0,0 @@ -// needed to uncondionally enable assertions -#undef NDEBUG -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -#include "../../arm_mte.h" - -using namespace std; - -using u8 = uint8_t; -using uptr = uintptr_t; -using u64 = uint64_t; - -const size_t DEFAULT_ALLOC_SIZE = 8; -const size_t CANARY_SIZE = 8; - -void do_context_switch() { - utsname s; - uname(&s); -} - -u8 get_pointer_tag(void *ptr) { - return (((uptr) ptr) >> 56) & 0xf; -} - -void *untag_pointer(void *ptr) { - const uintptr_t mask = UINTPTR_MAX >> 8; - return (void *) ((uintptr_t) ptr & mask); -} - -void *set_pointer_tag(void *ptr, u8 tag) { - return (void *) (((uintptr_t) tag << 56) | (uintptr_t) untag_pointer(ptr)); -} - -// This test checks that slab slot allocation uses tag that is distint from tags of its neighbors -// and from the tag of the previous allocation that used the same slot -void tag_distinctness() { - // tag 0 is reserved - const int min_tag = 1; - const int max_tag = 0xf; - - struct SizeClass { - int size; - int slot_cnt; - }; - - // values from size_classes[] and size_class_slots[] in h_malloc.c - SizeClass size_classes[] = { - { .size = 16, .slot_cnt = 256, }, - { .size = 32, .slot_cnt = 128, }, - // this size class is used by allocations that are made by the addr_tag_map, which breaks - // tag distinctess checks - // { .size = 48, .slot_cnt = 85, }, - { .size = 64, .slot_cnt = 64, }, - { .size = 80, .slot_cnt = 51, }, - { .size = 96, .slot_cnt = 42, }, - { .size = 112, .slot_cnt = 36, }, - { .size = 128, .slot_cnt = 64, }, - { .size = 160, .slot_cnt = 51, }, - { .size = 192, .slot_cnt = 64, }, - { .size = 224, .slot_cnt = 54, }, - { .size = 10240, .slot_cnt = 6, }, - { .size = 20480, .slot_cnt = 1, }, - }; - - int tag_usage[max_tag + 1]; - - for (size_t sc_idx = 0; sc_idx < sizeof(size_classes) / sizeof(SizeClass); ++sc_idx) { - SizeClass &sc = size_classes[sc_idx]; - - const size_t full_alloc_size = sc.size; - const size_t alloc_size = full_alloc_size - CANARY_SIZE; - - // "tdc" is short for "tag distinctness check" - int left_neighbor_tdc_cnt = 0; - int right_neighbor_tdc_cnt = 0; - int prev_alloc_tdc_cnt = 0; - - int iter_cnt = 600; - - unordered_map addr_tag_map; - addr_tag_map.reserve(iter_cnt * sc.slot_cnt); - - u64 seen_tags = 0; - - for (int iter = 0; iter < iter_cnt; ++iter) { - uptr allocations[256]; // 256 is max slot count - - for (int i = 0; i < sc.slot_cnt; ++i) { - u8 *p = (u8 *) malloc(alloc_size); - assert(p); - uptr addr = (uptr) untag_pointer(p); - u8 tag = get_pointer_tag(p); - - assert(tag >= min_tag && tag <= max_tag); - seen_tags |= 1 << tag; - ++tag_usage[tag]; - - // check most recent tags of left and right neighbors - - auto left = addr_tag_map.find(addr - full_alloc_size); - if (left != addr_tag_map.end()) { - assert(left->second != tag); - ++left_neighbor_tdc_cnt; - } - - auto right = addr_tag_map.find(addr + full_alloc_size); - if (right != addr_tag_map.end()) { - assert(right->second != tag); - ++right_neighbor_tdc_cnt; - } - - // check previous tag of this slot - auto prev = addr_tag_map.find(addr); - if (prev != addr_tag_map.end()) { - assert(prev->second != tag); - ++prev_alloc_tdc_cnt; - addr_tag_map.erase(addr); - } - - addr_tag_map.emplace(addr, tag); - - for (size_t j = 0; j < alloc_size; ++j) { - // check that slot is zeroed - assert(p[j] == 0); - // check that slot is readable and writable - p[j]++; - } - - allocations[i] = addr; - } - - // free some of allocations to allow their slots to be reused - for (int i = sc.slot_cnt - 1; i >= 0; i -= 2) { - free((void *) allocations[i]); - } - } - - // check that all of the tags were used, except for the reserved tag 0 - assert(seen_tags == (0xffff & ~(1 << 0))); - - printf("size_class\t%i\t" "tdc_left %i\t" "tdc_right %i\t" "tdc_prev_alloc %i\n", - sc.size, left_neighbor_tdc_cnt, right_neighbor_tdc_cnt, prev_alloc_tdc_cnt); - - // make sure tag distinctess checks were actually performed - int min_tdc_cnt = sc.slot_cnt * iter_cnt / 5; - - assert(prev_alloc_tdc_cnt > min_tdc_cnt); - - if (sc.slot_cnt > 1) { - assert(left_neighbor_tdc_cnt > min_tdc_cnt); - assert(right_neighbor_tdc_cnt > min_tdc_cnt); - } - - // async tag check failures are reported on context switch - do_context_switch(); - } - - printf("\nTag use counters:\n"); - - int min = INT_MAX; - int max = 0; - double geomean = 0.0; - for (int i = min_tag; i <= max_tag; ++i) { - int v = tag_usage[i]; - geomean += log(v); - min = std::min(min, v); - max = std::max(max, v); - printf("%i\t%i\n", i, tag_usage[i]); - } - int tag_cnt = 1 + max_tag - min_tag; - geomean = exp(geomean / tag_cnt); - - double max_deviation = std::max((double) max - geomean, geomean - min); - - printf("geomean: %.2f, max deviation from geomean: %.2f%%\n", geomean, (100.0 * max_deviation) / geomean); -} - -u8* alloc_default() { - const size_t full_alloc_size = DEFAULT_ALLOC_SIZE + CANARY_SIZE; - set addrs; - - // make sure allocation has both left and right neighbors, otherwise overflow/underflow tests - // will fail when allocation is at the end/beginning of slab - for (;;) { - u8 *p = (u8 *) malloc(DEFAULT_ALLOC_SIZE); - assert(p); - - uptr addr = (uptr) untag_pointer(p); - uptr left = addr - full_alloc_size; - if (addrs.find(left) != addrs.end()) { - uptr right = addr + full_alloc_size; - if (addrs.find(right) != addrs.end()) { - return p; - } - } - - addrs.emplace(addr); - } -} - -int expected_segv_code; - -#define expect_segv(exp, segv_code) ({\ - expected_segv_code = segv_code; \ - volatile auto val = exp; \ - (void) val; \ - do_context_switch(); \ - fprintf(stderr, "didn't receive SEGV code %i", segv_code); \ - exit(1); }) - -// it's expected that the device is configured to use asymm MTE tag checking mode (sync read checks, -// async write checks) -#define expect_read_segv(exp) expect_segv(exp, SEGV_MTESERR) -#define expect_write_segv(exp) expect_segv(exp, SEGV_MTEAERR) - -void read_after_free() { - u8 *p = alloc_default(); - free(p); - expect_read_segv(p[0]); -} - -void write_after_free() { - u8 *p = alloc_default(); - free(p); - expect_write_segv(p[0] = 1); -} - -void underflow_read() { - u8 *p = alloc_default(); - expect_read_segv(p[-1]); -} - -void underflow_write() { - u8 *p = alloc_default(); - expect_write_segv(p[-1] = 1); -} - -void overflow_read() { - u8 *p = alloc_default(); - expect_read_segv(p[DEFAULT_ALLOC_SIZE + CANARY_SIZE]); -} - -void overflow_write() { - u8 *p = alloc_default(); - expect_write_segv(p[DEFAULT_ALLOC_SIZE + CANARY_SIZE] = 1); -} - -void untagged_read() { - u8 *p = alloc_default(); - p = (u8 *) untag_pointer(p); - expect_read_segv(p[0]); -} - -void untagged_write() { - u8 *p = alloc_default(); - p = (u8 *) untag_pointer(p); - expect_write_segv(p[0] = 1); -} - -// checks that each of memory locations inside the buffer is tagged with expected_tag -void check_tag(void *buf, size_t len, u8 expected_tag) { - for (size_t i = 0; i < len; ++i) { - assert(get_pointer_tag(__arm_mte_get_tag((void *) ((uintptr_t) buf + i))) == expected_tag); - } -} - -void madvise_dontneed() { - const size_t len = 100'000; - void *ptr = mmap(NULL, len, PROT_READ | PROT_WRITE | PROT_MTE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); - assert(ptr != MAP_FAILED); - - // check that 0 is the initial tag - check_tag(ptr, len, 0); - - arm_mte_tag_and_clear_mem(set_pointer_tag(ptr, 1), len); - check_tag(ptr, len, 1); - - memset(set_pointer_tag(ptr, 1), 1, len); - - assert(madvise(ptr, len, MADV_DONTNEED) == 0); - // check that MADV_DONTNEED resets the tag - check_tag(ptr, len, 0); - - // check that MADV_DONTNEED clears the memory - for (size_t i = 0; i < len; ++i) { - assert(((u8 *) ptr)[i] == 0); - } - - // check that mistagged read after MADV_DONTNEED fails - expect_read_segv(*((u8 *) set_pointer_tag(ptr, 1))); -} - -map> tests = { -#define TEST(s) { #s, s } - TEST(tag_distinctness), - TEST(read_after_free), - TEST(write_after_free), - TEST(overflow_read), - TEST(overflow_write), - TEST(underflow_read), - TEST(underflow_write), - TEST(untagged_read), - TEST(untagged_write), - TEST(madvise_dontneed), -#undef TEST -}; - -void segv_handler(int, siginfo_t *si, void *) { - if (expected_segv_code == 0 || expected_segv_code != si->si_code) { - fprintf(stderr, "received unexpected SEGV_CODE %i", si->si_code); - exit(139); // standard exit code for SIGSEGV - } - - exit(0); -} - -int main(int argc, char **argv) { - setbuf(stdout, NULL); - assert(argc == 2); - - auto test_name = string(argv[1]); - auto test_fn = tests[test_name]; - assert(test_fn != nullptr); - - assert(mallopt(M_BIONIC_SET_HEAP_TAGGING_LEVEL, M_HEAP_TAGGING_LEVEL_ASYNC) == 1); - - struct sigaction sa = { - .sa_sigaction = segv_handler, - .sa_flags = SA_SIGINFO, - }; - - assert(sigaction(SIGSEGV, &sa, nullptr) == 0); - - test_fn(); - do_context_switch(); - - return 0; -} diff --git a/androidtest/src/grapheneos/hmalloc/MemtagTest.java b/androidtest/src/grapheneos/hmalloc/MemtagTest.java deleted file mode 100644 index be04bd9..0000000 --- a/androidtest/src/grapheneos/hmalloc/MemtagTest.java +++ /dev/null @@ -1,79 +0,0 @@ -package grapheneos.hmalloc; - -import com.android.tradefed.device.DeviceNotAvailableException; -import com.android.tradefed.testtype.DeviceJUnit4ClassRunner; -import com.android.tradefed.testtype.junit4.BaseHostJUnit4Test; - -import org.junit.Test; -import org.junit.runner.RunWith; - -import java.util.ArrayList; - -import static org.junit.Assert.assertEquals; - -@RunWith(DeviceJUnit4ClassRunner.class) -public class MemtagTest extends BaseHostJUnit4Test { - private static final String TEST_BINARY = "/data/local/tmp/memtag_test"; - - private void runTest(String name) throws DeviceNotAvailableException { - var args = new ArrayList(); - args.add(TEST_BINARY); - args.add(name); - String cmdLine = String.join(" ", args); - - var result = getDevice().executeShellV2Command(cmdLine); - - assertEquals("stderr", "", result.getStderr()); - assertEquals("process exit code", 0, result.getExitCode().intValue()); - } - - @Test - public void tag_distinctness() throws DeviceNotAvailableException { - runTest("tag_distinctness"); - } - - @Test - public void read_after_free() throws DeviceNotAvailableException { - runTest("read_after_free"); - } - - @Test - public void write_after_free() throws DeviceNotAvailableException { - runTest("write_after_free"); - } - - @Test - public void underflow_read() throws DeviceNotAvailableException { - runTest("underflow_read"); - } - - @Test - public void underflow_write() throws DeviceNotAvailableException { - runTest("underflow_write"); - } - - @Test - public void overflow_read() throws DeviceNotAvailableException { - runTest("overflow_read"); - } - - @Test - public void overflow_write() throws DeviceNotAvailableException { - runTest("overflow_write"); - } - - @Test - public void untagged_read() throws DeviceNotAvailableException { - runTest("untagged_read"); - } - - @Test - public void untagged_write() throws DeviceNotAvailableException { - runTest("untagged_write"); - } - - @Test - public void madvise_dontneed() throws DeviceNotAvailableException { - runTest("madvise_dontneed"); - } -} diff --git a/arm_mte.h b/arm_mte.h deleted file mode 100644 index 5ed900d..0000000 --- a/arm_mte.h +++ /dev/null @@ -1,91 +0,0 @@ -#ifndef ARM_MTE_H -#define ARM_MTE_H - -#include -#include - -// Returns a tagged pointer. -// See https://developer.arm.com/documentation/ddi0602/2023-09/Base-Instructions/IRG--Insert-Random-Tag- -static inline void *arm_mte_create_random_tag(void *p, uint64_t exclusion_mask) { - return __arm_mte_create_random_tag(p, exclusion_mask); -} - -// Tag the memory region with the tag specified in tag bits of tagged_ptr. Memory region itself is -// zeroed. -// tagged_ptr has to be aligned by 16, and len has to be a multiple of 16 (tag granule size). -// -// Arm's software optimization guide says: -// "it is recommended to use STZGM (or DCZGVA) to set tag if data is not a concern." (STZGM and -// DCGZVA are zeroing variants of tagging instructions). -// -// Contents of this function were copied from scudo: -// https://android.googlesource.com/platform/external/scudo/+/refs/tags/android-14.0.0_r1/standalone/memtag.h#167 -// -// scudo is licensed under the Apache License v2.0 with LLVM Exceptions, which is compatible with -// the hardened_malloc's MIT license -static inline void arm_mte_tag_and_clear_mem(void *tagged_ptr, size_t len) { - uintptr_t Begin = (uintptr_t) tagged_ptr; - uintptr_t End = Begin + len; - uintptr_t LineSize, Next, Tmp; - __asm__ __volatile__( - ".arch_extension memtag \n\t" - - // Compute the cache line size in bytes (DCZID_EL0 stores it as the log2 - // of the number of 4-byte words) and bail out to the slow path if DCZID_EL0 - // indicates that the DC instructions are unavailable. - "DCZID .req %[Tmp] \n\t" - "mrs DCZID, dczid_el0 \n\t" - "tbnz DCZID, #4, 3f \n\t" - "and DCZID, DCZID, #15 \n\t" - "mov %[LineSize], #4 \n\t" - "lsl %[LineSize], %[LineSize], DCZID \n\t" - ".unreq DCZID \n\t" - - // Our main loop doesn't handle the case where we don't need to perform any - // DC GZVA operations. If the size of our tagged region is less than - // twice the cache line size, bail out to the slow path since it's not - // guaranteed that we'll be able to do a DC GZVA. - "Size .req %[Tmp] \n\t" - "sub Size, %[End], %[Cur] \n\t" - "cmp Size, %[LineSize], lsl #1 \n\t" - "b.lt 3f \n\t" - ".unreq Size \n\t" - - "LineMask .req %[Tmp] \n\t" - "sub LineMask, %[LineSize], #1 \n\t" - - // STZG until the start of the next cache line. - "orr %[Next], %[Cur], LineMask \n\t" - - "1:\n\t" - "stzg %[Cur], [%[Cur]], #16 \n\t" - "cmp %[Cur], %[Next] \n\t" - "b.lt 1b \n\t" - - // DC GZVA cache lines until we have no more full cache lines. - "bic %[Next], %[End], LineMask \n\t" - ".unreq LineMask \n\t" - - "2: \n\t" - "dc gzva, %[Cur] \n\t" - "add %[Cur], %[Cur], %[LineSize] \n\t" - "cmp %[Cur], %[Next] \n\t" - "b.lt 2b \n\t" - - // STZG until the end of the tagged region. This loop is also used to handle - // slow path cases. - - "3: \n\t" - "cmp %[Cur], %[End] \n\t" - "b.ge 4f \n\t" - "stzg %[Cur], [%[Cur]], #16 \n\t" - "b 3b \n\t" - - "4: \n\t" - - : [Cur] "+&r"(Begin), [LineSize] "=&r"(LineSize), [Next] "=&r"(Next), [Tmp] "=&r"(Tmp) - : [End] "r"(End) - : "memory" - ); -} -#endif diff --git a/calculate-waste b/calculate_waste.py similarity index 100% rename from calculate-waste rename to calculate_waste.py diff --git a/h_malloc.c b/h_malloc.c index 6221d0b..2dc0bde 100644 --- a/h_malloc.c +++ b/h_malloc.c @@ -14,7 +14,6 @@ #include "h_malloc.h" #include "memory.h" -#include "memtag.h" #include "mutex.h" #include "pages.h" #include "random.h" @@ -76,9 +75,6 @@ static union { struct region_metadata *regions[2]; #ifdef USE_PKEY int metadata_pkey; -#endif -#ifdef MEMTAG - bool is_memtag_disabled; #endif }; char padding[PAGE_SIZE]; @@ -88,30 +84,6 @@ static inline void *get_slab_region_end(void) { return atomic_load_explicit(&ro.slab_region_end, memory_order_acquire); } -#ifdef MEMTAG -static inline bool is_memtag_enabled(void) { - return !ro.is_memtag_disabled; -} -#endif - -static void *memory_map_tagged(size_t size) { -#ifdef HAS_ARM_MTE - if (likely51(is_memtag_enabled())) { - return memory_map_mte(size); - } -#endif - return memory_map(size); -} - -static bool memory_map_fixed_tagged(void *ptr, size_t size) { -#ifdef HAS_ARM_MTE - if (likely51(is_memtag_enabled())) { - return memory_map_fixed_mte(ptr, size); - } -#endif - return memory_map_fixed(ptr, size); -} - #define SLAB_METADATA_COUNT struct slab_metadata { @@ -127,18 +99,6 @@ struct slab_metadata { #if SLAB_QUARANTINE u64 quarantine_bitmap[4]; #endif -#ifdef HAS_ARM_MTE - // arm_mte_tags is used as a u4 array (MTE tags are 4-bit wide) - // - // Its size is calculated by the following formula: - // (MAX_SLAB_SLOT_COUNT + 2) / 2 - // MAX_SLAB_SLOT_COUNT is currently 256, 2 extra slots are needed for branchless handling of - // edge slots in tag_and_clear_slab_slot() - // - // It's intentionally placed at the end of struct to improve locality: for most size classes, - // slot count is far lower than MAX_SLAB_SLOT_COUNT. - u8 arm_mte_tags[129]; -#endif }; static const size_t min_align = 16; @@ -487,12 +447,6 @@ static void write_after_free_check(const char *p, size_t size) { return; } -#ifdef HAS_ARM_MTE - if (likely51(is_memtag_enabled())) { - return; - } -#endif - for (size_t i = 0; i < size; i += sizeof(u64)) { if (unlikely(*(const u64 *)(const void *)(p + i))) { fatal_error("detected write after free"); @@ -507,48 +461,19 @@ static void set_slab_canary_value(UNUSED struct slab_metadata *metadata, UNUSED 0x00ffffffffffffffUL; metadata->canary_value = get_random_u64(rng) & canary_mask; -#ifdef HAS_ARM_MTE - if (unlikely(metadata->canary_value == 0)) { - // 0 is reserved to support disabling MTE at runtime (this is required on Android). - // When MTE is enabled, writing and reading of canaries is disabled, i.e. canary remains zeroed. - // After MTE is disabled, canaries that are set to 0 are ignored, since they wouldn't match - // slab's metadata->canary_value. - // 0x100 was chosen arbitrarily, and can be encoded as an immediate value on ARM by the compiler. - metadata->canary_value = 0x100; - } -#endif #endif } static void set_canary(UNUSED const struct slab_metadata *metadata, UNUSED void *p, UNUSED size_t size) { #if SLAB_CANARY -#ifdef HAS_ARM_MTE - if (likely51(is_memtag_enabled())) { - return; - } -#endif - memcpy((char *)p + size - canary_size, &metadata->canary_value, canary_size); #endif } static void check_canary(UNUSED const struct slab_metadata *metadata, UNUSED const void *p, UNUSED size_t size) { #if SLAB_CANARY -#ifdef HAS_ARM_MTE - if (likely51(is_memtag_enabled())) { - return; - } -#endif - u64 canary_value; memcpy(&canary_value, (const char *)p + size - canary_size, canary_size); - -#ifdef HAS_ARM_MTE - if (unlikely(canary_value == 0)) { - return; - } -#endif - if (unlikely(canary_value != metadata->canary_value)) { fatal_error("canary corrupted"); } @@ -581,38 +506,6 @@ static inline void stats_slab_deallocate(UNUSED struct size_class *c, UNUSED siz #endif } -#ifdef HAS_ARM_MTE -static void *tag_and_clear_slab_slot(struct slab_metadata *metadata, void *slot_ptr, size_t slot_idx, size_t slot_size) { - // arm_mte_tags is an array of 4-bit unsigned integers stored as u8 array (MTE tags are 4-bit wide) - // - // It stores the most recent tag for each slab slot, or 0 if the slot was never used. - // Slab indices in arm_mte_tags array are shifted to the right by 1, and size of this array - // is (MAX_SLAB_SLOT_COUNT + 2). This means that first and last values of arm_mte_tags array - // are always 0, which allows to handle edge slots in a branchless way when tag exclusion mask - // is constructed. - u8 *slot_tags = metadata->arm_mte_tags; - - // tag exclusion mask - u64 tem = (1 << RESERVED_TAG); - - // current or previous tag of left neighbor or 0 if there's no left neighbor or if it was never used - tem |= (1 << u4_arr_get(slot_tags, slot_idx)); - // previous tag of this slot or 0 if it was never used - tem |= (1 << u4_arr_get(slot_tags, slot_idx + 1)); - // current or previous tag of right neighbor or 0 if there's no right neighbor or if it was never used - tem |= (1 << u4_arr_get(slot_tags, slot_idx + 2)); - - void *tagged_ptr = arm_mte_create_random_tag(slot_ptr, tem); - // slot addresses and sizes are always aligned by 16 - arm_mte_tag_and_clear_mem(tagged_ptr, slot_size); - - // store new tag of this slot - u4_arr_set(slot_tags, slot_idx + 1, get_pointer_tag(tagged_ptr)); - - return tagged_ptr; -} -#endif - static inline void *allocate_small(unsigned arena, size_t requested_size) { struct size_info info = get_size_info(requested_size); size_t size = likely(info.size) ? info.size : 16; @@ -641,11 +534,6 @@ static inline void *allocate_small(unsigned arena, size_t requested_size) { if (requested_size) { write_after_free_check(p, size - canary_size); set_canary(metadata, p, size); -#ifdef HAS_ARM_MTE - if (likely51(is_memtag_enabled())) { - p = tag_and_clear_slab_slot(metadata, p, slot, size); - } -#endif } stats_small_allocate(c, size); @@ -678,11 +566,6 @@ static inline void *allocate_small(unsigned arena, size_t requested_size) { void *p = slot_pointer(size, slab, slot); if (requested_size) { set_canary(metadata, p, size); -#ifdef HAS_ARM_MTE - if (likely51(is_memtag_enabled())) { - p = tag_and_clear_slab_slot(metadata, p, slot, size); - } -#endif } stats_slab_allocate(c, slab_size); stats_small_allocate(c, size); @@ -705,11 +588,6 @@ static inline void *allocate_small(unsigned arena, size_t requested_size) { void *p = slot_pointer(size, slab, slot); if (requested_size) { set_canary(metadata, p, size); -#ifdef HAS_ARM_MTE - if (likely51(is_memtag_enabled())) { - p = tag_and_clear_slab_slot(metadata, p, slot, size); - } -#endif } stats_slab_allocate(c, slab_size); stats_small_allocate(c, size); @@ -734,11 +612,6 @@ static inline void *allocate_small(unsigned arena, size_t requested_size) { if (requested_size) { write_after_free_check(p, size - canary_size); set_canary(metadata, p, size); -#ifdef HAS_ARM_MTE - if (likely51(is_memtag_enabled())) { - p = tag_and_clear_slab_slot(metadata, p, slot, size); - } -#endif } stats_small_allocate(c, size); @@ -821,16 +694,7 @@ static inline void deallocate_small(void *p, const size_t *expected_size) { if (likely(!is_zero_size)) { check_canary(metadata, p, size); - bool skip_zero = false; -#ifdef HAS_ARM_MTE - if (likely51(is_memtag_enabled())) { - arm_mte_tag_and_clear_mem(set_pointer_tag(p, RESERVED_TAG), size); - // metadata->arm_mte_tags is intentionally not updated, see tag_and_clear_slab_slot() - skip_zero = true; - } -#endif - - if (ZERO_ON_FREE && !skip_zero) { + if (ZERO_ON_FREE) { memset(p, 0, size - canary_size); } } @@ -908,7 +772,7 @@ static inline void deallocate_small(void *p, const size_t *expected_size) { if (c->empty_slabs_total + slab_size > max_empty_slabs_total) { int saved_errno = errno; - if (!memory_map_fixed_tagged(slab, slab_size)) { + if (!memory_map_fixed(slab, slab_size)) { label_slab(slab, slab_size, class); stats_slab_deallocate(c, slab_size); enqueue_free_slab(c, metadata); @@ -1210,14 +1074,13 @@ static inline void enforce_init(void) { } } -static struct mutex init_lock = MUTEX_INITIALIZER; - COLD static void init_slow_path(void) { + static struct mutex lock = MUTEX_INITIALIZER; - mutex_lock(&init_lock); + mutex_lock(&lock); if (unlikely(is_init())) { - mutex_unlock(&init_lock); + mutex_unlock(&lock); return; } @@ -1260,7 +1123,8 @@ COLD static void init_slow_path(void) { if (unlikely(memory_protect_rw_metadata(ra->regions, ra->total * sizeof(struct region_metadata)))) { fatal_error("failed to unprotect memory for regions table"); } - ro.slab_region_start = memory_map_tagged(slab_region_size); + + ro.slab_region_start = memory_map(slab_region_size); if (unlikely(ro.slab_region_start == NULL)) { fatal_error("failed to allocate slab region"); } @@ -1300,7 +1164,7 @@ COLD static void init_slow_path(void) { } memory_set_name(&ro, sizeof(ro), "malloc read-only after init"); - mutex_unlock(&init_lock); + mutex_unlock(&lock); // may allocate, so wait until the allocator is initialized to avoid deadlocking if (unlikely(pthread_atfork(full_lock, full_unlock, post_fork_child))) { @@ -1504,11 +1368,6 @@ EXPORT void *h_calloc(size_t nmemb, size_t size) { if (!ZERO_ON_FREE && likely(p != NULL) && total_size && total_size <= max_slab_size_class) { memset(p, 0, total_size - canary_size); } -#ifdef HAS_ARM_MTE - // use an assert instead of adding a conditional to memset() above (freed memory is always - // zeroed when MTE is enabled) - static_assert(ZERO_ON_FREE, "disabling ZERO_ON_FREE reduces performance when ARM MTE is enabled"); -#endif return p; } @@ -1526,14 +1385,11 @@ EXPORT void *h_realloc(void *old, size_t size) { } } - void *old_orig = old; - old = untag_pointer(old); - size_t old_size; if (old < get_slab_region_end() && old >= ro.slab_region_start) { old_size = slab_usable_size(old); if (size <= max_slab_size_class && get_size_info(size).size == old_size) { - return old_orig; + return old; } thread_unseal_metadata(); } else { @@ -1646,7 +1502,7 @@ EXPORT void *h_realloc(void *old, size_t size) { if (copy_size > 0 && copy_size <= max_slab_size_class) { copy_size -= canary_size; } - memcpy(new, old_orig, copy_size); + memcpy(new, old, copy_size); if (old_size <= max_slab_size_class) { deallocate_small(old, NULL); } else { @@ -1687,8 +1543,6 @@ EXPORT void h_free(void *p) { return; } - p = untag_pointer(p); - if (p < get_slab_region_end() && p >= ro.slab_region_start) { thread_unseal_metadata(); deallocate_small(p, NULL); @@ -1712,8 +1566,6 @@ EXPORT void h_free_sized(void *p, size_t expected_size) { return; } - p = untag_pointer(p); - expected_size = adjust_size_for_canary(expected_size); if (p < get_slab_region_end() && p >= ro.slab_region_start) { @@ -1767,13 +1619,11 @@ static inline void memory_corruption_check_small(const void *p) { mutex_unlock(&c->lock); } -EXPORT size_t h_malloc_usable_size(H_MALLOC_USABLE_SIZE_CONST void *arg) { - if (arg == NULL) { +EXPORT size_t h_malloc_usable_size(H_MALLOC_USABLE_SIZE_CONST void *p) { + if (p == NULL) { return 0; } - const void *p = untag_const_pointer(arg); - if (p < get_slab_region_end() && p >= ro.slab_region_start) { thread_unseal_metadata(); memory_corruption_check_small(p); @@ -1905,7 +1755,7 @@ EXPORT int h_malloc_trim(UNUSED size_t pad) { struct slab_metadata *iterator = c->empty_slabs; while (iterator) { void *slab = get_slab(c, slab_size, iterator); - if (memory_map_fixed_tagged(slab, slab_size)) { + if (memory_map_fixed(slab, slab_size)) { break; } label_slab(slab, slab_size, class); @@ -2175,26 +2025,3 @@ COLD EXPORT int h_malloc_set_state(UNUSED void *state) { return -2; } #endif - -#ifdef __ANDROID__ -COLD EXPORT void h_malloc_disable_memory_tagging(void) { -#ifdef HAS_ARM_MTE - mutex_lock(&init_lock); - if (!ro.is_memtag_disabled) { - if (is_init()) { - if (unlikely(memory_protect_rw(&ro, sizeof(ro)))) { - fatal_error("failed to unprotect allocator data"); - } - ro.is_memtag_disabled = true; - if (unlikely(memory_protect_ro(&ro, sizeof(ro)))) { - fatal_error("failed to protect allocator data"); - } - } else { - // bionic calls this function very early in some cases - ro.is_memtag_disabled = true; - } - } - mutex_unlock(&init_lock); -#endif -} -#endif diff --git a/include/h_malloc.h b/include/h_malloc.h index 0eee395..5824abb 100644 --- a/include/h_malloc.h +++ b/include/h_malloc.h @@ -99,7 +99,6 @@ int h_malloc_iterate(uintptr_t base, size_t size, void (*callback)(uintptr_t ptr void *arg); void h_malloc_disable(void); void h_malloc_enable(void); -void h_malloc_disable_memory_tagging(void); #endif // hardened_malloc extensions diff --git a/memory.c b/memory.c index 2e54f6d..04afc23 100644 --- a/memory.c +++ b/memory.c @@ -17,8 +17,8 @@ #include "memory.h" #include "util.h" -static void *memory_map_prot(size_t size, int prot) { - void *p = mmap(NULL, size, prot, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); +void *memory_map(size_t size) { + void *p = mmap(NULL, size, PROT_NONE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); if (unlikely(p == MAP_FAILED)) { if (errno != ENOMEM) { fatal_error("non-ENOMEM mmap failure"); @@ -28,19 +28,8 @@ static void *memory_map_prot(size_t size, int prot) { return p; } -void *memory_map(size_t size) { - return memory_map_prot(size, PROT_NONE); -} - -#ifdef HAS_ARM_MTE -// Note that PROT_MTE can't be cleared via mprotect -void *memory_map_mte(size_t size) { - return memory_map_prot(size, PROT_MTE); -} -#endif - -static bool memory_map_fixed_prot(void *ptr, size_t size, int prot) { - void *p = mmap(ptr, size, prot, MAP_ANONYMOUS|MAP_PRIVATE|MAP_FIXED, -1, 0); +bool memory_map_fixed(void *ptr, size_t size) { + void *p = mmap(ptr, size, PROT_NONE, MAP_ANONYMOUS|MAP_PRIVATE|MAP_FIXED, -1, 0); bool ret = p == MAP_FAILED; if (unlikely(ret) && errno != ENOMEM) { fatal_error("non-ENOMEM MAP_FIXED mmap failure"); @@ -48,17 +37,6 @@ static bool memory_map_fixed_prot(void *ptr, size_t size, int prot) { return ret; } -bool memory_map_fixed(void *ptr, size_t size) { - return memory_map_fixed_prot(ptr, size, PROT_NONE); -} - -#ifdef HAS_ARM_MTE -// Note that PROT_MTE can't be cleared via mprotect -bool memory_map_fixed_mte(void *ptr, size_t size) { - return memory_map_fixed_prot(ptr, size, PROT_MTE); -} -#endif - bool memory_unmap(void *ptr, size_t size) { bool ret = munmap(ptr, size); if (unlikely(ret) && errno != ENOMEM) { diff --git a/memory.h b/memory.h index d5e336b..c04bfd9 100644 --- a/memory.h +++ b/memory.h @@ -11,13 +11,7 @@ int get_metadata_key(void); void *memory_map(size_t size); -#ifdef HAS_ARM_MTE -void *memory_map_mte(size_t size); -#endif bool memory_map_fixed(void *ptr, size_t size); -#ifdef HAS_ARM_MTE -bool memory_map_fixed_mte(void *ptr, size_t size); -#endif bool memory_unmap(void *ptr, size_t size); bool memory_protect_ro(void *ptr, size_t size); bool memory_protect_rw(void *ptr, size_t size); diff --git a/memtag.h b/memtag.h deleted file mode 100644 index e431283..0000000 --- a/memtag.h +++ /dev/null @@ -1,50 +0,0 @@ -#ifndef MEMTAG_H -#define MEMTAG_H - -#include "util.h" - -#ifdef HAS_ARM_MTE -#include "arm_mte.h" -#define MEMTAG 1 -// Note that bionic libc always reserves tag 0 via PR_MTE_TAG_MASK prctl -#define RESERVED_TAG 0 -#define TAG_WIDTH 4 -#endif - -static inline void *untag_pointer(void *ptr) { -#ifdef HAS_ARM_MTE - const uintptr_t mask = UINTPTR_MAX >> 8; - return (void *) ((uintptr_t) ptr & mask); -#else - return ptr; -#endif -} - -static inline const void *untag_const_pointer(const void *ptr) { -#ifdef HAS_ARM_MTE - const uintptr_t mask = UINTPTR_MAX >> 8; - return (const void *) ((uintptr_t) ptr & mask); -#else - return ptr; -#endif -} - -static inline void *set_pointer_tag(void *ptr, u8 tag) { -#ifdef HAS_ARM_MTE - return (void *) (((uintptr_t) tag << 56) | (uintptr_t) untag_pointer(ptr)); -#else - (void) tag; - return ptr; -#endif -} - -static inline u8 get_pointer_tag(void *ptr) { -#ifdef HAS_ARM_MTE - return (((uintptr_t) ptr) >> 56) & 0xf; -#else - (void) ptr; - return 0; -#endif -} - -#endif diff --git a/test/malloc_info.c b/test/malloc_info.c index 3b99ead..50b256f 100644 --- a/test/malloc_info.c +++ b/test/malloc_info.c @@ -1,6 +1,5 @@ #include #include -#include #if defined(__GLIBC__) || defined(__ANDROID__) #include diff --git a/third_party/libdivide.h b/third_party/libdivide.h index bddc763..e9a31d1 100644 --- a/third_party/libdivide.h +++ b/third_party/libdivide.h @@ -1,8 +1,8 @@ // libdivide.h - Optimized integer division // https://libdivide.com // -// Copyright (C) 2010 - 2022 ridiculous_fish, -// Copyright (C) 2016 - 2022 Kim Walisch, +// Copyright (C) 2010 - 2021 ridiculous_fish, +// Copyright (C) 2016 - 2021 Kim Walisch, // // libdivide is dual-licensed under the Boost or zlib licenses. // You may use libdivide under the terms of either of these. @@ -11,14 +11,11 @@ #ifndef LIBDIVIDE_H #define LIBDIVIDE_H -// *** Version numbers are auto generated - do not edit *** -#define LIBDIVIDE_VERSION "5.2.0" +#define LIBDIVIDE_VERSION "5.0" #define LIBDIVIDE_VERSION_MAJOR 5 -#define LIBDIVIDE_VERSION_MINOR 2 -#define LIBDIVIDE_VERSION_PATCH 0 +#define LIBDIVIDE_VERSION_MINOR 0 #include - #if !defined(__AVR__) #include #include @@ -27,29 +24,20 @@ #if defined(LIBDIVIDE_SSE2) #include #endif - #if defined(LIBDIVIDE_AVX2) || defined(LIBDIVIDE_AVX512) #include #endif - #if defined(LIBDIVIDE_NEON) #include #endif -// Clang-cl prior to Visual Studio 2022 doesn't include __umulh/__mulh intrinsics -#if defined(_MSC_VER) && defined(LIBDIVIDE_X86_64) && (!defined(__clang__) || _MSC_VER>1930) -#define LIBDIVIDE_X64_INTRINSICS -#endif - #if defined(_MSC_VER) -#if defined(LIBDIVIDE_X64_INTRINSICS) #include -#endif #pragma warning(push) // disable warning C4146: unary minus operator applied // to unsigned type, result still unsigned #pragma warning(disable : 4146) -// disable warning C4204: nonstandard extension used : non-constant aggregate +// disable warning C4204: nonstandard extension used : non-constant aggregate // initializer // // It's valid C99 @@ -250,32 +238,24 @@ static LIBDIVIDE_INLINE struct libdivide_u64_branchfree_t libdivide_u64_branchfr static LIBDIVIDE_INLINE int16_t libdivide_s16_do_raw( int16_t numer, int16_t magic, uint8_t more); static LIBDIVIDE_INLINE int16_t libdivide_s16_do( - int16_t numer, const struct libdivide_s16_t *denom); + int16_t numer, const struct libdivide_s16_t* denom); static LIBDIVIDE_INLINE uint16_t libdivide_u16_do_raw( - uint16_t numer, uint16_t magic, uint8_t more); + uint16_t numer, uint16_t magic, uint8_t more); static LIBDIVIDE_INLINE uint16_t libdivide_u16_do( - uint16_t numer, const struct libdivide_u16_t *denom); -static LIBDIVIDE_INLINE int32_t libdivide_s32_do_raw( - int32_t numer, int32_t magic, uint8_t more); + uint16_t numer, const struct libdivide_u16_t* denom); static LIBDIVIDE_INLINE int32_t libdivide_s32_do( int32_t numer, const struct libdivide_s32_t *denom); -static LIBDIVIDE_INLINE uint32_t libdivide_u32_do_raw( - uint32_t numer, uint32_t magic, uint8_t more); static LIBDIVIDE_INLINE uint32_t libdivide_u32_do( uint32_t numer, const struct libdivide_u32_t *denom); -static LIBDIVIDE_INLINE int64_t libdivide_s64_do_raw( - int64_t numer, int64_t magic, uint8_t more); static LIBDIVIDE_INLINE int64_t libdivide_s64_do( int64_t numer, const struct libdivide_s64_t *denom); -static LIBDIVIDE_INLINE uint64_t libdivide_u64_do_raw( - uint64_t numer, uint64_t magic, uint8_t more); static LIBDIVIDE_INLINE uint64_t libdivide_u64_do( uint64_t numer, const struct libdivide_u64_t *denom); static LIBDIVIDE_INLINE int16_t libdivide_s16_branchfree_do( - int16_t numer, const struct libdivide_s16_branchfree_t *denom); + int16_t numer, const struct libdivide_s16_branchfree_t* denom); static LIBDIVIDE_INLINE uint16_t libdivide_u16_branchfree_do( - uint16_t numer, const struct libdivide_u16_branchfree_t *denom); + uint16_t numer, const struct libdivide_u16_branchfree_t* denom); static LIBDIVIDE_INLINE int32_t libdivide_s32_branchfree_do( int32_t numer, const struct libdivide_s32_branchfree_t *denom); static LIBDIVIDE_INLINE uint32_t libdivide_u32_branchfree_do( @@ -285,17 +265,17 @@ static LIBDIVIDE_INLINE int64_t libdivide_s64_branchfree_do( static LIBDIVIDE_INLINE uint64_t libdivide_u64_branchfree_do( uint64_t numer, const struct libdivide_u64_branchfree_t *denom); -static LIBDIVIDE_INLINE int16_t libdivide_s16_recover(const struct libdivide_s16_t *denom); -static LIBDIVIDE_INLINE uint16_t libdivide_u16_recover(const struct libdivide_u16_t *denom); +static LIBDIVIDE_INLINE int16_t libdivide_s16_recover(const struct libdivide_s16_t* denom); +static LIBDIVIDE_INLINE uint16_t libdivide_u16_recover(const struct libdivide_u16_t* denom); static LIBDIVIDE_INLINE int32_t libdivide_s32_recover(const struct libdivide_s32_t *denom); static LIBDIVIDE_INLINE uint32_t libdivide_u32_recover(const struct libdivide_u32_t *denom); static LIBDIVIDE_INLINE int64_t libdivide_s64_recover(const struct libdivide_s64_t *denom); static LIBDIVIDE_INLINE uint64_t libdivide_u64_recover(const struct libdivide_u64_t *denom); static LIBDIVIDE_INLINE int16_t libdivide_s16_branchfree_recover( - const struct libdivide_s16_branchfree_t *denom); + const struct libdivide_s16_branchfree_t* denom); static LIBDIVIDE_INLINE uint16_t libdivide_u16_branchfree_recover( - const struct libdivide_u16_branchfree_t *denom); + const struct libdivide_u16_branchfree_t* denom); static LIBDIVIDE_INLINE int32_t libdivide_s32_branchfree_recover( const struct libdivide_s32_branchfree_t *denom); static LIBDIVIDE_INLINE uint32_t libdivide_u32_branchfree_recover( @@ -334,7 +314,7 @@ static LIBDIVIDE_INLINE int32_t libdivide_mullhi_s32(int32_t x, int32_t y) { } static LIBDIVIDE_INLINE uint64_t libdivide_mullhi_u64(uint64_t x, uint64_t y) { -#if defined(LIBDIVIDE_X64_INTRINSICS) +#if defined(LIBDIVIDE_VC) && defined(LIBDIVIDE_X86_64) return __umulh(x, y); #elif defined(HAS_INT128_T) __uint128_t xl = x, yl = y; @@ -360,7 +340,7 @@ static LIBDIVIDE_INLINE uint64_t libdivide_mullhi_u64(uint64_t x, uint64_t y) { } static LIBDIVIDE_INLINE int64_t libdivide_mullhi_s64(int64_t x, int64_t y) { -#if defined(LIBDIVIDE_X64_INTRINSICS) +#if defined(LIBDIVIDE_VC) && defined(LIBDIVIDE_X86_64) return __mulh(x, y); #elif defined(HAS_INT128_T) __int128_t xl = x, yl = y; @@ -413,7 +393,7 @@ static LIBDIVIDE_INLINE int16_t libdivide_count_leading_zeros16(uint16_t val) { static LIBDIVIDE_INLINE int32_t libdivide_count_leading_zeros32(uint32_t val) { #if defined(__AVR__) - // Fast way to count leading zeros + // Fast way to count leading zeros return __builtin_clzl(val); #elif defined(__GNUC__) || __has_builtin(__builtin_clz) // Fast way to count leading zeros @@ -462,7 +442,7 @@ static LIBDIVIDE_INLINE int32_t libdivide_count_leading_zeros64(uint64_t val) { // uint {v}. The result must fit in 16 bits. // Returns the quotient directly and the remainder in *r static LIBDIVIDE_INLINE uint16_t libdivide_32_div_16_to_16( - uint16_t u1, uint16_t u0, uint16_t v, uint16_t *r) { + uint16_t u1, uint16_t u0, uint16_t v, uint16_t* r) { uint32_t n = ((uint32_t)u1 << 16) | u0; uint16_t result = (uint16_t)(n / v); *r = (uint16_t)(n - result * (uint32_t)v); @@ -532,7 +512,7 @@ static LIBDIVIDE_INLINE uint64_t libdivide_128_div_64_to_64( // Check for overflow and divide by 0. if (numhi >= den) { - if (r) *r = ~0ull; + if (r != NULL) *r = ~0ull; return ~0ull; } @@ -578,14 +558,11 @@ static LIBDIVIDE_INLINE uint64_t libdivide_128_div_64_to_64( q0 = (uint32_t)qhat; // Return remainder if requested. - if (r) *r = (rem * b + num0 - q0 * den) >> shift; + if (r != NULL) *r = (rem * b + num0 - q0 * den) >> shift; return ((uint64_t)q1 << 32) | q0; #endif } -#if !(defined(HAS_INT128_T) && \ - defined(HAS_INT128_DIV)) - // Bitshift a u128 in place, left (signed_shift > 0) or right (signed_shift < 0) static LIBDIVIDE_INLINE void libdivide_u128_shift( uint64_t *u1, uint64_t *u0, int32_t signed_shift) { @@ -602,8 +579,6 @@ static LIBDIVIDE_INLINE void libdivide_u128_shift( } } -#endif - // Computes a 128 / 128 -> 64 bit division, with a 128 bit remainder. static LIBDIVIDE_INLINE uint64_t libdivide_128_div_128_to_64( uint64_t u_hi, uint64_t u_lo, uint64_t v_hi, uint64_t v_lo, uint64_t *r_hi, uint64_t *r_lo) { @@ -721,7 +696,8 @@ static LIBDIVIDE_INLINE struct libdivide_u16_t libdivide_internal_u16_gen( // 1 in its recovery algorithm. result.magic = 0; result.more = (uint8_t)(floor_log_2_d - (branchfree != 0)); - } else { + } + else { uint8_t more; uint16_t rem, proposed_m; proposed_m = libdivide_32_div_16_to_16((uint16_t)1 << floor_log_2_d, 0, d, &rem); @@ -733,7 +709,8 @@ static LIBDIVIDE_INLINE struct libdivide_u16_t libdivide_internal_u16_gen( if (!branchfree && (e < ((uint16_t)1 << floor_log_2_d))) { // This power works more = floor_log_2_d; - } else { + } + else { // We have to use the general 17-bit algorithm. We need to compute // (2**power) / d. However, we already have (2**(power-1))/d and // its remainder. By doubling both, and then correcting the @@ -765,7 +742,7 @@ struct libdivide_u16_branchfree_t libdivide_u16_branchfree_gen(uint16_t d) { } struct libdivide_u16_t tmp = libdivide_internal_u16_gen(d, 1); struct libdivide_u16_branchfree_t ret = { - tmp.magic, (uint8_t)(tmp.more & LIBDIVIDE_16_SHIFT_MASK)}; + tmp.magic, (uint8_t)(tmp.more & LIBDIVIDE_16_SHIFT_MASK) }; return ret; } @@ -775,25 +752,27 @@ struct libdivide_u16_branchfree_t libdivide_u16_branchfree_gen(uint16_t d) { uint16_t libdivide_u16_do_raw(uint16_t numer, uint16_t magic, uint8_t more) { if (!magic) { return numer >> more; - } else { + } + else { uint16_t q = libdivide_mullhi_u16(magic, numer); if (more & LIBDIVIDE_ADD_MARKER) { uint16_t t = ((numer - q) >> 1) + q; return t >> (more & LIBDIVIDE_16_SHIFT_MASK); - } else { + } + else { // All upper bits are 0, // don't need to mask them off. return q >> more; } - } + } } -uint16_t libdivide_u16_do(uint16_t numer, const struct libdivide_u16_t *denom) { +uint16_t libdivide_u16_do(uint16_t numer, const struct libdivide_u16_t* denom) { return libdivide_u16_do_raw(numer, denom->magic, denom->more); } uint16_t libdivide_u16_branchfree_do( - uint16_t numer, const struct libdivide_u16_branchfree_t *denom) { + uint16_t numer, const struct libdivide_u16_branchfree_t* denom) { uint16_t q = libdivide_mullhi_u16(denom->magic, numer); uint16_t t = ((numer - q) >> 1) + q; return t >> denom->more; @@ -821,7 +800,7 @@ uint16_t libdivide_u16_recover(const struct libdivide_u16_t *denom) { // overflow. So we have to compute it as 2^(16+shift)/(m+2^16), and // then double the quotient and remainder. uint32_t half_n = (uint32_t)1 << (16 + shift); - uint32_t d = ((uint32_t)1 << 16) | denom->magic; + uint32_t d = ( (uint32_t)1 << 16) | denom->magic; // Note that the quotient is guaranteed <= 16 bits, but the remainder // may need 17! uint16_t half_q = (uint16_t)(half_n / d); @@ -933,11 +912,12 @@ struct libdivide_u32_branchfree_t libdivide_u32_branchfree_gen(uint32_t d) { return ret; } -uint32_t libdivide_u32_do_raw(uint32_t numer, uint32_t magic, uint8_t more) { - if (!magic) { +uint32_t libdivide_u32_do(uint32_t numer, const struct libdivide_u32_t *denom) { + uint8_t more = denom->more; + if (!denom->magic) { return numer >> more; } else { - uint32_t q = libdivide_mullhi_u32(magic, numer); + uint32_t q = libdivide_mullhi_u32(denom->magic, numer); if (more & LIBDIVIDE_ADD_MARKER) { uint32_t t = ((numer - q) >> 1) + q; return t >> (more & LIBDIVIDE_32_SHIFT_MASK); @@ -949,10 +929,6 @@ uint32_t libdivide_u32_do_raw(uint32_t numer, uint32_t magic, uint8_t more) { } } -uint32_t libdivide_u32_do(uint32_t numer, const struct libdivide_u32_t *denom) { - return libdivide_u32_do_raw(numer, denom->magic, denom->more); -} - uint32_t libdivide_u32_branchfree_do( uint32_t numer, const struct libdivide_u32_branchfree_t *denom) { uint32_t q = libdivide_mullhi_u32(denom->magic, numer); @@ -1096,11 +1072,12 @@ struct libdivide_u64_branchfree_t libdivide_u64_branchfree_gen(uint64_t d) { return ret; } -uint64_t libdivide_u64_do_raw(uint64_t numer, uint64_t magic, uint8_t more) { - if (!magic) { +uint64_t libdivide_u64_do(uint64_t numer, const struct libdivide_u64_t *denom) { + uint8_t more = denom->more; + if (!denom->magic) { return numer >> more; } else { - uint64_t q = libdivide_mullhi_u64(magic, numer); + uint64_t q = libdivide_mullhi_u64(denom->magic, numer); if (more & LIBDIVIDE_ADD_MARKER) { uint64_t t = ((numer - q) >> 1) + q; return t >> (more & LIBDIVIDE_64_SHIFT_MASK); @@ -1112,10 +1089,6 @@ uint64_t libdivide_u64_do_raw(uint64_t numer, uint64_t magic, uint8_t more) { } } -uint64_t libdivide_u64_do(uint64_t numer, const struct libdivide_u64_t *denom) { - return libdivide_u64_do_raw(numer, denom->magic, denom->more); -} - uint64_t libdivide_u64_branchfree_do( uint64_t numer, const struct libdivide_u64_branchfree_t *denom) { uint64_t q = libdivide_mullhi_u64(denom->magic, numer); @@ -1455,10 +1428,11 @@ struct libdivide_s32_branchfree_t libdivide_s32_branchfree_gen(int32_t d) { return result; } -int32_t libdivide_s32_do_raw(int32_t numer, int32_t magic, uint8_t more) { +int32_t libdivide_s32_do(int32_t numer, const struct libdivide_s32_t *denom) { + uint8_t more = denom->more; uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK; - if (!magic) { + if (!denom->magic) { uint32_t sign = (int8_t)more >> 7; uint32_t mask = ((uint32_t)1 << shift) - 1; uint32_t uq = numer + ((numer >> 31) & mask); @@ -1467,7 +1441,7 @@ int32_t libdivide_s32_do_raw(int32_t numer, int32_t magic, uint8_t more) { q = (q ^ sign) - sign; return q; } else { - uint32_t uq = (uint32_t)libdivide_mullhi_s32(magic, numer); + uint32_t uq = (uint32_t)libdivide_mullhi_s32(denom->magic, numer); if (more & LIBDIVIDE_ADD_MARKER) { // must be arithmetic shift and then sign extend int32_t sign = (int8_t)more >> 7; @@ -1482,10 +1456,6 @@ int32_t libdivide_s32_do_raw(int32_t numer, int32_t magic, uint8_t more) { } } -int32_t libdivide_s32_do(int32_t numer, const struct libdivide_s32_t *denom) { - return libdivide_s32_do_raw(numer, denom->magic, denom->more); -} - int32_t libdivide_s32_branchfree_do(int32_t numer, const struct libdivide_s32_branchfree_t *denom) { uint8_t more = denom->more; uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK; @@ -1627,10 +1597,11 @@ struct libdivide_s64_branchfree_t libdivide_s64_branchfree_gen(int64_t d) { return ret; } -int64_t libdivide_s64_do_raw(int64_t numer, int64_t magic, uint8_t more) { +int64_t libdivide_s64_do(int64_t numer, const struct libdivide_s64_t *denom) { + uint8_t more = denom->more; uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK; - if (!magic) { // shift path + if (!denom->magic) { // shift path uint64_t mask = ((uint64_t)1 << shift) - 1; uint64_t uq = numer + ((numer >> 63) & mask); int64_t q = (int64_t)uq; @@ -1640,7 +1611,7 @@ int64_t libdivide_s64_do_raw(int64_t numer, int64_t magic, uint8_t more) { q = (q ^ sign) - sign; return q; } else { - uint64_t uq = (uint64_t)libdivide_mullhi_s64(magic, numer); + uint64_t uq = (uint64_t)libdivide_mullhi_s64(denom->magic, numer); if (more & LIBDIVIDE_ADD_MARKER) { // must be arithmetic shift and then sign extend int64_t sign = (int8_t)more >> 7; @@ -1655,10 +1626,6 @@ int64_t libdivide_s64_do_raw(int64_t numer, int64_t magic, uint8_t more) { } } -int64_t libdivide_s64_do(int64_t numer, const struct libdivide_s64_t *denom) { - return libdivide_s64_do_raw(numer, denom->magic, denom->more); -} - int64_t libdivide_s64_branchfree_do(int64_t numer, const struct libdivide_s64_branchfree_t *denom) { uint8_t more = denom->more; uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK; @@ -1715,22 +1682,15 @@ int64_t libdivide_s64_branchfree_recover(const struct libdivide_s64_branchfree_t // Simplest possible vector type division: treat the vector type as an array // of underlying native type. -// -// Use a union to read a vector via pointer-to-integer, without violating strict -// aliasing. -#define SIMPLE_VECTOR_DIVISION(IntT, VecT, Algo) \ - const size_t count = sizeof(VecT) / sizeof(IntT); \ - union type_pun_vec { \ - VecT vec; \ - IntT arr[sizeof(VecT) / sizeof(IntT)]; \ - }; \ - union type_pun_vec result; \ - union type_pun_vec input; \ - input.vec = numers; \ - for (size_t loop = 0; loop < count; ++loop) { \ - result.arr[loop] = libdivide_##Algo##_do(input.arr[loop], denom); \ - } \ - return result.vec; +#define SIMPLE_VECTOR_DIVISION(IntT, VecT, Algo) \ + const size_t count = sizeof(VecT) / sizeof(IntT); \ + VecT result; \ + IntT *pSource = (IntT *)&numers; \ + IntT *pTarget = (IntT *)&result; \ + for (size_t loop=0; loopmore; - if (!denom->magic) { - return _mm256_srli_epi16(numers, more); - } else { - __m256i q = _mm256_mulhi_epu16(numers, _mm256_set1_epi16(denom->magic)); - if (more & LIBDIVIDE_ADD_MARKER) { - __m256i t = _mm256_adds_epu16(_mm256_srli_epi16(_mm256_subs_epu16(numers, q), 1), q); - return _mm256_srli_epi16(t, (more & LIBDIVIDE_16_SHIFT_MASK)); - } else { - return _mm256_srli_epi16(q, more); - } - } + SIMPLE_VECTOR_DIVISION(uint16_t, __m256i, u16) } -__m256i libdivide_u16_branchfree_do_vec256( - __m256i numers, const struct libdivide_u16_branchfree_t *denom) { - __m256i q = _mm256_mulhi_epu16(numers, _mm256_set1_epi16(denom->magic)); - __m256i t = _mm256_adds_epu16(_mm256_srli_epi16(_mm256_subs_epu16(numers, q), 1), q); - return _mm256_srli_epi16(t, denom->more); +__m256i libdivide_u16_branchfree_do_vec256(__m256i numers, const struct libdivide_u16_branchfree_t *denom) { + SIMPLE_VECTOR_DIVISION(uint16_t, __m256i, u16_branchfree) } ////////// UINT32 @@ -2479,54 +2429,11 @@ __m256i libdivide_u64_branchfree_do_vec256( ////////// SINT16 __m256i libdivide_s16_do_vec256(__m256i numers, const struct libdivide_s16_t *denom) { - uint8_t more = denom->more; - if (!denom->magic) { - uint16_t shift = more & LIBDIVIDE_16_SHIFT_MASK; - uint16_t mask = ((uint16_t)1 << shift) - 1; - __m256i roundToZeroTweak = _mm256_set1_epi16(mask); - // q = numer + ((numer >> 15) & roundToZeroTweak); - __m256i q = _mm256_add_epi16( - numers, _mm256_and_si256(_mm256_srai_epi16(numers, 15), roundToZeroTweak)); - q = _mm256_srai_epi16(q, shift); - __m256i sign = _mm256_set1_epi16((int8_t)more >> 7); - // q = (q ^ sign) - sign; - q = _mm256_sub_epi16(_mm256_xor_si256(q, sign), sign); - return q; - } else { - __m256i q = _mm256_mulhi_epi16(numers, _mm256_set1_epi16(denom->magic)); - if (more & LIBDIVIDE_ADD_MARKER) { - // must be arithmetic shift - __m256i sign = _mm256_set1_epi16((int8_t)more >> 7); - // q += ((numer ^ sign) - sign); - q = _mm256_add_epi16(q, _mm256_sub_epi16(_mm256_xor_si256(numers, sign), sign)); - } - // q >>= shift - q = _mm256_srai_epi16(q, more & LIBDIVIDE_16_SHIFT_MASK); - q = _mm256_add_epi16(q, _mm256_srli_epi16(q, 15)); // q += (q < 0) - return q; - } + SIMPLE_VECTOR_DIVISION(int16_t, __m256i, s16) } -__m256i libdivide_s16_branchfree_do_vec256( - __m256i numers, const struct libdivide_s16_branchfree_t *denom) { - int16_t magic = denom->magic; - uint8_t more = denom->more; - uint8_t shift = more & LIBDIVIDE_16_SHIFT_MASK; - // must be arithmetic shift - __m256i sign = _mm256_set1_epi16((int8_t)more >> 7); - __m256i q = _mm256_mulhi_epi16(numers, _mm256_set1_epi16(magic)); - q = _mm256_add_epi16(q, numers); // q += numers - - // If q is non-negative, we have nothing to do - // If q is negative, we want to add either (2**shift)-1 if d is - // a power of 2, or (2**shift) if it is not a power of 2 - uint16_t is_power_of_2 = (magic == 0); - __m256i q_sign = _mm256_srai_epi16(q, 15); // q_sign = q >> 15 - __m256i mask = _mm256_set1_epi16(((uint16_t)1 << shift) - is_power_of_2); - q = _mm256_add_epi16(q, _mm256_and_si256(q_sign, mask)); // q = q + (q_sign & mask) - q = _mm256_srai_epi16(q, shift); // q >>= shift - q = _mm256_sub_epi16(_mm256_xor_si256(q, sign), sign); // q = (q ^ sign) - sign - return q; +__m256i libdivide_s16_branchfree_do_vec256(__m256i numers, const struct libdivide_s16_branchfree_t *denom) { + SIMPLE_VECTOR_DIVISION(int16_t, __m256i, s16_branchfree) } ////////// SINT32 @@ -2754,25 +2661,11 @@ static LIBDIVIDE_INLINE __m128i libdivide_mullhi_s64_vec128(__m128i x, __m128i y ////////// UINT26 __m128i libdivide_u16_do_vec128(__m128i numers, const struct libdivide_u16_t *denom) { - uint8_t more = denom->more; - if (!denom->magic) { - return _mm_srli_epi16(numers, more); - } else { - __m128i q = _mm_mulhi_epu16(numers, _mm_set1_epi16(denom->magic)); - if (more & LIBDIVIDE_ADD_MARKER) { - __m128i t = _mm_adds_epu16(_mm_srli_epi16(_mm_subs_epu16(numers, q), 1), q); - return _mm_srli_epi16(t, (more & LIBDIVIDE_16_SHIFT_MASK)); - } else { - return _mm_srli_epi16(q, more); - } - } + SIMPLE_VECTOR_DIVISION(uint16_t, __m128i, u16) } -__m128i libdivide_u16_branchfree_do_vec128( - __m128i numers, const struct libdivide_u16_branchfree_t *denom) { - __m128i q = _mm_mulhi_epu16(numers, _mm_set1_epi16(denom->magic)); - __m128i t = _mm_adds_epu16(_mm_srli_epi16(_mm_subs_epu16(numers, q), 1), q); - return _mm_srli_epi16(t, denom->more); +__m128i libdivide_u16_branchfree_do_vec128(__m128i numers, const struct libdivide_u16_branchfree_t *denom) { + SIMPLE_VECTOR_DIVISION(uint16_t, __m128i, u16_branchfree) } ////////// UINT32 @@ -2832,54 +2725,11 @@ __m128i libdivide_u64_branchfree_do_vec128( ////////// SINT16 __m128i libdivide_s16_do_vec128(__m128i numers, const struct libdivide_s16_t *denom) { - uint8_t more = denom->more; - if (!denom->magic) { - uint16_t shift = more & LIBDIVIDE_16_SHIFT_MASK; - uint16_t mask = ((uint16_t)1 << shift) - 1; - __m128i roundToZeroTweak = _mm_set1_epi16(mask); - // q = numer + ((numer >> 15) & roundToZeroTweak); - __m128i q = - _mm_add_epi16(numers, _mm_and_si128(_mm_srai_epi16(numers, 15), roundToZeroTweak)); - q = _mm_srai_epi16(q, shift); - __m128i sign = _mm_set1_epi16((int8_t)more >> 7); - // q = (q ^ sign) - sign; - q = _mm_sub_epi16(_mm_xor_si128(q, sign), sign); - return q; - } else { - __m128i q = _mm_mulhi_epi16(numers, _mm_set1_epi16(denom->magic)); - if (more & LIBDIVIDE_ADD_MARKER) { - // must be arithmetic shift - __m128i sign = _mm_set1_epi16((int8_t)more >> 7); - // q += ((numer ^ sign) - sign); - q = _mm_add_epi16(q, _mm_sub_epi16(_mm_xor_si128(numers, sign), sign)); - } - // q >>= shift - q = _mm_srai_epi16(q, more & LIBDIVIDE_16_SHIFT_MASK); - q = _mm_add_epi16(q, _mm_srli_epi16(q, 15)); // q += (q < 0) - return q; - } + SIMPLE_VECTOR_DIVISION(int16_t, __m128i, s16) } -__m128i libdivide_s16_branchfree_do_vec128( - __m128i numers, const struct libdivide_s16_branchfree_t *denom) { - int16_t magic = denom->magic; - uint8_t more = denom->more; - uint8_t shift = more & LIBDIVIDE_16_SHIFT_MASK; - // must be arithmetic shift - __m128i sign = _mm_set1_epi16((int8_t)more >> 7); - __m128i q = _mm_mulhi_epi16(numers, _mm_set1_epi16(magic)); - q = _mm_add_epi16(q, numers); // q += numers - - // If q is non-negative, we have nothing to do - // If q is negative, we want to add either (2**shift)-1 if d is - // a power of 2, or (2**shift) if it is not a power of 2 - uint16_t is_power_of_2 = (magic == 0); - __m128i q_sign = _mm_srai_epi16(q, 15); // q_sign = q >> 15 - __m128i mask = _mm_set1_epi16(((uint16_t)1 << shift) - is_power_of_2); - q = _mm_add_epi16(q, _mm_and_si128(q_sign, mask)); // q = q + (q_sign & mask) - q = _mm_srai_epi16(q, shift); // q >>= shift - q = _mm_sub_epi16(_mm_xor_si128(q, sign), sign); // q = (q ^ sign) - sign - return q; +__m128i libdivide_s16_branchfree_do_vec128(__m128i numers, const struct libdivide_s16_branchfree_t *denom) { + SIMPLE_VECTOR_DIVISION(int16_t, __m128i, s16_branchfree) } ////////// SINT32 @@ -2945,8 +2795,8 @@ __m128i libdivide_s64_do_vec128(__m128i numers, const struct libdivide_s64_t *de uint64_t mask = ((uint64_t)1 << shift) - 1; __m128i roundToZeroTweak = _mm_set1_epi64x(mask); // q = numer + ((numer >> 63) & roundToZeroTweak); - __m128i q = _mm_add_epi64( - numers, _mm_and_si128(libdivide_s64_signbits_vec128(numers), roundToZeroTweak)); + __m128i q = + _mm_add_epi64(numers, _mm_and_si128(libdivide_s64_signbits_vec128(numers), roundToZeroTweak)); q = libdivide_s64_shift_right_vec128(q, shift); __m128i sign = _mm_set1_epi32((int8_t)more >> 7); // q = (q ^ sign) - sign; @@ -2997,80 +2847,49 @@ __m128i libdivide_s64_branchfree_do_vec128( #ifdef __cplusplus -//for constexpr zero initialization, -//c++11 might handle things ok, -//but just limit to at least c++14 to ensure -//we don't break anyone's code: - -// for gcc and clang, use https://en.cppreference.com/w/cpp/feature_test#cpp_constexpr -#if (defined(__GNUC__) || defined(__clang__)) && (__cpp_constexpr >= 201304L) -#define LIBDIVIDE_CONSTEXPR constexpr - -// supposedly, MSVC might not implement feature test macros right (https://stackoverflow.com/questions/49316752/feature-test-macros-not-working-properly-in-visual-c) -// so check that _MSVC_LANG corresponds to at least c++14, and _MSC_VER corresponds to at least VS 2017 15.0 (for extended constexpr support https://learn.microsoft.com/en-us/cpp/overview/visual-cpp-language-conformance?view=msvc-170) -#elif defined(_MSC_VER) && _MSC_VER >= 1910 && defined(_MSVC_LANG) && _MSVC_LANG >=201402L -#define LIBDIVIDE_CONSTEXPR constexpr - -// in case some other obscure compiler has the right __cpp_constexpr : -#elif defined(__cpp_constexpr) && __cpp_constexpr >= 201304L -#define LIBDIVIDE_CONSTEXPR constexpr - -#else -#define LIBDIVIDE_CONSTEXPR LIBDIVIDE_INLINE -#endif - enum Branching { BRANCHFULL, // use branching algorithms BRANCHFREE // use branchfree algorithms }; -namespace detail { -enum Signedness { - SIGNED, - UNSIGNED, -}; - #if defined(LIBDIVIDE_NEON) // Helper to deduce NEON vector type for integral type. -template -struct NeonVec {}; +template +struct NeonVecFor {}; template <> -struct NeonVec<16, UNSIGNED> { +struct NeonVecFor { typedef uint16x8_t type; }; template <> -struct NeonVec<16, SIGNED> { +struct NeonVecFor { typedef int16x8_t type; }; template <> -struct NeonVec<32, UNSIGNED> { +struct NeonVecFor { typedef uint32x4_t type; }; template <> -struct NeonVec<32, SIGNED> { +struct NeonVecFor { typedef int32x4_t type; }; template <> -struct NeonVec<64, UNSIGNED> { +struct NeonVecFor { typedef uint64x2_t type; }; template <> -struct NeonVec<64, SIGNED> { +struct NeonVecFor { typedef int64x2_t type; }; +#endif -template -struct NeonVecFor { - // See 'class divider' for an explanation of these template parameters. - typedef typename NeonVec> 0) > (T)(-1) ? SIGNED : UNSIGNED)>::type type; -}; - +// Versions of our algorithms for SIMD. +#if defined(LIBDIVIDE_NEON) #define LIBDIVIDE_DIVIDE_NEON(ALGO, INT_TYPE) \ LIBDIVIDE_INLINE typename NeonVecFor::type divide( \ typename NeonVecFor::type n) const { \ @@ -3079,7 +2898,6 @@ struct NeonVecFor { #else #define LIBDIVIDE_DIVIDE_NEON(ALGO, INT_TYPE) #endif - #if defined(LIBDIVIDE_SSE2) #define LIBDIVIDE_DIVIDE_SSE2(ALGO) \ LIBDIVIDE_INLINE __m128i divide(__m128i n) const { \ @@ -3112,7 +2930,6 @@ struct NeonVecFor { #define DISPATCHER_GEN(T, ALGO) \ libdivide_##ALGO##_t denom; \ LIBDIVIDE_INLINE dispatcher() {} \ - explicit LIBDIVIDE_CONSTEXPR dispatcher(decltype(nullptr)) : denom{} {} \ LIBDIVIDE_INLINE dispatcher(T d) : denom(libdivide_##ALGO##_gen(d)) {} \ LIBDIVIDE_INLINE T divide(T n) const { return libdivide_##ALGO##_do(n, &denom); } \ LIBDIVIDE_INLINE T recover() const { return libdivide_##ALGO##_recover(&denom); } \ @@ -3122,81 +2939,66 @@ struct NeonVecFor { LIBDIVIDE_DIVIDE_AVX512(ALGO) // The dispatcher selects a specific division algorithm for a given -// width, signedness, and ALGO using partial template specialization. -template +// type and ALGO using partial template specialization. +template struct dispatcher {}; template <> -struct dispatcher<16, SIGNED, BRANCHFULL> { +struct dispatcher { DISPATCHER_GEN(int16_t, s16) }; template <> -struct dispatcher<16, SIGNED, BRANCHFREE> { +struct dispatcher { DISPATCHER_GEN(int16_t, s16_branchfree) }; template <> -struct dispatcher<16, UNSIGNED, BRANCHFULL> { +struct dispatcher { DISPATCHER_GEN(uint16_t, u16) }; template <> -struct dispatcher<16, UNSIGNED, BRANCHFREE> { +struct dispatcher { DISPATCHER_GEN(uint16_t, u16_branchfree) }; template <> -struct dispatcher<32, SIGNED, BRANCHFULL> { +struct dispatcher { DISPATCHER_GEN(int32_t, s32) }; template <> -struct dispatcher<32, SIGNED, BRANCHFREE> { +struct dispatcher { DISPATCHER_GEN(int32_t, s32_branchfree) }; template <> -struct dispatcher<32, UNSIGNED, BRANCHFULL> { +struct dispatcher { DISPATCHER_GEN(uint32_t, u32) }; template <> -struct dispatcher<32, UNSIGNED, BRANCHFREE> { +struct dispatcher { DISPATCHER_GEN(uint32_t, u32_branchfree) }; template <> -struct dispatcher<64, SIGNED, BRANCHFULL> { +struct dispatcher { DISPATCHER_GEN(int64_t, s64) }; template <> -struct dispatcher<64, SIGNED, BRANCHFREE> { +struct dispatcher { DISPATCHER_GEN(int64_t, s64_branchfree) }; template <> -struct dispatcher<64, UNSIGNED, BRANCHFULL> { +struct dispatcher { DISPATCHER_GEN(uint64_t, u64) }; template <> -struct dispatcher<64, UNSIGNED, BRANCHFREE> { +struct dispatcher { DISPATCHER_GEN(uint64_t, u64_branchfree) }; -} // namespace detail - -#if defined(LIBDIVIDE_NEON) -// Allow NeonVecFor outside of detail namespace. -template -struct NeonVecFor { - typedef typename detail::NeonVecFor::type type; -}; -#endif // This is the main divider class for use by the user (C++ API). // The actual division algorithm is selected using the dispatcher struct -// based on the integer width and algorithm template parameters. +// based on the integer and algorithm template parameters. template class divider { private: - // Dispatch based on the size and signedness. - // We avoid using type_traits as it's not available in AVR. - // Detect signedness by checking if T(-1) is less than T(0). - // Also throw in a shift by 0, which prevents floating point types from being passed. - typedef detail::dispatcher> 0) > (T)(-1) ? detail::SIGNED : detail::UNSIGNED), ALGO> - dispatcher_t; + typedef dispatcher dispatcher_t; public: // We leave the default constructor empty so that creating @@ -3204,9 +3006,6 @@ class divider { // later doesn't slow us down. divider() {} - // constexpr zero-initialization to allow for use w/ static constinit - explicit LIBDIVIDE_CONSTEXPR divider(decltype(nullptr)) : div(nullptr) {} - // Constructor that takes the divisor as a parameter LIBDIVIDE_INLINE divider(T d) : div(d) {} @@ -3218,7 +3017,7 @@ class divider { T recover() const { return div.recover(); } bool operator==(const divider &other) const { - return div.denom.magic == other.div.denom.magic && div.denom.more == other.div.denom.more; + return div.denom.magic == other.denom.magic && div.denom.more == other.denom.more; } bool operator!=(const divider &other) const { return !(*this == other); } @@ -3299,14 +3098,12 @@ LIBDIVIDE_INLINE __m512i operator/=(__m512i &n, const divider &div) { #if defined(LIBDIVIDE_NEON) template -LIBDIVIDE_INLINE typename NeonVecFor::type operator/( - typename NeonVecFor::type n, const divider &div) { +LIBDIVIDE_INLINE typename NeonVecFor::type operator/(typename NeonVecFor::type n, const divider &div) { return div.divide(n); } template -LIBDIVIDE_INLINE typename NeonVecFor::type operator/=( - typename NeonVecFor::type &n, const divider &div) { +LIBDIVIDE_INLINE typename NeonVecFor::type operator/=(typename NeonVecFor::type &n, const divider &div) { n = div.divide(n); return n; } diff --git a/util.c b/util.c index a43679c..a3d6f0c 100644 --- a/util.c +++ b/util.c @@ -6,8 +6,6 @@ #ifdef __ANDROID__ #include -int mallopt(int param, int value); -#define M_BIONIC_RESTORE_DEFAULT_SIGABRT_HANDLER (-1003) #endif #include "util.h" @@ -32,7 +30,6 @@ static int write_full(int fd, const char *buf, size_t length) { COLD noreturn void fatal_error(const char *s) { #ifdef __ANDROID__ - mallopt(M_BIONIC_RESTORE_DEFAULT_SIGABRT_HANDLER, 0); async_safe_fatal("hardened_malloc: fatal allocator error: %s", s); #else const char *prefix = "fatal allocator error: "; diff --git a/util.h b/util.h index 6b1a390..9a4a7af 100644 --- a/util.h +++ b/util.h @@ -9,9 +9,7 @@ #define noreturn __attribute__((noreturn)) #define likely(x) __builtin_expect(!!(x), 1) -#define likely51(x) __builtin_expect_with_probability(!!(x), 1, 0.51) #define unlikely(x) __builtin_expect(!!(x), 0) -#define unlikely51(x) __builtin_expect_with_probability(!!(x), 0, 0.51) #define min(x, y) ({ \ __typeof__(x) _x = (x); \ @@ -59,22 +57,6 @@ static inline size_t align(size_t size, size_t align) { return (size + mask) & ~mask; } -// u4_arr_{set,get} are helper functions for using u8 array as an array of unsigned 4-bit values. - -// val is treated as a 4-bit value -static inline void u4_arr_set(u8 *arr, size_t idx, u8 val) { - size_t off = idx >> 1; - size_t shift = (idx & 1) << 2; - u8 mask = (u8) (0xf0 >> shift); - arr[off] = (arr[off] & mask) | (val << shift); -} - -static inline u8 u4_arr_get(const u8 *arr, size_t idx) { - size_t off = idx >> 1; - size_t shift = (idx & 1) << 2; - return (u8) ((arr[off] >> shift) & 0xf); -} - COLD noreturn void fatal_error(const char *s); #if CONFIG_SEAL_METADATA