mirror of
https://github.com/GrapheneOS/hardened_malloc.git
synced 2025-04-19 15:06:07 -04:00
Compare commits
53 Commits
13
...
2025032100
Author | SHA1 | Date | |
---|---|---|---|
![]() |
4fe9018b6f | ||
![]() |
3ab23f7ebf | ||
![]() |
c894f3ec1d | ||
![]() |
c97263ef0c | ||
![]() |
a7302add63 | ||
![]() |
b1d9571fec | ||
![]() |
e03579253a | ||
![]() |
9739cb4690 | ||
![]() |
aa950244f8 | ||
![]() |
6402e2b0d4 | ||
![]() |
e86192e7fe | ||
![]() |
6ce663a8bd | ||
![]() |
9ca9d2d925 | ||
![]() |
3f07acfab1 | ||
![]() |
749640c274 | ||
![]() |
7268189933 | ||
![]() |
3c1f40aff0 | ||
![]() |
5fbbdc2ef8 | ||
![]() |
7d2151e40c | ||
![]() |
4756716904 | ||
![]() |
a3bf742c3e | ||
![]() |
53a45b4661 | ||
![]() |
abe54dba27 | ||
![]() |
365ee6900d | ||
![]() |
7093fdc482 | ||
![]() |
61821b02c8 | ||
![]() |
3c274731ba | ||
![]() |
4171bd164e | ||
![]() |
352c083f65 | ||
![]() |
88b3c1acf9 | ||
![]() |
f793a3edf6 | ||
![]() |
fd75fc1ba8 | ||
![]() |
72dc236d5f | ||
![]() |
be08eeee2d | ||
![]() |
25f0fe9c69 | ||
![]() |
c75cb4c3f3 | ||
![]() |
b560431c01 | ||
![]() |
009f2dad76 | ||
![]() |
03883eb2ce | ||
![]() |
7a6dbd8152 | ||
![]() |
f16ef601d4 | ||
![]() |
155800526a | ||
![]() |
28d5d394cf | ||
![]() |
577d9583eb | ||
![]() |
93aa9eefe4 | ||
![]() |
01a199e19e | ||
![]() |
576328b1b4 | ||
![]() |
5137d2da4d | ||
![]() |
f042a6b9b0 | ||
![]() |
001fc86585 | ||
![]() |
70c91f4c3e | ||
![]() |
e3686ae457 | ||
![]() |
19a46e0f96 |
16
.github/workflows/build-and-test.yml
vendored
16
.github/workflows/build-and-test.yml
vendored
@ -9,14 +9,30 @@ on:
|
||||
jobs:
|
||||
build-ubuntu-gcc:
|
||||
runs-on: ubuntu-latest
|
||||
strategy:
|
||||
matrix:
|
||||
version: [12, 13, 14]
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- name: Setting up gcc version
|
||||
run: |
|
||||
sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-${{ matrix.version }} 100
|
||||
sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-${{ matrix.version }} 100
|
||||
- name: Build
|
||||
run: make test
|
||||
build-ubuntu-clang:
|
||||
runs-on: ubuntu-latest
|
||||
strategy:
|
||||
matrix:
|
||||
version: [14, 15, 16, 17, 18]
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- name: Install dependencies
|
||||
run: sudo apt-get update && sudo apt-get install -y --no-install-recommends clang-14 clang-15
|
||||
- name: Setting up clang version
|
||||
run: |
|
||||
sudo update-alternatives --install /usr/bin/clang++ clang++ /usr/bin/clang++-${{ matrix.version }} 100
|
||||
sudo update-alternatives --install /usr/bin/clang clang /usr/bin/clang-${{ matrix.version }} 100
|
||||
- name: Build
|
||||
run: CC=clang CXX=clang++ make test
|
||||
build-musl:
|
||||
|
@ -5,8 +5,6 @@ common_cflags = [
|
||||
"-fPIC",
|
||||
"-fvisibility=hidden",
|
||||
//"-fno-plt",
|
||||
"-Wall",
|
||||
"-Wextra",
|
||||
"-Wcast-align",
|
||||
"-Wcast-qual",
|
||||
"-Wwrite-strings",
|
||||
@ -73,6 +71,9 @@ cc_library {
|
||||
debuggable: {
|
||||
cflags: ["-DLABEL_MEMORY"],
|
||||
},
|
||||
device_has_arm_mte: {
|
||||
cflags: ["-DHAS_ARM_MTE", "-march=armv8-a+dotprod+memtag"]
|
||||
},
|
||||
},
|
||||
apex_available: [
|
||||
"com.android.runtime",
|
||||
|
227
CREDITS
227
CREDITS
@ -54,3 +54,230 @@ libdivide:
|
||||
|
||||
random.c get_random_{type}_uniform functions are based on Fast Random Integer
|
||||
Generation in an Interval by Daniel Lemire
|
||||
|
||||
arm_mte.h arm_mte_tag_and_clear_mem function contents were copied from storeTags function in scudo:
|
||||
|
||||
==============================================================================
|
||||
The LLVM Project is under the Apache License v2.0 with LLVM Exceptions:
|
||||
==============================================================================
|
||||
|
||||
Apache License
|
||||
Version 2.0, January 2004
|
||||
http://www.apache.org/licenses/
|
||||
|
||||
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
||||
|
||||
1. Definitions.
|
||||
|
||||
"License" shall mean the terms and conditions for use, reproduction,
|
||||
and distribution as defined by Sections 1 through 9 of this document.
|
||||
|
||||
"Licensor" shall mean the copyright owner or entity authorized by
|
||||
the copyright owner that is granting the License.
|
||||
|
||||
"Legal Entity" shall mean the union of the acting entity and all
|
||||
other entities that control, are controlled by, or are under common
|
||||
control with that entity. For the purposes of this definition,
|
||||
"control" means (i) the power, direct or indirect, to cause the
|
||||
direction or management of such entity, whether by contract or
|
||||
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
||||
outstanding shares, or (iii) beneficial ownership of such entity.
|
||||
|
||||
"You" (or "Your") shall mean an individual or Legal Entity
|
||||
exercising permissions granted by this License.
|
||||
|
||||
"Source" form shall mean the preferred form for making modifications,
|
||||
including but not limited to software source code, documentation
|
||||
source, and configuration files.
|
||||
|
||||
"Object" form shall mean any form resulting from mechanical
|
||||
transformation or translation of a Source form, including but
|
||||
not limited to compiled object code, generated documentation,
|
||||
and conversions to other media types.
|
||||
|
||||
"Work" shall mean the work of authorship, whether in Source or
|
||||
Object form, made available under the License, as indicated by a
|
||||
copyright notice that is included in or attached to the work
|
||||
(an example is provided in the Appendix below).
|
||||
|
||||
"Derivative Works" shall mean any work, whether in Source or Object
|
||||
form, that is based on (or derived from) the Work and for which the
|
||||
editorial revisions, annotations, elaborations, or other modifications
|
||||
represent, as a whole, an original work of authorship. For the purposes
|
||||
of this License, Derivative Works shall not include works that remain
|
||||
separable from, or merely link (or bind by name) to the interfaces of,
|
||||
the Work and Derivative Works thereof.
|
||||
|
||||
"Contribution" shall mean any work of authorship, including
|
||||
the original version of the Work and any modifications or additions
|
||||
to that Work or Derivative Works thereof, that is intentionally
|
||||
submitted to Licensor for inclusion in the Work by the copyright owner
|
||||
or by an individual or Legal Entity authorized to submit on behalf of
|
||||
the copyright owner. For the purposes of this definition, "submitted"
|
||||
means any form of electronic, verbal, or written communication sent
|
||||
to the Licensor or its representatives, including but not limited to
|
||||
communication on electronic mailing lists, source code control systems,
|
||||
and issue tracking systems that are managed by, or on behalf of, the
|
||||
Licensor for the purpose of discussing and improving the Work, but
|
||||
excluding communication that is conspicuously marked or otherwise
|
||||
designated in writing by the copyright owner as "Not a Contribution."
|
||||
|
||||
"Contributor" shall mean Licensor and any individual or Legal Entity
|
||||
on behalf of whom a Contribution has been received by Licensor and
|
||||
subsequently incorporated within the Work.
|
||||
|
||||
2. Grant of Copyright License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
copyright license to reproduce, prepare Derivative Works of,
|
||||
publicly display, publicly perform, sublicense, and distribute the
|
||||
Work and such Derivative Works in Source or Object form.
|
||||
|
||||
3. Grant of Patent License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
(except as stated in this section) patent license to make, have made,
|
||||
use, offer to sell, sell, import, and otherwise transfer the Work,
|
||||
where such license applies only to those patent claims licensable
|
||||
by such Contributor that are necessarily infringed by their
|
||||
Contribution(s) alone or by combination of their Contribution(s)
|
||||
with the Work to which such Contribution(s) was submitted. If You
|
||||
institute patent litigation against any entity (including a
|
||||
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
||||
or a Contribution incorporated within the Work constitutes direct
|
||||
or contributory patent infringement, then any patent licenses
|
||||
granted to You under this License for that Work shall terminate
|
||||
as of the date such litigation is filed.
|
||||
|
||||
4. Redistribution. You may reproduce and distribute copies of the
|
||||
Work or Derivative Works thereof in any medium, with or without
|
||||
modifications, and in Source or Object form, provided that You
|
||||
meet the following conditions:
|
||||
|
||||
(a) You must give any other recipients of the Work or
|
||||
Derivative Works a copy of this License; and
|
||||
|
||||
(b) You must cause any modified files to carry prominent notices
|
||||
stating that You changed the files; and
|
||||
|
||||
(c) You must retain, in the Source form of any Derivative Works
|
||||
that You distribute, all copyright, patent, trademark, and
|
||||
attribution notices from the Source form of the Work,
|
||||
excluding those notices that do not pertain to any part of
|
||||
the Derivative Works; and
|
||||
|
||||
(d) If the Work includes a "NOTICE" text file as part of its
|
||||
distribution, then any Derivative Works that You distribute must
|
||||
include a readable copy of the attribution notices contained
|
||||
within such NOTICE file, excluding those notices that do not
|
||||
pertain to any part of the Derivative Works, in at least one
|
||||
of the following places: within a NOTICE text file distributed
|
||||
as part of the Derivative Works; within the Source form or
|
||||
documentation, if provided along with the Derivative Works; or,
|
||||
within a display generated by the Derivative Works, if and
|
||||
wherever such third-party notices normally appear. The contents
|
||||
of the NOTICE file are for informational purposes only and
|
||||
do not modify the License. You may add Your own attribution
|
||||
notices within Derivative Works that You distribute, alongside
|
||||
or as an addendum to the NOTICE text from the Work, provided
|
||||
that such additional attribution notices cannot be construed
|
||||
as modifying the License.
|
||||
|
||||
You may add Your own copyright statement to Your modifications and
|
||||
may provide additional or different license terms and conditions
|
||||
for use, reproduction, or distribution of Your modifications, or
|
||||
for any such Derivative Works as a whole, provided Your use,
|
||||
reproduction, and distribution of the Work otherwise complies with
|
||||
the conditions stated in this License.
|
||||
|
||||
5. Submission of Contributions. Unless You explicitly state otherwise,
|
||||
any Contribution intentionally submitted for inclusion in the Work
|
||||
by You to the Licensor shall be under the terms and conditions of
|
||||
this License, without any additional terms or conditions.
|
||||
Notwithstanding the above, nothing herein shall supersede or modify
|
||||
the terms of any separate license agreement you may have executed
|
||||
with Licensor regarding such Contributions.
|
||||
|
||||
6. Trademarks. This License does not grant permission to use the trade
|
||||
names, trademarks, service marks, or product names of the Licensor,
|
||||
except as required for reasonable and customary use in describing the
|
||||
origin of the Work and reproducing the content of the NOTICE file.
|
||||
|
||||
7. Disclaimer of Warranty. Unless required by applicable law or
|
||||
agreed to in writing, Licensor provides the Work (and each
|
||||
Contributor provides its Contributions) on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
||||
implied, including, without limitation, any warranties or conditions
|
||||
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
||||
PARTICULAR PURPOSE. You are solely responsible for determining the
|
||||
appropriateness of using or redistributing the Work and assume any
|
||||
risks associated with Your exercise of permissions under this License.
|
||||
|
||||
8. Limitation of Liability. In no event and under no legal theory,
|
||||
whether in tort (including negligence), contract, or otherwise,
|
||||
unless required by applicable law (such as deliberate and grossly
|
||||
negligent acts) or agreed to in writing, shall any Contributor be
|
||||
liable to You for damages, including any direct, indirect, special,
|
||||
incidental, or consequential damages of any character arising as a
|
||||
result of this License or out of the use or inability to use the
|
||||
Work (including but not limited to damages for loss of goodwill,
|
||||
work stoppage, computer failure or malfunction, or any and all
|
||||
other commercial damages or losses), even if such Contributor
|
||||
has been advised of the possibility of such damages.
|
||||
|
||||
9. Accepting Warranty or Additional Liability. While redistributing
|
||||
the Work or Derivative Works thereof, You may choose to offer,
|
||||
and charge a fee for, acceptance of support, warranty, indemnity,
|
||||
or other liability obligations and/or rights consistent with this
|
||||
License. However, in accepting such obligations, You may act only
|
||||
on Your own behalf and on Your sole responsibility, not on behalf
|
||||
of any other Contributor, and only if You agree to indemnify,
|
||||
defend, and hold each Contributor harmless for any liability
|
||||
incurred by, or claims asserted against, such Contributor by reason
|
||||
of your accepting any such warranty or additional liability.
|
||||
|
||||
END OF TERMS AND CONDITIONS
|
||||
|
||||
APPENDIX: How to apply the Apache License to your work.
|
||||
|
||||
To apply the Apache License to your work, attach the following
|
||||
boilerplate notice, with the fields enclosed by brackets "[]"
|
||||
replaced with your own identifying information. (Don't include
|
||||
the brackets!) The text should be enclosed in the appropriate
|
||||
comment syntax for the file format. We also recommend that a
|
||||
file or class name and description of purpose be included on the
|
||||
same "printed page" as the copyright notice for easier
|
||||
identification within third-party archives.
|
||||
|
||||
Copyright [yyyy] [name of copyright owner]
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
|
||||
|
||||
---- LLVM Exceptions to the Apache 2.0 License ----
|
||||
|
||||
As an exception, if, as a result of your compiling your source code, portions
|
||||
of this Software are embedded into an Object form of such source code, you
|
||||
may redistribute such embedded portions in such Object form without complying
|
||||
with the conditions of Sections 4(a), 4(b) and 4(d) of the License.
|
||||
|
||||
In addition, if you combine or link compiled forms of this Software with
|
||||
software that is licensed under the GPLv2 ("Combined Software") and if a
|
||||
court of competent jurisdiction determines that the patent provision (Section
|
||||
3), the indemnity provision (Section 9) or other Section of the License
|
||||
conflicts with the conditions of the GPLv2, you may retroactively and
|
||||
prospectively choose to deem waived or otherwise exclude such Section(s) of
|
||||
the License, but only in their entirety and only with respect to the Combined
|
||||
Software.
|
||||
|
||||
==============================================================================
|
||||
|
2
LICENSE
2
LICENSE
@ -1,4 +1,4 @@
|
||||
Copyright © 2018-2023 GrapheneOS
|
||||
Copyright © 2018-2024 GrapheneOS
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
|
108
README.md
108
README.md
@ -83,7 +83,7 @@ there will be custom integration offering better performance in the future
|
||||
along with other hardening for the C standard library implementation.
|
||||
|
||||
For Android, only the current generation, actively developed maintenance branch of the Android
|
||||
Open Source Project will be supported, which currently means `android13-qpr2-release`.
|
||||
Open Source Project will be supported, which currently means `android15-release`.
|
||||
|
||||
## Testing
|
||||
|
||||
@ -159,6 +159,9 @@ line to the `/etc/ld.so.preload` configuration file:
|
||||
The format of this configuration file is a whitespace-separated list, so it's
|
||||
good practice to put each library on a separate line.
|
||||
|
||||
On Debian systems `libhardened_malloc.so` should be installed into `/usr/lib/`
|
||||
to avoid preload failures caused by AppArmor profile restrictions.
|
||||
|
||||
Using the `LD_PRELOAD` environment variable to load it on a case-by-case basis
|
||||
will not work when `AT_SECURE` is set such as with setuid binaries. It's also
|
||||
generally not a recommended approach for production usage. The recommendation
|
||||
@ -470,16 +473,16 @@ was a bit less important and if a core goal was finding latent bugs.
|
||||
* Errors other than ENOMEM from mmap, munmap, mprotect and mremap treated
|
||||
as fatal, which can help to detect memory management gone wrong elsewhere
|
||||
in the process.
|
||||
* [future] Memory tagging for slab allocations via MTE on ARMv8.5+
|
||||
* Memory tagging for slab allocations via MTE on ARMv8.5+
|
||||
* random memory tags as the baseline, providing probabilistic protection
|
||||
against various forms of memory corruption
|
||||
* dedicated tag for free slots, set on free, for deterministic protection
|
||||
against accessing freed memory
|
||||
* store previous random tag within freed slab allocations, and increment it
|
||||
to get the next tag for that slot to provide deterministic use-after-free
|
||||
detection through multiple cycles of memory reuse
|
||||
* guarantee distinct tags for adjacent memory allocations by incrementing
|
||||
past matching values for deterministic detection of linear overflows
|
||||
* [future] store previous random tag and increment it to get the next tag
|
||||
for that slot to provide deterministic use-after-free detection through
|
||||
multiple cycles of memory reuse
|
||||
|
||||
## Randomness
|
||||
|
||||
@ -721,77 +724,46 @@ freeing as there would be if the kernel supported these features directly.
|
||||
|
||||
## Memory tagging
|
||||
|
||||
Integrating extensive support for ARMv8.5 memory tagging is planned and this
|
||||
section will be expanded to cover the details on the chosen design. The approach
|
||||
for slab allocations is currently covered, but it can also be used for the
|
||||
allocator metadata region and large allocations.
|
||||
Random tags are set for all slab allocations when allocated, with 4 excluded values:
|
||||
|
||||
Memory allocations are already always multiples of naturally aligned 16 byte
|
||||
units, so memory tags are a natural fit into a malloc implementation due to the
|
||||
16 byte alignment requirement. The only extra memory consumption will come from
|
||||
the hardware supported storage for the tag values (4 bits per 16 bytes).
|
||||
1. the reserved `0` tag
|
||||
2. the previous tag used for the slot
|
||||
3. the current (or previous) tag used for the slot to the left
|
||||
4. the current (or previous) tag used for the slot to the right
|
||||
|
||||
The baseline policy will be to generate random tags for each slab allocation
|
||||
slot on first use. The highest value will be reserved for marking freed memory
|
||||
allocations to detect any accesses to freed memory so it won't be part of the
|
||||
generated range. Adjacent slots will be guaranteed to have distinct memory tags
|
||||
in order to guarantee that linear overflows are detected. There are a few ways
|
||||
of implementing this and it will end up depending on the performance costs of
|
||||
different approaches. If there's an efficient way to fetch the adjacent tag
|
||||
values without wasting extra memory, it will be possible to check for them and
|
||||
skip them either by generating a new random value in a loop or incrementing
|
||||
past them since the tiny bit of bias wouldn't matter. Another approach would be
|
||||
alternating odd and even tag values but that would substantially reduce the
|
||||
overall randomness of the tags and there's very little entropy from the start.
|
||||
When a slab allocation is freed, the reserved `0` tag is set for the slot.
|
||||
Slab allocation slots are cleared before reuse when memory tagging is enabled.
|
||||
|
||||
Once a slab allocation has been freed, the tag will be set to the reserved
|
||||
value for free memory and the previous tag value will be stored inside the
|
||||
allocation itself. The next time the slot is allocated, the chosen tag value
|
||||
will be the previous value incremented by one to provide use-after-free
|
||||
detection between generations of allocations. The stored tag will be wiped
|
||||
before retagging the memory, to avoid leaking it and as part of preserving the
|
||||
security property of newly allocated memory being zeroed due to zero-on-free.
|
||||
It will eventually wrap all the way around, but this ends up providing a strong
|
||||
guarantee for many allocation cycles due to the combination of 4 bit tags with
|
||||
the FIFO quarantine feature providing delayed free. It also benefits from
|
||||
random slot allocation and the randomized portion of delayed free, which result
|
||||
in a further delay along with preventing a deterministic bypass by forcing a
|
||||
reuse after a certain number of allocation cycles. Similarly to the initial tag
|
||||
generation, tag values for adjacent allocations will be skipped by incrementing
|
||||
past them.
|
||||
This ensures the following properties:
|
||||
|
||||
For example, consider this slab of allocations that are not yet used with 15
|
||||
representing the tag for free memory. For the sake of simplicity, there will be
|
||||
no quarantine or other slabs for this example:
|
||||
- Linear overflows are deterministically detected.
|
||||
- Use-after-free are deterministically detected until the freed slot goes through
|
||||
both the random and FIFO quarantines, gets allocated again, goes through both
|
||||
quarantines again and then finally gets allocated again for a 2nd time.
|
||||
- Since the default `0` tag is reserved, untagged pointers can't access slab
|
||||
allocations and vice versa.
|
||||
|
||||
| 15 | 15 | 15 | 15 | 15 | 15 |
|
||||
Slab allocations are done in a statically reserved region for each size class
|
||||
and all metadata is in a statically reserved region, so interactions between
|
||||
different uses of the same address space is not applicable.
|
||||
|
||||
Three slots are randomly chosen for allocations, with random tags assigned (2,
|
||||
7, 14) since these slots haven't ever been used and don't have saved values:
|
||||
Large allocations beyond the largest slab allocation size class (128k by
|
||||
default) are guaranteed to have randomly sized guard regions to the left and
|
||||
right. Random and FIFO address space quarantines provide use-after-free
|
||||
detection. We need to test whether the cost of random tags is acceptable to enabled them by default,
|
||||
since they would be useful for:
|
||||
|
||||
| 15 | 2 | 15 | 7 | 14 | 15 |
|
||||
- probabilistic detection of overflows
|
||||
- probabilistic detection of use-after-free once the address space is
|
||||
out of the quarantine and reused for another allocation
|
||||
- deterministic detection of use-after-free for reuse by another allocator.
|
||||
|
||||
The 2nd allocation slot is freed, and is set back to the tag for free memory
|
||||
(15), but with the previous tag value stored in the freed space:
|
||||
|
||||
| 15 | 15 | 15 | 7 | 14 | 15 |
|
||||
|
||||
The first slot is allocated for the first time, receiving the random value 3:
|
||||
|
||||
| 3 | 15 | 15 | 7 | 14 | 15 |
|
||||
|
||||
The 2nd slot is randomly chosen again, so the previous tag (2) is retrieved and
|
||||
incremented to 3 as part of the use-after-free mitigation. An adjacent
|
||||
allocation already uses the tag 3, so the tag is further incremented to 4 (it
|
||||
would be incremented to 5 if one of the adjacent tags was 4):
|
||||
|
||||
| 3 | 4 | 15 | 7 | 14 | 15 |
|
||||
|
||||
The last slot is randomly chosen for the next allocation, and is assigned the
|
||||
random value 14. However, it's placed next to an allocation with the tag 14 so
|
||||
the tag is incremented and wraps around to 0:
|
||||
|
||||
| 3 | 4 | 15 | 7 | 14 | 0 |
|
||||
When memory tagging is enabled, checking for write-after-free at allocation
|
||||
time and checking canaries are both disabled. Canaries will be more thoroughly
|
||||
disabled when using memory tagging in the future, but Android currently has
|
||||
[very dynamic memory tagging support](https://source.android.com/docs/security/test/memory-safety/arm-mte)
|
||||
where it can be disabled at any time which creates a barrier to optimizing
|
||||
by disabling redundant features.
|
||||
|
||||
## API extensions
|
||||
|
||||
|
25
androidtest/Android.bp
Normal file
25
androidtest/Android.bp
Normal file
@ -0,0 +1,25 @@
|
||||
java_test_host {
|
||||
name: "HMallocTest",
|
||||
srcs: [
|
||||
"src/**/*.java",
|
||||
],
|
||||
|
||||
libs: [
|
||||
"tradefed",
|
||||
"compatibility-tradefed",
|
||||
"compatibility-host-util",
|
||||
],
|
||||
|
||||
static_libs: [
|
||||
"cts-host-utils",
|
||||
"frameworks-base-hostutils",
|
||||
],
|
||||
|
||||
test_suites: [
|
||||
"general-tests",
|
||||
],
|
||||
|
||||
data_device_bins_64: [
|
||||
"memtag_test",
|
||||
],
|
||||
}
|
13
androidtest/AndroidTest.xml
Normal file
13
androidtest/AndroidTest.xml
Normal file
@ -0,0 +1,13 @@
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<configuration description="hardened_malloc test">
|
||||
|
||||
<target_preparer class="com.android.compatibility.common.tradefed.targetprep.FilePusher">
|
||||
<option name="cleanup" value="true" />
|
||||
<option name="push" value="memtag_test->/data/local/tmp/memtag_test" />
|
||||
</target_preparer>
|
||||
|
||||
<test class="com.android.compatibility.common.tradefed.testtype.JarHostTest" >
|
||||
<option name="jar" value="HMallocTest.jar" />
|
||||
</test>
|
||||
|
||||
</configuration>
|
17
androidtest/memtag/Android.bp
Normal file
17
androidtest/memtag/Android.bp
Normal file
@ -0,0 +1,17 @@
|
||||
cc_test {
|
||||
name: "memtag_test",
|
||||
srcs: ["memtag_test.cc"],
|
||||
cflags: [
|
||||
"-Wall",
|
||||
"-Werror",
|
||||
"-Wextra",
|
||||
"-O0",
|
||||
"-march=armv9-a+memtag",
|
||||
],
|
||||
|
||||
compile_multilib: "64",
|
||||
|
||||
sanitize: {
|
||||
memtag_heap: true,
|
||||
},
|
||||
}
|
351
androidtest/memtag/memtag_test.cc
Normal file
351
androidtest/memtag/memtag_test.cc
Normal file
@ -0,0 +1,351 @@
|
||||
// needed to uncondionally enable assertions
|
||||
#undef NDEBUG
|
||||
#include <assert.h>
|
||||
#include <malloc.h>
|
||||
#include <signal.h>
|
||||
#include <stdio.h>
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <sys/mman.h>
|
||||
#include <sys/utsname.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#include <map>
|
||||
#include <set>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
|
||||
#include "../../arm_mte.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
using u8 = uint8_t;
|
||||
using uptr = uintptr_t;
|
||||
using u64 = uint64_t;
|
||||
|
||||
const size_t DEFAULT_ALLOC_SIZE = 8;
|
||||
const size_t CANARY_SIZE = 8;
|
||||
|
||||
void do_context_switch() {
|
||||
utsname s;
|
||||
uname(&s);
|
||||
}
|
||||
|
||||
u8 get_pointer_tag(void *ptr) {
|
||||
return (((uptr) ptr) >> 56) & 0xf;
|
||||
}
|
||||
|
||||
void *untag_pointer(void *ptr) {
|
||||
const uintptr_t mask = UINTPTR_MAX >> 8;
|
||||
return (void *) ((uintptr_t) ptr & mask);
|
||||
}
|
||||
|
||||
void *set_pointer_tag(void *ptr, u8 tag) {
|
||||
return (void *) (((uintptr_t) tag << 56) | (uintptr_t) untag_pointer(ptr));
|
||||
}
|
||||
|
||||
// This test checks that slab slot allocation uses tag that is distint from tags of its neighbors
|
||||
// and from the tag of the previous allocation that used the same slot
|
||||
void tag_distinctness() {
|
||||
// tag 0 is reserved
|
||||
const int min_tag = 1;
|
||||
const int max_tag = 0xf;
|
||||
|
||||
struct SizeClass {
|
||||
int size;
|
||||
int slot_cnt;
|
||||
};
|
||||
|
||||
// values from size_classes[] and size_class_slots[] in h_malloc.c
|
||||
SizeClass size_classes[] = {
|
||||
{ .size = 16, .slot_cnt = 256, },
|
||||
{ .size = 32, .slot_cnt = 128, },
|
||||
// this size class is used by allocations that are made by the addr_tag_map, which breaks
|
||||
// tag distinctess checks
|
||||
// { .size = 48, .slot_cnt = 85, },
|
||||
{ .size = 64, .slot_cnt = 64, },
|
||||
{ .size = 80, .slot_cnt = 51, },
|
||||
{ .size = 96, .slot_cnt = 42, },
|
||||
{ .size = 112, .slot_cnt = 36, },
|
||||
{ .size = 128, .slot_cnt = 64, },
|
||||
{ .size = 160, .slot_cnt = 51, },
|
||||
{ .size = 192, .slot_cnt = 64, },
|
||||
{ .size = 224, .slot_cnt = 54, },
|
||||
{ .size = 10240, .slot_cnt = 6, },
|
||||
{ .size = 20480, .slot_cnt = 1, },
|
||||
};
|
||||
|
||||
int tag_usage[max_tag + 1];
|
||||
|
||||
for (size_t sc_idx = 0; sc_idx < sizeof(size_classes) / sizeof(SizeClass); ++sc_idx) {
|
||||
SizeClass &sc = size_classes[sc_idx];
|
||||
|
||||
const size_t full_alloc_size = sc.size;
|
||||
const size_t alloc_size = full_alloc_size - CANARY_SIZE;
|
||||
|
||||
// "tdc" is short for "tag distinctness check"
|
||||
int left_neighbor_tdc_cnt = 0;
|
||||
int right_neighbor_tdc_cnt = 0;
|
||||
int prev_alloc_tdc_cnt = 0;
|
||||
|
||||
int iter_cnt = 600;
|
||||
|
||||
unordered_map<uptr, u8> addr_tag_map;
|
||||
addr_tag_map.reserve(iter_cnt * sc.slot_cnt);
|
||||
|
||||
u64 seen_tags = 0;
|
||||
|
||||
for (int iter = 0; iter < iter_cnt; ++iter) {
|
||||
uptr allocations[256]; // 256 is max slot count
|
||||
|
||||
for (int i = 0; i < sc.slot_cnt; ++i) {
|
||||
u8 *p = (u8 *) malloc(alloc_size);
|
||||
assert(p);
|
||||
uptr addr = (uptr) untag_pointer(p);
|
||||
u8 tag = get_pointer_tag(p);
|
||||
|
||||
assert(tag >= min_tag && tag <= max_tag);
|
||||
seen_tags |= 1 << tag;
|
||||
++tag_usage[tag];
|
||||
|
||||
// check most recent tags of left and right neighbors
|
||||
|
||||
auto left = addr_tag_map.find(addr - full_alloc_size);
|
||||
if (left != addr_tag_map.end()) {
|
||||
assert(left->second != tag);
|
||||
++left_neighbor_tdc_cnt;
|
||||
}
|
||||
|
||||
auto right = addr_tag_map.find(addr + full_alloc_size);
|
||||
if (right != addr_tag_map.end()) {
|
||||
assert(right->second != tag);
|
||||
++right_neighbor_tdc_cnt;
|
||||
}
|
||||
|
||||
// check previous tag of this slot
|
||||
auto prev = addr_tag_map.find(addr);
|
||||
if (prev != addr_tag_map.end()) {
|
||||
assert(prev->second != tag);
|
||||
++prev_alloc_tdc_cnt;
|
||||
addr_tag_map.erase(addr);
|
||||
}
|
||||
|
||||
addr_tag_map.emplace(addr, tag);
|
||||
|
||||
for (size_t j = 0; j < alloc_size; ++j) {
|
||||
// check that slot is zeroed
|
||||
assert(p[j] == 0);
|
||||
// check that slot is readable and writable
|
||||
p[j]++;
|
||||
}
|
||||
|
||||
allocations[i] = addr;
|
||||
}
|
||||
|
||||
// free some of allocations to allow their slots to be reused
|
||||
for (int i = sc.slot_cnt - 1; i >= 0; i -= 2) {
|
||||
free((void *) allocations[i]);
|
||||
}
|
||||
}
|
||||
|
||||
// check that all of the tags were used, except for the reserved tag 0
|
||||
assert(seen_tags == (0xffff & ~(1 << 0)));
|
||||
|
||||
printf("size_class\t%i\t" "tdc_left %i\t" "tdc_right %i\t" "tdc_prev_alloc %i\n",
|
||||
sc.size, left_neighbor_tdc_cnt, right_neighbor_tdc_cnt, prev_alloc_tdc_cnt);
|
||||
|
||||
// make sure tag distinctess checks were actually performed
|
||||
int min_tdc_cnt = sc.slot_cnt * iter_cnt / 5;
|
||||
|
||||
assert(prev_alloc_tdc_cnt > min_tdc_cnt);
|
||||
|
||||
if (sc.slot_cnt > 1) {
|
||||
assert(left_neighbor_tdc_cnt > min_tdc_cnt);
|
||||
assert(right_neighbor_tdc_cnt > min_tdc_cnt);
|
||||
}
|
||||
|
||||
// async tag check failures are reported on context switch
|
||||
do_context_switch();
|
||||
}
|
||||
|
||||
printf("\nTag use counters:\n");
|
||||
|
||||
int min = INT_MAX;
|
||||
int max = 0;
|
||||
double geomean = 0.0;
|
||||
for (int i = min_tag; i <= max_tag; ++i) {
|
||||
int v = tag_usage[i];
|
||||
geomean += log(v);
|
||||
min = std::min(min, v);
|
||||
max = std::max(max, v);
|
||||
printf("%i\t%i\n", i, tag_usage[i]);
|
||||
}
|
||||
int tag_cnt = 1 + max_tag - min_tag;
|
||||
geomean = exp(geomean / tag_cnt);
|
||||
|
||||
double max_deviation = std::max((double) max - geomean, geomean - min);
|
||||
|
||||
printf("geomean: %.2f, max deviation from geomean: %.2f%%\n", geomean, (100.0 * max_deviation) / geomean);
|
||||
}
|
||||
|
||||
u8* alloc_default() {
|
||||
const size_t full_alloc_size = DEFAULT_ALLOC_SIZE + CANARY_SIZE;
|
||||
set<uptr> addrs;
|
||||
|
||||
// make sure allocation has both left and right neighbors, otherwise overflow/underflow tests
|
||||
// will fail when allocation is at the end/beginning of slab
|
||||
for (;;) {
|
||||
u8 *p = (u8 *) malloc(DEFAULT_ALLOC_SIZE);
|
||||
assert(p);
|
||||
|
||||
uptr addr = (uptr) untag_pointer(p);
|
||||
uptr left = addr - full_alloc_size;
|
||||
if (addrs.find(left) != addrs.end()) {
|
||||
uptr right = addr + full_alloc_size;
|
||||
if (addrs.find(right) != addrs.end()) {
|
||||
return p;
|
||||
}
|
||||
}
|
||||
|
||||
addrs.emplace(addr);
|
||||
}
|
||||
}
|
||||
|
||||
int expected_segv_code;
|
||||
|
||||
#define expect_segv(exp, segv_code) ({\
|
||||
expected_segv_code = segv_code; \
|
||||
volatile auto val = exp; \
|
||||
(void) val; \
|
||||
do_context_switch(); \
|
||||
fprintf(stderr, "didn't receive SEGV code %i", segv_code); \
|
||||
exit(1); })
|
||||
|
||||
// it's expected that the device is configured to use asymm MTE tag checking mode (sync read checks,
|
||||
// async write checks)
|
||||
#define expect_read_segv(exp) expect_segv(exp, SEGV_MTESERR)
|
||||
#define expect_write_segv(exp) expect_segv(exp, SEGV_MTEAERR)
|
||||
|
||||
void read_after_free() {
|
||||
u8 *p = alloc_default();
|
||||
free(p);
|
||||
expect_read_segv(p[0]);
|
||||
}
|
||||
|
||||
void write_after_free() {
|
||||
u8 *p = alloc_default();
|
||||
free(p);
|
||||
expect_write_segv(p[0] = 1);
|
||||
}
|
||||
|
||||
void underflow_read() {
|
||||
u8 *p = alloc_default();
|
||||
expect_read_segv(p[-1]);
|
||||
}
|
||||
|
||||
void underflow_write() {
|
||||
u8 *p = alloc_default();
|
||||
expect_write_segv(p[-1] = 1);
|
||||
}
|
||||
|
||||
void overflow_read() {
|
||||
u8 *p = alloc_default();
|
||||
expect_read_segv(p[DEFAULT_ALLOC_SIZE + CANARY_SIZE]);
|
||||
}
|
||||
|
||||
void overflow_write() {
|
||||
u8 *p = alloc_default();
|
||||
expect_write_segv(p[DEFAULT_ALLOC_SIZE + CANARY_SIZE] = 1);
|
||||
}
|
||||
|
||||
void untagged_read() {
|
||||
u8 *p = alloc_default();
|
||||
p = (u8 *) untag_pointer(p);
|
||||
expect_read_segv(p[0]);
|
||||
}
|
||||
|
||||
void untagged_write() {
|
||||
u8 *p = alloc_default();
|
||||
p = (u8 *) untag_pointer(p);
|
||||
expect_write_segv(p[0] = 1);
|
||||
}
|
||||
|
||||
// checks that each of memory locations inside the buffer is tagged with expected_tag
|
||||
void check_tag(void *buf, size_t len, u8 expected_tag) {
|
||||
for (size_t i = 0; i < len; ++i) {
|
||||
assert(get_pointer_tag(__arm_mte_get_tag((void *) ((uintptr_t) buf + i))) == expected_tag);
|
||||
}
|
||||
}
|
||||
|
||||
void madvise_dontneed() {
|
||||
const size_t len = 100'000;
|
||||
void *ptr = mmap(NULL, len, PROT_READ | PROT_WRITE | PROT_MTE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
|
||||
assert(ptr != MAP_FAILED);
|
||||
|
||||
// check that 0 is the initial tag
|
||||
check_tag(ptr, len, 0);
|
||||
|
||||
arm_mte_tag_and_clear_mem(set_pointer_tag(ptr, 1), len);
|
||||
check_tag(ptr, len, 1);
|
||||
|
||||
memset(set_pointer_tag(ptr, 1), 1, len);
|
||||
|
||||
assert(madvise(ptr, len, MADV_DONTNEED) == 0);
|
||||
// check that MADV_DONTNEED resets the tag
|
||||
check_tag(ptr, len, 0);
|
||||
|
||||
// check that MADV_DONTNEED clears the memory
|
||||
for (size_t i = 0; i < len; ++i) {
|
||||
assert(((u8 *) ptr)[i] == 0);
|
||||
}
|
||||
|
||||
// check that mistagged read after MADV_DONTNEED fails
|
||||
expect_read_segv(*((u8 *) set_pointer_tag(ptr, 1)));
|
||||
}
|
||||
|
||||
map<string, function<void()>> tests = {
|
||||
#define TEST(s) { #s, s }
|
||||
TEST(tag_distinctness),
|
||||
TEST(read_after_free),
|
||||
TEST(write_after_free),
|
||||
TEST(overflow_read),
|
||||
TEST(overflow_write),
|
||||
TEST(underflow_read),
|
||||
TEST(underflow_write),
|
||||
TEST(untagged_read),
|
||||
TEST(untagged_write),
|
||||
TEST(madvise_dontneed),
|
||||
#undef TEST
|
||||
};
|
||||
|
||||
void segv_handler(int, siginfo_t *si, void *) {
|
||||
if (expected_segv_code == 0 || expected_segv_code != si->si_code) {
|
||||
fprintf(stderr, "received unexpected SEGV_CODE %i", si->si_code);
|
||||
exit(139); // standard exit code for SIGSEGV
|
||||
}
|
||||
|
||||
exit(0);
|
||||
}
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
setbuf(stdout, NULL);
|
||||
assert(argc == 2);
|
||||
|
||||
auto test_name = string(argv[1]);
|
||||
auto test_fn = tests[test_name];
|
||||
assert(test_fn != nullptr);
|
||||
|
||||
assert(mallopt(M_BIONIC_SET_HEAP_TAGGING_LEVEL, M_HEAP_TAGGING_LEVEL_ASYNC) == 1);
|
||||
|
||||
struct sigaction sa = {
|
||||
.sa_sigaction = segv_handler,
|
||||
.sa_flags = SA_SIGINFO,
|
||||
};
|
||||
|
||||
assert(sigaction(SIGSEGV, &sa, nullptr) == 0);
|
||||
|
||||
test_fn();
|
||||
do_context_switch();
|
||||
|
||||
return 0;
|
||||
}
|
79
androidtest/src/grapheneos/hmalloc/MemtagTest.java
Normal file
79
androidtest/src/grapheneos/hmalloc/MemtagTest.java
Normal file
@ -0,0 +1,79 @@
|
||||
package grapheneos.hmalloc;
|
||||
|
||||
import com.android.tradefed.device.DeviceNotAvailableException;
|
||||
import com.android.tradefed.testtype.DeviceJUnit4ClassRunner;
|
||||
import com.android.tradefed.testtype.junit4.BaseHostJUnit4Test;
|
||||
|
||||
import org.junit.Test;
|
||||
import org.junit.runner.RunWith;
|
||||
|
||||
import java.util.ArrayList;
|
||||
|
||||
import static org.junit.Assert.assertEquals;
|
||||
|
||||
@RunWith(DeviceJUnit4ClassRunner.class)
|
||||
public class MemtagTest extends BaseHostJUnit4Test {
|
||||
private static final String TEST_BINARY = "/data/local/tmp/memtag_test";
|
||||
|
||||
private void runTest(String name) throws DeviceNotAvailableException {
|
||||
var args = new ArrayList<String>();
|
||||
args.add(TEST_BINARY);
|
||||
args.add(name);
|
||||
String cmdLine = String.join(" ", args);
|
||||
|
||||
var result = getDevice().executeShellV2Command(cmdLine);
|
||||
|
||||
assertEquals("stderr", "", result.getStderr());
|
||||
assertEquals("process exit code", 0, result.getExitCode().intValue());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void tag_distinctness() throws DeviceNotAvailableException {
|
||||
runTest("tag_distinctness");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void read_after_free() throws DeviceNotAvailableException {
|
||||
runTest("read_after_free");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void write_after_free() throws DeviceNotAvailableException {
|
||||
runTest("write_after_free");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void underflow_read() throws DeviceNotAvailableException {
|
||||
runTest("underflow_read");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void underflow_write() throws DeviceNotAvailableException {
|
||||
runTest("underflow_write");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void overflow_read() throws DeviceNotAvailableException {
|
||||
runTest("overflow_read");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void overflow_write() throws DeviceNotAvailableException {
|
||||
runTest("overflow_write");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void untagged_read() throws DeviceNotAvailableException {
|
||||
runTest("untagged_read");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void untagged_write() throws DeviceNotAvailableException {
|
||||
runTest("untagged_write");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void madvise_dontneed() throws DeviceNotAvailableException {
|
||||
runTest("madvise_dontneed");
|
||||
}
|
||||
}
|
91
arm_mte.h
Normal file
91
arm_mte.h
Normal file
@ -0,0 +1,91 @@
|
||||
#ifndef ARM_MTE_H
|
||||
#define ARM_MTE_H
|
||||
|
||||
#include <arm_acle.h>
|
||||
#include <stdint.h>
|
||||
|
||||
// Returns a tagged pointer.
|
||||
// See https://developer.arm.com/documentation/ddi0602/2023-09/Base-Instructions/IRG--Insert-Random-Tag-
|
||||
static inline void *arm_mte_create_random_tag(void *p, uint64_t exclusion_mask) {
|
||||
return __arm_mte_create_random_tag(p, exclusion_mask);
|
||||
}
|
||||
|
||||
// Tag the memory region with the tag specified in tag bits of tagged_ptr. Memory region itself is
|
||||
// zeroed.
|
||||
// tagged_ptr has to be aligned by 16, and len has to be a multiple of 16 (tag granule size).
|
||||
//
|
||||
// Arm's software optimization guide says:
|
||||
// "it is recommended to use STZGM (or DCZGVA) to set tag if data is not a concern." (STZGM and
|
||||
// DCGZVA are zeroing variants of tagging instructions).
|
||||
//
|
||||
// Contents of this function were copied from scudo:
|
||||
// https://android.googlesource.com/platform/external/scudo/+/refs/tags/android-14.0.0_r1/standalone/memtag.h#167
|
||||
//
|
||||
// scudo is licensed under the Apache License v2.0 with LLVM Exceptions, which is compatible with
|
||||
// the hardened_malloc's MIT license
|
||||
static inline void arm_mte_tag_and_clear_mem(void *tagged_ptr, size_t len) {
|
||||
uintptr_t Begin = (uintptr_t) tagged_ptr;
|
||||
uintptr_t End = Begin + len;
|
||||
uintptr_t LineSize, Next, Tmp;
|
||||
__asm__ __volatile__(
|
||||
".arch_extension memtag \n\t"
|
||||
|
||||
// Compute the cache line size in bytes (DCZID_EL0 stores it as the log2
|
||||
// of the number of 4-byte words) and bail out to the slow path if DCZID_EL0
|
||||
// indicates that the DC instructions are unavailable.
|
||||
"DCZID .req %[Tmp] \n\t"
|
||||
"mrs DCZID, dczid_el0 \n\t"
|
||||
"tbnz DCZID, #4, 3f \n\t"
|
||||
"and DCZID, DCZID, #15 \n\t"
|
||||
"mov %[LineSize], #4 \n\t"
|
||||
"lsl %[LineSize], %[LineSize], DCZID \n\t"
|
||||
".unreq DCZID \n\t"
|
||||
|
||||
// Our main loop doesn't handle the case where we don't need to perform any
|
||||
// DC GZVA operations. If the size of our tagged region is less than
|
||||
// twice the cache line size, bail out to the slow path since it's not
|
||||
// guaranteed that we'll be able to do a DC GZVA.
|
||||
"Size .req %[Tmp] \n\t"
|
||||
"sub Size, %[End], %[Cur] \n\t"
|
||||
"cmp Size, %[LineSize], lsl #1 \n\t"
|
||||
"b.lt 3f \n\t"
|
||||
".unreq Size \n\t"
|
||||
|
||||
"LineMask .req %[Tmp] \n\t"
|
||||
"sub LineMask, %[LineSize], #1 \n\t"
|
||||
|
||||
// STZG until the start of the next cache line.
|
||||
"orr %[Next], %[Cur], LineMask \n\t"
|
||||
|
||||
"1:\n\t"
|
||||
"stzg %[Cur], [%[Cur]], #16 \n\t"
|
||||
"cmp %[Cur], %[Next] \n\t"
|
||||
"b.lt 1b \n\t"
|
||||
|
||||
// DC GZVA cache lines until we have no more full cache lines.
|
||||
"bic %[Next], %[End], LineMask \n\t"
|
||||
".unreq LineMask \n\t"
|
||||
|
||||
"2: \n\t"
|
||||
"dc gzva, %[Cur] \n\t"
|
||||
"add %[Cur], %[Cur], %[LineSize] \n\t"
|
||||
"cmp %[Cur], %[Next] \n\t"
|
||||
"b.lt 2b \n\t"
|
||||
|
||||
// STZG until the end of the tagged region. This loop is also used to handle
|
||||
// slow path cases.
|
||||
|
||||
"3: \n\t"
|
||||
"cmp %[Cur], %[End] \n\t"
|
||||
"b.ge 4f \n\t"
|
||||
"stzg %[Cur], [%[Cur]], #16 \n\t"
|
||||
"b 3b \n\t"
|
||||
|
||||
"4: \n\t"
|
||||
|
||||
: [Cur] "+&r"(Begin), [LineSize] "=&r"(LineSize), [Next] "=&r"(Next), [Tmp] "=&r"(Tmp)
|
||||
: [End] "r"(End)
|
||||
: "memory"
|
||||
);
|
||||
}
|
||||
#endif
|
201
h_malloc.c
201
h_malloc.c
@ -14,6 +14,7 @@
|
||||
|
||||
#include "h_malloc.h"
|
||||
#include "memory.h"
|
||||
#include "memtag.h"
|
||||
#include "mutex.h"
|
||||
#include "pages.h"
|
||||
#include "random.h"
|
||||
@ -75,6 +76,9 @@ static union {
|
||||
struct region_metadata *regions[2];
|
||||
#ifdef USE_PKEY
|
||||
int metadata_pkey;
|
||||
#endif
|
||||
#ifdef MEMTAG
|
||||
bool is_memtag_disabled;
|
||||
#endif
|
||||
};
|
||||
char padding[PAGE_SIZE];
|
||||
@ -84,6 +88,30 @@ static inline void *get_slab_region_end(void) {
|
||||
return atomic_load_explicit(&ro.slab_region_end, memory_order_acquire);
|
||||
}
|
||||
|
||||
#ifdef MEMTAG
|
||||
static inline bool is_memtag_enabled(void) {
|
||||
return !ro.is_memtag_disabled;
|
||||
}
|
||||
#endif
|
||||
|
||||
static void *memory_map_tagged(size_t size) {
|
||||
#ifdef HAS_ARM_MTE
|
||||
if (likely51(is_memtag_enabled())) {
|
||||
return memory_map_mte(size);
|
||||
}
|
||||
#endif
|
||||
return memory_map(size);
|
||||
}
|
||||
|
||||
static bool memory_map_fixed_tagged(void *ptr, size_t size) {
|
||||
#ifdef HAS_ARM_MTE
|
||||
if (likely51(is_memtag_enabled())) {
|
||||
return memory_map_fixed_mte(ptr, size);
|
||||
}
|
||||
#endif
|
||||
return memory_map_fixed(ptr, size);
|
||||
}
|
||||
|
||||
#define SLAB_METADATA_COUNT
|
||||
|
||||
struct slab_metadata {
|
||||
@ -99,6 +127,18 @@ struct slab_metadata {
|
||||
#if SLAB_QUARANTINE
|
||||
u64 quarantine_bitmap[4];
|
||||
#endif
|
||||
#ifdef HAS_ARM_MTE
|
||||
// arm_mte_tags is used as a u4 array (MTE tags are 4-bit wide)
|
||||
//
|
||||
// Its size is calculated by the following formula:
|
||||
// (MAX_SLAB_SLOT_COUNT + 2) / 2
|
||||
// MAX_SLAB_SLOT_COUNT is currently 256, 2 extra slots are needed for branchless handling of
|
||||
// edge slots in tag_and_clear_slab_slot()
|
||||
//
|
||||
// It's intentionally placed at the end of struct to improve locality: for most size classes,
|
||||
// slot count is far lower than MAX_SLAB_SLOT_COUNT.
|
||||
u8 arm_mte_tags[129];
|
||||
#endif
|
||||
};
|
||||
|
||||
static const size_t min_align = 16;
|
||||
@ -447,6 +487,12 @@ static void write_after_free_check(const char *p, size_t size) {
|
||||
return;
|
||||
}
|
||||
|
||||
#ifdef HAS_ARM_MTE
|
||||
if (likely51(is_memtag_enabled())) {
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
for (size_t i = 0; i < size; i += sizeof(u64)) {
|
||||
if (unlikely(*(const u64 *)(const void *)(p + i))) {
|
||||
fatal_error("detected write after free");
|
||||
@ -461,19 +507,48 @@ static void set_slab_canary_value(UNUSED struct slab_metadata *metadata, UNUSED
|
||||
0x00ffffffffffffffUL;
|
||||
|
||||
metadata->canary_value = get_random_u64(rng) & canary_mask;
|
||||
#ifdef HAS_ARM_MTE
|
||||
if (unlikely(metadata->canary_value == 0)) {
|
||||
// 0 is reserved to support disabling MTE at runtime (this is required on Android).
|
||||
// When MTE is enabled, writing and reading of canaries is disabled, i.e. canary remains zeroed.
|
||||
// After MTE is disabled, canaries that are set to 0 are ignored, since they wouldn't match
|
||||
// slab's metadata->canary_value.
|
||||
// 0x100 was chosen arbitrarily, and can be encoded as an immediate value on ARM by the compiler.
|
||||
metadata->canary_value = 0x100;
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
|
||||
static void set_canary(UNUSED const struct slab_metadata *metadata, UNUSED void *p, UNUSED size_t size) {
|
||||
#if SLAB_CANARY
|
||||
#ifdef HAS_ARM_MTE
|
||||
if (likely51(is_memtag_enabled())) {
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
memcpy((char *)p + size - canary_size, &metadata->canary_value, canary_size);
|
||||
#endif
|
||||
}
|
||||
|
||||
static void check_canary(UNUSED const struct slab_metadata *metadata, UNUSED const void *p, UNUSED size_t size) {
|
||||
#if SLAB_CANARY
|
||||
#ifdef HAS_ARM_MTE
|
||||
if (likely51(is_memtag_enabled())) {
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
u64 canary_value;
|
||||
memcpy(&canary_value, (const char *)p + size - canary_size, canary_size);
|
||||
|
||||
#ifdef HAS_ARM_MTE
|
||||
if (unlikely(canary_value == 0)) {
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
if (unlikely(canary_value != metadata->canary_value)) {
|
||||
fatal_error("canary corrupted");
|
||||
}
|
||||
@ -506,6 +581,38 @@ static inline void stats_slab_deallocate(UNUSED struct size_class *c, UNUSED siz
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifdef HAS_ARM_MTE
|
||||
static void *tag_and_clear_slab_slot(struct slab_metadata *metadata, void *slot_ptr, size_t slot_idx, size_t slot_size) {
|
||||
// arm_mte_tags is an array of 4-bit unsigned integers stored as u8 array (MTE tags are 4-bit wide)
|
||||
//
|
||||
// It stores the most recent tag for each slab slot, or 0 if the slot was never used.
|
||||
// Slab indices in arm_mte_tags array are shifted to the right by 1, and size of this array
|
||||
// is (MAX_SLAB_SLOT_COUNT + 2). This means that first and last values of arm_mte_tags array
|
||||
// are always 0, which allows to handle edge slots in a branchless way when tag exclusion mask
|
||||
// is constructed.
|
||||
u8 *slot_tags = metadata->arm_mte_tags;
|
||||
|
||||
// tag exclusion mask
|
||||
u64 tem = (1 << RESERVED_TAG);
|
||||
|
||||
// current or previous tag of left neighbor or 0 if there's no left neighbor or if it was never used
|
||||
tem |= (1 << u4_arr_get(slot_tags, slot_idx));
|
||||
// previous tag of this slot or 0 if it was never used
|
||||
tem |= (1 << u4_arr_get(slot_tags, slot_idx + 1));
|
||||
// current or previous tag of right neighbor or 0 if there's no right neighbor or if it was never used
|
||||
tem |= (1 << u4_arr_get(slot_tags, slot_idx + 2));
|
||||
|
||||
void *tagged_ptr = arm_mte_create_random_tag(slot_ptr, tem);
|
||||
// slot addresses and sizes are always aligned by 16
|
||||
arm_mte_tag_and_clear_mem(tagged_ptr, slot_size);
|
||||
|
||||
// store new tag of this slot
|
||||
u4_arr_set(slot_tags, slot_idx + 1, get_pointer_tag(tagged_ptr));
|
||||
|
||||
return tagged_ptr;
|
||||
}
|
||||
#endif
|
||||
|
||||
static inline void *allocate_small(unsigned arena, size_t requested_size) {
|
||||
struct size_info info = get_size_info(requested_size);
|
||||
size_t size = likely(info.size) ? info.size : 16;
|
||||
@ -534,6 +641,11 @@ static inline void *allocate_small(unsigned arena, size_t requested_size) {
|
||||
if (requested_size) {
|
||||
write_after_free_check(p, size - canary_size);
|
||||
set_canary(metadata, p, size);
|
||||
#ifdef HAS_ARM_MTE
|
||||
if (likely51(is_memtag_enabled())) {
|
||||
p = tag_and_clear_slab_slot(metadata, p, slot, size);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
stats_small_allocate(c, size);
|
||||
|
||||
@ -566,6 +678,11 @@ static inline void *allocate_small(unsigned arena, size_t requested_size) {
|
||||
void *p = slot_pointer(size, slab, slot);
|
||||
if (requested_size) {
|
||||
set_canary(metadata, p, size);
|
||||
#ifdef HAS_ARM_MTE
|
||||
if (likely51(is_memtag_enabled())) {
|
||||
p = tag_and_clear_slab_slot(metadata, p, slot, size);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
stats_slab_allocate(c, slab_size);
|
||||
stats_small_allocate(c, size);
|
||||
@ -588,6 +705,11 @@ static inline void *allocate_small(unsigned arena, size_t requested_size) {
|
||||
void *p = slot_pointer(size, slab, slot);
|
||||
if (requested_size) {
|
||||
set_canary(metadata, p, size);
|
||||
#ifdef HAS_ARM_MTE
|
||||
if (likely51(is_memtag_enabled())) {
|
||||
p = tag_and_clear_slab_slot(metadata, p, slot, size);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
stats_slab_allocate(c, slab_size);
|
||||
stats_small_allocate(c, size);
|
||||
@ -612,6 +734,11 @@ static inline void *allocate_small(unsigned arena, size_t requested_size) {
|
||||
if (requested_size) {
|
||||
write_after_free_check(p, size - canary_size);
|
||||
set_canary(metadata, p, size);
|
||||
#ifdef HAS_ARM_MTE
|
||||
if (likely51(is_memtag_enabled())) {
|
||||
p = tag_and_clear_slab_slot(metadata, p, slot, size);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
stats_small_allocate(c, size);
|
||||
|
||||
@ -694,7 +821,16 @@ static inline void deallocate_small(void *p, const size_t *expected_size) {
|
||||
if (likely(!is_zero_size)) {
|
||||
check_canary(metadata, p, size);
|
||||
|
||||
if (ZERO_ON_FREE) {
|
||||
bool skip_zero = false;
|
||||
#ifdef HAS_ARM_MTE
|
||||
if (likely51(is_memtag_enabled())) {
|
||||
arm_mte_tag_and_clear_mem(set_pointer_tag(p, RESERVED_TAG), size);
|
||||
// metadata->arm_mte_tags is intentionally not updated, see tag_and_clear_slab_slot()
|
||||
skip_zero = true;
|
||||
}
|
||||
#endif
|
||||
|
||||
if (ZERO_ON_FREE && !skip_zero) {
|
||||
memset(p, 0, size - canary_size);
|
||||
}
|
||||
}
|
||||
@ -772,7 +908,7 @@ static inline void deallocate_small(void *p, const size_t *expected_size) {
|
||||
|
||||
if (c->empty_slabs_total + slab_size > max_empty_slabs_total) {
|
||||
int saved_errno = errno;
|
||||
if (!memory_map_fixed(slab, slab_size)) {
|
||||
if (!memory_map_fixed_tagged(slab, slab_size)) {
|
||||
label_slab(slab, slab_size, class);
|
||||
stats_slab_deallocate(c, slab_size);
|
||||
enqueue_free_slab(c, metadata);
|
||||
@ -1074,13 +1210,14 @@ static inline void enforce_init(void) {
|
||||
}
|
||||
}
|
||||
|
||||
COLD static void init_slow_path(void) {
|
||||
static struct mutex lock = MUTEX_INITIALIZER;
|
||||
static struct mutex init_lock = MUTEX_INITIALIZER;
|
||||
|
||||
mutex_lock(&lock);
|
||||
COLD static void init_slow_path(void) {
|
||||
|
||||
mutex_lock(&init_lock);
|
||||
|
||||
if (unlikely(is_init())) {
|
||||
mutex_unlock(&lock);
|
||||
mutex_unlock(&init_lock);
|
||||
return;
|
||||
}
|
||||
|
||||
@ -1123,8 +1260,7 @@ COLD static void init_slow_path(void) {
|
||||
if (unlikely(memory_protect_rw_metadata(ra->regions, ra->total * sizeof(struct region_metadata)))) {
|
||||
fatal_error("failed to unprotect memory for regions table");
|
||||
}
|
||||
|
||||
ro.slab_region_start = memory_map(slab_region_size);
|
||||
ro.slab_region_start = memory_map_tagged(slab_region_size);
|
||||
if (unlikely(ro.slab_region_start == NULL)) {
|
||||
fatal_error("failed to allocate slab region");
|
||||
}
|
||||
@ -1164,7 +1300,7 @@ COLD static void init_slow_path(void) {
|
||||
}
|
||||
memory_set_name(&ro, sizeof(ro), "malloc read-only after init");
|
||||
|
||||
mutex_unlock(&lock);
|
||||
mutex_unlock(&init_lock);
|
||||
|
||||
// may allocate, so wait until the allocator is initialized to avoid deadlocking
|
||||
if (unlikely(pthread_atfork(full_lock, full_unlock, post_fork_child))) {
|
||||
@ -1368,6 +1504,11 @@ EXPORT void *h_calloc(size_t nmemb, size_t size) {
|
||||
if (!ZERO_ON_FREE && likely(p != NULL) && total_size && total_size <= max_slab_size_class) {
|
||||
memset(p, 0, total_size - canary_size);
|
||||
}
|
||||
#ifdef HAS_ARM_MTE
|
||||
// use an assert instead of adding a conditional to memset() above (freed memory is always
|
||||
// zeroed when MTE is enabled)
|
||||
static_assert(ZERO_ON_FREE, "disabling ZERO_ON_FREE reduces performance when ARM MTE is enabled");
|
||||
#endif
|
||||
return p;
|
||||
}
|
||||
|
||||
@ -1385,11 +1526,14 @@ EXPORT void *h_realloc(void *old, size_t size) {
|
||||
}
|
||||
}
|
||||
|
||||
void *old_orig = old;
|
||||
old = untag_pointer(old);
|
||||
|
||||
size_t old_size;
|
||||
if (old < get_slab_region_end() && old >= ro.slab_region_start) {
|
||||
old_size = slab_usable_size(old);
|
||||
if (size <= max_slab_size_class && get_size_info(size).size == old_size) {
|
||||
return old;
|
||||
return old_orig;
|
||||
}
|
||||
thread_unseal_metadata();
|
||||
} else {
|
||||
@ -1502,7 +1646,7 @@ EXPORT void *h_realloc(void *old, size_t size) {
|
||||
if (copy_size > 0 && copy_size <= max_slab_size_class) {
|
||||
copy_size -= canary_size;
|
||||
}
|
||||
memcpy(new, old, copy_size);
|
||||
memcpy(new, old_orig, copy_size);
|
||||
if (old_size <= max_slab_size_class) {
|
||||
deallocate_small(old, NULL);
|
||||
} else {
|
||||
@ -1543,6 +1687,8 @@ EXPORT void h_free(void *p) {
|
||||
return;
|
||||
}
|
||||
|
||||
p = untag_pointer(p);
|
||||
|
||||
if (p < get_slab_region_end() && p >= ro.slab_region_start) {
|
||||
thread_unseal_metadata();
|
||||
deallocate_small(p, NULL);
|
||||
@ -1566,6 +1712,8 @@ EXPORT void h_free_sized(void *p, size_t expected_size) {
|
||||
return;
|
||||
}
|
||||
|
||||
p = untag_pointer(p);
|
||||
|
||||
expected_size = adjust_size_for_canary(expected_size);
|
||||
|
||||
if (p < get_slab_region_end() && p >= ro.slab_region_start) {
|
||||
@ -1619,11 +1767,13 @@ static inline void memory_corruption_check_small(const void *p) {
|
||||
mutex_unlock(&c->lock);
|
||||
}
|
||||
|
||||
EXPORT size_t h_malloc_usable_size(H_MALLOC_USABLE_SIZE_CONST void *p) {
|
||||
if (p == NULL) {
|
||||
EXPORT size_t h_malloc_usable_size(H_MALLOC_USABLE_SIZE_CONST void *arg) {
|
||||
if (arg == NULL) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
const void *p = untag_const_pointer(arg);
|
||||
|
||||
if (p < get_slab_region_end() && p >= ro.slab_region_start) {
|
||||
thread_unseal_metadata();
|
||||
memory_corruption_check_small(p);
|
||||
@ -1755,7 +1905,7 @@ EXPORT int h_malloc_trim(UNUSED size_t pad) {
|
||||
struct slab_metadata *iterator = c->empty_slabs;
|
||||
while (iterator) {
|
||||
void *slab = get_slab(c, slab_size, iterator);
|
||||
if (memory_map_fixed(slab, slab_size)) {
|
||||
if (memory_map_fixed_tagged(slab, slab_size)) {
|
||||
break;
|
||||
}
|
||||
label_slab(slab, slab_size, class);
|
||||
@ -2025,3 +2175,26 @@ COLD EXPORT int h_malloc_set_state(UNUSED void *state) {
|
||||
return -2;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef __ANDROID__
|
||||
COLD EXPORT void h_malloc_disable_memory_tagging(void) {
|
||||
#ifdef HAS_ARM_MTE
|
||||
mutex_lock(&init_lock);
|
||||
if (!ro.is_memtag_disabled) {
|
||||
if (is_init()) {
|
||||
if (unlikely(memory_protect_rw(&ro, sizeof(ro)))) {
|
||||
fatal_error("failed to unprotect allocator data");
|
||||
}
|
||||
ro.is_memtag_disabled = true;
|
||||
if (unlikely(memory_protect_ro(&ro, sizeof(ro)))) {
|
||||
fatal_error("failed to protect allocator data");
|
||||
}
|
||||
} else {
|
||||
// bionic calls this function very early in some cases
|
||||
ro.is_memtag_disabled = true;
|
||||
}
|
||||
}
|
||||
mutex_unlock(&init_lock);
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
@ -99,6 +99,7 @@ int h_malloc_iterate(uintptr_t base, size_t size, void (*callback)(uintptr_t ptr
|
||||
void *arg);
|
||||
void h_malloc_disable(void);
|
||||
void h_malloc_enable(void);
|
||||
void h_malloc_disable_memory_tagging(void);
|
||||
#endif
|
||||
|
||||
// hardened_malloc extensions
|
||||
|
30
memory.c
30
memory.c
@ -17,8 +17,8 @@
|
||||
#include "memory.h"
|
||||
#include "util.h"
|
||||
|
||||
void *memory_map(size_t size) {
|
||||
void *p = mmap(NULL, size, PROT_NONE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
|
||||
static void *memory_map_prot(size_t size, int prot) {
|
||||
void *p = mmap(NULL, size, prot, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
|
||||
if (unlikely(p == MAP_FAILED)) {
|
||||
if (errno != ENOMEM) {
|
||||
fatal_error("non-ENOMEM mmap failure");
|
||||
@ -28,8 +28,19 @@ void *memory_map(size_t size) {
|
||||
return p;
|
||||
}
|
||||
|
||||
bool memory_map_fixed(void *ptr, size_t size) {
|
||||
void *p = mmap(ptr, size, PROT_NONE, MAP_ANONYMOUS|MAP_PRIVATE|MAP_FIXED, -1, 0);
|
||||
void *memory_map(size_t size) {
|
||||
return memory_map_prot(size, PROT_NONE);
|
||||
}
|
||||
|
||||
#ifdef HAS_ARM_MTE
|
||||
// Note that PROT_MTE can't be cleared via mprotect
|
||||
void *memory_map_mte(size_t size) {
|
||||
return memory_map_prot(size, PROT_MTE);
|
||||
}
|
||||
#endif
|
||||
|
||||
static bool memory_map_fixed_prot(void *ptr, size_t size, int prot) {
|
||||
void *p = mmap(ptr, size, prot, MAP_ANONYMOUS|MAP_PRIVATE|MAP_FIXED, -1, 0);
|
||||
bool ret = p == MAP_FAILED;
|
||||
if (unlikely(ret) && errno != ENOMEM) {
|
||||
fatal_error("non-ENOMEM MAP_FIXED mmap failure");
|
||||
@ -37,6 +48,17 @@ bool memory_map_fixed(void *ptr, size_t size) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
bool memory_map_fixed(void *ptr, size_t size) {
|
||||
return memory_map_fixed_prot(ptr, size, PROT_NONE);
|
||||
}
|
||||
|
||||
#ifdef HAS_ARM_MTE
|
||||
// Note that PROT_MTE can't be cleared via mprotect
|
||||
bool memory_map_fixed_mte(void *ptr, size_t size) {
|
||||
return memory_map_fixed_prot(ptr, size, PROT_MTE);
|
||||
}
|
||||
#endif
|
||||
|
||||
bool memory_unmap(void *ptr, size_t size) {
|
||||
bool ret = munmap(ptr, size);
|
||||
if (unlikely(ret) && errno != ENOMEM) {
|
||||
|
6
memory.h
6
memory.h
@ -11,7 +11,13 @@
|
||||
int get_metadata_key(void);
|
||||
|
||||
void *memory_map(size_t size);
|
||||
#ifdef HAS_ARM_MTE
|
||||
void *memory_map_mte(size_t size);
|
||||
#endif
|
||||
bool memory_map_fixed(void *ptr, size_t size);
|
||||
#ifdef HAS_ARM_MTE
|
||||
bool memory_map_fixed_mte(void *ptr, size_t size);
|
||||
#endif
|
||||
bool memory_unmap(void *ptr, size_t size);
|
||||
bool memory_protect_ro(void *ptr, size_t size);
|
||||
bool memory_protect_rw(void *ptr, size_t size);
|
||||
|
50
memtag.h
Normal file
50
memtag.h
Normal file
@ -0,0 +1,50 @@
|
||||
#ifndef MEMTAG_H
|
||||
#define MEMTAG_H
|
||||
|
||||
#include "util.h"
|
||||
|
||||
#ifdef HAS_ARM_MTE
|
||||
#include "arm_mte.h"
|
||||
#define MEMTAG 1
|
||||
// Note that bionic libc always reserves tag 0 via PR_MTE_TAG_MASK prctl
|
||||
#define RESERVED_TAG 0
|
||||
#define TAG_WIDTH 4
|
||||
#endif
|
||||
|
||||
static inline void *untag_pointer(void *ptr) {
|
||||
#ifdef HAS_ARM_MTE
|
||||
const uintptr_t mask = UINTPTR_MAX >> 8;
|
||||
return (void *) ((uintptr_t) ptr & mask);
|
||||
#else
|
||||
return ptr;
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline const void *untag_const_pointer(const void *ptr) {
|
||||
#ifdef HAS_ARM_MTE
|
||||
const uintptr_t mask = UINTPTR_MAX >> 8;
|
||||
return (const void *) ((uintptr_t) ptr & mask);
|
||||
#else
|
||||
return ptr;
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline void *set_pointer_tag(void *ptr, u8 tag) {
|
||||
#ifdef HAS_ARM_MTE
|
||||
return (void *) (((uintptr_t) tag << 56) | (uintptr_t) untag_pointer(ptr));
|
||||
#else
|
||||
(void) tag;
|
||||
return ptr;
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline u8 get_pointer_tag(void *ptr) {
|
||||
#ifdef HAS_ARM_MTE
|
||||
return (((uintptr_t) ptr) >> 56) & 0xf;
|
||||
#else
|
||||
(void) ptr;
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
#endif
|
@ -1,5 +1,6 @@
|
||||
#include <pthread.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#if defined(__GLIBC__) || defined(__ANDROID__)
|
||||
#include <malloc.h>
|
||||
|
455
third_party/libdivide.h
vendored
455
third_party/libdivide.h
vendored
@ -1,8 +1,8 @@
|
||||
// libdivide.h - Optimized integer division
|
||||
// https://libdivide.com
|
||||
//
|
||||
// Copyright (C) 2010 - 2021 ridiculous_fish, <libdivide@ridiculousfish.com>
|
||||
// Copyright (C) 2016 - 2021 Kim Walisch, <kim.walisch@gmail.com>
|
||||
// Copyright (C) 2010 - 2022 ridiculous_fish, <libdivide@ridiculousfish.com>
|
||||
// Copyright (C) 2016 - 2022 Kim Walisch, <kim.walisch@gmail.com>
|
||||
//
|
||||
// libdivide is dual-licensed under the Boost or zlib licenses.
|
||||
// You may use libdivide under the terms of either of these.
|
||||
@ -11,11 +11,14 @@
|
||||
#ifndef LIBDIVIDE_H
|
||||
#define LIBDIVIDE_H
|
||||
|
||||
#define LIBDIVIDE_VERSION "5.0"
|
||||
// *** Version numbers are auto generated - do not edit ***
|
||||
#define LIBDIVIDE_VERSION "5.2.0"
|
||||
#define LIBDIVIDE_VERSION_MAJOR 5
|
||||
#define LIBDIVIDE_VERSION_MINOR 0
|
||||
#define LIBDIVIDE_VERSION_MINOR 2
|
||||
#define LIBDIVIDE_VERSION_PATCH 0
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#if !defined(__AVR__)
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
@ -24,20 +27,29 @@
|
||||
#if defined(LIBDIVIDE_SSE2)
|
||||
#include <emmintrin.h>
|
||||
#endif
|
||||
|
||||
#if defined(LIBDIVIDE_AVX2) || defined(LIBDIVIDE_AVX512)
|
||||
#include <immintrin.h>
|
||||
#endif
|
||||
|
||||
#if defined(LIBDIVIDE_NEON)
|
||||
#include <arm_neon.h>
|
||||
#endif
|
||||
|
||||
// Clang-cl prior to Visual Studio 2022 doesn't include __umulh/__mulh intrinsics
|
||||
#if defined(_MSC_VER) && defined(LIBDIVIDE_X86_64) && (!defined(__clang__) || _MSC_VER>1930)
|
||||
#define LIBDIVIDE_X64_INTRINSICS
|
||||
#endif
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
#if defined(LIBDIVIDE_X64_INTRINSICS)
|
||||
#include <intrin.h>
|
||||
#endif
|
||||
#pragma warning(push)
|
||||
// disable warning C4146: unary minus operator applied
|
||||
// to unsigned type, result still unsigned
|
||||
#pragma warning(disable : 4146)
|
||||
// disable warning C4204: nonstandard extension used : non-constant aggregate
|
||||
// disable warning C4204: nonstandard extension used : non-constant aggregate
|
||||
// initializer
|
||||
//
|
||||
// It's valid C99
|
||||
@ -238,24 +250,32 @@ static LIBDIVIDE_INLINE struct libdivide_u64_branchfree_t libdivide_u64_branchfr
|
||||
static LIBDIVIDE_INLINE int16_t libdivide_s16_do_raw(
|
||||
int16_t numer, int16_t magic, uint8_t more);
|
||||
static LIBDIVIDE_INLINE int16_t libdivide_s16_do(
|
||||
int16_t numer, const struct libdivide_s16_t* denom);
|
||||
int16_t numer, const struct libdivide_s16_t *denom);
|
||||
static LIBDIVIDE_INLINE uint16_t libdivide_u16_do_raw(
|
||||
uint16_t numer, uint16_t magic, uint8_t more);
|
||||
uint16_t numer, uint16_t magic, uint8_t more);
|
||||
static LIBDIVIDE_INLINE uint16_t libdivide_u16_do(
|
||||
uint16_t numer, const struct libdivide_u16_t* denom);
|
||||
uint16_t numer, const struct libdivide_u16_t *denom);
|
||||
static LIBDIVIDE_INLINE int32_t libdivide_s32_do_raw(
|
||||
int32_t numer, int32_t magic, uint8_t more);
|
||||
static LIBDIVIDE_INLINE int32_t libdivide_s32_do(
|
||||
int32_t numer, const struct libdivide_s32_t *denom);
|
||||
static LIBDIVIDE_INLINE uint32_t libdivide_u32_do_raw(
|
||||
uint32_t numer, uint32_t magic, uint8_t more);
|
||||
static LIBDIVIDE_INLINE uint32_t libdivide_u32_do(
|
||||
uint32_t numer, const struct libdivide_u32_t *denom);
|
||||
static LIBDIVIDE_INLINE int64_t libdivide_s64_do_raw(
|
||||
int64_t numer, int64_t magic, uint8_t more);
|
||||
static LIBDIVIDE_INLINE int64_t libdivide_s64_do(
|
||||
int64_t numer, const struct libdivide_s64_t *denom);
|
||||
static LIBDIVIDE_INLINE uint64_t libdivide_u64_do_raw(
|
||||
uint64_t numer, uint64_t magic, uint8_t more);
|
||||
static LIBDIVIDE_INLINE uint64_t libdivide_u64_do(
|
||||
uint64_t numer, const struct libdivide_u64_t *denom);
|
||||
|
||||
static LIBDIVIDE_INLINE int16_t libdivide_s16_branchfree_do(
|
||||
int16_t numer, const struct libdivide_s16_branchfree_t* denom);
|
||||
int16_t numer, const struct libdivide_s16_branchfree_t *denom);
|
||||
static LIBDIVIDE_INLINE uint16_t libdivide_u16_branchfree_do(
|
||||
uint16_t numer, const struct libdivide_u16_branchfree_t* denom);
|
||||
uint16_t numer, const struct libdivide_u16_branchfree_t *denom);
|
||||
static LIBDIVIDE_INLINE int32_t libdivide_s32_branchfree_do(
|
||||
int32_t numer, const struct libdivide_s32_branchfree_t *denom);
|
||||
static LIBDIVIDE_INLINE uint32_t libdivide_u32_branchfree_do(
|
||||
@ -265,17 +285,17 @@ static LIBDIVIDE_INLINE int64_t libdivide_s64_branchfree_do(
|
||||
static LIBDIVIDE_INLINE uint64_t libdivide_u64_branchfree_do(
|
||||
uint64_t numer, const struct libdivide_u64_branchfree_t *denom);
|
||||
|
||||
static LIBDIVIDE_INLINE int16_t libdivide_s16_recover(const struct libdivide_s16_t* denom);
|
||||
static LIBDIVIDE_INLINE uint16_t libdivide_u16_recover(const struct libdivide_u16_t* denom);
|
||||
static LIBDIVIDE_INLINE int16_t libdivide_s16_recover(const struct libdivide_s16_t *denom);
|
||||
static LIBDIVIDE_INLINE uint16_t libdivide_u16_recover(const struct libdivide_u16_t *denom);
|
||||
static LIBDIVIDE_INLINE int32_t libdivide_s32_recover(const struct libdivide_s32_t *denom);
|
||||
static LIBDIVIDE_INLINE uint32_t libdivide_u32_recover(const struct libdivide_u32_t *denom);
|
||||
static LIBDIVIDE_INLINE int64_t libdivide_s64_recover(const struct libdivide_s64_t *denom);
|
||||
static LIBDIVIDE_INLINE uint64_t libdivide_u64_recover(const struct libdivide_u64_t *denom);
|
||||
|
||||
static LIBDIVIDE_INLINE int16_t libdivide_s16_branchfree_recover(
|
||||
const struct libdivide_s16_branchfree_t* denom);
|
||||
const struct libdivide_s16_branchfree_t *denom);
|
||||
static LIBDIVIDE_INLINE uint16_t libdivide_u16_branchfree_recover(
|
||||
const struct libdivide_u16_branchfree_t* denom);
|
||||
const struct libdivide_u16_branchfree_t *denom);
|
||||
static LIBDIVIDE_INLINE int32_t libdivide_s32_branchfree_recover(
|
||||
const struct libdivide_s32_branchfree_t *denom);
|
||||
static LIBDIVIDE_INLINE uint32_t libdivide_u32_branchfree_recover(
|
||||
@ -314,7 +334,7 @@ static LIBDIVIDE_INLINE int32_t libdivide_mullhi_s32(int32_t x, int32_t y) {
|
||||
}
|
||||
|
||||
static LIBDIVIDE_INLINE uint64_t libdivide_mullhi_u64(uint64_t x, uint64_t y) {
|
||||
#if defined(LIBDIVIDE_VC) && defined(LIBDIVIDE_X86_64)
|
||||
#if defined(LIBDIVIDE_X64_INTRINSICS)
|
||||
return __umulh(x, y);
|
||||
#elif defined(HAS_INT128_T)
|
||||
__uint128_t xl = x, yl = y;
|
||||
@ -340,7 +360,7 @@ static LIBDIVIDE_INLINE uint64_t libdivide_mullhi_u64(uint64_t x, uint64_t y) {
|
||||
}
|
||||
|
||||
static LIBDIVIDE_INLINE int64_t libdivide_mullhi_s64(int64_t x, int64_t y) {
|
||||
#if defined(LIBDIVIDE_VC) && defined(LIBDIVIDE_X86_64)
|
||||
#if defined(LIBDIVIDE_X64_INTRINSICS)
|
||||
return __mulh(x, y);
|
||||
#elif defined(HAS_INT128_T)
|
||||
__int128_t xl = x, yl = y;
|
||||
@ -393,7 +413,7 @@ static LIBDIVIDE_INLINE int16_t libdivide_count_leading_zeros16(uint16_t val) {
|
||||
|
||||
static LIBDIVIDE_INLINE int32_t libdivide_count_leading_zeros32(uint32_t val) {
|
||||
#if defined(__AVR__)
|
||||
// Fast way to count leading zeros
|
||||
// Fast way to count leading zeros
|
||||
return __builtin_clzl(val);
|
||||
#elif defined(__GNUC__) || __has_builtin(__builtin_clz)
|
||||
// Fast way to count leading zeros
|
||||
@ -442,7 +462,7 @@ static LIBDIVIDE_INLINE int32_t libdivide_count_leading_zeros64(uint64_t val) {
|
||||
// uint {v}. The result must fit in 16 bits.
|
||||
// Returns the quotient directly and the remainder in *r
|
||||
static LIBDIVIDE_INLINE uint16_t libdivide_32_div_16_to_16(
|
||||
uint16_t u1, uint16_t u0, uint16_t v, uint16_t* r) {
|
||||
uint16_t u1, uint16_t u0, uint16_t v, uint16_t *r) {
|
||||
uint32_t n = ((uint32_t)u1 << 16) | u0;
|
||||
uint16_t result = (uint16_t)(n / v);
|
||||
*r = (uint16_t)(n - result * (uint32_t)v);
|
||||
@ -512,7 +532,7 @@ static LIBDIVIDE_INLINE uint64_t libdivide_128_div_64_to_64(
|
||||
|
||||
// Check for overflow and divide by 0.
|
||||
if (numhi >= den) {
|
||||
if (r != NULL) *r = ~0ull;
|
||||
if (r) *r = ~0ull;
|
||||
return ~0ull;
|
||||
}
|
||||
|
||||
@ -558,11 +578,14 @@ static LIBDIVIDE_INLINE uint64_t libdivide_128_div_64_to_64(
|
||||
q0 = (uint32_t)qhat;
|
||||
|
||||
// Return remainder if requested.
|
||||
if (r != NULL) *r = (rem * b + num0 - q0 * den) >> shift;
|
||||
if (r) *r = (rem * b + num0 - q0 * den) >> shift;
|
||||
return ((uint64_t)q1 << 32) | q0;
|
||||
#endif
|
||||
}
|
||||
|
||||
#if !(defined(HAS_INT128_T) && \
|
||||
defined(HAS_INT128_DIV))
|
||||
|
||||
// Bitshift a u128 in place, left (signed_shift > 0) or right (signed_shift < 0)
|
||||
static LIBDIVIDE_INLINE void libdivide_u128_shift(
|
||||
uint64_t *u1, uint64_t *u0, int32_t signed_shift) {
|
||||
@ -579,6 +602,8 @@ static LIBDIVIDE_INLINE void libdivide_u128_shift(
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
// Computes a 128 / 128 -> 64 bit division, with a 128 bit remainder.
|
||||
static LIBDIVIDE_INLINE uint64_t libdivide_128_div_128_to_64(
|
||||
uint64_t u_hi, uint64_t u_lo, uint64_t v_hi, uint64_t v_lo, uint64_t *r_hi, uint64_t *r_lo) {
|
||||
@ -696,8 +721,7 @@ static LIBDIVIDE_INLINE struct libdivide_u16_t libdivide_internal_u16_gen(
|
||||
// 1 in its recovery algorithm.
|
||||
result.magic = 0;
|
||||
result.more = (uint8_t)(floor_log_2_d - (branchfree != 0));
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
uint8_t more;
|
||||
uint16_t rem, proposed_m;
|
||||
proposed_m = libdivide_32_div_16_to_16((uint16_t)1 << floor_log_2_d, 0, d, &rem);
|
||||
@ -709,8 +733,7 @@ static LIBDIVIDE_INLINE struct libdivide_u16_t libdivide_internal_u16_gen(
|
||||
if (!branchfree && (e < ((uint16_t)1 << floor_log_2_d))) {
|
||||
// This power works
|
||||
more = floor_log_2_d;
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
// We have to use the general 17-bit algorithm. We need to compute
|
||||
// (2**power) / d. However, we already have (2**(power-1))/d and
|
||||
// its remainder. By doubling both, and then correcting the
|
||||
@ -742,7 +765,7 @@ struct libdivide_u16_branchfree_t libdivide_u16_branchfree_gen(uint16_t d) {
|
||||
}
|
||||
struct libdivide_u16_t tmp = libdivide_internal_u16_gen(d, 1);
|
||||
struct libdivide_u16_branchfree_t ret = {
|
||||
tmp.magic, (uint8_t)(tmp.more & LIBDIVIDE_16_SHIFT_MASK) };
|
||||
tmp.magic, (uint8_t)(tmp.more & LIBDIVIDE_16_SHIFT_MASK)};
|
||||
return ret;
|
||||
}
|
||||
|
||||
@ -752,27 +775,25 @@ struct libdivide_u16_branchfree_t libdivide_u16_branchfree_gen(uint16_t d) {
|
||||
uint16_t libdivide_u16_do_raw(uint16_t numer, uint16_t magic, uint8_t more) {
|
||||
if (!magic) {
|
||||
return numer >> more;
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
uint16_t q = libdivide_mullhi_u16(magic, numer);
|
||||
if (more & LIBDIVIDE_ADD_MARKER) {
|
||||
uint16_t t = ((numer - q) >> 1) + q;
|
||||
return t >> (more & LIBDIVIDE_16_SHIFT_MASK);
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
// All upper bits are 0,
|
||||
// don't need to mask them off.
|
||||
return q >> more;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
uint16_t libdivide_u16_do(uint16_t numer, const struct libdivide_u16_t* denom) {
|
||||
uint16_t libdivide_u16_do(uint16_t numer, const struct libdivide_u16_t *denom) {
|
||||
return libdivide_u16_do_raw(numer, denom->magic, denom->more);
|
||||
}
|
||||
|
||||
uint16_t libdivide_u16_branchfree_do(
|
||||
uint16_t numer, const struct libdivide_u16_branchfree_t* denom) {
|
||||
uint16_t numer, const struct libdivide_u16_branchfree_t *denom) {
|
||||
uint16_t q = libdivide_mullhi_u16(denom->magic, numer);
|
||||
uint16_t t = ((numer - q) >> 1) + q;
|
||||
return t >> denom->more;
|
||||
@ -800,7 +821,7 @@ uint16_t libdivide_u16_recover(const struct libdivide_u16_t *denom) {
|
||||
// overflow. So we have to compute it as 2^(16+shift)/(m+2^16), and
|
||||
// then double the quotient and remainder.
|
||||
uint32_t half_n = (uint32_t)1 << (16 + shift);
|
||||
uint32_t d = ( (uint32_t)1 << 16) | denom->magic;
|
||||
uint32_t d = ((uint32_t)1 << 16) | denom->magic;
|
||||
// Note that the quotient is guaranteed <= 16 bits, but the remainder
|
||||
// may need 17!
|
||||
uint16_t half_q = (uint16_t)(half_n / d);
|
||||
@ -912,12 +933,11 @@ struct libdivide_u32_branchfree_t libdivide_u32_branchfree_gen(uint32_t d) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
uint32_t libdivide_u32_do(uint32_t numer, const struct libdivide_u32_t *denom) {
|
||||
uint8_t more = denom->more;
|
||||
if (!denom->magic) {
|
||||
uint32_t libdivide_u32_do_raw(uint32_t numer, uint32_t magic, uint8_t more) {
|
||||
if (!magic) {
|
||||
return numer >> more;
|
||||
} else {
|
||||
uint32_t q = libdivide_mullhi_u32(denom->magic, numer);
|
||||
uint32_t q = libdivide_mullhi_u32(magic, numer);
|
||||
if (more & LIBDIVIDE_ADD_MARKER) {
|
||||
uint32_t t = ((numer - q) >> 1) + q;
|
||||
return t >> (more & LIBDIVIDE_32_SHIFT_MASK);
|
||||
@ -929,6 +949,10 @@ uint32_t libdivide_u32_do(uint32_t numer, const struct libdivide_u32_t *denom) {
|
||||
}
|
||||
}
|
||||
|
||||
uint32_t libdivide_u32_do(uint32_t numer, const struct libdivide_u32_t *denom) {
|
||||
return libdivide_u32_do_raw(numer, denom->magic, denom->more);
|
||||
}
|
||||
|
||||
uint32_t libdivide_u32_branchfree_do(
|
||||
uint32_t numer, const struct libdivide_u32_branchfree_t *denom) {
|
||||
uint32_t q = libdivide_mullhi_u32(denom->magic, numer);
|
||||
@ -1072,12 +1096,11 @@ struct libdivide_u64_branchfree_t libdivide_u64_branchfree_gen(uint64_t d) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
uint64_t libdivide_u64_do(uint64_t numer, const struct libdivide_u64_t *denom) {
|
||||
uint8_t more = denom->more;
|
||||
if (!denom->magic) {
|
||||
uint64_t libdivide_u64_do_raw(uint64_t numer, uint64_t magic, uint8_t more) {
|
||||
if (!magic) {
|
||||
return numer >> more;
|
||||
} else {
|
||||
uint64_t q = libdivide_mullhi_u64(denom->magic, numer);
|
||||
uint64_t q = libdivide_mullhi_u64(magic, numer);
|
||||
if (more & LIBDIVIDE_ADD_MARKER) {
|
||||
uint64_t t = ((numer - q) >> 1) + q;
|
||||
return t >> (more & LIBDIVIDE_64_SHIFT_MASK);
|
||||
@ -1089,6 +1112,10 @@ uint64_t libdivide_u64_do(uint64_t numer, const struct libdivide_u64_t *denom) {
|
||||
}
|
||||
}
|
||||
|
||||
uint64_t libdivide_u64_do(uint64_t numer, const struct libdivide_u64_t *denom) {
|
||||
return libdivide_u64_do_raw(numer, denom->magic, denom->more);
|
||||
}
|
||||
|
||||
uint64_t libdivide_u64_branchfree_do(
|
||||
uint64_t numer, const struct libdivide_u64_branchfree_t *denom) {
|
||||
uint64_t q = libdivide_mullhi_u64(denom->magic, numer);
|
||||
@ -1428,11 +1455,10 @@ struct libdivide_s32_branchfree_t libdivide_s32_branchfree_gen(int32_t d) {
|
||||
return result;
|
||||
}
|
||||
|
||||
int32_t libdivide_s32_do(int32_t numer, const struct libdivide_s32_t *denom) {
|
||||
uint8_t more = denom->more;
|
||||
int32_t libdivide_s32_do_raw(int32_t numer, int32_t magic, uint8_t more) {
|
||||
uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
|
||||
|
||||
if (!denom->magic) {
|
||||
if (!magic) {
|
||||
uint32_t sign = (int8_t)more >> 7;
|
||||
uint32_t mask = ((uint32_t)1 << shift) - 1;
|
||||
uint32_t uq = numer + ((numer >> 31) & mask);
|
||||
@ -1441,7 +1467,7 @@ int32_t libdivide_s32_do(int32_t numer, const struct libdivide_s32_t *denom) {
|
||||
q = (q ^ sign) - sign;
|
||||
return q;
|
||||
} else {
|
||||
uint32_t uq = (uint32_t)libdivide_mullhi_s32(denom->magic, numer);
|
||||
uint32_t uq = (uint32_t)libdivide_mullhi_s32(magic, numer);
|
||||
if (more & LIBDIVIDE_ADD_MARKER) {
|
||||
// must be arithmetic shift and then sign extend
|
||||
int32_t sign = (int8_t)more >> 7;
|
||||
@ -1456,6 +1482,10 @@ int32_t libdivide_s32_do(int32_t numer, const struct libdivide_s32_t *denom) {
|
||||
}
|
||||
}
|
||||
|
||||
int32_t libdivide_s32_do(int32_t numer, const struct libdivide_s32_t *denom) {
|
||||
return libdivide_s32_do_raw(numer, denom->magic, denom->more);
|
||||
}
|
||||
|
||||
int32_t libdivide_s32_branchfree_do(int32_t numer, const struct libdivide_s32_branchfree_t *denom) {
|
||||
uint8_t more = denom->more;
|
||||
uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
|
||||
@ -1597,11 +1627,10 @@ struct libdivide_s64_branchfree_t libdivide_s64_branchfree_gen(int64_t d) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
int64_t libdivide_s64_do(int64_t numer, const struct libdivide_s64_t *denom) {
|
||||
uint8_t more = denom->more;
|
||||
int64_t libdivide_s64_do_raw(int64_t numer, int64_t magic, uint8_t more) {
|
||||
uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
|
||||
|
||||
if (!denom->magic) { // shift path
|
||||
if (!magic) { // shift path
|
||||
uint64_t mask = ((uint64_t)1 << shift) - 1;
|
||||
uint64_t uq = numer + ((numer >> 63) & mask);
|
||||
int64_t q = (int64_t)uq;
|
||||
@ -1611,7 +1640,7 @@ int64_t libdivide_s64_do(int64_t numer, const struct libdivide_s64_t *denom) {
|
||||
q = (q ^ sign) - sign;
|
||||
return q;
|
||||
} else {
|
||||
uint64_t uq = (uint64_t)libdivide_mullhi_s64(denom->magic, numer);
|
||||
uint64_t uq = (uint64_t)libdivide_mullhi_s64(magic, numer);
|
||||
if (more & LIBDIVIDE_ADD_MARKER) {
|
||||
// must be arithmetic shift and then sign extend
|
||||
int64_t sign = (int8_t)more >> 7;
|
||||
@ -1626,6 +1655,10 @@ int64_t libdivide_s64_do(int64_t numer, const struct libdivide_s64_t *denom) {
|
||||
}
|
||||
}
|
||||
|
||||
int64_t libdivide_s64_do(int64_t numer, const struct libdivide_s64_t *denom) {
|
||||
return libdivide_s64_do_raw(numer, denom->magic, denom->more);
|
||||
}
|
||||
|
||||
int64_t libdivide_s64_branchfree_do(int64_t numer, const struct libdivide_s64_branchfree_t *denom) {
|
||||
uint8_t more = denom->more;
|
||||
uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
|
||||
@ -1682,15 +1715,22 @@ int64_t libdivide_s64_branchfree_recover(const struct libdivide_s64_branchfree_t
|
||||
|
||||
// Simplest possible vector type division: treat the vector type as an array
|
||||
// of underlying native type.
|
||||
#define SIMPLE_VECTOR_DIVISION(IntT, VecT, Algo) \
|
||||
const size_t count = sizeof(VecT) / sizeof(IntT); \
|
||||
VecT result; \
|
||||
IntT *pSource = (IntT *)&numers; \
|
||||
IntT *pTarget = (IntT *)&result; \
|
||||
for (size_t loop=0; loop<count; ++loop) { \
|
||||
pTarget[loop] = libdivide_##Algo##_do(pSource[loop], denom); \
|
||||
} \
|
||||
return result; \
|
||||
//
|
||||
// Use a union to read a vector via pointer-to-integer, without violating strict
|
||||
// aliasing.
|
||||
#define SIMPLE_VECTOR_DIVISION(IntT, VecT, Algo) \
|
||||
const size_t count = sizeof(VecT) / sizeof(IntT); \
|
||||
union type_pun_vec { \
|
||||
VecT vec; \
|
||||
IntT arr[sizeof(VecT) / sizeof(IntT)]; \
|
||||
}; \
|
||||
union type_pun_vec result; \
|
||||
union type_pun_vec input; \
|
||||
input.vec = numers; \
|
||||
for (size_t loop = 0; loop < count; ++loop) { \
|
||||
result.arr[loop] = libdivide_##Algo##_do(input.arr[loop], denom); \
|
||||
} \
|
||||
return result.vec;
|
||||
|
||||
#if defined(LIBDIVIDE_NEON)
|
||||
|
||||
@ -1804,13 +1844,12 @@ static LIBDIVIDE_INLINE int64x2_t libdivide_mullhi_s64_vec128(int64x2_t x, int64
|
||||
|
||||
////////// UINT16
|
||||
|
||||
uint16x8_t libdivide_u16_do_vec128(uint16x8_t numers, const struct libdivide_u16_t *denom) {
|
||||
SIMPLE_VECTOR_DIVISION(uint16_t, uint16x8_t, u16)
|
||||
}
|
||||
uint16x8_t libdivide_u16_do_vec128(uint16x8_t numers, const struct libdivide_u16_t *denom){
|
||||
SIMPLE_VECTOR_DIVISION(uint16_t, uint16x8_t, u16)}
|
||||
|
||||
uint16x8_t libdivide_u16_branchfree_do_vec128(uint16x8_t numers, const struct libdivide_u16_branchfree_t *denom) {
|
||||
SIMPLE_VECTOR_DIVISION(uint16_t, uint16x8_t, u16_branchfree)
|
||||
}
|
||||
uint16x8_t libdivide_u16_branchfree_do_vec128(
|
||||
uint16x8_t numers, const struct libdivide_u16_branchfree_t *denom){
|
||||
SIMPLE_VECTOR_DIVISION(uint16_t, uint16x8_t, u16_branchfree)}
|
||||
|
||||
////////// UINT32
|
||||
|
||||
@ -1870,13 +1909,12 @@ uint64x2_t libdivide_u64_branchfree_do_vec128(
|
||||
|
||||
////////// SINT16
|
||||
|
||||
int16x8_t libdivide_s16_do_vec128(int16x8_t numers, const struct libdivide_s16_t *denom) {
|
||||
SIMPLE_VECTOR_DIVISION(int16_t, int16x8_t, s16)
|
||||
}
|
||||
int16x8_t libdivide_s16_do_vec128(int16x8_t numers, const struct libdivide_s16_t *denom){
|
||||
SIMPLE_VECTOR_DIVISION(int16_t, int16x8_t, s16)}
|
||||
|
||||
int16x8_t libdivide_s16_branchfree_do_vec128(int16x8_t numers, const struct libdivide_s16_branchfree_t *denom) {
|
||||
SIMPLE_VECTOR_DIVISION(int16_t, int16x8_t, s16_branchfree)
|
||||
}
|
||||
int16x8_t libdivide_s16_branchfree_do_vec128(
|
||||
int16x8_t numers, const struct libdivide_s16_branchfree_t *denom){
|
||||
SIMPLE_VECTOR_DIVISION(int16_t, int16x8_t, s16_branchfree)}
|
||||
|
||||
////////// SINT32
|
||||
|
||||
@ -2082,13 +2120,12 @@ static LIBDIVIDE_INLINE __m512i libdivide_mullhi_s64_vec512(__m512i x, __m512i y
|
||||
|
||||
////////// UINT16
|
||||
|
||||
__m512i libdivide_u16_do_vec512(__m512i numers, const struct libdivide_u16_t *denom) {
|
||||
SIMPLE_VECTOR_DIVISION(uint16_t, __m512i, u16)
|
||||
}
|
||||
__m512i libdivide_u16_do_vec512(__m512i numers, const struct libdivide_u16_t *denom){
|
||||
SIMPLE_VECTOR_DIVISION(uint16_t, __m512i, u16)}
|
||||
|
||||
__m512i libdivide_u16_branchfree_do_vec512(__m512i numers, const struct libdivide_u16_branchfree_t *denom) {
|
||||
SIMPLE_VECTOR_DIVISION(uint16_t, __m512i, u16_branchfree)
|
||||
}
|
||||
__m512i libdivide_u16_branchfree_do_vec512(
|
||||
__m512i numers, const struct libdivide_u16_branchfree_t *denom){
|
||||
SIMPLE_VECTOR_DIVISION(uint16_t, __m512i, u16_branchfree)}
|
||||
|
||||
////////// UINT32
|
||||
|
||||
@ -2146,13 +2183,12 @@ __m512i libdivide_u64_branchfree_do_vec512(
|
||||
|
||||
////////// SINT16
|
||||
|
||||
__m512i libdivide_s16_do_vec512(__m512i numers, const struct libdivide_s16_t *denom) {
|
||||
SIMPLE_VECTOR_DIVISION(int16_t, __m512i, s16)
|
||||
}
|
||||
__m512i libdivide_s16_do_vec512(__m512i numers, const struct libdivide_s16_t *denom){
|
||||
SIMPLE_VECTOR_DIVISION(int16_t, __m512i, s16)}
|
||||
|
||||
__m512i libdivide_s16_branchfree_do_vec512(__m512i numers, const struct libdivide_s16_branchfree_t *denom) {
|
||||
SIMPLE_VECTOR_DIVISION(int16_t, __m512i, s16_branchfree)
|
||||
}
|
||||
__m512i libdivide_s16_branchfree_do_vec512(
|
||||
__m512i numers, const struct libdivide_s16_branchfree_t *denom){
|
||||
SIMPLE_VECTOR_DIVISION(int16_t, __m512i, s16_branchfree)}
|
||||
|
||||
////////// SINT32
|
||||
|
||||
@ -2365,11 +2401,25 @@ static LIBDIVIDE_INLINE __m256i libdivide_mullhi_s64_vec256(__m256i x, __m256i y
|
||||
////////// UINT16
|
||||
|
||||
__m256i libdivide_u16_do_vec256(__m256i numers, const struct libdivide_u16_t *denom) {
|
||||
SIMPLE_VECTOR_DIVISION(uint16_t, __m256i, u16)
|
||||
uint8_t more = denom->more;
|
||||
if (!denom->magic) {
|
||||
return _mm256_srli_epi16(numers, more);
|
||||
} else {
|
||||
__m256i q = _mm256_mulhi_epu16(numers, _mm256_set1_epi16(denom->magic));
|
||||
if (more & LIBDIVIDE_ADD_MARKER) {
|
||||
__m256i t = _mm256_adds_epu16(_mm256_srli_epi16(_mm256_subs_epu16(numers, q), 1), q);
|
||||
return _mm256_srli_epi16(t, (more & LIBDIVIDE_16_SHIFT_MASK));
|
||||
} else {
|
||||
return _mm256_srli_epi16(q, more);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__m256i libdivide_u16_branchfree_do_vec256(__m256i numers, const struct libdivide_u16_branchfree_t *denom) {
|
||||
SIMPLE_VECTOR_DIVISION(uint16_t, __m256i, u16_branchfree)
|
||||
__m256i libdivide_u16_branchfree_do_vec256(
|
||||
__m256i numers, const struct libdivide_u16_branchfree_t *denom) {
|
||||
__m256i q = _mm256_mulhi_epu16(numers, _mm256_set1_epi16(denom->magic));
|
||||
__m256i t = _mm256_adds_epu16(_mm256_srli_epi16(_mm256_subs_epu16(numers, q), 1), q);
|
||||
return _mm256_srli_epi16(t, denom->more);
|
||||
}
|
||||
|
||||
////////// UINT32
|
||||
@ -2429,11 +2479,54 @@ __m256i libdivide_u64_branchfree_do_vec256(
|
||||
////////// SINT16
|
||||
|
||||
__m256i libdivide_s16_do_vec256(__m256i numers, const struct libdivide_s16_t *denom) {
|
||||
SIMPLE_VECTOR_DIVISION(int16_t, __m256i, s16)
|
||||
uint8_t more = denom->more;
|
||||
if (!denom->magic) {
|
||||
uint16_t shift = more & LIBDIVIDE_16_SHIFT_MASK;
|
||||
uint16_t mask = ((uint16_t)1 << shift) - 1;
|
||||
__m256i roundToZeroTweak = _mm256_set1_epi16(mask);
|
||||
// q = numer + ((numer >> 15) & roundToZeroTweak);
|
||||
__m256i q = _mm256_add_epi16(
|
||||
numers, _mm256_and_si256(_mm256_srai_epi16(numers, 15), roundToZeroTweak));
|
||||
q = _mm256_srai_epi16(q, shift);
|
||||
__m256i sign = _mm256_set1_epi16((int8_t)more >> 7);
|
||||
// q = (q ^ sign) - sign;
|
||||
q = _mm256_sub_epi16(_mm256_xor_si256(q, sign), sign);
|
||||
return q;
|
||||
} else {
|
||||
__m256i q = _mm256_mulhi_epi16(numers, _mm256_set1_epi16(denom->magic));
|
||||
if (more & LIBDIVIDE_ADD_MARKER) {
|
||||
// must be arithmetic shift
|
||||
__m256i sign = _mm256_set1_epi16((int8_t)more >> 7);
|
||||
// q += ((numer ^ sign) - sign);
|
||||
q = _mm256_add_epi16(q, _mm256_sub_epi16(_mm256_xor_si256(numers, sign), sign));
|
||||
}
|
||||
// q >>= shift
|
||||
q = _mm256_srai_epi16(q, more & LIBDIVIDE_16_SHIFT_MASK);
|
||||
q = _mm256_add_epi16(q, _mm256_srli_epi16(q, 15)); // q += (q < 0)
|
||||
return q;
|
||||
}
|
||||
}
|
||||
|
||||
__m256i libdivide_s16_branchfree_do_vec256(__m256i numers, const struct libdivide_s16_branchfree_t *denom) {
|
||||
SIMPLE_VECTOR_DIVISION(int16_t, __m256i, s16_branchfree)
|
||||
__m256i libdivide_s16_branchfree_do_vec256(
|
||||
__m256i numers, const struct libdivide_s16_branchfree_t *denom) {
|
||||
int16_t magic = denom->magic;
|
||||
uint8_t more = denom->more;
|
||||
uint8_t shift = more & LIBDIVIDE_16_SHIFT_MASK;
|
||||
// must be arithmetic shift
|
||||
__m256i sign = _mm256_set1_epi16((int8_t)more >> 7);
|
||||
__m256i q = _mm256_mulhi_epi16(numers, _mm256_set1_epi16(magic));
|
||||
q = _mm256_add_epi16(q, numers); // q += numers
|
||||
|
||||
// If q is non-negative, we have nothing to do
|
||||
// If q is negative, we want to add either (2**shift)-1 if d is
|
||||
// a power of 2, or (2**shift) if it is not a power of 2
|
||||
uint16_t is_power_of_2 = (magic == 0);
|
||||
__m256i q_sign = _mm256_srai_epi16(q, 15); // q_sign = q >> 15
|
||||
__m256i mask = _mm256_set1_epi16(((uint16_t)1 << shift) - is_power_of_2);
|
||||
q = _mm256_add_epi16(q, _mm256_and_si256(q_sign, mask)); // q = q + (q_sign & mask)
|
||||
q = _mm256_srai_epi16(q, shift); // q >>= shift
|
||||
q = _mm256_sub_epi16(_mm256_xor_si256(q, sign), sign); // q = (q ^ sign) - sign
|
||||
return q;
|
||||
}
|
||||
|
||||
////////// SINT32
|
||||
@ -2661,11 +2754,25 @@ static LIBDIVIDE_INLINE __m128i libdivide_mullhi_s64_vec128(__m128i x, __m128i y
|
||||
////////// UINT26
|
||||
|
||||
__m128i libdivide_u16_do_vec128(__m128i numers, const struct libdivide_u16_t *denom) {
|
||||
SIMPLE_VECTOR_DIVISION(uint16_t, __m128i, u16)
|
||||
uint8_t more = denom->more;
|
||||
if (!denom->magic) {
|
||||
return _mm_srli_epi16(numers, more);
|
||||
} else {
|
||||
__m128i q = _mm_mulhi_epu16(numers, _mm_set1_epi16(denom->magic));
|
||||
if (more & LIBDIVIDE_ADD_MARKER) {
|
||||
__m128i t = _mm_adds_epu16(_mm_srli_epi16(_mm_subs_epu16(numers, q), 1), q);
|
||||
return _mm_srli_epi16(t, (more & LIBDIVIDE_16_SHIFT_MASK));
|
||||
} else {
|
||||
return _mm_srli_epi16(q, more);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__m128i libdivide_u16_branchfree_do_vec128(__m128i numers, const struct libdivide_u16_branchfree_t *denom) {
|
||||
SIMPLE_VECTOR_DIVISION(uint16_t, __m128i, u16_branchfree)
|
||||
__m128i libdivide_u16_branchfree_do_vec128(
|
||||
__m128i numers, const struct libdivide_u16_branchfree_t *denom) {
|
||||
__m128i q = _mm_mulhi_epu16(numers, _mm_set1_epi16(denom->magic));
|
||||
__m128i t = _mm_adds_epu16(_mm_srli_epi16(_mm_subs_epu16(numers, q), 1), q);
|
||||
return _mm_srli_epi16(t, denom->more);
|
||||
}
|
||||
|
||||
////////// UINT32
|
||||
@ -2725,11 +2832,54 @@ __m128i libdivide_u64_branchfree_do_vec128(
|
||||
////////// SINT16
|
||||
|
||||
__m128i libdivide_s16_do_vec128(__m128i numers, const struct libdivide_s16_t *denom) {
|
||||
SIMPLE_VECTOR_DIVISION(int16_t, __m128i, s16)
|
||||
uint8_t more = denom->more;
|
||||
if (!denom->magic) {
|
||||
uint16_t shift = more & LIBDIVIDE_16_SHIFT_MASK;
|
||||
uint16_t mask = ((uint16_t)1 << shift) - 1;
|
||||
__m128i roundToZeroTweak = _mm_set1_epi16(mask);
|
||||
// q = numer + ((numer >> 15) & roundToZeroTweak);
|
||||
__m128i q =
|
||||
_mm_add_epi16(numers, _mm_and_si128(_mm_srai_epi16(numers, 15), roundToZeroTweak));
|
||||
q = _mm_srai_epi16(q, shift);
|
||||
__m128i sign = _mm_set1_epi16((int8_t)more >> 7);
|
||||
// q = (q ^ sign) - sign;
|
||||
q = _mm_sub_epi16(_mm_xor_si128(q, sign), sign);
|
||||
return q;
|
||||
} else {
|
||||
__m128i q = _mm_mulhi_epi16(numers, _mm_set1_epi16(denom->magic));
|
||||
if (more & LIBDIVIDE_ADD_MARKER) {
|
||||
// must be arithmetic shift
|
||||
__m128i sign = _mm_set1_epi16((int8_t)more >> 7);
|
||||
// q += ((numer ^ sign) - sign);
|
||||
q = _mm_add_epi16(q, _mm_sub_epi16(_mm_xor_si128(numers, sign), sign));
|
||||
}
|
||||
// q >>= shift
|
||||
q = _mm_srai_epi16(q, more & LIBDIVIDE_16_SHIFT_MASK);
|
||||
q = _mm_add_epi16(q, _mm_srli_epi16(q, 15)); // q += (q < 0)
|
||||
return q;
|
||||
}
|
||||
}
|
||||
|
||||
__m128i libdivide_s16_branchfree_do_vec128(__m128i numers, const struct libdivide_s16_branchfree_t *denom) {
|
||||
SIMPLE_VECTOR_DIVISION(int16_t, __m128i, s16_branchfree)
|
||||
__m128i libdivide_s16_branchfree_do_vec128(
|
||||
__m128i numers, const struct libdivide_s16_branchfree_t *denom) {
|
||||
int16_t magic = denom->magic;
|
||||
uint8_t more = denom->more;
|
||||
uint8_t shift = more & LIBDIVIDE_16_SHIFT_MASK;
|
||||
// must be arithmetic shift
|
||||
__m128i sign = _mm_set1_epi16((int8_t)more >> 7);
|
||||
__m128i q = _mm_mulhi_epi16(numers, _mm_set1_epi16(magic));
|
||||
q = _mm_add_epi16(q, numers); // q += numers
|
||||
|
||||
// If q is non-negative, we have nothing to do
|
||||
// If q is negative, we want to add either (2**shift)-1 if d is
|
||||
// a power of 2, or (2**shift) if it is not a power of 2
|
||||
uint16_t is_power_of_2 = (magic == 0);
|
||||
__m128i q_sign = _mm_srai_epi16(q, 15); // q_sign = q >> 15
|
||||
__m128i mask = _mm_set1_epi16(((uint16_t)1 << shift) - is_power_of_2);
|
||||
q = _mm_add_epi16(q, _mm_and_si128(q_sign, mask)); // q = q + (q_sign & mask)
|
||||
q = _mm_srai_epi16(q, shift); // q >>= shift
|
||||
q = _mm_sub_epi16(_mm_xor_si128(q, sign), sign); // q = (q ^ sign) - sign
|
||||
return q;
|
||||
}
|
||||
|
||||
////////// SINT32
|
||||
@ -2795,8 +2945,8 @@ __m128i libdivide_s64_do_vec128(__m128i numers, const struct libdivide_s64_t *de
|
||||
uint64_t mask = ((uint64_t)1 << shift) - 1;
|
||||
__m128i roundToZeroTweak = _mm_set1_epi64x(mask);
|
||||
// q = numer + ((numer >> 63) & roundToZeroTweak);
|
||||
__m128i q =
|
||||
_mm_add_epi64(numers, _mm_and_si128(libdivide_s64_signbits_vec128(numers), roundToZeroTweak));
|
||||
__m128i q = _mm_add_epi64(
|
||||
numers, _mm_and_si128(libdivide_s64_signbits_vec128(numers), roundToZeroTweak));
|
||||
q = libdivide_s64_shift_right_vec128(q, shift);
|
||||
__m128i sign = _mm_set1_epi32((int8_t)more >> 7);
|
||||
// q = (q ^ sign) - sign;
|
||||
@ -2847,49 +2997,80 @@ __m128i libdivide_s64_branchfree_do_vec128(
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
||||
//for constexpr zero initialization,
|
||||
//c++11 might handle things ok,
|
||||
//but just limit to at least c++14 to ensure
|
||||
//we don't break anyone's code:
|
||||
|
||||
// for gcc and clang, use https://en.cppreference.com/w/cpp/feature_test#cpp_constexpr
|
||||
#if (defined(__GNUC__) || defined(__clang__)) && (__cpp_constexpr >= 201304L)
|
||||
#define LIBDIVIDE_CONSTEXPR constexpr
|
||||
|
||||
// supposedly, MSVC might not implement feature test macros right (https://stackoverflow.com/questions/49316752/feature-test-macros-not-working-properly-in-visual-c)
|
||||
// so check that _MSVC_LANG corresponds to at least c++14, and _MSC_VER corresponds to at least VS 2017 15.0 (for extended constexpr support https://learn.microsoft.com/en-us/cpp/overview/visual-cpp-language-conformance?view=msvc-170)
|
||||
#elif defined(_MSC_VER) && _MSC_VER >= 1910 && defined(_MSVC_LANG) && _MSVC_LANG >=201402L
|
||||
#define LIBDIVIDE_CONSTEXPR constexpr
|
||||
|
||||
// in case some other obscure compiler has the right __cpp_constexpr :
|
||||
#elif defined(__cpp_constexpr) && __cpp_constexpr >= 201304L
|
||||
#define LIBDIVIDE_CONSTEXPR constexpr
|
||||
|
||||
#else
|
||||
#define LIBDIVIDE_CONSTEXPR LIBDIVIDE_INLINE
|
||||
#endif
|
||||
|
||||
enum Branching {
|
||||
BRANCHFULL, // use branching algorithms
|
||||
BRANCHFREE // use branchfree algorithms
|
||||
};
|
||||
|
||||
namespace detail {
|
||||
enum Signedness {
|
||||
SIGNED,
|
||||
UNSIGNED,
|
||||
};
|
||||
|
||||
#if defined(LIBDIVIDE_NEON)
|
||||
// Helper to deduce NEON vector type for integral type.
|
||||
template <typename T>
|
||||
struct NeonVecFor {};
|
||||
template <int _WIDTH, Signedness _SIGN>
|
||||
struct NeonVec {};
|
||||
|
||||
template <>
|
||||
struct NeonVecFor<uint16_t> {
|
||||
struct NeonVec<16, UNSIGNED> {
|
||||
typedef uint16x8_t type;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct NeonVecFor<int16_t> {
|
||||
struct NeonVec<16, SIGNED> {
|
||||
typedef int16x8_t type;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct NeonVecFor<uint32_t> {
|
||||
struct NeonVec<32, UNSIGNED> {
|
||||
typedef uint32x4_t type;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct NeonVecFor<int32_t> {
|
||||
struct NeonVec<32, SIGNED> {
|
||||
typedef int32x4_t type;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct NeonVecFor<uint64_t> {
|
||||
struct NeonVec<64, UNSIGNED> {
|
||||
typedef uint64x2_t type;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct NeonVecFor<int64_t> {
|
||||
struct NeonVec<64, SIGNED> {
|
||||
typedef int64x2_t type;
|
||||
};
|
||||
#endif
|
||||
|
||||
// Versions of our algorithms for SIMD.
|
||||
#if defined(LIBDIVIDE_NEON)
|
||||
template <typename T>
|
||||
struct NeonVecFor {
|
||||
// See 'class divider' for an explanation of these template parameters.
|
||||
typedef typename NeonVec<sizeof(T) * 8, (((T)0 >> 0) > (T)(-1) ? SIGNED : UNSIGNED)>::type type;
|
||||
};
|
||||
|
||||
#define LIBDIVIDE_DIVIDE_NEON(ALGO, INT_TYPE) \
|
||||
LIBDIVIDE_INLINE typename NeonVecFor<INT_TYPE>::type divide( \
|
||||
typename NeonVecFor<INT_TYPE>::type n) const { \
|
||||
@ -2898,6 +3079,7 @@ struct NeonVecFor<int64_t> {
|
||||
#else
|
||||
#define LIBDIVIDE_DIVIDE_NEON(ALGO, INT_TYPE)
|
||||
#endif
|
||||
|
||||
#if defined(LIBDIVIDE_SSE2)
|
||||
#define LIBDIVIDE_DIVIDE_SSE2(ALGO) \
|
||||
LIBDIVIDE_INLINE __m128i divide(__m128i n) const { \
|
||||
@ -2930,6 +3112,7 @@ struct NeonVecFor<int64_t> {
|
||||
#define DISPATCHER_GEN(T, ALGO) \
|
||||
libdivide_##ALGO##_t denom; \
|
||||
LIBDIVIDE_INLINE dispatcher() {} \
|
||||
explicit LIBDIVIDE_CONSTEXPR dispatcher(decltype(nullptr)) : denom{} {} \
|
||||
LIBDIVIDE_INLINE dispatcher(T d) : denom(libdivide_##ALGO##_gen(d)) {} \
|
||||
LIBDIVIDE_INLINE T divide(T n) const { return libdivide_##ALGO##_do(n, &denom); } \
|
||||
LIBDIVIDE_INLINE T recover() const { return libdivide_##ALGO##_recover(&denom); } \
|
||||
@ -2939,66 +3122,81 @@ struct NeonVecFor<int64_t> {
|
||||
LIBDIVIDE_DIVIDE_AVX512(ALGO)
|
||||
|
||||
// The dispatcher selects a specific division algorithm for a given
|
||||
// type and ALGO using partial template specialization.
|
||||
template <typename _IntT, Branching ALGO>
|
||||
// width, signedness, and ALGO using partial template specialization.
|
||||
template <int _WIDTH, Signedness _SIGN, Branching _ALGO>
|
||||
struct dispatcher {};
|
||||
|
||||
template <>
|
||||
struct dispatcher<int16_t, BRANCHFULL> {
|
||||
struct dispatcher<16, SIGNED, BRANCHFULL> {
|
||||
DISPATCHER_GEN(int16_t, s16)
|
||||
};
|
||||
template <>
|
||||
struct dispatcher<int16_t, BRANCHFREE> {
|
||||
struct dispatcher<16, SIGNED, BRANCHFREE> {
|
||||
DISPATCHER_GEN(int16_t, s16_branchfree)
|
||||
};
|
||||
template <>
|
||||
struct dispatcher<uint16_t, BRANCHFULL> {
|
||||
struct dispatcher<16, UNSIGNED, BRANCHFULL> {
|
||||
DISPATCHER_GEN(uint16_t, u16)
|
||||
};
|
||||
template <>
|
||||
struct dispatcher<uint16_t, BRANCHFREE> {
|
||||
struct dispatcher<16, UNSIGNED, BRANCHFREE> {
|
||||
DISPATCHER_GEN(uint16_t, u16_branchfree)
|
||||
};
|
||||
template <>
|
||||
struct dispatcher<int32_t, BRANCHFULL> {
|
||||
struct dispatcher<32, SIGNED, BRANCHFULL> {
|
||||
DISPATCHER_GEN(int32_t, s32)
|
||||
};
|
||||
template <>
|
||||
struct dispatcher<int32_t, BRANCHFREE> {
|
||||
struct dispatcher<32, SIGNED, BRANCHFREE> {
|
||||
DISPATCHER_GEN(int32_t, s32_branchfree)
|
||||
};
|
||||
template <>
|
||||
struct dispatcher<uint32_t, BRANCHFULL> {
|
||||
struct dispatcher<32, UNSIGNED, BRANCHFULL> {
|
||||
DISPATCHER_GEN(uint32_t, u32)
|
||||
};
|
||||
template <>
|
||||
struct dispatcher<uint32_t, BRANCHFREE> {
|
||||
struct dispatcher<32, UNSIGNED, BRANCHFREE> {
|
||||
DISPATCHER_GEN(uint32_t, u32_branchfree)
|
||||
};
|
||||
template <>
|
||||
struct dispatcher<int64_t, BRANCHFULL> {
|
||||
struct dispatcher<64, SIGNED, BRANCHFULL> {
|
||||
DISPATCHER_GEN(int64_t, s64)
|
||||
};
|
||||
template <>
|
||||
struct dispatcher<int64_t, BRANCHFREE> {
|
||||
struct dispatcher<64, SIGNED, BRANCHFREE> {
|
||||
DISPATCHER_GEN(int64_t, s64_branchfree)
|
||||
};
|
||||
template <>
|
||||
struct dispatcher<uint64_t, BRANCHFULL> {
|
||||
struct dispatcher<64, UNSIGNED, BRANCHFULL> {
|
||||
DISPATCHER_GEN(uint64_t, u64)
|
||||
};
|
||||
template <>
|
||||
struct dispatcher<uint64_t, BRANCHFREE> {
|
||||
struct dispatcher<64, UNSIGNED, BRANCHFREE> {
|
||||
DISPATCHER_GEN(uint64_t, u64_branchfree)
|
||||
};
|
||||
} // namespace detail
|
||||
|
||||
#if defined(LIBDIVIDE_NEON)
|
||||
// Allow NeonVecFor outside of detail namespace.
|
||||
template <typename T>
|
||||
struct NeonVecFor {
|
||||
typedef typename detail::NeonVecFor<T>::type type;
|
||||
};
|
||||
#endif
|
||||
|
||||
// This is the main divider class for use by the user (C++ API).
|
||||
// The actual division algorithm is selected using the dispatcher struct
|
||||
// based on the integer and algorithm template parameters.
|
||||
// based on the integer width and algorithm template parameters.
|
||||
template <typename T, Branching ALGO = BRANCHFULL>
|
||||
class divider {
|
||||
private:
|
||||
typedef dispatcher<T, ALGO> dispatcher_t;
|
||||
// Dispatch based on the size and signedness.
|
||||
// We avoid using type_traits as it's not available in AVR.
|
||||
// Detect signedness by checking if T(-1) is less than T(0).
|
||||
// Also throw in a shift by 0, which prevents floating point types from being passed.
|
||||
typedef detail::dispatcher<sizeof(T) * 8,
|
||||
(((T)0 >> 0) > (T)(-1) ? detail::SIGNED : detail::UNSIGNED), ALGO>
|
||||
dispatcher_t;
|
||||
|
||||
public:
|
||||
// We leave the default constructor empty so that creating
|
||||
@ -3006,6 +3204,9 @@ class divider {
|
||||
// later doesn't slow us down.
|
||||
divider() {}
|
||||
|
||||
// constexpr zero-initialization to allow for use w/ static constinit
|
||||
explicit LIBDIVIDE_CONSTEXPR divider(decltype(nullptr)) : div(nullptr) {}
|
||||
|
||||
// Constructor that takes the divisor as a parameter
|
||||
LIBDIVIDE_INLINE divider(T d) : div(d) {}
|
||||
|
||||
@ -3017,7 +3218,7 @@ class divider {
|
||||
T recover() const { return div.recover(); }
|
||||
|
||||
bool operator==(const divider<T, ALGO> &other) const {
|
||||
return div.denom.magic == other.denom.magic && div.denom.more == other.denom.more;
|
||||
return div.denom.magic == other.div.denom.magic && div.denom.more == other.div.denom.more;
|
||||
}
|
||||
|
||||
bool operator!=(const divider<T, ALGO> &other) const { return !(*this == other); }
|
||||
@ -3098,12 +3299,14 @@ LIBDIVIDE_INLINE __m512i operator/=(__m512i &n, const divider<T, ALGO> &div) {
|
||||
|
||||
#if defined(LIBDIVIDE_NEON)
|
||||
template <typename T, Branching ALGO>
|
||||
LIBDIVIDE_INLINE typename NeonVecFor<T>::type operator/(typename NeonVecFor<T>::type n, const divider<T, ALGO> &div) {
|
||||
LIBDIVIDE_INLINE typename NeonVecFor<T>::type operator/(
|
||||
typename NeonVecFor<T>::type n, const divider<T, ALGO> &div) {
|
||||
return div.divide(n);
|
||||
}
|
||||
|
||||
template <typename T, Branching ALGO>
|
||||
LIBDIVIDE_INLINE typename NeonVecFor<T>::type operator/=(typename NeonVecFor<T>::type &n, const divider<T, ALGO> &div) {
|
||||
LIBDIVIDE_INLINE typename NeonVecFor<T>::type operator/=(
|
||||
typename NeonVecFor<T>::type &n, const divider<T, ALGO> &div) {
|
||||
n = div.divide(n);
|
||||
return n;
|
||||
}
|
||||
|
3
util.c
3
util.c
@ -6,6 +6,8 @@
|
||||
|
||||
#ifdef __ANDROID__
|
||||
#include <async_safe/log.h>
|
||||
int mallopt(int param, int value);
|
||||
#define M_BIONIC_RESTORE_DEFAULT_SIGABRT_HANDLER (-1003)
|
||||
#endif
|
||||
|
||||
#include "util.h"
|
||||
@ -30,6 +32,7 @@ static int write_full(int fd, const char *buf, size_t length) {
|
||||
|
||||
COLD noreturn void fatal_error(const char *s) {
|
||||
#ifdef __ANDROID__
|
||||
mallopt(M_BIONIC_RESTORE_DEFAULT_SIGABRT_HANDLER, 0);
|
||||
async_safe_fatal("hardened_malloc: fatal allocator error: %s", s);
|
||||
#else
|
||||
const char *prefix = "fatal allocator error: ";
|
||||
|
18
util.h
18
util.h
@ -9,7 +9,9 @@
|
||||
#define noreturn __attribute__((noreturn))
|
||||
|
||||
#define likely(x) __builtin_expect(!!(x), 1)
|
||||
#define likely51(x) __builtin_expect_with_probability(!!(x), 1, 0.51)
|
||||
#define unlikely(x) __builtin_expect(!!(x), 0)
|
||||
#define unlikely51(x) __builtin_expect_with_probability(!!(x), 0, 0.51)
|
||||
|
||||
#define min(x, y) ({ \
|
||||
__typeof__(x) _x = (x); \
|
||||
@ -57,6 +59,22 @@ static inline size_t align(size_t size, size_t align) {
|
||||
return (size + mask) & ~mask;
|
||||
}
|
||||
|
||||
// u4_arr_{set,get} are helper functions for using u8 array as an array of unsigned 4-bit values.
|
||||
|
||||
// val is treated as a 4-bit value
|
||||
static inline void u4_arr_set(u8 *arr, size_t idx, u8 val) {
|
||||
size_t off = idx >> 1;
|
||||
size_t shift = (idx & 1) << 2;
|
||||
u8 mask = (u8) (0xf0 >> shift);
|
||||
arr[off] = (arr[off] & mask) | (val << shift);
|
||||
}
|
||||
|
||||
static inline u8 u4_arr_get(const u8 *arr, size_t idx) {
|
||||
size_t off = idx >> 1;
|
||||
size_t shift = (idx & 1) << 2;
|
||||
return (u8) ((arr[off] >> shift) & 0xf);
|
||||
}
|
||||
|
||||
COLD noreturn void fatal_error(const char *s);
|
||||
|
||||
#if CONFIG_SEAL_METADATA
|
||||
|
Loading…
x
Reference in New Issue
Block a user