-
Notifications
You must be signed in to change notification settings - Fork 13.2k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AArch64] Fix nofp regressions in compiler-rt and libunwind #111235
Conversation
@llvm/pr-subscribers-libunwind Author: Keith Packard (keith-packard) ChangesThese two libraries don't build for Full diff: https://github.com/llvm/llvm-project/pull/111235.diff 4 Files Affected:
diff --git a/compiler-rt/lib/builtins/aarch64/sme-libc-mem-routines.S b/compiler-rt/lib/builtins/aarch64/sme-libc-mem-routines.S
index 0318d9a6f1ebd2..72d87fb4fa8586 100644
--- a/compiler-rt/lib/builtins/aarch64/sme-libc-mem-routines.S
+++ b/compiler-rt/lib/builtins/aarch64/sme-libc-mem-routines.S
@@ -6,7 +6,7 @@
#include "../assembly.h"
-#ifdef __aarch64__
+#if defined(__aarch64__) && __ARM_FP != 0
#define L(l) .L ## l
diff --git a/compiler-rt/lib/builtins/aarch64/sme-libc-routines.c b/compiler-rt/lib/builtins/aarch64/sme-libc-routines.c
index 315490e73ea2b1..92fb953c03a376 100644
--- a/compiler-rt/lib/builtins/aarch64/sme-libc-routines.c
+++ b/compiler-rt/lib/builtins/aarch64/sme-libc-routines.c
@@ -1,5 +1,82 @@
#include <stddef.h>
+#if __ARM_FP == 0
+// WARNING: When building the scalar versions of these functions you need to
+// use the compiler flag "-mllvm -disable-loop-idiom-all" to prevent clang
+// from recognising a loop idiom and planting calls to memcpy!
+
+static void *__arm_sc_memcpy_fwd(void *dest, const void *src,
+ size_t n) __arm_streaming_compatible {
+ unsigned char *destp = (unsigned char *)dest;
+ const unsigned char *srcp = (const unsigned char *)src;
+ for (size_t i = 0; i < n; ++i)
+ destp[i] = srcp[i];
+
+ return dest;
+}
+
+// If dest and src overlap then behaviour is undefined, hence we can add the
+// restrict keywords here. This also matches the definition of the libc memcpy
+// according to the man page.
+void *__arm_sc_memcpy(void *__restrict__ dest, const void *__restrict__ src,
+ size_t n) __arm_streaming_compatible {
+ return __arm_sc_memcpy_fwd(dest, src, n);
+}
+
+void *__arm_sc_memset(void *dest, int c, size_t n) __arm_streaming_compatible {
+ unsigned char *destp = (unsigned char *)dest;
+ unsigned char c8 = (unsigned char)c;
+ for (size_t i = 0; i < n; ++i)
+ destp[i] = c8;
+
+ return dest;
+}
+
+static void *__arm_sc_memcpy_rev(void *dest, const void *src,
+ size_t n) __arm_streaming_compatible {
+ unsigned char *destp = (unsigned char *)dest;
+ const unsigned char *srcp = (const unsigned char *)src;
+ // TODO: Improve performance by copying larger chunks in reverse, or by
+ // using SVE.
+ while (n > 0) {
+ --n;
+ destp[n] = srcp[n];
+ }
+ return dest;
+}
+
+// Semantically a memmove is equivalent to the following:
+// 1. Copy the entire contents of src to a temporary array that does not
+// overlap with src or dest.
+// 2. Copy the contents of the temporary array into dest.
+void *__arm_sc_memmove(void *dest, const void *src,
+ size_t n) __arm_streaming_compatible {
+ unsigned char *destp = (unsigned char *)dest;
+ const unsigned char *srcp = (const unsigned char *)src;
+
+ // If src and dest don't overlap then just invoke memcpy
+ if ((srcp > (destp + n)) || (destp > (srcp + n)))
+ return __arm_sc_memcpy_fwd(dest, src, n);
+
+ // Overlap case 1:
+ // src: Low | -> | High
+ // dest: Low | -> | High
+ // Here src is always ahead of dest at a higher addres. If we first read a
+ // chunk of data from src we can safely write the same chunk to dest without
+ // corrupting future reads of src.
+ if (srcp > destp)
+ return __arm_sc_memcpy_fwd(dest, src, n);
+
+ // Overlap case 2:
+ // src: Low | -> | High
+ // dest: Low | -> | High
+ // While we're in the overlap region we're always corrupting future reads of
+ // src when writing to dest. An efficient way to do this is to copy the data
+ // in reverse by starting at the highest address.
+ return __arm_sc_memcpy_rev(dest, src, n);
+}
+#endif
+
const void *__arm_sc_memchr(const void *src, int c,
size_t n) __arm_streaming_compatible {
const unsigned char *srcp = (const unsigned char *)src;
diff --git a/libunwind/src/UnwindRegistersRestore.S b/libunwind/src/UnwindRegistersRestore.S
index 180a66582f41b5..13a40d080c4c2d 100644
--- a/libunwind/src/UnwindRegistersRestore.S
+++ b/libunwind/src/UnwindRegistersRestore.S
@@ -633,6 +633,12 @@ Lnovec:
.arch_extension gcs
#endif
+#if defined(__ARM_FP) && __ARM_FP != 0
+#define LDP(a,b,r,o,p) stp a, b, [r, o]
+#else
+#define LDP(a,b,r,o,p) ldr a, [r, o] ; ldr b, [r, p]
+#endif
+
//
// extern "C" void __libunwind_Registers_arm64_jumpto(Registers_arm64 *);
//
@@ -642,23 +648,24 @@ Lnovec:
.p2align 2
DEFINE_LIBUNWIND_FUNCTION(__libunwind_Registers_arm64_jumpto)
// skip restore of x0,x1 for now
- ldp x2, x3, [x0, #0x010]
- ldp x4, x5, [x0, #0x020]
- ldp x6, x7, [x0, #0x030]
- ldp x8, x9, [x0, #0x040]
- ldp x10,x11, [x0, #0x050]
- ldp x12,x13, [x0, #0x060]
- ldp x14,x15, [x0, #0x070]
+ LDP(x2, x3, x0, #0x010, #0x018)
+ LDP(x4, x5, x0, #0x020, #0x028)
+ LDP(x6, x7, x0, #0x030, #0x038)
+ LDP(x8, x9, x0, #0x040, #0x048)
+ LDP(x10, x11, x0, #0x050, #0x058)
+ LDP(x12, x13, x0, #0x060, #0x068)
+ LDP(x14, x15, x0, #0x070, #0x078)
// x16 and x17 were clobbered by the call into the unwinder, so no point in
// restoring them.
- ldp x18,x19, [x0, #0x090]
- ldp x20,x21, [x0, #0x0A0]
- ldp x22,x23, [x0, #0x0B0]
- ldp x24,x25, [x0, #0x0C0]
- ldp x26,x27, [x0, #0x0D0]
- ldp x28,x29, [x0, #0x0E0]
+ LDP(x18, x19, x0, #0x090, #0x098)
+ LDP(x20, x21, x0, #0x0A0, #0x0A8)
+ LDP(x22, x23, x0, #0x0B0, #0x0B8)
+ LDP(x24, x25, x0, #0x0C0, #0x0C8)
+ LDP(x26, x27, x0, #0x0D0, #0x0D8)
+ LDP(x28, x29, x0, #0x0E0, #0x0E8)
ldr x30, [x0, #0x100] // restore pc into lr
+#if defined(__ARM_FP) && __ARM_FP != 0
ldp d0, d1, [x0, #0x110]
ldp d2, d3, [x0, #0x120]
ldp d4, d5, [x0, #0x130]
@@ -676,13 +683,14 @@ DEFINE_LIBUNWIND_FUNCTION(__libunwind_Registers_arm64_jumpto)
ldp d28,d29, [x0, #0x1F0]
ldr d30, [x0, #0x200]
ldr d31, [x0, #0x208]
+#endif
// Finally, restore sp. This must be done after the last read from the
// context struct, because it is allocated on the stack, and an exception
// could clobber the de-allocated portion of the stack after sp has been
// restored.
ldr x16, [x0, #0x0F8]
- ldp x0, x1, [x0, #0x000] // restore x0,x1
+ LDP(x0, x1, x0, #0x000, #0x008) // restore x0,x1
mov sp,x16 // restore sp
#if defined(__ARM_FEATURE_GCS_DEFAULT)
// If GCS is enabled we need to push the address we're returning to onto the
diff --git a/libunwind/src/UnwindRegistersSave.S b/libunwind/src/UnwindRegistersSave.S
index fab234fcd6f318..922469bc11aa22 100644
--- a/libunwind/src/UnwindRegistersSave.S
+++ b/libunwind/src/UnwindRegistersSave.S
@@ -718,6 +718,12 @@ LnoR2Fix:
#elif defined(__aarch64__)
+#if defined(__ARM_FP) && __ARM_FP != 0
+#define STP(a,b,r,o,p) stp a, b, [r, o]
+#else
+#define STP(a,b,r,o,p) str a, [r, o] ; str b, [r, p]
+#endif
+
//
// extern int __unw_getcontext(unw_context_t* thread_state)
//
@@ -726,21 +732,21 @@ LnoR2Fix:
//
.p2align 2
DEFINE_LIBUNWIND_FUNCTION(__unw_getcontext)
- stp x0, x1, [x0, #0x000]
- stp x2, x3, [x0, #0x010]
- stp x4, x5, [x0, #0x020]
- stp x6, x7, [x0, #0x030]
- stp x8, x9, [x0, #0x040]
- stp x10,x11, [x0, #0x050]
- stp x12,x13, [x0, #0x060]
- stp x14,x15, [x0, #0x070]
- stp x16,x17, [x0, #0x080]
- stp x18,x19, [x0, #0x090]
- stp x20,x21, [x0, #0x0A0]
- stp x22,x23, [x0, #0x0B0]
- stp x24,x25, [x0, #0x0C0]
- stp x26,x27, [x0, #0x0D0]
- stp x28,x29, [x0, #0x0E0]
+ STP(x0, x1, x0, #0x000, #0x008)
+ STP(x2, x3, x0, #0x010, #0x018)
+ STP(x4, x5, x0, #0x020, #0x028)
+ STP(x6, x7, x0, #0x030, #0x038)
+ STP(x8, x9, x0, #0x040, #0x048)
+ STP(x10, x11, x0, #0x050, #0x058)
+ STP(x12, x13, x0, #0x060, #0x068)
+ STP(x14, x15, x0, #0x070, #0x078)
+ STP(x16, x17, x0, #0x080, #0x088)
+ STP(x18, x19, x0, #0x090, #0x098)
+ STP(x20, x21, x0, #0x0A0, #0x0A8)
+ STP(x22, x23, x0, #0x0B0, #0x0B8)
+ STP(x24, x25, x0, #0x0C0, #0x0C8)
+ STP(x26, x27, x0, #0x0D0, #0x0D8)
+ STP(x28, x29, x0, #0x0E0, #0x0E8)
str x30, [x0, #0x0F0]
mov x1,sp
str x1, [x0, #0x0F8]
@@ -763,6 +769,7 @@ DEFINE_LIBUNWIND_FUNCTION(__unw_getcontext)
stp d28,d29, [x0, #0x1F0]
str d30, [x0, #0x200]
str d31, [x0, #0x208]
+#endif
mov x0, #0 // return UNW_ESUCCESS
ret
|
8ba3fed
to
4177bf1
Compare
4177bf1
to
43aa8d2
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
libunwind changes LGTM, not quite sure about the compiler-rt part.
43aa8d2
to
afdaba6
Compare
afdaba6
to
474ae35
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM, just one comment.
@@ -238,7 +236,8 @@ END_COMPILERRT_OUTLINE_FUNCTION(__arm_sc_memcpy) | |||
|
|||
DEFINE_COMPILERRT_FUNCTION_ALIAS(__arm_sc_memmove, __arm_sc_memcpy) | |||
|
|||
|
|||
// This version uses FP registers. Use this only on targets with them | |||
#if defined(__aarch64__) && __ARM_FP != 0 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is the defined(__aarch64__)
needed here? The file is in the aarch64 subdirectory so I would assume that is always true?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
#if defined(__aarch64__) && __ARM_FP != 0 | |
#if __ARM_FP != 0 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm not sure; the previous version protected this with #ifdef __aarch64__
, so I just kept that in place. Seemed odd to me as well.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@keith-packard I noticed that your patch has already been approved. Could you please proceed with submitting it upstream at your earliest convenience?
Thank you for your contribution and for helping to improve the project.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I've rebased today; what else would I need to do to "submit it upstream"?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nothing else needed - I will click the merge button once CI completes. This will squash the commits, but I think that should be fine since they are both quite small. Is that okay or would you prefer to have them separate in the commit history?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
squashed is just fine with me; I like to keep things split apart for review, but after that, I have no strong opinions.
Skip save/restore of FPU registers on targets without them. Signed-off-by: Keith Packard <keithp@keithp.com>
Fall back to the old C implementation of __arm_sc_memset when the target doesn't have an FPU. Signed-off-by: Keith Packard <keithp@keithp.com>
474ae35
to
90d4f6a
Compare
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/55/builds/3403 Here is the relevant piece of the build log for the reference
|
These two libraries don't build for `-march=armv8-a+nofp -mabi=aapcs-soft` as a couple of uses of floating point instructions and registers have crept in. In libunwind, skip save/restore of FPU registers on targets without them. In compiler-rt, fall back to the old C implementation of __arm_sc_memset when the target doesn't have an FPU. --------- Signed-off-by: Keith Packard <keithp@keithp.com>
These two libraries don't build for
-march=armv8-a+nofp -mabi=aapcs-soft
as a couple of uses of floating point instructions and registers have crept in.