Skip to content

[AArch64] Miscompilation of struct containing complex double and BitInt #89230

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
ostannard opened this issue Apr 18, 2024 · 4 comments
Closed

Comments

@ostannard
Copy link
Collaborator

This code is miscompiled when targeting AArch64 at all optimisation levels:

extern void __aeabi_assert(const char *, const char *, int);
#define assert(e) ((e) ? (void)0 : __aeabi_assert(#e, __FILE__, __LINE__))
#include <complex.h>

union S131 {
  double _Complex M0 __attribute__((aligned(16)));
  signed _BitInt(124) M1;
};

void F94(union S131 P3) {
  assert(P3.M0 == 1.0 + 2.0 * _Complex_I);
}

int main() {
  union S131 P3 = {1.0 + 2.0 * _Complex_I};
  F94(P3);

  return 0;
}

I think that the IR generated by clang is wrong, before any optimisations:

$ /work/llvm/build/bin/clang --target=aarch64--none-elf -march=armv8-a -c test.c -o - -S -O0 -emit-llvm
; ModuleID = 'test.c'
source_filename = "test.c"
target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
target triple = "aarch64-unknown-none-elf"

%union.S131 = type { i124 }

@.str = private unnamed_addr constant [21 x i8] c"P3.M0 == 1.0 + 2.0if\00", align 1
@.str.1 = private unnamed_addr constant [7 x i8] c"test.c\00", align 1
@__const.main.P3 = private unnamed_addr constant { { double, double } } { { double, double } { double 1.000000e+00, double 2.000000e+00 } }, align 16

; Function Attrs: noinline nounwind optnone
define dso_local void @F94(i128 %P3.coerce) #0 {
entry:
  %P3 = alloca %union.S131, align 16
  %coerce.dive = getelementptr inbounds %union.S131, ptr %P3, i32 0, i32 0
  %coerce.val.ii = trunc i128 %P3.coerce to i124
  store i124 %coerce.val.ii, ptr %coerce.dive, align 16
  %P3.realp = getelementptr inbounds { double, double }, ptr %P3, i32 0, i32 0
  %P3.real = load double, ptr %P3.realp, align 16
  %P3.imagp = getelementptr inbounds { double, double }, ptr %P3, i32 0, i32 1
  %P3.imag = load double, ptr %P3.imagp, align 8
  %cmp.r = fcmp oeq double %P3.real, 1.000000e+00
  %cmp.i = fcmp oeq double %P3.imag, 2.000000e+00
  %and.ri = and i1 %cmp.r, %cmp.i
  br i1 %and.ri, label %cond.true, label %cond.false

cond.true:                                        ; preds = %entry
  br label %cond.end

cond.false:                                       ; preds = %entry
  call void @__aeabi_assert(ptr noundef @.str, ptr noundef @.str.1, i32 noundef 10)
  br label %cond.end

cond.end:                                         ; preds = %cond.false, %cond.true
  ret void
}

declare dso_local void @__aeabi_assert(ptr noundef, ptr noundef, i32 noundef) #1

; Function Attrs: noinline nounwind optnone
define dso_local i32 @main() #0 {
entry:
  %retval = alloca i32, align 4
  %P3 = alloca %union.S131, align 16
  store i32 0, ptr %retval, align 4
  call void @llvm.memcpy.p0.p0.i64(ptr align 16 %P3, ptr align 16 @__const.main.P3, i64 16, i1 false)
  %coerce.dive = getelementptr inbounds %union.S131, ptr %P3, i32 0, i32 0
  %0 = load i124, ptr %coerce.dive, align 16
  %coerce.val.ii = zext i124 %0 to i128
  call void @F94(i128 %coerce.val.ii)
  ret i32 0
}

; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: readwrite)
declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg) #2

attributes #0 = { noinline nounwind optnone "frame-pointer"="non-leaf" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+fp-armv8,+neon,+v8a" }
attributes #1 = { "frame-pointer"="non-leaf" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+fp-armv8,+neon,+v8a" }
attributes #2 = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }

!llvm.module.flags = !{!0, !1}
!llvm.ident = !{!2}

!0 = !{i32 1, !"wchar_size", i32 4}
!1 = !{i32 7, !"frame-pointer", i32 1}
!2 = !{!"clang version 19.0.0git (git@github.com:llvm/llvm-project.git 743d090b96e09fe7c2cea60a8962f579684c37ce)"}

This is using %union.S131, which is a 124-bit type, to store the value of P3, but it needs 128 bits of storage to hold the double _Complex value. The memcpy in main copies 16 bytes into an allocation which is only 15 bytes.

@llvmbot
Copy link
Member

llvmbot commented Apr 18, 2024

@llvm/issue-subscribers-backend-aarch64

Author: None (ostannard)

This code is miscompiled when targeting AArch64 at all optimisation levels:
extern void __aeabi_assert(const char *, const char *, int);
#define assert(e) ((e) ? (void)0 : __aeabi_assert(#e, __FILE__, __LINE__))
#include &lt;complex.h&gt;

union S131 {
  double _Complex M0 __attribute__((aligned(16)));
  signed _BitInt(124) M1;
};

void F94(union S131 P3) {
  assert(P3.M0 == 1.0 + 2.0 * _Complex_I);
}

int main() {
  union S131 P3 = {1.0 + 2.0 * _Complex_I};
  F94(P3);

  return 0;
}

I think that the IR generated by clang is wrong, before any optimisations:

$ /work/llvm/build/bin/clang --target=aarch64--none-elf -march=armv8-a -c test.c -o - -S -O0 -emit-llvm
; ModuleID = 'test.c'
source_filename = "test.c"
target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
target triple = "aarch64-unknown-none-elf"

%union.S131 = type { i124 }

@.str = private unnamed_addr constant [21 x i8] c"P3.M0 == 1.0 + 2.0if\00", align 1
@.str.1 = private unnamed_addr constant [7 x i8] c"test.c\00", align 1
@<!-- -->__const.main.P3 = private unnamed_addr constant { { double, double } } { { double, double } { double 1.000000e+00, double 2.000000e+00 } }, align 16

; Function Attrs: noinline nounwind optnone
define dso_local void @<!-- -->F94(i128 %P3.coerce) #<!-- -->0 {
entry:
  %P3 = alloca %union.S131, align 16
  %coerce.dive = getelementptr inbounds %union.S131, ptr %P3, i32 0, i32 0
  %coerce.val.ii = trunc i128 %P3.coerce to i124
  store i124 %coerce.val.ii, ptr %coerce.dive, align 16
  %P3.realp = getelementptr inbounds { double, double }, ptr %P3, i32 0, i32 0
  %P3.real = load double, ptr %P3.realp, align 16
  %P3.imagp = getelementptr inbounds { double, double }, ptr %P3, i32 0, i32 1
  %P3.imag = load double, ptr %P3.imagp, align 8
  %cmp.r = fcmp oeq double %P3.real, 1.000000e+00
  %cmp.i = fcmp oeq double %P3.imag, 2.000000e+00
  %and.ri = and i1 %cmp.r, %cmp.i
  br i1 %and.ri, label %cond.true, label %cond.false

cond.true:                                        ; preds = %entry
  br label %cond.end

cond.false:                                       ; preds = %entry
  call void @<!-- -->__aeabi_assert(ptr noundef @.str, ptr noundef @.str.1, i32 noundef 10)
  br label %cond.end

cond.end:                                         ; preds = %cond.false, %cond.true
  ret void
}

declare dso_local void @<!-- -->__aeabi_assert(ptr noundef, ptr noundef, i32 noundef) #<!-- -->1

; Function Attrs: noinline nounwind optnone
define dso_local i32 @<!-- -->main() #<!-- -->0 {
entry:
  %retval = alloca i32, align 4
  %P3 = alloca %union.S131, align 16
  store i32 0, ptr %retval, align 4
  call void @<!-- -->llvm.memcpy.p0.p0.i64(ptr align 16 %P3, ptr align 16 @<!-- -->__const.main.P3, i64 16, i1 false)
  %coerce.dive = getelementptr inbounds %union.S131, ptr %P3, i32 0, i32 0
  %0 = load i124, ptr %coerce.dive, align 16
  %coerce.val.ii = zext i124 %0 to i128
  call void @<!-- -->F94(i128 %coerce.val.ii)
  ret i32 0
}

; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: readwrite)
declare void @<!-- -->llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg) #<!-- -->2

attributes #<!-- -->0 = { noinline nounwind optnone "frame-pointer"="non-leaf" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+fp-armv8,+neon,+v8a" }
attributes #<!-- -->1 = { "frame-pointer"="non-leaf" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+fp-armv8,+neon,+v8a" }
attributes #<!-- -->2 = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }

!llvm.module.flags = !{!0, !1}
!llvm.ident = !{!2}

!0 = !{i32 1, !"wchar_size", i32 4}
!1 = !{i32 7, !"frame-pointer", i32 1}
!2 = !{!"clang version 19.0.0git (git@<!-- -->github.com:llvm/llvm-project.git 743d090b96e09fe7c2cea60a8962f579684c37ce)"}

This is using %union.S131, which is a 124-bit type, to store the value of P3, but it needs 128 bits of storage to hold the double _Complex value. The memcpy in main copies 16 bytes into an allocation which is only 15 bytes.

@dc03-work
Copy link
Contributor

This seems reproducible on x86 clang as well, so I don't think its a backend issue: https://godbolt.org/z/8c8deKPzq.

@efriedma-quic
Copy link
Collaborator

There's no such thing as an allocation of 124 bits. An alloca's size is always a number in bytes, that size is always a multiple of the type's alignment, and it's legal to read and write the padding.

So the allocation is 128 bits, i.e. 16 bytes, and the memcpy is fine.

(It's possible there's some optimization in LLVM that doesn't handle this construct correctly, but that would be a separate issue. See also #64081.)

@Fznamznon
Copy link
Contributor

Note, after #91364 it is allocating i128 https://godbolt.org/z/zfxvPEf3o .

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

No branches or pull requests

6 participants