Skip to content

Commit db8f6c0

Browse files
hstk30davemgreen
hstk30
authored andcommitted
[AArch64] Fix arm neon vstx lane memVT size
StN lane memory size set too big lead to alias analysis goes wrong. Fixes llvm#64696 Differential Revision: https://reviews.llvm.org/D158611
1 parent 003cf29 commit db8f6c0

File tree

4 files changed

+185
-17
lines changed

4 files changed

+185
-17
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

+42-8
Original file line numberDiff line numberDiff line change
@@ -13900,17 +13900,31 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
1390013900
case Intrinsic::aarch64_neon_ld4:
1390113901
case Intrinsic::aarch64_neon_ld1x2:
1390213902
case Intrinsic::aarch64_neon_ld1x3:
13903-
case Intrinsic::aarch64_neon_ld1x4:
13903+
case Intrinsic::aarch64_neon_ld1x4: {
13904+
Info.opc = ISD::INTRINSIC_W_CHAIN;
13905+
uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
13906+
Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
13907+
Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
13908+
Info.offset = 0;
13909+
Info.align.reset();
13910+
// volatile loads with NEON intrinsics not supported
13911+
Info.flags = MachineMemOperand::MOLoad;
13912+
return true;
13913+
}
1390413914
case Intrinsic::aarch64_neon_ld2lane:
1390513915
case Intrinsic::aarch64_neon_ld3lane:
1390613916
case Intrinsic::aarch64_neon_ld4lane:
1390713917
case Intrinsic::aarch64_neon_ld2r:
1390813918
case Intrinsic::aarch64_neon_ld3r:
1390913919
case Intrinsic::aarch64_neon_ld4r: {
1391013920
Info.opc = ISD::INTRINSIC_W_CHAIN;
13911-
// Conservatively set memVT to the entire set of vectors loaded.
13912-
uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
13913-
Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
13921+
// ldx return struct with the same vec type
13922+
Type *RetTy = I.getType();
13923+
auto *StructTy = cast<StructType>(RetTy);
13924+
unsigned NumElts = StructTy->getNumElements();
13925+
Type *VecTy = StructTy->getElementType(0);
13926+
MVT EleVT = MVT::getVT(VecTy).getVectorElementType();
13927+
Info.memVT = EVT::getVectorVT(I.getType()->getContext(), EleVT, NumElts);
1391413928
Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
1391513929
Info.offset = 0;
1391613930
Info.align.reset();
@@ -13923,20 +13937,40 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
1392313937
case Intrinsic::aarch64_neon_st4:
1392413938
case Intrinsic::aarch64_neon_st1x2:
1392513939
case Intrinsic::aarch64_neon_st1x3:
13926-
case Intrinsic::aarch64_neon_st1x4:
13940+
case Intrinsic::aarch64_neon_st1x4: {
13941+
Info.opc = ISD::INTRINSIC_VOID;
13942+
unsigned NumElts = 0;
13943+
for (const Value *Arg : I.args()) {
13944+
Type *ArgTy = Arg->getType();
13945+
if (!ArgTy->isVectorTy())
13946+
break;
13947+
NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
13948+
}
13949+
Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
13950+
Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
13951+
Info.offset = 0;
13952+
Info.align.reset();
13953+
// volatile stores with NEON intrinsics not supported
13954+
Info.flags = MachineMemOperand::MOStore;
13955+
return true;
13956+
}
1392713957
case Intrinsic::aarch64_neon_st2lane:
1392813958
case Intrinsic::aarch64_neon_st3lane:
1392913959
case Intrinsic::aarch64_neon_st4lane: {
1393013960
Info.opc = ISD::INTRINSIC_VOID;
13931-
// Conservatively set memVT to the entire set of vectors stored.
1393213961
unsigned NumElts = 0;
13962+
// all the vector type is same
13963+
Type *VecTy = I.getArgOperand(0)->getType();
13964+
MVT EleVT = MVT::getVT(VecTy).getVectorElementType();
13965+
1393313966
for (const Value *Arg : I.args()) {
1393413967
Type *ArgTy = Arg->getType();
1393513968
if (!ArgTy->isVectorTy())
1393613969
break;
13937-
NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
13970+
NumElts += 1;
1393813971
}
13939-
Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
13972+
13973+
Info.memVT = EVT::getVectorVT(I.getType()->getContext(), EleVT, NumElts);
1394013974
Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
1394113975
Info.offset = 0;
1394213976
Info.align.reset();
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
; RUN: llc < %s -mtriple=arm64-none-linux-gnu -mattr=+neon -O2 | FileCheck %s
2+
3+
; st2 must before two ldrb.
4+
; The situation that put one ldrb before st2 because of the conservative memVT set for st2lane,
5+
; which lead to basic-aa goes wrong.
6+
7+
define dso_local i32 @test_vst2_lane_u8([2 x <8 x i8>] %vectors.coerce) local_unnamed_addr {
8+
; CHECK-LABEL: test_vst2_lane_u8:
9+
; CHECK: st2 { v[[V1:[0-9]+]].b, v[[V2:[0-9]+]].b }[6], [x8]
10+
; CHECK-NEXT: umov w[[W1:[0-9]+]], v[[V12:[0-9]+]].b[6]
11+
; CHECK-NEXT: ldrb w[[W2:[0-9]+]], [sp, #12]
12+
; CHECK-NEXT: ldrb w[[W2:[0-9]+]], [sp, #13]
13+
entry:
14+
%temp = alloca [2 x i8], align 4
15+
%vectors.coerce.fca.0.extract = extractvalue [2 x <8 x i8>] %vectors.coerce, 0
16+
%vectors.coerce.fca.1.extract = extractvalue [2 x <8 x i8>] %vectors.coerce, 1
17+
call void @llvm.lifetime.start.p0(i64 2, ptr nonnull %temp) #4
18+
call void @llvm.aarch64.neon.st2lane.v8i8.p0(<8 x i8> %vectors.coerce.fca.0.extract, <8 x i8> %vectors.coerce.fca.1.extract, i64 6, ptr nonnull %temp)
19+
%0 = load i8, ptr %temp, align 4
20+
%vget_lane = extractelement <8 x i8> %vectors.coerce.fca.0.extract, i64 6
21+
%cmp8.not = icmp ne i8 %0, %vget_lane
22+
%arrayidx3.1 = getelementptr inbounds [2 x i8], ptr %temp, i64 0, i64 1
23+
%1 = load i8, ptr %arrayidx3.1, align 1
24+
%vget_lane.1 = extractelement <8 x i8> %vectors.coerce.fca.1.extract, i64 6
25+
%cmp8.not.1 = icmp ne i8 %1, %vget_lane.1
26+
%or.cond = select i1 %cmp8.not, i1 true, i1 %cmp8.not.1
27+
%cmp.lcssa = zext i1 %or.cond to i32
28+
call void @llvm.lifetime.end.p0(i64 2, ptr nonnull %temp) #4
29+
ret i32 %cmp.lcssa
30+
}
31+
32+
declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #2
33+
declare void @llvm.aarch64.neon.st2lane.v8i8.p0(<8 x i8>, <8 x i8>, i64, ptr nocapture) #2
34+
declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #2
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
; RUN: llc -mtriple=aarch64-linux-gnu -stop-after=instruction-select < %s | FileCheck %s
2+
3+
%struct.__neon_float32x2x2_t = type { <2 x float>, <2 x float> }
4+
%struct.__neon_float32x2x3_t = type { <2 x float>, <2 x float>, <2 x float> }
5+
%struct.__neon_float32x2x4_t = type { <2 x float>, <2 x float>, <2 x float>, <2 x float> }
6+
7+
declare %struct.__neon_float32x2x2_t @llvm.aarch64.neon.ld2.v2f32.p0f32(float*)
8+
declare %struct.__neon_float32x2x3_t @llvm.aarch64.neon.ld3.v2f32.p0f32(float*)
9+
declare %struct.__neon_float32x2x4_t @llvm.aarch64.neon.ld4.v2f32.p0f32(float*)
10+
11+
declare %struct.__neon_float32x2x2_t @llvm.aarch64.neon.ld1x2.v2f32.p0f32(float*)
12+
declare %struct.__neon_float32x2x3_t @llvm.aarch64.neon.ld1x3.v2f32.p0f32(float*)
13+
declare %struct.__neon_float32x2x4_t @llvm.aarch64.neon.ld1x4.v2f32.p0f32(float*)
14+
15+
declare %struct.__neon_float32x2x2_t @llvm.aarch64.neon.ld2r.v2f32.p0f32(float*)
16+
declare %struct.__neon_float32x2x3_t @llvm.aarch64.neon.ld3r.v2f32.p0f32(float*)
17+
declare %struct.__neon_float32x2x4_t @llvm.aarch64.neon.ld4r.v2f32.p0f32(float*)
18+
19+
declare %struct.__neon_float32x2x2_t @llvm.aarch64.neon.ld2lane.v2f32.p0f32(<2 x float>, <2 x float>, i64, float*)
20+
declare %struct.__neon_float32x2x3_t @llvm.aarch64.neon.ld3lane.v2f32.p0f32(<2 x float>, <2 x float>, <2 x float>, i64, float*)
21+
declare %struct.__neon_float32x2x4_t @llvm.aarch64.neon.ld4lane.v2f32.p0f32(<2 x float>, <2 x float>, <2 x float>, <2 x float>, i64, float*)
22+
23+
24+
define %struct.__neon_float32x2x2_t @test_ld2(float* %addr) {
25+
; CHECK-LABEL: name: test_ld2
26+
; CHECK: LD2Twov2s {{.*}} :: (load (s128) {{.*}})
27+
%val = call %struct.__neon_float32x2x2_t @llvm.aarch64.neon.ld2.v2f32.p0f32(float* %addr)
28+
ret %struct.__neon_float32x2x2_t %val
29+
}
30+
31+
define %struct.__neon_float32x2x3_t @test_ld3(float* %addr) {
32+
; CHECK-LABEL: name: test_ld3
33+
; CHECK: LD3Threev2s {{.*}} :: (load (s192) {{.*}})
34+
%val = call %struct.__neon_float32x2x3_t @llvm.aarch64.neon.ld3.v2f32.p0f32(float* %addr)
35+
ret %struct.__neon_float32x2x3_t %val
36+
}
37+
38+
define %struct.__neon_float32x2x4_t @test_ld4(float* %addr) {
39+
; CHECK-LABEL: name: test_ld4
40+
; CHECK: LD4Fourv2s {{.*}} :: (load (s256) {{.*}})
41+
%val = call %struct.__neon_float32x2x4_t @llvm.aarch64.neon.ld4.v2f32.p0f32(float* %addr)
42+
ret %struct.__neon_float32x2x4_t %val
43+
}
44+
45+
define %struct.__neon_float32x2x2_t @test_ld1x2(float* %addr) {
46+
; CHECK-LABEL: name: test_ld1x2
47+
; CHECK: LD1Twov2s {{.*}} :: (load (s128) {{.*}})
48+
%val = call %struct.__neon_float32x2x2_t @llvm.aarch64.neon.ld1x2.v2f32.p0f32(float* %addr)
49+
ret %struct.__neon_float32x2x2_t %val
50+
}
51+
52+
define %struct.__neon_float32x2x3_t @test_ld1x3(float* %addr) {
53+
; CHECK-LABEL: name: test_ld1x3
54+
; CHECK: LD1Threev2s {{.*}} :: (load (s192) {{.*}})
55+
%val = call %struct.__neon_float32x2x3_t @llvm.aarch64.neon.ld1x3.v2f32.p0f32(float* %addr)
56+
ret %struct.__neon_float32x2x3_t %val
57+
}
58+
59+
define %struct.__neon_float32x2x4_t @test_ld1x4(float* %addr) {
60+
; CHECK-LABEL: name: test_ld1x4
61+
; CHECK: LD1Fourv2s {{.*}} :: (load (s256) {{.*}})
62+
%val = call %struct.__neon_float32x2x4_t @llvm.aarch64.neon.ld1x4.v2f32.p0f32(float* %addr)
63+
ret %struct.__neon_float32x2x4_t %val
64+
}
65+
66+
define %struct.__neon_float32x2x2_t @test_ld2r(float* %addr) {
67+
; CHECK-LABEL: name: test_ld2r
68+
; CHECK: LD2Rv2s {{.*}} :: (load (s64) {{.*}})
69+
%val = call %struct.__neon_float32x2x2_t @llvm.aarch64.neon.ld2r.v2f32.p0f32(float* %addr)
70+
ret %struct.__neon_float32x2x2_t %val
71+
}
72+
73+
define %struct.__neon_float32x2x3_t @test_ld3r(float* %addr) {
74+
; CHECK-LABEL: name: test_ld3r
75+
; CHECK: LD3Rv2s {{.*}} :: (load (s96) {{.*}})
76+
%val = call %struct.__neon_float32x2x3_t @llvm.aarch64.neon.ld3r.v2f32.p0f32(float* %addr)
77+
ret %struct.__neon_float32x2x3_t %val
78+
}
79+
80+
define %struct.__neon_float32x2x4_t @test_ld4r(float* %addr) {
81+
; CHECK-LABEL: name: test_ld4r
82+
; CHECK: LD4Rv2s {{.*}} :: (load (s128) {{.*}})
83+
%val = call %struct.__neon_float32x2x4_t @llvm.aarch64.neon.ld4r.v2f32.p0f32(float* %addr)
84+
ret %struct.__neon_float32x2x4_t %val
85+
}
86+
87+
define %struct.__neon_float32x2x2_t @test_ld2lane(<2 x float> %a, <2 x float> %b, float* %addr) {
88+
; CHECK-LABEL: name: test_ld2lane
89+
; CHECK: {{.*}} LD2i32 {{.*}}
90+
%val = call %struct.__neon_float32x2x2_t @llvm.aarch64.neon.ld2lane.v2f32.p0f32(<2 x float> %a, <2 x float> %b, i64 1, float* %addr)
91+
ret %struct.__neon_float32x2x2_t %val
92+
}
93+
94+
define %struct.__neon_float32x2x3_t @test_ld3lane(<2 x float> %a, <2 x float> %b, <2 x float> %c, float* %addr) {
95+
; CHECK-LABEL: name: test_ld3lane
96+
; CHECK: {{.*}} LD3i32 {{.*}}
97+
%val = call %struct.__neon_float32x2x3_t @llvm.aarch64.neon.ld3lane.v2f32.p0f32(<2 x float> %a, <2 x float> %b, <2 x float> %c, i64 1, float* %addr)
98+
ret %struct.__neon_float32x2x3_t %val
99+
}
100+
101+
define %struct.__neon_float32x2x4_t @test_ld4lane(<2 x float> %a, <2 x float> %b, <2 x float> %c, <2 x float> %d, float* %addr) {
102+
; CHECK-LABEL: name: test_ld4lane
103+
; CHECK: {{.*}} LD4i32 {{.*}}
104+
%val = call %struct.__neon_float32x2x4_t @llvm.aarch64.neon.ld4lane.v2f32.p0f32(<2 x float> %a, <2 x float> %b, <2 x float> %c, <2 x float> %d, i64 1, float* %addr)
105+
ret %struct.__neon_float32x2x4_t %val
106+
}

llvm/test/CodeGen/AArch64/multi-vector-store-size.ll

+3-9
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,6 @@ define void @addstx(ptr %res, ptr %a, ptr %b, ptr %c, ptr %d) {
2323
%cr = fadd <4 x float> %cl, %dl
2424
%dr = fadd <4 x float> %dl, %al
2525

26-
; The sizes below are conservative. AArch64TargetLowering
27-
; conservatively assumes the entire vector is stored.
2826
tail call void @llvm.aarch64.neon.st2.v4f32.p0(<4 x float> %ar, <4 x float> %br, ptr %res)
2927
; CHECK: ST2Twov4s {{.*}} :: (store (s256) {{.*}})
3028
tail call void @llvm.aarch64.neon.st3.v4f32.p0(<4 x float> %ar, <4 x float> %br, <4 x float> %cr, ptr %res)
@@ -46,8 +44,6 @@ define void @addst1x(ptr %res, ptr %a, ptr %b, ptr %c, ptr %d) {
4644
%cr = fadd <4 x float> %cl, %dl
4745
%dr = fadd <4 x float> %dl, %al
4846

49-
; The sizes below are conservative. AArch64TargetLowering
50-
; conservatively assumes the entire vector is stored.
5147
tail call void @llvm.aarch64.neon.st1x2.v4f32.p0(<4 x float> %ar, <4 x float> %br, ptr %res)
5248
; CHECK: ST1Twov4s {{.*}} :: (store (s256) {{.*}})
5349
tail call void @llvm.aarch64.neon.st1x3.v4f32.p0(<4 x float> %ar, <4 x float> %br, <4 x float> %cr, ptr %res)
@@ -69,14 +65,12 @@ define void @addstxlane(ptr %res, ptr %a, ptr %b, ptr %c, ptr %d) {
6965
%cr = fadd <4 x float> %cl, %dl
7066
%dr = fadd <4 x float> %dl, %al
7167

72-
; The sizes below are conservative. AArch64TargetLowering
73-
; conservatively assumes the entire vector is stored.
7468
tail call void @llvm.aarch64.neon.st2lane.v4f32.p0(<4 x float> %ar, <4 x float> %br, i64 1, ptr %res)
75-
; CHECK: ST2i32 {{.*}} :: (store (s256) {{.*}})
69+
; CHECK: ST2i32 {{.*}} :: (store (s64) {{.*}})
7670
tail call void @llvm.aarch64.neon.st3lane.v4f32.p0(<4 x float> %ar, <4 x float> %br, <4 x float> %cr, i64 1, ptr %res)
77-
; CHECK: ST3i32 {{.*}} :: (store (s384) {{.*}})
71+
; CHECK: ST3i32 {{.*}} :: (store (s96) {{.*}})
7872
tail call void @llvm.aarch64.neon.st4lane.v4f32.p0(<4 x float> %ar, <4 x float> %br, <4 x float> %cr, <4 x float> %dr, i64 1, ptr %res)
79-
; CHECK: ST4i32 {{.*}} :: (store (s512) {{.*}})
73+
; CHECK: ST4i32 {{.*}} :: (store (s128) {{.*}})
8074

8175
ret void
8276
}

0 commit comments

Comments
 (0)