Skip to content

Commit c80080f

Browse files
authored
[AArch64][SVE] Pair SVE fill/spill into LDP/STP with -msve-vector-bits=128. (llvm#134068)
When compiling with -msve-vector-bits=128 or vscale_range(1, 1) and when the offsets allow it, we can pair SVE LDR/STR instructions into Neon LDP/STP. For example, given: ```cpp #include <arm_sve.h> void foo(double const *ldp, double *stp) { svbool_t pg = svptrue_b64(); svfloat64_t ld1 = svld1_f64(pg, ldp); svfloat64_t ld2 = svld1_f64(pg, ldp+svcntd()); svst1_f64(pg, stp, ld1); svst1_f64(pg, stp+svcntd(), ld2); } ``` When compiled with `-msve-vector-bits=128`, we currently generate: ```gas foo: ldr z0, [x0] ldr z1, [x0, #1, mul vl] str z0, [x1] str z1, [x1, #1, mul vl] ret ``` With this patch, we instead generate: ```gas foo: ldp q0, q1, [x0] stp q0, q1, [x1] ret ``` This is an alternative, more targetted approach to llvm#127500.
1 parent 2f6b06b commit c80080f

File tree

3 files changed

+321
-1
lines changed

3 files changed

+321
-1
lines changed

llvm/lib/Target/AArch64/AArch64InstrInfo.cpp

+15
Original file line numberDiff line numberDiff line change
@@ -2760,6 +2760,9 @@ bool AArch64InstrInfo::isPairableLdStInst(const MachineInstr &MI) {
27602760
case AArch64::LDRXpre:
27612761
case AArch64::LDURSWi:
27622762
case AArch64::LDRSWpre:
2763+
// SVE instructions.
2764+
case AArch64::LDR_ZXI:
2765+
case AArch64::STR_ZXI:
27632766
return true;
27642767
}
27652768
}
@@ -2912,6 +2915,18 @@ bool AArch64InstrInfo::isCandidateToMergeOrPair(const MachineInstr &MI) const {
29122915
return false;
29132916
}
29142917

2918+
// Pairing SVE fills/spills is only valid for little-endian targets that
2919+
// implement VLS 128.
2920+
switch (MI.getOpcode()) {
2921+
default:
2922+
break;
2923+
case AArch64::LDR_ZXI:
2924+
case AArch64::STR_ZXI:
2925+
if (!Subtarget.isLittleEndian() ||
2926+
Subtarget.getSVEVectorSizeInBits() != 128)
2927+
return false;
2928+
}
2929+
29152930
// Check if this load/store has a hint to avoid pair formation.
29162931
// MachineMemOperands hints are set by the AArch64StorePairSuppress pass.
29172932
if (isLdStPairSuppressed(MI))

llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp

+23-1
Original file line numberDiff line numberDiff line change
@@ -298,6 +298,7 @@ static unsigned getMatchingNonSExtOpcode(unsigned Opc,
298298
case AArch64::STRXui:
299299
case AArch64::STRXpre:
300300
case AArch64::STURXi:
301+
case AArch64::STR_ZXI:
301302
case AArch64::LDRDui:
302303
case AArch64::LDURDi:
303304
case AArch64::LDRDpre:
@@ -316,6 +317,7 @@ static unsigned getMatchingNonSExtOpcode(unsigned Opc,
316317
case AArch64::LDRSui:
317318
case AArch64::LDURSi:
318319
case AArch64::LDRSpre:
320+
case AArch64::LDR_ZXI:
319321
return Opc;
320322
case AArch64::LDRSWui:
321323
return AArch64::LDRWui;
@@ -361,6 +363,7 @@ static unsigned getMatchingPairOpcode(unsigned Opc) {
361363
return AArch64::STPDpre;
362364
case AArch64::STRQui:
363365
case AArch64::STURQi:
366+
case AArch64::STR_ZXI:
364367
return AArch64::STPQi;
365368
case AArch64::STRQpre:
366369
return AArch64::STPQpre;
@@ -386,6 +389,7 @@ static unsigned getMatchingPairOpcode(unsigned Opc) {
386389
return AArch64::LDPDpre;
387390
case AArch64::LDRQui:
388391
case AArch64::LDURQi:
392+
case AArch64::LDR_ZXI:
389393
return AArch64::LDPQi;
390394
case AArch64::LDRQpre:
391395
return AArch64::LDPQpre;
@@ -1225,6 +1229,16 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
12251229
(void)MIBSXTW;
12261230
LLVM_DEBUG(dbgs() << " Extend operand:\n ");
12271231
LLVM_DEBUG(((MachineInstr *)MIBSXTW)->print(dbgs()));
1232+
} else if (Opc == AArch64::LDR_ZXI || Opc == AArch64::STR_ZXI) {
1233+
// We are combining SVE fill/spill to LDP/STP, so we need to use the Q
1234+
// variant of the registers.
1235+
MachineOperand &MOp0 = MIB->getOperand(0);
1236+
MachineOperand &MOp1 = MIB->getOperand(1);
1237+
assert(AArch64::ZPRRegClass.contains(MOp0.getReg()) &&
1238+
AArch64::ZPRRegClass.contains(MOp1.getReg()) && "Invalid register.");
1239+
MOp0.setReg(AArch64::Q0 + (MOp0.getReg() - AArch64::Z0));
1240+
MOp1.setReg(AArch64::Q0 + (MOp1.getReg() - AArch64::Z0));
1241+
LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs()));
12281242
} else {
12291243
LLVM_DEBUG(((MachineInstr *)MIB)->print(dbgs()));
12301244
}
@@ -2659,7 +2673,8 @@ bool AArch64LoadStoreOpt::tryToPairLdStInst(MachineBasicBlock::iterator &MBBI) {
26592673
// Get the needed alignments to check them if
26602674
// ldp-aligned-only/stp-aligned-only features are opted.
26612675
uint64_t MemAlignment = MemOp->getAlign().value();
2662-
uint64_t TypeAlignment = Align(MemOp->getSize().getValue()).value();
2676+
uint64_t TypeAlignment =
2677+
Align(MemOp->getSize().getValue().getKnownMinValue()).value();
26632678

26642679
if (MemAlignment < 2 * TypeAlignment) {
26652680
NumFailedAlignmentCheck++;
@@ -2820,11 +2835,18 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
28202835
}
28212836
// 3) Find loads and stores that can be merged into a single load or store
28222837
// pair instruction.
2838+
// When compiling for SVE 128, also try to combine SVE fill/spill
2839+
// instructions into LDP/STP.
28232840
// e.g.,
28242841
// ldr x0, [x2]
28252842
// ldr x1, [x2, #8]
28262843
// ; becomes
28272844
// ldp x0, x1, [x2]
2845+
// e.g.,
2846+
// ldr z0, [x2]
2847+
// ldr z1, [x2, #1, mul vl]
2848+
// ; becomes
2849+
// ldp q0, q1, [x2]
28282850

28292851
if (MBB.getParent()->getRegInfo().tracksLiveness()) {
28302852
DefinedInBB.clear();
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,283 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2+
; RUN: llc -verify-machineinstrs -mtriple=aarch64-linux-gnu -mattr=+sve -aarch64-sve-vector-bits-min=128 -aarch64-sve-vector-bits-max=128 < %s | FileCheck %s
3+
; RUN: llc -verify-machineinstrs -mtriple=aarch64_be-linux-gnu -mattr=+sve -aarch64-sve-vector-bits-min=128 -aarch64-sve-vector-bits-max=128 < %s | FileCheck %s --check-prefixes=CHECK-BE
4+
; RUN: llc -verify-machineinstrs -mtriple=aarch64-linux-gnu -mattr=+sve,ldp-aligned-only -aarch64-sve-vector-bits-min=128 -aarch64-sve-vector-bits-max=128 < %s | FileCheck %s --check-prefixes=CHECK-LDPALIGNEDONLY
5+
; RUN: llc -verify-machineinstrs -mtriple=aarch64-linux-gnu -mattr=+sve,stp-aligned-only -aarch64-sve-vector-bits-min=128 -aarch64-sve-vector-bits-max=128 < %s | FileCheck %s --check-prefixes=CHECK-STPALIGNEDONLY
6+
; RUN: llc -verify-machineinstrs -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s --check-prefixes=CHECK-OFF
7+
; RUN: llc -verify-machineinstrs -mtriple=aarch64-linux-gnu -mattr=+sve -aarch64-sve-vector-bits-min=256 -aarch64-sve-vector-bits-max=256 < %s | FileCheck %s --check-prefixes=CHECK-OFF
8+
9+
define void @nxv16i8(ptr %ldptr, ptr %stptr) {
10+
; CHECK-LABEL: nxv16i8:
11+
; CHECK: // %bb.0:
12+
; CHECK-NEXT: ldp q0, q1, [x0]
13+
; CHECK-NEXT: stp q0, q1, [x1]
14+
; CHECK-NEXT: ret
15+
;
16+
; CHECK-BE-LABEL: nxv16i8:
17+
; CHECK-BE: // %bb.0:
18+
; CHECK-BE-NEXT: ptrue p0.b
19+
; CHECK-BE-NEXT: ld1b { z0.b }, p0/z, [x0]
20+
; CHECK-BE-NEXT: ld1b { z1.b }, p0/z, [x0, #1, mul vl]
21+
; CHECK-BE-NEXT: st1b { z0.b }, p0, [x1]
22+
; CHECK-BE-NEXT: st1b { z1.b }, p0, [x1, #1, mul vl]
23+
; CHECK-BE-NEXT: ret
24+
;
25+
; CHECK-LDPALIGNEDONLY-LABEL: nxv16i8:
26+
; CHECK-LDPALIGNEDONLY: // %bb.0:
27+
; CHECK-LDPALIGNEDONLY-NEXT: ldr z0, [x0]
28+
; CHECK-LDPALIGNEDONLY-NEXT: ldr z1, [x0, #1, mul vl]
29+
; CHECK-LDPALIGNEDONLY-NEXT: stp q0, q1, [x1]
30+
; CHECK-LDPALIGNEDONLY-NEXT: ret
31+
;
32+
; CHECK-STPALIGNEDONLY-LABEL: nxv16i8:
33+
; CHECK-STPALIGNEDONLY: // %bb.0:
34+
; CHECK-STPALIGNEDONLY-NEXT: ldp q0, q1, [x0]
35+
; CHECK-STPALIGNEDONLY-NEXT: str z0, [x1]
36+
; CHECK-STPALIGNEDONLY-NEXT: str z1, [x1, #1, mul vl]
37+
; CHECK-STPALIGNEDONLY-NEXT: ret
38+
;
39+
; CHECK-OFF-LABEL: nxv16i8:
40+
; CHECK-OFF: // %bb.0:
41+
; CHECK-OFF-NEXT: ldr z0, [x0]
42+
; CHECK-OFF-NEXT: ldr z1, [x0, #1, mul vl]
43+
; CHECK-OFF-NEXT: str z0, [x1]
44+
; CHECK-OFF-NEXT: str z1, [x1, #1, mul vl]
45+
; CHECK-OFF-NEXT: ret
46+
%vscale = tail call i64 @llvm.vscale()
47+
%vl = shl nuw nsw i64 %vscale, 4
48+
%ldptr2 = getelementptr inbounds nuw i8, ptr %ldptr, i64 %vl
49+
%stptr2 = getelementptr inbounds nuw i8, ptr %stptr, i64 %vl
50+
%ld1 = load <vscale x 16 x i8>, ptr %ldptr, align 1
51+
%ld2 = load <vscale x 16 x i8>, ptr %ldptr2, align 1
52+
store <vscale x 16 x i8> %ld1, ptr %stptr, align 1
53+
store <vscale x 16 x i8> %ld2, ptr %stptr2, align 1
54+
ret void
55+
}
56+
57+
define void @nxv16i8_max_range(ptr %ldptr, ptr %stptr) {
58+
; CHECK-LABEL: nxv16i8_max_range:
59+
; CHECK: // %bb.0:
60+
; CHECK-NEXT: ldp q0, q1, [x0, #-1024]
61+
; CHECK-NEXT: stp q0, q1, [x1, #1008]
62+
; CHECK-NEXT: ret
63+
;
64+
; CHECK-BE-LABEL: nxv16i8_max_range:
65+
; CHECK-BE: // %bb.0:
66+
; CHECK-BE-NEXT: rdvl x8, #1
67+
; CHECK-BE-NEXT: mov x9, #-1008 // =0xfffffffffffffc10
68+
; CHECK-BE-NEXT: mov x10, #-1024 // =0xfffffffffffffc00
69+
; CHECK-BE-NEXT: lsr x8, x8, #4
70+
; CHECK-BE-NEXT: mov w11, #1008 // =0x3f0
71+
; CHECK-BE-NEXT: mov w12, #1024 // =0x400
72+
; CHECK-BE-NEXT: ptrue p0.b
73+
; CHECK-BE-NEXT: mul x9, x8, x9
74+
; CHECK-BE-NEXT: mul x10, x8, x10
75+
; CHECK-BE-NEXT: mul x11, x8, x11
76+
; CHECK-BE-NEXT: ld1b { z1.b }, p0/z, [x0, x9]
77+
; CHECK-BE-NEXT: mul x8, x8, x12
78+
; CHECK-BE-NEXT: ld1b { z0.b }, p0/z, [x0, x10]
79+
; CHECK-BE-NEXT: st1b { z0.b }, p0, [x1, x11]
80+
; CHECK-BE-NEXT: st1b { z1.b }, p0, [x1, x8]
81+
; CHECK-BE-NEXT: ret
82+
;
83+
; CHECK-LDPALIGNEDONLY-LABEL: nxv16i8_max_range:
84+
; CHECK-LDPALIGNEDONLY: // %bb.0:
85+
; CHECK-LDPALIGNEDONLY-NEXT: ldr z0, [x0, #-64, mul vl]
86+
; CHECK-LDPALIGNEDONLY-NEXT: ldr z1, [x0, #-63, mul vl]
87+
; CHECK-LDPALIGNEDONLY-NEXT: stp q0, q1, [x1, #1008]
88+
; CHECK-LDPALIGNEDONLY-NEXT: ret
89+
;
90+
; CHECK-STPALIGNEDONLY-LABEL: nxv16i8_max_range:
91+
; CHECK-STPALIGNEDONLY: // %bb.0:
92+
; CHECK-STPALIGNEDONLY-NEXT: ldp q0, q1, [x0, #-1024]
93+
; CHECK-STPALIGNEDONLY-NEXT: str z0, [x1, #63, mul vl]
94+
; CHECK-STPALIGNEDONLY-NEXT: str z1, [x1, #64, mul vl]
95+
; CHECK-STPALIGNEDONLY-NEXT: ret
96+
;
97+
; CHECK-OFF-LABEL: nxv16i8_max_range:
98+
; CHECK-OFF: // %bb.0:
99+
; CHECK-OFF-NEXT: ldr z0, [x0, #-64, mul vl]
100+
; CHECK-OFF-NEXT: ldr z1, [x0, #-63, mul vl]
101+
; CHECK-OFF-NEXT: str z0, [x1, #63, mul vl]
102+
; CHECK-OFF-NEXT: str z1, [x1, #64, mul vl]
103+
; CHECK-OFF-NEXT: ret
104+
%vscale = tail call i64 @llvm.vscale()
105+
%ldoff1 = mul i64 %vscale, -1024
106+
%ldoff2 = mul i64 %vscale, -1008
107+
%stoff1 = mul i64 %vscale, 1008
108+
%stoff2 = mul i64 %vscale, 1024
109+
%ldptr1 = getelementptr inbounds nuw i8, ptr %ldptr, i64 %ldoff1
110+
%ldptr2 = getelementptr inbounds nuw i8, ptr %ldptr, i64 %ldoff2
111+
%stptr1 = getelementptr inbounds nuw i8, ptr %stptr, i64 %stoff1
112+
%stptr2 = getelementptr inbounds nuw i8, ptr %stptr, i64 %stoff2
113+
%ld1 = load <vscale x 16 x i8>, ptr %ldptr1, align 1
114+
%ld2 = load <vscale x 16 x i8>, ptr %ldptr2, align 1
115+
store <vscale x 16 x i8> %ld1, ptr %stptr1, align 1
116+
store <vscale x 16 x i8> %ld2, ptr %stptr2, align 1
117+
ret void
118+
}
119+
120+
define void @nxv16i8_outside_range(ptr %ldptr, ptr %stptr) {
121+
; CHECK-LABEL: nxv16i8_outside_range:
122+
; CHECK: // %bb.0:
123+
; CHECK-NEXT: ldr z0, [x0, #-65, mul vl]
124+
; CHECK-NEXT: ldr z1, [x0, #-64, mul vl]
125+
; CHECK-NEXT: str z0, [x1, #64, mul vl]
126+
; CHECK-NEXT: str z1, [x1, #65, mul vl]
127+
; CHECK-NEXT: ret
128+
;
129+
; CHECK-BE-LABEL: nxv16i8_outside_range:
130+
; CHECK-BE: // %bb.0:
131+
; CHECK-BE-NEXT: rdvl x8, #1
132+
; CHECK-BE-NEXT: mov x9, #-1040 // =0xfffffffffffffbf0
133+
; CHECK-BE-NEXT: mov x10, #-1024 // =0xfffffffffffffc00
134+
; CHECK-BE-NEXT: lsr x8, x8, #4
135+
; CHECK-BE-NEXT: mov w11, #1024 // =0x400
136+
; CHECK-BE-NEXT: mov w12, #1040 // =0x410
137+
; CHECK-BE-NEXT: ptrue p0.b
138+
; CHECK-BE-NEXT: mul x9, x8, x9
139+
; CHECK-BE-NEXT: mul x10, x8, x10
140+
; CHECK-BE-NEXT: mul x11, x8, x11
141+
; CHECK-BE-NEXT: ld1b { z0.b }, p0/z, [x0, x9]
142+
; CHECK-BE-NEXT: mul x8, x8, x12
143+
; CHECK-BE-NEXT: ld1b { z1.b }, p0/z, [x0, x10]
144+
; CHECK-BE-NEXT: st1b { z0.b }, p0, [x1, x11]
145+
; CHECK-BE-NEXT: st1b { z1.b }, p0, [x1, x8]
146+
; CHECK-BE-NEXT: ret
147+
;
148+
; CHECK-LDPALIGNEDONLY-LABEL: nxv16i8_outside_range:
149+
; CHECK-LDPALIGNEDONLY: // %bb.0:
150+
; CHECK-LDPALIGNEDONLY-NEXT: ldr z0, [x0, #-65, mul vl]
151+
; CHECK-LDPALIGNEDONLY-NEXT: ldr z1, [x0, #-64, mul vl]
152+
; CHECK-LDPALIGNEDONLY-NEXT: str z0, [x1, #64, mul vl]
153+
; CHECK-LDPALIGNEDONLY-NEXT: str z1, [x1, #65, mul vl]
154+
; CHECK-LDPALIGNEDONLY-NEXT: ret
155+
;
156+
; CHECK-STPALIGNEDONLY-LABEL: nxv16i8_outside_range:
157+
; CHECK-STPALIGNEDONLY: // %bb.0:
158+
; CHECK-STPALIGNEDONLY-NEXT: ldr z0, [x0, #-65, mul vl]
159+
; CHECK-STPALIGNEDONLY-NEXT: ldr z1, [x0, #-64, mul vl]
160+
; CHECK-STPALIGNEDONLY-NEXT: str z0, [x1, #64, mul vl]
161+
; CHECK-STPALIGNEDONLY-NEXT: str z1, [x1, #65, mul vl]
162+
; CHECK-STPALIGNEDONLY-NEXT: ret
163+
;
164+
; CHECK-OFF-LABEL: nxv16i8_outside_range:
165+
; CHECK-OFF: // %bb.0:
166+
; CHECK-OFF-NEXT: ldr z0, [x0, #-65, mul vl]
167+
; CHECK-OFF-NEXT: ldr z1, [x0, #-64, mul vl]
168+
; CHECK-OFF-NEXT: str z0, [x1, #64, mul vl]
169+
; CHECK-OFF-NEXT: str z1, [x1, #65, mul vl]
170+
; CHECK-OFF-NEXT: ret
171+
%vscale = tail call i64 @llvm.vscale()
172+
%ldoff1 = mul i64 %vscale, -1040
173+
%ldoff2 = mul i64 %vscale, -1024
174+
%stoff1 = mul i64 %vscale, 1024
175+
%stoff2 = mul i64 %vscale, 1040
176+
%ldptr1 = getelementptr inbounds nuw i8, ptr %ldptr, i64 %ldoff1
177+
%ldptr2 = getelementptr inbounds nuw i8, ptr %ldptr, i64 %ldoff2
178+
%stptr1 = getelementptr inbounds nuw i8, ptr %stptr, i64 %stoff1
179+
%stptr2 = getelementptr inbounds nuw i8, ptr %stptr, i64 %stoff2
180+
%ld1 = load <vscale x 16 x i8>, ptr %ldptr1, align 1
181+
%ld2 = load <vscale x 16 x i8>, ptr %ldptr2, align 1
182+
store <vscale x 16 x i8> %ld1, ptr %stptr1, align 1
183+
store <vscale x 16 x i8> %ld2, ptr %stptr2, align 1
184+
ret void
185+
}
186+
187+
define void @nxv16i8_2vl_stride(ptr %ldptr, ptr %stptr) {
188+
; CHECK-LABEL: nxv16i8_2vl_stride:
189+
; CHECK: // %bb.0:
190+
; CHECK-NEXT: ldr z0, [x0]
191+
; CHECK-NEXT: ldr z1, [x0, #2, mul vl]
192+
; CHECK-NEXT: str z0, [x1]
193+
; CHECK-NEXT: str z1, [x1, #2, mul vl]
194+
; CHECK-NEXT: ret
195+
;
196+
; CHECK-BE-LABEL: nxv16i8_2vl_stride:
197+
; CHECK-BE: // %bb.0:
198+
; CHECK-BE-NEXT: ptrue p0.b
199+
; CHECK-BE-NEXT: ld1b { z0.b }, p0/z, [x0]
200+
; CHECK-BE-NEXT: ld1b { z1.b }, p0/z, [x0, #2, mul vl]
201+
; CHECK-BE-NEXT: st1b { z0.b }, p0, [x1]
202+
; CHECK-BE-NEXT: st1b { z1.b }, p0, [x1, #2, mul vl]
203+
; CHECK-BE-NEXT: ret
204+
;
205+
; CHECK-LDPALIGNEDONLY-LABEL: nxv16i8_2vl_stride:
206+
; CHECK-LDPALIGNEDONLY: // %bb.0:
207+
; CHECK-LDPALIGNEDONLY-NEXT: ldr z0, [x0]
208+
; CHECK-LDPALIGNEDONLY-NEXT: ldr z1, [x0, #2, mul vl]
209+
; CHECK-LDPALIGNEDONLY-NEXT: str z0, [x1]
210+
; CHECK-LDPALIGNEDONLY-NEXT: str z1, [x1, #2, mul vl]
211+
; CHECK-LDPALIGNEDONLY-NEXT: ret
212+
;
213+
; CHECK-STPALIGNEDONLY-LABEL: nxv16i8_2vl_stride:
214+
; CHECK-STPALIGNEDONLY: // %bb.0:
215+
; CHECK-STPALIGNEDONLY-NEXT: ldr z0, [x0]
216+
; CHECK-STPALIGNEDONLY-NEXT: ldr z1, [x0, #2, mul vl]
217+
; CHECK-STPALIGNEDONLY-NEXT: str z0, [x1]
218+
; CHECK-STPALIGNEDONLY-NEXT: str z1, [x1, #2, mul vl]
219+
; CHECK-STPALIGNEDONLY-NEXT: ret
220+
;
221+
; CHECK-OFF-LABEL: nxv16i8_2vl_stride:
222+
; CHECK-OFF: // %bb.0:
223+
; CHECK-OFF-NEXT: ldr z0, [x0]
224+
; CHECK-OFF-NEXT: ldr z1, [x0, #2, mul vl]
225+
; CHECK-OFF-NEXT: str z0, [x1]
226+
; CHECK-OFF-NEXT: str z1, [x1, #2, mul vl]
227+
; CHECK-OFF-NEXT: ret
228+
%vscale = tail call i64 @llvm.vscale()
229+
%vl = shl nuw nsw i64 %vscale, 5
230+
%ldptr2 = getelementptr inbounds nuw i8, ptr %ldptr, i64 %vl
231+
%stptr2 = getelementptr inbounds nuw i8, ptr %stptr, i64 %vl
232+
%ld1 = load <vscale x 16 x i8>, ptr %ldptr, align 1
233+
%ld2 = load <vscale x 16 x i8>, ptr %ldptr2, align 1
234+
store <vscale x 16 x i8> %ld1, ptr %stptr, align 1
235+
store <vscale x 16 x i8> %ld2, ptr %stptr2, align 1
236+
ret void
237+
}
238+
239+
define void @nxv2f64_32b_aligned(ptr %ldptr, ptr %stptr) {
240+
; CHECK-LABEL: nxv2f64_32b_aligned:
241+
; CHECK: // %bb.0:
242+
; CHECK-NEXT: ldp q0, q1, [x0]
243+
; CHECK-NEXT: stp q0, q1, [x1]
244+
; CHECK-NEXT: ret
245+
;
246+
; CHECK-BE-LABEL: nxv2f64_32b_aligned:
247+
; CHECK-BE: // %bb.0:
248+
; CHECK-BE-NEXT: ptrue p0.d
249+
; CHECK-BE-NEXT: ld1d { z0.d }, p0/z, [x0]
250+
; CHECK-BE-NEXT: ld1d { z1.d }, p0/z, [x0, #1, mul vl]
251+
; CHECK-BE-NEXT: st1d { z0.d }, p0, [x1]
252+
; CHECK-BE-NEXT: st1d { z1.d }, p0, [x1, #1, mul vl]
253+
; CHECK-BE-NEXT: ret
254+
;
255+
; CHECK-LDPALIGNEDONLY-LABEL: nxv2f64_32b_aligned:
256+
; CHECK-LDPALIGNEDONLY: // %bb.0:
257+
; CHECK-LDPALIGNEDONLY-NEXT: ldp q0, q1, [x0]
258+
; CHECK-LDPALIGNEDONLY-NEXT: stp q0, q1, [x1]
259+
; CHECK-LDPALIGNEDONLY-NEXT: ret
260+
;
261+
; CHECK-STPALIGNEDONLY-LABEL: nxv2f64_32b_aligned:
262+
; CHECK-STPALIGNEDONLY: // %bb.0:
263+
; CHECK-STPALIGNEDONLY-NEXT: ldp q0, q1, [x0]
264+
; CHECK-STPALIGNEDONLY-NEXT: stp q0, q1, [x1]
265+
; CHECK-STPALIGNEDONLY-NEXT: ret
266+
;
267+
; CHECK-OFF-LABEL: nxv2f64_32b_aligned:
268+
; CHECK-OFF: // %bb.0:
269+
; CHECK-OFF-NEXT: ldr z0, [x0]
270+
; CHECK-OFF-NEXT: ldr z1, [x0, #1, mul vl]
271+
; CHECK-OFF-NEXT: str z0, [x1]
272+
; CHECK-OFF-NEXT: str z1, [x1, #1, mul vl]
273+
; CHECK-OFF-NEXT: ret
274+
%vscale = tail call i64 @llvm.vscale()
275+
%vl = shl nuw nsw i64 %vscale, 4
276+
%ldptr2 = getelementptr inbounds nuw i8, ptr %ldptr, i64 %vl
277+
%stptr2 = getelementptr inbounds nuw i8, ptr %stptr, i64 %vl
278+
%ld1 = load <vscale x 2 x double>, ptr %ldptr, align 32
279+
%ld2 = load <vscale x 2 x double>, ptr %ldptr2, align 32
280+
store <vscale x 2 x double> %ld1, ptr %stptr, align 32
281+
store <vscale x 2 x double> %ld2, ptr %stptr2, align 32
282+
ret void
283+
}

0 commit comments

Comments
 (0)