Skip to content

Commit d1e162e

Browse files
authored
[AArch64] Add custom lowering for load <3 x i8>. (#78632)
Add custom combine to lower load <3 x i8> as the more efficient sequence below: ldrb wX, [x0, #2] ldrh wY, [x0] orr wX, wY, wX, lsl #16 fmov s0, wX At the moment, there are almost no cases in which such vector operations will be generated automatically. The motivating case is non-power-of-2 SLP vectorization: llvm/llvm-project#77790
1 parent 748c295 commit d1e162e

File tree

2 files changed

+124
-114
lines changed

2 files changed

+124
-114
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

+63-2
Original file line numberDiff line numberDiff line change
@@ -21248,6 +21248,61 @@ static SDValue foldTruncStoreOfExt(SelectionDAG &DAG, SDNode *N) {
2124821248
return SDValue();
2124921249
}
2125021250

21251+
// A custom combine to lower load <3 x i8> as the more efficient sequence
21252+
// below:
21253+
// ldrb wX, [x0, #2]
21254+
// ldrh wY, [x0]
21255+
// orr wX, wY, wX, lsl #16
21256+
// fmov s0, wX
21257+
//
21258+
// Note that an alternative sequence with even fewer (although usually more
21259+
// complex/expensive) instructions would be:
21260+
// ld1r.4h { v0 }, [x0], #2
21261+
// ld1.b { v0 }[2], [x0]
21262+
//
21263+
// Generating this sequence unfortunately results in noticeably worse codegen
21264+
// for code that extends the loaded v3i8, due to legalization breaking vector
21265+
// shuffle detection in a way that is very difficult to work around.
21266+
// TODO: Revisit once v3i8 legalization has been improved in general.
21267+
static SDValue combineV3I8LoadExt(LoadSDNode *LD, SelectionDAG &DAG) {
21268+
EVT MemVT = LD->getMemoryVT();
21269+
if (MemVT != EVT::getVectorVT(*DAG.getContext(), MVT::i8, 3) ||
21270+
LD->getOriginalAlign() >= 4)
21271+
return SDValue();
21272+
21273+
SDLoc DL(LD);
21274+
MachineFunction &MF = DAG.getMachineFunction();
21275+
SDValue Chain = LD->getChain();
21276+
SDValue BasePtr = LD->getBasePtr();
21277+
MachineMemOperand *MMO = LD->getMemOperand();
21278+
assert(LD->getOffset().isUndef() && "undef offset expected");
21279+
21280+
// Load 2 x i8, then 1 x i8.
21281+
SDValue L16 = DAG.getLoad(MVT::i16, DL, Chain, BasePtr, MMO);
21282+
TypeSize Offset2 = TypeSize::getFixed(2);
21283+
SDValue L8 = DAG.getLoad(MVT::i8, DL, Chain,
21284+
DAG.getMemBasePlusOffset(BasePtr, Offset2, DL),
21285+
MF.getMachineMemOperand(MMO, 2, 1));
21286+
21287+
// Extend to i32.
21288+
SDValue Ext16 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, L16);
21289+
SDValue Ext8 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, L8);
21290+
21291+
// Pack 2 x i8 and 1 x i8 in an i32 and convert to v4i8.
21292+
SDValue Shl = DAG.getNode(ISD::SHL, DL, MVT::i32, Ext8,
21293+
DAG.getConstant(16, DL, MVT::i32));
21294+
SDValue Or = DAG.getNode(ISD::OR, DL, MVT::i32, Ext16, Shl);
21295+
SDValue Cast = DAG.getNode(ISD::BITCAST, DL, MVT::v4i8, Or);
21296+
21297+
// Extract v3i8 again.
21298+
SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MemVT, Cast,
21299+
DAG.getConstant(0, DL, MVT::i64));
21300+
SDValue TokenFactor = DAG.getNode(
21301+
ISD::TokenFactor, DL, MVT::Other,
21302+
{SDValue(cast<SDNode>(L16), 1), SDValue(cast<SDNode>(L8), 1)});
21303+
return DAG.getMergeValues({Extract, TokenFactor}, DL);
21304+
}
21305+
2125121306
// Perform TBI simplification if supported by the target and try to break up
2125221307
// nontemporal loads larger than 256-bits loads for odd types so LDNPQ 256-bit
2125321308
// load instructions can be selected.
@@ -21259,10 +21314,16 @@ static SDValue performLOADCombine(SDNode *N,
2125921314
performTBISimplification(N->getOperand(1), DCI, DAG);
2126021315

2126121316
LoadSDNode *LD = cast<LoadSDNode>(N);
21262-
EVT MemVT = LD->getMemoryVT();
21263-
if (LD->isVolatile() || !LD->isNonTemporal() || !Subtarget->isLittleEndian())
21317+
if (LD->isVolatile() || !Subtarget->isLittleEndian())
2126421318
return SDValue(N, 0);
2126521319

21320+
if (SDValue Res = combineV3I8LoadExt(LD, DAG))
21321+
return Res;
21322+
21323+
if (!LD->isNonTemporal())
21324+
return SDValue(N, 0);
21325+
21326+
EVT MemVT = LD->getMemoryVT();
2126621327
if (MemVT.isScalableVector() || MemVT.getSizeInBits() <= 256 ||
2126721328
MemVT.getSizeInBits() % 256 == 0 ||
2126821329
256 % MemVT.getScalarSizeInBits() != 0)

llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll

+61-112
Original file line numberDiff line numberDiff line change
@@ -5,19 +5,10 @@
55
define <16 x i8> @load_v3i8(ptr %src) {
66
; CHECK-LABEL: load_v3i8:
77
; CHECK: ; %bb.0:
8-
; CHECK-NEXT: sub sp, sp, #16
9-
; CHECK-NEXT: .cfi_def_cfa_offset 16
10-
; CHECK-NEXT: ldrh w8, [x0]
11-
; CHECK-NEXT: strh w8, [sp, #12]
12-
; CHECK-NEXT: ldr s0, [sp, #12]
13-
; CHECK-NEXT: ushll.8h v0, v0, #0
14-
; CHECK-NEXT: umov.h w8, v0[0]
15-
; CHECK-NEXT: umov.h w9, v0[1]
8+
; CHECK-NEXT: ldrb w8, [x0, #2]
9+
; CHECK-NEXT: ldrh w9, [x0]
10+
; CHECK-NEXT: orr w8, w9, w8, lsl #16
1611
; CHECK-NEXT: fmov s0, w8
17-
; CHECK-NEXT: add x8, x0, #2
18-
; CHECK-NEXT: mov.b v0[1], w9
19-
; CHECK-NEXT: ld1.b { v0 }[2], [x8]
20-
; CHECK-NEXT: add sp, sp, #16
2112
; CHECK-NEXT: ret
2213
;
2314
; BE-LABEL: load_v3i8:
@@ -47,19 +38,14 @@ define <16 x i8> @load_v3i8(ptr %src) {
4738
define <4 x i32> @load_v3i8_to_4xi32(ptr %src) {
4839
; CHECK-LABEL: load_v3i8_to_4xi32:
4940
; CHECK: ; %bb.0:
50-
; CHECK-NEXT: sub sp, sp, #16
51-
; CHECK-NEXT: .cfi_def_cfa_offset 16
52-
; CHECK-NEXT: ldrh w8, [x0]
41+
; CHECK-NEXT: ldrb w8, [x0, #2]
42+
; CHECK-NEXT: ldrh w9, [x0]
5343
; CHECK-NEXT: movi.2d v1, #0x0000ff000000ff
54-
; CHECK-NEXT: strh w8, [sp, #12]
55-
; CHECK-NEXT: ldr s0, [sp, #12]
56-
; CHECK-NEXT: ldrsb w8, [x0, #2]
57-
; CHECK-NEXT: ushll.8h v0, v0, #0
58-
; CHECK-NEXT: mov.h v0[1], v0[1]
59-
; CHECK-NEXT: mov.h v0[2], w8
44+
; CHECK-NEXT: orr w8, w9, w8, lsl #16
45+
; CHECK-NEXT: fmov s0, w8
46+
; CHECK-NEXT: zip1.8b v0, v0, v0
6047
; CHECK-NEXT: ushll.4s v0, v0, #0
6148
; CHECK-NEXT: and.16b v0, v0, v1
62-
; CHECK-NEXT: add sp, sp, #16
6349
; CHECK-NEXT: ret
6450
;
6551
; BE-LABEL: load_v3i8_to_4xi32:
@@ -90,19 +76,14 @@ define <4 x i32> @load_v3i8_to_4xi32(ptr %src) {
9076
define <4 x i32> @load_v3i8_to_4xi32_align_2(ptr %src) {
9177
; CHECK-LABEL: load_v3i8_to_4xi32_align_2:
9278
; CHECK: ; %bb.0:
93-
; CHECK-NEXT: sub sp, sp, #16
94-
; CHECK-NEXT: .cfi_def_cfa_offset 16
95-
; CHECK-NEXT: ldrh w8, [x0]
79+
; CHECK-NEXT: ldrb w8, [x0, #2]
80+
; CHECK-NEXT: ldrh w9, [x0]
9681
; CHECK-NEXT: movi.2d v1, #0x0000ff000000ff
97-
; CHECK-NEXT: strh w8, [sp, #12]
98-
; CHECK-NEXT: ldr s0, [sp, #12]
99-
; CHECK-NEXT: ldrsb w8, [x0, #2]
100-
; CHECK-NEXT: ushll.8h v0, v0, #0
101-
; CHECK-NEXT: mov.h v0[1], v0[1]
102-
; CHECK-NEXT: mov.h v0[2], w8
82+
; CHECK-NEXT: orr w8, w9, w8, lsl #16
83+
; CHECK-NEXT: fmov s0, w8
84+
; CHECK-NEXT: zip1.8b v0, v0, v0
10385
; CHECK-NEXT: ushll.4s v0, v0, #0
10486
; CHECK-NEXT: and.16b v0, v0, v1
105-
; CHECK-NEXT: add sp, sp, #16
10687
; CHECK-NEXT: ret
10788
;
10889
; BE-LABEL: load_v3i8_to_4xi32_align_2:
@@ -160,19 +141,14 @@ define <4 x i32> @load_v3i8_to_4xi32_align_4(ptr %src) {
160141
define <4 x i32> @load_v3i8_to_4xi32_const_offset_1(ptr %src) {
161142
; CHECK-LABEL: load_v3i8_to_4xi32_const_offset_1:
162143
; CHECK: ; %bb.0:
163-
; CHECK-NEXT: sub sp, sp, #16
164-
; CHECK-NEXT: .cfi_def_cfa_offset 16
165-
; CHECK-NEXT: ldurh w8, [x0, #1]
144+
; CHECK-NEXT: ldrb w8, [x0, #3]
145+
; CHECK-NEXT: ldurh w9, [x0, #1]
166146
; CHECK-NEXT: movi.2d v1, #0x0000ff000000ff
167-
; CHECK-NEXT: strh w8, [sp, #12]
168-
; CHECK-NEXT: ldr s0, [sp, #12]
169-
; CHECK-NEXT: ldrsb w8, [x0, #3]
170-
; CHECK-NEXT: ushll.8h v0, v0, #0
171-
; CHECK-NEXT: mov.h v0[1], v0[1]
172-
; CHECK-NEXT: mov.h v0[2], w8
147+
; CHECK-NEXT: orr w8, w9, w8, lsl #16
148+
; CHECK-NEXT: fmov s0, w8
149+
; CHECK-NEXT: zip1.8b v0, v0, v0
173150
; CHECK-NEXT: ushll.4s v0, v0, #0
174151
; CHECK-NEXT: and.16b v0, v0, v1
175-
; CHECK-NEXT: add sp, sp, #16
176152
; CHECK-NEXT: ret
177153
;
178154
; BE-LABEL: load_v3i8_to_4xi32_const_offset_1:
@@ -204,19 +180,14 @@ define <4 x i32> @load_v3i8_to_4xi32_const_offset_1(ptr %src) {
204180
define <4 x i32> @load_v3i8_to_4xi32_const_offset_3(ptr %src) {
205181
; CHECK-LABEL: load_v3i8_to_4xi32_const_offset_3:
206182
; CHECK: ; %bb.0:
207-
; CHECK-NEXT: sub sp, sp, #16
208-
; CHECK-NEXT: .cfi_def_cfa_offset 16
209-
; CHECK-NEXT: ldurh w8, [x0, #3]
183+
; CHECK-NEXT: ldrb w8, [x0, #5]
184+
; CHECK-NEXT: ldurh w9, [x0, #3]
210185
; CHECK-NEXT: movi.2d v1, #0x0000ff000000ff
211-
; CHECK-NEXT: strh w8, [sp, #12]
212-
; CHECK-NEXT: ldr s0, [sp, #12]
213-
; CHECK-NEXT: ldrsb w8, [x0, #5]
214-
; CHECK-NEXT: ushll.8h v0, v0, #0
215-
; CHECK-NEXT: mov.h v0[1], v0[1]
216-
; CHECK-NEXT: mov.h v0[2], w8
186+
; CHECK-NEXT: orr w8, w9, w8, lsl #16
187+
; CHECK-NEXT: fmov s0, w8
188+
; CHECK-NEXT: zip1.8b v0, v0, v0
217189
; CHECK-NEXT: ushll.4s v0, v0, #0
218190
; CHECK-NEXT: and.16b v0, v0, v1
219-
; CHECK-NEXT: add sp, sp, #16
220191
; CHECK-NEXT: ret
221192
;
222193
; BE-LABEL: load_v3i8_to_4xi32_const_offset_3:
@@ -348,18 +319,14 @@ define <3 x i32> @load_v3i32(ptr %src) {
348319
define <3 x i32> @load_v3i8_zext_to_3xi32(ptr %src) {
349320
; CHECK-LABEL: load_v3i8_zext_to_3xi32:
350321
; CHECK: ; %bb.0:
351-
; CHECK-NEXT: sub sp, sp, #16
352-
; CHECK-NEXT: .cfi_def_cfa_offset 16
353-
; CHECK-NEXT: ldrh w8, [x0]
322+
; CHECK-NEXT: ldrb w8, [x0, #2]
323+
; CHECK-NEXT: ldrh w9, [x0]
354324
; CHECK-NEXT: movi.2d v1, #0x0000ff000000ff
355-
; CHECK-NEXT: strh w8, [sp, #12]
356-
; CHECK-NEXT: add x8, x0, #2
357-
; CHECK-NEXT: ldr s0, [sp, #12]
358-
; CHECK-NEXT: ushll.8h v0, v0, #0
359-
; CHECK-NEXT: ld1.b { v0 }[4], [x8]
325+
; CHECK-NEXT: orr w8, w9, w8, lsl #16
326+
; CHECK-NEXT: fmov s0, w8
327+
; CHECK-NEXT: zip1.8b v0, v0, v0
360328
; CHECK-NEXT: ushll.4s v0, v0, #0
361329
; CHECK-NEXT: and.16b v0, v0, v1
362-
; CHECK-NEXT: add sp, sp, #16
363330
; CHECK-NEXT: ret
364331
;
365332
; BE-LABEL: load_v3i8_zext_to_3xi32:
@@ -388,18 +355,14 @@ define <3 x i32> @load_v3i8_zext_to_3xi32(ptr %src) {
388355
define <3 x i32> @load_v3i8_sext_to_3xi32(ptr %src) {
389356
; CHECK-LABEL: load_v3i8_sext_to_3xi32:
390357
; CHECK: ; %bb.0:
391-
; CHECK-NEXT: sub sp, sp, #16
392-
; CHECK-NEXT: .cfi_def_cfa_offset 16
393-
; CHECK-NEXT: ldrh w8, [x0]
394-
; CHECK-NEXT: strh w8, [sp, #12]
395-
; CHECK-NEXT: add x8, x0, #2
396-
; CHECK-NEXT: ldr s0, [sp, #12]
397-
; CHECK-NEXT: ushll.8h v0, v0, #0
398-
; CHECK-NEXT: ld1.b { v0 }[4], [x8]
358+
; CHECK-NEXT: ldrb w8, [x0, #2]
359+
; CHECK-NEXT: ldrh w9, [x0]
360+
; CHECK-NEXT: orr w8, w9, w8, lsl #16
361+
; CHECK-NEXT: fmov s0, w8
362+
; CHECK-NEXT: zip1.8b v0, v0, v0
399363
; CHECK-NEXT: ushll.4s v0, v0, #0
400364
; CHECK-NEXT: shl.4s v0, v0, #24
401365
; CHECK-NEXT: sshr.4s v0, v0, #24
402-
; CHECK-NEXT: add sp, sp, #16
403366
; CHECK-NEXT: ret
404367
;
405368
; BE-LABEL: load_v3i8_sext_to_3xi32:
@@ -513,19 +476,15 @@ entry:
513476
define void @load_ext_to_64bits(ptr %src, ptr %dst) {
514477
; CHECK-LABEL: load_ext_to_64bits:
515478
; CHECK: ; %bb.0: ; %entry
516-
; CHECK-NEXT: sub sp, sp, #16
517-
; CHECK-NEXT: .cfi_def_cfa_offset 16
518-
; CHECK-NEXT: ldrh w8, [x0]
519-
; CHECK-NEXT: strh w8, [sp, #12]
520-
; CHECK-NEXT: add x8, x0, #2
521-
; CHECK-NEXT: ldr s0, [sp, #12]
522-
; CHECK-NEXT: ushll.8h v0, v0, #0
523-
; CHECK-NEXT: ld1.b { v0 }[4], [x8]
479+
; CHECK-NEXT: ldrb w8, [x0, #2]
480+
; CHECK-NEXT: ldrh w9, [x0]
481+
; CHECK-NEXT: orr w8, w9, w8, lsl #16
482+
; CHECK-NEXT: fmov s0, w8
524483
; CHECK-NEXT: add x8, x1, #4
484+
; CHECK-NEXT: zip1.8b v0, v0, v0
525485
; CHECK-NEXT: bic.4h v0, #255, lsl #8
526486
; CHECK-NEXT: st1.h { v0 }[2], [x8]
527487
; CHECK-NEXT: str s0, [x1]
528-
; CHECK-NEXT: add sp, sp, #16
529488
; CHECK-NEXT: ret
530489
;
531490
; BE-LABEL: load_ext_to_64bits:
@@ -614,24 +573,20 @@ entry:
614573
define void @load_ext_add_to_64bits(ptr %src, ptr %dst) {
615574
; CHECK-LABEL: load_ext_add_to_64bits:
616575
; CHECK: ; %bb.0: ; %entry
617-
; CHECK-NEXT: sub sp, sp, #16
618-
; CHECK-NEXT: .cfi_def_cfa_offset 16
619-
; CHECK-NEXT: ldrh w9, [x0]
576+
; CHECK-NEXT: ldrb w9, [x0, #2]
577+
; CHECK-NEXT: ldrh w10, [x0]
620578
; CHECK-NEXT: Lloh2:
621579
; CHECK-NEXT: adrp x8, lCPI15_0@PAGE
622580
; CHECK-NEXT: Lloh3:
623581
; CHECK-NEXT: ldr d1, [x8, lCPI15_0@PAGEOFF]
624582
; CHECK-NEXT: add x8, x1, #4
625-
; CHECK-NEXT: strh w9, [sp, #12]
626-
; CHECK-NEXT: add x9, x0, #2
627-
; CHECK-NEXT: ldr s0, [sp, #12]
628-
; CHECK-NEXT: ushll.8h v0, v0, #0
629-
; CHECK-NEXT: ld1.b { v0 }[4], [x9]
583+
; CHECK-NEXT: orr w9, w10, w9, lsl #16
584+
; CHECK-NEXT: fmov s0, w9
585+
; CHECK-NEXT: zip1.8b v0, v0, v0
630586
; CHECK-NEXT: bic.4h v0, #255, lsl #8
631587
; CHECK-NEXT: add.4h v0, v0, v1
632588
; CHECK-NEXT: st1.h { v0 }[2], [x8]
633589
; CHECK-NEXT: str s0, [x1]
634-
; CHECK-NEXT: add sp, sp, #16
635590
; CHECK-NEXT: ret
636591
; CHECK-NEXT: .loh AdrpLdr Lloh2, Lloh3
637592
;
@@ -880,24 +835,21 @@ define void @shift_trunc_volatile_store(ptr %src, ptr %dst) {
880835
define void @load_v3i8_zext_to_3xi32_add_trunc_store(ptr %src) {
881836
; CHECK-LABEL: load_v3i8_zext_to_3xi32_add_trunc_store:
882837
; CHECK: ; %bb.0:
883-
; CHECK-NEXT: sub sp, sp, #16
884-
; CHECK-NEXT: .cfi_def_cfa_offset 16
885-
; CHECK-NEXT: ldrh w9, [x0]
838+
; CHECK-NEXT: ldrb w9, [x0, #2]
839+
; CHECK-NEXT: ldrh w10, [x0]
886840
; CHECK-NEXT: Lloh4:
887841
; CHECK-NEXT: adrp x8, lCPI22_0@PAGE
888842
; CHECK-NEXT: Lloh5:
889843
; CHECK-NEXT: ldr q1, [x8, lCPI22_0@PAGEOFF]
890-
; CHECK-NEXT: add x8, x0, #1
891-
; CHECK-NEXT: strh w9, [sp, #12]
892-
; CHECK-NEXT: add x9, x0, #2
893-
; CHECK-NEXT: ldr s0, [sp, #12]
894-
; CHECK-NEXT: ushll.8h v0, v0, #0
895-
; CHECK-NEXT: ld1.b { v0 }[4], [x9]
844+
; CHECK-NEXT: add x8, x0, #2
845+
; CHECK-NEXT: orr w9, w10, w9, lsl #16
846+
; CHECK-NEXT: fmov s0, w9
847+
; CHECK-NEXT: zip1.8b v0, v0, v0
896848
; CHECK-NEXT: uaddw.4s v0, v1, v0
897-
; CHECK-NEXT: st1.b { v0 }[4], [x8]
898-
; CHECK-NEXT: st1.b { v0 }[8], [x9]
849+
; CHECK-NEXT: st1.b { v0 }[8], [x8]
850+
; CHECK-NEXT: add x8, x0, #1
899851
; CHECK-NEXT: st1.b { v0 }[0], [x0]
900-
; CHECK-NEXT: add sp, sp, #16
852+
; CHECK-NEXT: st1.b { v0 }[4], [x8]
901853
; CHECK-NEXT: ret
902854
; CHECK-NEXT: .loh AdrpLdr Lloh4, Lloh5
903855
;
@@ -936,24 +888,21 @@ define void @load_v3i8_zext_to_3xi32_add_trunc_store(ptr %src) {
936888
define void @load_v3i8_sext_to_3xi32_add_trunc_store(ptr %src) {
937889
; CHECK-LABEL: load_v3i8_sext_to_3xi32_add_trunc_store:
938890
; CHECK: ; %bb.0:
939-
; CHECK-NEXT: sub sp, sp, #16
940-
; CHECK-NEXT: .cfi_def_cfa_offset 16
941-
; CHECK-NEXT: ldrh w9, [x0]
891+
; CHECK-NEXT: ldrb w9, [x0, #2]
892+
; CHECK-NEXT: ldrh w10, [x0]
942893
; CHECK-NEXT: Lloh6:
943894
; CHECK-NEXT: adrp x8, lCPI23_0@PAGE
944895
; CHECK-NEXT: Lloh7:
945896
; CHECK-NEXT: ldr q1, [x8, lCPI23_0@PAGEOFF]
946-
; CHECK-NEXT: add x8, x0, #1
947-
; CHECK-NEXT: strh w9, [sp, #12]
948-
; CHECK-NEXT: add x9, x0, #2
949-
; CHECK-NEXT: ldr s0, [sp, #12]
950-
; CHECK-NEXT: ushll.8h v0, v0, #0
951-
; CHECK-NEXT: ld1.b { v0 }[4], [x9]
897+
; CHECK-NEXT: add x8, x0, #2
898+
; CHECK-NEXT: orr w9, w10, w9, lsl #16
899+
; CHECK-NEXT: fmov s0, w9
900+
; CHECK-NEXT: zip1.8b v0, v0, v0
952901
; CHECK-NEXT: uaddw.4s v0, v1, v0
953-
; CHECK-NEXT: st1.b { v0 }[4], [x8]
954-
; CHECK-NEXT: st1.b { v0 }[8], [x9]
902+
; CHECK-NEXT: st1.b { v0 }[8], [x8]
903+
; CHECK-NEXT: add x8, x0, #1
955904
; CHECK-NEXT: st1.b { v0 }[0], [x0]
956-
; CHECK-NEXT: add sp, sp, #16
905+
; CHECK-NEXT: st1.b { v0 }[4], [x8]
957906
; CHECK-NEXT: ret
958907
; CHECK-NEXT: .loh AdrpLdr Lloh6, Lloh7
959908
;

0 commit comments

Comments
 (0)