diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp index 028bffd1bf5a7..65cfa722dbd72 100644 --- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp @@ -4072,6 +4072,19 @@ LegalizerHelper::LegalizeResult LegalizerHelper::lowerLoad(GAnyLoad &LoadMI) { if (MemTy != DstTy) return UnableToLegalize; + Align Alignment = LoadMI.getAlign(); + if (Alignment.value() * 8 > MemSizeInBits && + isPowerOf2_64(DstTy.getScalarSizeInBits())) { + LLT MoreTy = LLT::fixed_vector(NextPowerOf2(DstTy.getNumElements()), + DstTy.getElementType()); + MachineMemOperand *NewMMO = MF.getMachineMemOperand(&MMO, 0, MoreTy); + auto NewLoad = MIRBuilder.buildLoad(MoreTy, PtrReg, *NewMMO); + MIRBuilder.buildDeleteTrailingVectorElements(LoadMI.getReg(0), + NewLoad.getReg(0)); + LoadMI.eraseFromParent(); + return Legalized; + } + // TODO: We can do better than scalarizing the vector and at least split it // in half. return reduceLoadStoreWidth(LoadMI, 0, DstTy.getElementType()); diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-load-range.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-load-range.mir new file mode 100644 index 0000000000000..5611642a13649 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-load-range.mir @@ -0,0 +1,46 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# RUN: llc -mtriple=aarch64 -run-pass=legalizer -global-isel -o - %s | FileCheck %s + +--- | + target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32" + target triple = "aarch64" + + define <3 x i16> @range_v3i16(ptr %a_ptr, ptr %b_ptr) { + %a = load <3 x i16>, ptr %a_ptr, align 8, !range !0, !noundef !1 + %b = load <3 x i16>, ptr %b_ptr, align 8, !range !2, !noundef !1 + %result = add <3 x i16> %a, %b + ret <3 x i16> %result + } + + !0 = !{i16 16, i16 17} + !1 = !{} + !2 = !{i16 32, i16 33} +... +--- +name: range_v3i16 +body: | + bb.1 (%ir-block.0): + liveins: $x0, $x1 + ; Make sure we drop the range metadata when widening an aligned load. + + ; CHECK-LABEL: name: range_v3i16 + ; CHECK: liveins: $x0, $x1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY $x1 + ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<4 x s16>) from %ir.a_ptr) + ; CHECK-NEXT: [[LOAD1:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY1]](p0) :: (load (<4 x s16>) from %ir.b_ptr) + ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(<4 x s16>) = G_ADD [[LOAD]], [[LOAD1]] + ; CHECK-NEXT: $d0 = COPY [[ADD]](<4 x s16>) + ; CHECK-NEXT: RET_ReallyLR implicit $d0 + %0:_(p0) = COPY $x0 + %1:_(p0) = COPY $x1 + %2:_(<3 x s16>) = G_LOAD %0(p0) :: (load (<3 x s16>) from %ir.a_ptr, align 8, !range !0) + %3:_(<3 x s16>) = G_LOAD %1(p0) :: (load (<3 x s16>) from %ir.b_ptr, align 8, !range !2) + %4:_(<3 x s16>) = G_ADD %2, %3 + %5:_(s16), %6:_(s16), %7:_(s16) = G_UNMERGE_VALUES %4(<3 x s16>) + %8:_(s16) = G_IMPLICIT_DEF + %9:_(<4 x s16>) = G_BUILD_VECTOR %5(s16), %6(s16), %7(s16), %8(s16) + $d0 = COPY %9(<4 x s16>) + RET_ReallyLR implicit $d0 +... diff --git a/llvm/test/CodeGen/AArch64/add.ll b/llvm/test/CodeGen/AArch64/add.ll index d5bd1b712a2a6..96168cb80196f 100644 --- a/llvm/test/CodeGen/AArch64/add.ll +++ b/llvm/test/CodeGen/AArch64/add.ll @@ -110,16 +110,20 @@ define void @v3i8(ptr %p1, ptr %p2) { ; ; CHECK-GI-LABEL: v3i8: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ldrb w8, [x0] -; CHECK-GI-NEXT: ldrb w9, [x1] -; CHECK-GI-NEXT: ldrb w10, [x0, #1] -; CHECK-GI-NEXT: ldrb w11, [x1, #1] +; CHECK-GI-NEXT: ldr w8, [x0] +; CHECK-GI-NEXT: ldr w9, [x1] ; CHECK-GI-NEXT: fmov s0, w8 ; CHECK-GI-NEXT: fmov s1, w9 -; CHECK-GI-NEXT: ldrb w8, [x0, #2] -; CHECK-GI-NEXT: ldrb w9, [x1, #2] -; CHECK-GI-NEXT: mov v0.h[1], w10 -; CHECK-GI-NEXT: mov v1.h[1], w11 +; CHECK-GI-NEXT: mov b2, v0.b[1] +; CHECK-GI-NEXT: mov b3, v1.b[1] +; CHECK-GI-NEXT: mov b4, v0.b[2] +; CHECK-GI-NEXT: mov b5, v1.b[2] +; CHECK-GI-NEXT: fmov w8, s2 +; CHECK-GI-NEXT: fmov w9, s3 +; CHECK-GI-NEXT: mov v0.h[1], w8 +; CHECK-GI-NEXT: mov v1.h[1], w9 +; CHECK-GI-NEXT: fmov w8, s4 +; CHECK-GI-NEXT: fmov w9, s5 ; CHECK-GI-NEXT: mov v0.h[2], w8 ; CHECK-GI-NEXT: mov v1.h[2], w9 ; CHECK-GI-NEXT: add v0.4h, v0.4h, v1.4h @@ -270,16 +274,10 @@ define void @v3i16(ptr %p1, ptr %p2) { ; ; CHECK-GI-LABEL: v3i16: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ldr h0, [x0] -; CHECK-GI-NEXT: ldr h1, [x1] +; CHECK-GI-NEXT: ldr d0, [x0] +; CHECK-GI-NEXT: ldr d1, [x1] ; CHECK-GI-NEXT: add x8, x0, #2 -; CHECK-GI-NEXT: add x9, x1, #2 -; CHECK-GI-NEXT: add x10, x1, #4 -; CHECK-GI-NEXT: ld1 { v0.h }[1], [x8] -; CHECK-GI-NEXT: ld1 { v1.h }[1], [x9] ; CHECK-GI-NEXT: add x9, x0, #4 -; CHECK-GI-NEXT: ld1 { v0.h }[2], [x9] -; CHECK-GI-NEXT: ld1 { v1.h }[2], [x10] ; CHECK-GI-NEXT: add v0.4h, v0.4h, v1.4h ; CHECK-GI-NEXT: str h0, [x0] ; CHECK-GI-NEXT: st1 { v0.h }[1], [x8] diff --git a/llvm/test/CodeGen/AArch64/andorxor.ll b/llvm/test/CodeGen/AArch64/andorxor.ll index f7df1092287bd..a7875dbebd0e6 100644 --- a/llvm/test/CodeGen/AArch64/andorxor.ll +++ b/llvm/test/CodeGen/AArch64/andorxor.ll @@ -302,16 +302,20 @@ define void @and_v3i8(ptr %p1, ptr %p2) { ; ; CHECK-GI-LABEL: and_v3i8: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ldrb w8, [x0] -; CHECK-GI-NEXT: ldrb w9, [x1] -; CHECK-GI-NEXT: ldrb w10, [x0, #1] -; CHECK-GI-NEXT: ldrb w11, [x1, #1] +; CHECK-GI-NEXT: ldr w8, [x0] +; CHECK-GI-NEXT: ldr w9, [x1] ; CHECK-GI-NEXT: fmov s0, w8 ; CHECK-GI-NEXT: fmov s1, w9 -; CHECK-GI-NEXT: ldrb w8, [x0, #2] -; CHECK-GI-NEXT: ldrb w9, [x1, #2] -; CHECK-GI-NEXT: mov v0.h[1], w10 -; CHECK-GI-NEXT: mov v1.h[1], w11 +; CHECK-GI-NEXT: mov b2, v0.b[1] +; CHECK-GI-NEXT: mov b3, v1.b[1] +; CHECK-GI-NEXT: mov b4, v0.b[2] +; CHECK-GI-NEXT: mov b5, v1.b[2] +; CHECK-GI-NEXT: fmov w8, s2 +; CHECK-GI-NEXT: fmov w9, s3 +; CHECK-GI-NEXT: mov v0.h[1], w8 +; CHECK-GI-NEXT: mov v1.h[1], w9 +; CHECK-GI-NEXT: fmov w8, s4 +; CHECK-GI-NEXT: fmov w9, s5 ; CHECK-GI-NEXT: mov v0.h[2], w8 ; CHECK-GI-NEXT: mov v1.h[2], w9 ; CHECK-GI-NEXT: and v0.8b, v0.8b, v1.8b @@ -350,16 +354,20 @@ define void @or_v3i8(ptr %p1, ptr %p2) { ; ; CHECK-GI-LABEL: or_v3i8: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ldrb w8, [x0] -; CHECK-GI-NEXT: ldrb w9, [x1] -; CHECK-GI-NEXT: ldrb w10, [x0, #1] -; CHECK-GI-NEXT: ldrb w11, [x1, #1] +; CHECK-GI-NEXT: ldr w8, [x0] +; CHECK-GI-NEXT: ldr w9, [x1] ; CHECK-GI-NEXT: fmov s0, w8 ; CHECK-GI-NEXT: fmov s1, w9 -; CHECK-GI-NEXT: ldrb w8, [x0, #2] -; CHECK-GI-NEXT: ldrb w9, [x1, #2] -; CHECK-GI-NEXT: mov v0.h[1], w10 -; CHECK-GI-NEXT: mov v1.h[1], w11 +; CHECK-GI-NEXT: mov b2, v0.b[1] +; CHECK-GI-NEXT: mov b3, v1.b[1] +; CHECK-GI-NEXT: mov b4, v0.b[2] +; CHECK-GI-NEXT: mov b5, v1.b[2] +; CHECK-GI-NEXT: fmov w8, s2 +; CHECK-GI-NEXT: fmov w9, s3 +; CHECK-GI-NEXT: mov v0.h[1], w8 +; CHECK-GI-NEXT: mov v1.h[1], w9 +; CHECK-GI-NEXT: fmov w8, s4 +; CHECK-GI-NEXT: fmov w9, s5 ; CHECK-GI-NEXT: mov v0.h[2], w8 ; CHECK-GI-NEXT: mov v1.h[2], w9 ; CHECK-GI-NEXT: orr v0.8b, v0.8b, v1.8b @@ -398,16 +406,20 @@ define void @xor_v3i8(ptr %p1, ptr %p2) { ; ; CHECK-GI-LABEL: xor_v3i8: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ldrb w8, [x0] -; CHECK-GI-NEXT: ldrb w9, [x1] -; CHECK-GI-NEXT: ldrb w10, [x0, #1] -; CHECK-GI-NEXT: ldrb w11, [x1, #1] +; CHECK-GI-NEXT: ldr w8, [x0] +; CHECK-GI-NEXT: ldr w9, [x1] ; CHECK-GI-NEXT: fmov s0, w8 ; CHECK-GI-NEXT: fmov s1, w9 -; CHECK-GI-NEXT: ldrb w8, [x0, #2] -; CHECK-GI-NEXT: ldrb w9, [x1, #2] -; CHECK-GI-NEXT: mov v0.h[1], w10 -; CHECK-GI-NEXT: mov v1.h[1], w11 +; CHECK-GI-NEXT: mov b2, v0.b[1] +; CHECK-GI-NEXT: mov b3, v1.b[1] +; CHECK-GI-NEXT: mov b4, v0.b[2] +; CHECK-GI-NEXT: mov b5, v1.b[2] +; CHECK-GI-NEXT: fmov w8, s2 +; CHECK-GI-NEXT: fmov w9, s3 +; CHECK-GI-NEXT: mov v0.h[1], w8 +; CHECK-GI-NEXT: mov v1.h[1], w9 +; CHECK-GI-NEXT: fmov w8, s4 +; CHECK-GI-NEXT: fmov w9, s5 ; CHECK-GI-NEXT: mov v0.h[2], w8 ; CHECK-GI-NEXT: mov v1.h[2], w9 ; CHECK-GI-NEXT: eor v0.8b, v0.8b, v1.8b @@ -805,16 +817,10 @@ define void @and_v3i16(ptr %p1, ptr %p2) { ; ; CHECK-GI-LABEL: and_v3i16: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ldr h0, [x0] -; CHECK-GI-NEXT: ldr h1, [x1] +; CHECK-GI-NEXT: ldr d0, [x0] +; CHECK-GI-NEXT: ldr d1, [x1] ; CHECK-GI-NEXT: add x8, x0, #2 -; CHECK-GI-NEXT: add x9, x1, #2 -; CHECK-GI-NEXT: add x10, x1, #4 -; CHECK-GI-NEXT: ld1 { v0.h }[1], [x8] -; CHECK-GI-NEXT: ld1 { v1.h }[1], [x9] ; CHECK-GI-NEXT: add x9, x0, #4 -; CHECK-GI-NEXT: ld1 { v0.h }[2], [x9] -; CHECK-GI-NEXT: ld1 { v1.h }[2], [x10] ; CHECK-GI-NEXT: and v0.8b, v0.8b, v1.8b ; CHECK-GI-NEXT: str h0, [x0] ; CHECK-GI-NEXT: st1 { v0.h }[1], [x8] @@ -842,16 +848,10 @@ define void @or_v3i16(ptr %p1, ptr %p2) { ; ; CHECK-GI-LABEL: or_v3i16: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ldr h0, [x0] -; CHECK-GI-NEXT: ldr h1, [x1] +; CHECK-GI-NEXT: ldr d0, [x0] +; CHECK-GI-NEXT: ldr d1, [x1] ; CHECK-GI-NEXT: add x8, x0, #2 -; CHECK-GI-NEXT: add x9, x1, #2 -; CHECK-GI-NEXT: add x10, x1, #4 -; CHECK-GI-NEXT: ld1 { v0.h }[1], [x8] -; CHECK-GI-NEXT: ld1 { v1.h }[1], [x9] ; CHECK-GI-NEXT: add x9, x0, #4 -; CHECK-GI-NEXT: ld1 { v0.h }[2], [x9] -; CHECK-GI-NEXT: ld1 { v1.h }[2], [x10] ; CHECK-GI-NEXT: orr v0.8b, v0.8b, v1.8b ; CHECK-GI-NEXT: str h0, [x0] ; CHECK-GI-NEXT: st1 { v0.h }[1], [x8] @@ -879,16 +879,10 @@ define void @xor_v3i16(ptr %p1, ptr %p2) { ; ; CHECK-GI-LABEL: xor_v3i16: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ldr h0, [x0] -; CHECK-GI-NEXT: ldr h1, [x1] +; CHECK-GI-NEXT: ldr d0, [x0] +; CHECK-GI-NEXT: ldr d1, [x1] ; CHECK-GI-NEXT: add x8, x0, #2 -; CHECK-GI-NEXT: add x9, x1, #2 -; CHECK-GI-NEXT: add x10, x1, #4 -; CHECK-GI-NEXT: ld1 { v0.h }[1], [x8] -; CHECK-GI-NEXT: ld1 { v1.h }[1], [x9] ; CHECK-GI-NEXT: add x9, x0, #4 -; CHECK-GI-NEXT: ld1 { v0.h }[2], [x9] -; CHECK-GI-NEXT: ld1 { v1.h }[2], [x10] ; CHECK-GI-NEXT: eor v0.8b, v0.8b, v1.8b ; CHECK-GI-NEXT: str h0, [x0] ; CHECK-GI-NEXT: st1 { v0.h }[1], [x8] diff --git a/llvm/test/CodeGen/AArch64/ctlz.ll b/llvm/test/CodeGen/AArch64/ctlz.ll index b1c6e24c30a7d..04124609eec74 100644 --- a/llvm/test/CodeGen/AArch64/ctlz.ll +++ b/llvm/test/CodeGen/AArch64/ctlz.ll @@ -56,12 +56,16 @@ define void @v3i8(ptr %p1) { ; ; CHECK-GI-LABEL: v3i8: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ldr b0, [x0] -; CHECK-GI-NEXT: add x8, x0, #1 +; CHECK-GI-NEXT: ldr w8, [x0] ; CHECK-GI-NEXT: add x9, x0, #2 -; CHECK-GI-NEXT: ld1 { v0.b }[1], [x8] -; CHECK-GI-NEXT: ld1 { v0.b }[2], [x9] -; CHECK-GI-NEXT: clz v0.8b, v0.8b +; CHECK-GI-NEXT: fmov s0, w8 +; CHECK-GI-NEXT: add x8, x0, #1 +; CHECK-GI-NEXT: mov b1, v0.b[1] +; CHECK-GI-NEXT: mov v2.b[0], v0.b[0] +; CHECK-GI-NEXT: mov b0, v0.b[2] +; CHECK-GI-NEXT: mov v2.b[1], v1.b[0] +; CHECK-GI-NEXT: mov v2.b[2], v0.b[0] +; CHECK-GI-NEXT: clz v0.8b, v2.8b ; CHECK-GI-NEXT: st1 { v0.b }[0], [x0] ; CHECK-GI-NEXT: st1 { v0.b }[1], [x8] ; CHECK-GI-NEXT: st1 { v0.b }[2], [x9] @@ -181,11 +185,9 @@ define void @v3i16(ptr %p1) { ; ; CHECK-GI-LABEL: v3i16: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ldr h0, [x0] +; CHECK-GI-NEXT: ldr d0, [x0] ; CHECK-GI-NEXT: add x8, x0, #2 ; CHECK-GI-NEXT: add x9, x0, #4 -; CHECK-GI-NEXT: ld1 { v0.h }[1], [x8] -; CHECK-GI-NEXT: ld1 { v0.h }[2], [x9] ; CHECK-GI-NEXT: clz v0.4h, v0.4h ; CHECK-GI-NEXT: str h0, [x0] ; CHECK-GI-NEXT: st1 { v0.h }[1], [x8] diff --git a/llvm/test/CodeGen/AArch64/ctpop.ll b/llvm/test/CodeGen/AArch64/ctpop.ll index 55f75b6bc3f27..c739be95cd243 100644 --- a/llvm/test/CodeGen/AArch64/ctpop.ll +++ b/llvm/test/CodeGen/AArch64/ctpop.ll @@ -55,12 +55,16 @@ define void @v3i8(ptr %p1) { ; ; CHECK-GI-LABEL: v3i8: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ldr b0, [x0] -; CHECK-GI-NEXT: add x8, x0, #1 +; CHECK-GI-NEXT: ldr w8, [x0] ; CHECK-GI-NEXT: add x9, x0, #2 -; CHECK-GI-NEXT: ld1 { v0.b }[1], [x8] -; CHECK-GI-NEXT: ld1 { v0.b }[2], [x9] -; CHECK-GI-NEXT: cnt v0.8b, v0.8b +; CHECK-GI-NEXT: fmov s0, w8 +; CHECK-GI-NEXT: add x8, x0, #1 +; CHECK-GI-NEXT: mov b1, v0.b[1] +; CHECK-GI-NEXT: mov v2.b[0], v0.b[0] +; CHECK-GI-NEXT: mov b0, v0.b[2] +; CHECK-GI-NEXT: mov v2.b[1], v1.b[0] +; CHECK-GI-NEXT: mov v2.b[2], v0.b[0] +; CHECK-GI-NEXT: cnt v0.8b, v2.8b ; CHECK-GI-NEXT: st1 { v0.b }[0], [x0] ; CHECK-GI-NEXT: st1 { v0.b }[1], [x8] ; CHECK-GI-NEXT: st1 { v0.b }[2], [x9] @@ -181,11 +185,9 @@ define void @v3i16(ptr %p1) { ; ; CHECK-GI-LABEL: v3i16: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ldr h0, [x0] +; CHECK-GI-NEXT: ldr d0, [x0] ; CHECK-GI-NEXT: add x8, x0, #2 ; CHECK-GI-NEXT: add x9, x0, #4 -; CHECK-GI-NEXT: ld1 { v0.h }[1], [x8] -; CHECK-GI-NEXT: ld1 { v0.h }[2], [x9] ; CHECK-GI-NEXT: cnt v0.8b, v0.8b ; CHECK-GI-NEXT: uaddlp v0.4h, v0.8b ; CHECK-GI-NEXT: str h0, [x0] diff --git a/llvm/test/CodeGen/AArch64/cttz.ll b/llvm/test/CodeGen/AArch64/cttz.ll index 93ac97e20dabd..fc9bf2c0aca65 100644 --- a/llvm/test/CodeGen/AArch64/cttz.ll +++ b/llvm/test/CodeGen/AArch64/cttz.ll @@ -68,21 +68,23 @@ define void @v3i8(ptr %p1) { ; ; CHECK-GI-LABEL: v3i8: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ldrb w9, [x0] +; CHECK-GI-NEXT: ldr w9, [x0] ; CHECK-GI-NEXT: mov w8, #65535 // =0xffff -; CHECK-GI-NEXT: ldrb w10, [x0, #1] -; CHECK-GI-NEXT: fmov s0, w8 -; CHECK-GI-NEXT: fmov s1, w9 -; CHECK-GI-NEXT: ldrb w9, [x0, #2] -; CHECK-GI-NEXT: mov v0.h[1], w8 -; CHECK-GI-NEXT: mov v1.h[1], w10 -; CHECK-GI-NEXT: mov v0.h[2], w8 +; CHECK-GI-NEXT: fmov s2, w8 +; CHECK-GI-NEXT: fmov s0, w9 +; CHECK-GI-NEXT: mov v2.h[1], w8 +; CHECK-GI-NEXT: mov b1, v0.b[1] +; CHECK-GI-NEXT: mov v2.h[2], w8 ; CHECK-GI-NEXT: add x8, x0, #1 -; CHECK-GI-NEXT: mov v1.h[2], w9 +; CHECK-GI-NEXT: fmov w9, s1 +; CHECK-GI-NEXT: mov b1, v0.b[2] +; CHECK-GI-NEXT: mov v0.h[1], w9 +; CHECK-GI-NEXT: fmov w9, s1 +; CHECK-GI-NEXT: mov v0.h[2], w9 ; CHECK-GI-NEXT: add x9, x0, #2 -; CHECK-GI-NEXT: eor v2.8b, v1.8b, v0.8b -; CHECK-GI-NEXT: add v0.4h, v1.4h, v0.4h -; CHECK-GI-NEXT: and v0.8b, v2.8b, v0.8b +; CHECK-GI-NEXT: eor v1.8b, v0.8b, v2.8b +; CHECK-GI-NEXT: add v0.4h, v0.4h, v2.4h +; CHECK-GI-NEXT: and v0.8b, v1.8b, v0.8b ; CHECK-GI-NEXT: uzp1 v0.8b, v0.8b, v0.8b ; CHECK-GI-NEXT: cnt v0.8b, v0.8b ; CHECK-GI-NEXT: st1 { v0.b }[0], [x0] @@ -275,22 +277,20 @@ define void @v3i16(ptr %p1) { ; CHECK-GI-LABEL: v3i16: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: mov w8, #65535 // =0xffff -; CHECK-GI-NEXT: ldr h1, [x0] -; CHECK-GI-NEXT: add x9, x0, #2 +; CHECK-GI-NEXT: ldr d1, [x0] +; CHECK-GI-NEXT: add x9, x0, #4 ; CHECK-GI-NEXT: fmov s0, w8 -; CHECK-GI-NEXT: add x10, x0, #4 -; CHECK-GI-NEXT: ld1 { v1.h }[1], [x9] ; CHECK-GI-NEXT: mov v0.h[1], w8 -; CHECK-GI-NEXT: ld1 { v1.h }[2], [x10] ; CHECK-GI-NEXT: mov v0.h[2], w8 +; CHECK-GI-NEXT: add x8, x0, #2 ; CHECK-GI-NEXT: eor v2.8b, v1.8b, v0.8b ; CHECK-GI-NEXT: add v0.4h, v1.4h, v0.4h ; CHECK-GI-NEXT: and v0.8b, v2.8b, v0.8b ; CHECK-GI-NEXT: cnt v0.8b, v0.8b ; CHECK-GI-NEXT: uaddlp v0.4h, v0.8b ; CHECK-GI-NEXT: str h0, [x0] -; CHECK-GI-NEXT: st1 { v0.h }[1], [x9] -; CHECK-GI-NEXT: st1 { v0.h }[2], [x10] +; CHECK-GI-NEXT: st1 { v0.h }[1], [x8] +; CHECK-GI-NEXT: st1 { v0.h }[2], [x9] ; CHECK-GI-NEXT: ret entry: %d = load <3 x i16>, ptr %p1 diff --git a/llvm/test/CodeGen/AArch64/load.ll b/llvm/test/CodeGen/AArch64/load.ll index 6b26ae98a4ed8..c4bb6e37d6eaf 100644 --- a/llvm/test/CodeGen/AArch64/load.ll +++ b/llvm/test/CodeGen/AArch64/load.ll @@ -335,102 +335,50 @@ define <3 x i8> @load_v3i8(ptr %ptr) { ; ; CHECK-GI-LABEL: load_v3i8: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: ldrb w8, [x0] -; CHECK-GI-NEXT: ldrb w1, [x0, #1] -; CHECK-GI-NEXT: ldrb w2, [x0, #2] -; CHECK-GI-NEXT: mov w0, w8 +; CHECK-GI-NEXT: ldr w8, [x0] +; CHECK-GI-NEXT: fmov s0, w8 +; CHECK-GI-NEXT: mov b1, v0.b[1] +; CHECK-GI-NEXT: mov b2, v0.b[2] +; CHECK-GI-NEXT: fmov w0, s0 +; CHECK-GI-NEXT: fmov w1, s1 +; CHECK-GI-NEXT: fmov w2, s2 ; CHECK-GI-NEXT: ret %a = load <3 x i8>, ptr %ptr ret <3 x i8> %a } define <7 x i8> @load_v7i8(ptr %ptr) { -; CHECK-SD-LABEL: load_v7i8: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: ldr d0, [x0] -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: load_v7i8: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: ldr b0, [x0] -; CHECK-GI-NEXT: add x8, x0, #1 -; CHECK-GI-NEXT: mov v0.b[0], v0.b[0] -; CHECK-GI-NEXT: ld1 { v0.b }[1], [x8] -; CHECK-GI-NEXT: add x8, x0, #2 -; CHECK-GI-NEXT: ld1 { v0.b }[2], [x8] -; CHECK-GI-NEXT: add x8, x0, #3 -; CHECK-GI-NEXT: ld1 { v0.b }[3], [x8] -; CHECK-GI-NEXT: add x8, x0, #4 -; CHECK-GI-NEXT: ld1 { v0.b }[4], [x8] -; CHECK-GI-NEXT: add x8, x0, #5 -; CHECK-GI-NEXT: ld1 { v0.b }[5], [x8] -; CHECK-GI-NEXT: add x8, x0, #6 -; CHECK-GI-NEXT: ld1 { v0.b }[6], [x8] -; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-GI-NEXT: ret +; CHECK-LABEL: load_v7i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: ret %a = load <7 x i8>, ptr %ptr ret <7 x i8> %a } define <3 x i16> @load_v3i16(ptr %ptr) { -; CHECK-SD-LABEL: load_v3i16: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: ldr d0, [x0] -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: load_v3i16: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: ldr h0, [x0] -; CHECK-GI-NEXT: add x8, x0, #2 -; CHECK-GI-NEXT: ld1 { v0.h }[1], [x8] -; CHECK-GI-NEXT: add x8, x0, #4 -; CHECK-GI-NEXT: ld1 { v0.h }[2], [x8] -; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-GI-NEXT: ret +; CHECK-LABEL: load_v3i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: ret %a = load <3 x i16>, ptr %ptr ret <3 x i16> %a } define <7 x i16> @load_v7i16(ptr %ptr) { -; CHECK-SD-LABEL: load_v7i16: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: ldr q0, [x0] -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: load_v7i16: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: ldr h0, [x0] -; CHECK-GI-NEXT: add x8, x0, #2 -; CHECK-GI-NEXT: ld1 { v0.h }[1], [x8] -; CHECK-GI-NEXT: add x8, x0, #4 -; CHECK-GI-NEXT: ld1 { v0.h }[2], [x8] -; CHECK-GI-NEXT: add x8, x0, #6 -; CHECK-GI-NEXT: ld1 { v0.h }[3], [x8] -; CHECK-GI-NEXT: add x8, x0, #8 -; CHECK-GI-NEXT: ld1 { v0.h }[4], [x8] -; CHECK-GI-NEXT: add x8, x0, #10 -; CHECK-GI-NEXT: ld1 { v0.h }[5], [x8] -; CHECK-GI-NEXT: add x8, x0, #12 -; CHECK-GI-NEXT: ld1 { v0.h }[6], [x8] -; CHECK-GI-NEXT: ret +; CHECK-LABEL: load_v7i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ret %a = load <7 x i16>, ptr %ptr ret <7 x i16> %a } define <3 x i32> @load_v3i32(ptr %ptr) { -; CHECK-SD-LABEL: load_v3i32: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: ldr q0, [x0] -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: load_v3i32: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: ldr s0, [x0] -; CHECK-GI-NEXT: add x8, x0, #4 -; CHECK-GI-NEXT: ld1 { v0.s }[1], [x8] -; CHECK-GI-NEXT: add x8, x0, #8 -; CHECK-GI-NEXT: ld1 { v0.s }[2], [x8] -; CHECK-GI-NEXT: ret +; CHECK-LABEL: load_v3i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ret %a = load <3 x i32>, ptr %ptr ret <3 x i32> %a } diff --git a/llvm/test/CodeGen/AArch64/mul.ll b/llvm/test/CodeGen/AArch64/mul.ll index 1558043f7f40a..9c69a6f03b858 100644 --- a/llvm/test/CodeGen/AArch64/mul.ll +++ b/llvm/test/CodeGen/AArch64/mul.ll @@ -122,16 +122,20 @@ define void @v3i8(ptr %p1, ptr %p2) { ; ; CHECK-GI-LABEL: v3i8: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ldrb w8, [x0] -; CHECK-GI-NEXT: ldrb w9, [x1] -; CHECK-GI-NEXT: ldrb w10, [x0, #1] -; CHECK-GI-NEXT: ldrb w11, [x1, #1] +; CHECK-GI-NEXT: ldr w8, [x0] +; CHECK-GI-NEXT: ldr w9, [x1] ; CHECK-GI-NEXT: fmov s0, w8 ; CHECK-GI-NEXT: fmov s1, w9 -; CHECK-GI-NEXT: ldrb w8, [x0, #2] -; CHECK-GI-NEXT: ldrb w9, [x1, #2] -; CHECK-GI-NEXT: mov v0.h[1], w10 -; CHECK-GI-NEXT: mov v1.h[1], w11 +; CHECK-GI-NEXT: mov b2, v0.b[1] +; CHECK-GI-NEXT: mov b3, v1.b[1] +; CHECK-GI-NEXT: mov b4, v0.b[2] +; CHECK-GI-NEXT: mov b5, v1.b[2] +; CHECK-GI-NEXT: fmov w8, s2 +; CHECK-GI-NEXT: fmov w9, s3 +; CHECK-GI-NEXT: mov v0.h[1], w8 +; CHECK-GI-NEXT: mov v1.h[1], w9 +; CHECK-GI-NEXT: fmov w8, s4 +; CHECK-GI-NEXT: fmov w9, s5 ; CHECK-GI-NEXT: mov v0.h[2], w8 ; CHECK-GI-NEXT: mov v1.h[2], w9 ; CHECK-GI-NEXT: mul v0.4h, v0.4h, v1.4h @@ -282,16 +286,10 @@ define void @v3i16(ptr %p1, ptr %p2) { ; ; CHECK-GI-LABEL: v3i16: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ldr h0, [x0] -; CHECK-GI-NEXT: ldr h1, [x1] +; CHECK-GI-NEXT: ldr d0, [x0] +; CHECK-GI-NEXT: ldr d1, [x1] ; CHECK-GI-NEXT: add x8, x0, #2 -; CHECK-GI-NEXT: add x9, x1, #2 -; CHECK-GI-NEXT: add x10, x1, #4 -; CHECK-GI-NEXT: ld1 { v0.h }[1], [x8] -; CHECK-GI-NEXT: ld1 { v1.h }[1], [x9] ; CHECK-GI-NEXT: add x9, x0, #4 -; CHECK-GI-NEXT: ld1 { v0.h }[2], [x9] -; CHECK-GI-NEXT: ld1 { v1.h }[2], [x10] ; CHECK-GI-NEXT: mul v0.4h, v0.4h, v1.4h ; CHECK-GI-NEXT: str h0, [x0] ; CHECK-GI-NEXT: st1 { v0.h }[1], [x8] diff --git a/llvm/test/CodeGen/AArch64/neon-dotreduce.ll b/llvm/test/CodeGen/AArch64/neon-dotreduce.ll index a534112b7c559..4f0c4080aa0ce 100644 --- a/llvm/test/CodeGen/AArch64/neon-dotreduce.ll +++ b/llvm/test/CodeGen/AArch64/neon-dotreduce.ll @@ -412,31 +412,33 @@ define i32 @test_udot_v5i8(ptr nocapture readonly %a, ptr nocapture readonly %b, ; ; CHECK-GI-LABEL: test_udot_v5i8: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ldrb w8, [x0, #4] -; CHECK-GI-NEXT: ldrb w9, [x1, #4] -; CHECK-GI-NEXT: ldrb w10, [x1] -; CHECK-GI-NEXT: ldrb w11, [x0, #1] -; CHECK-GI-NEXT: ldrb w12, [x1, #1] -; CHECK-GI-NEXT: mul w8, w9, w8 -; CHECK-GI-NEXT: ldrb w9, [x0] -; CHECK-GI-NEXT: fmov s0, w10 -; CHECK-GI-NEXT: fmov s1, w9 -; CHECK-GI-NEXT: ldrb w9, [x1, #2] -; CHECK-GI-NEXT: fmov s2, w8 -; CHECK-GI-NEXT: mov v0.s[1], w12 -; CHECK-GI-NEXT: ldrb w8, [x0, #2] -; CHECK-GI-NEXT: mov v1.s[1], w11 -; CHECK-GI-NEXT: mov v2.s[1], wzr -; CHECK-GI-NEXT: mov v0.s[2], w9 -; CHECK-GI-NEXT: ldrb w9, [x1, #3] -; CHECK-GI-NEXT: mov v1.s[2], w8 -; CHECK-GI-NEXT: ldrb w8, [x0, #3] -; CHECK-GI-NEXT: mov v2.s[2], wzr -; CHECK-GI-NEXT: mov v0.s[3], w9 -; CHECK-GI-NEXT: mov v1.s[3], w8 -; CHECK-GI-NEXT: mov v2.s[3], wzr -; CHECK-GI-NEXT: mla v2.4s, v0.4s, v1.4s -; CHECK-GI-NEXT: addv s0, v2.4s +; CHECK-GI-NEXT: ldr d0, [x0] +; CHECK-GI-NEXT: ldr d1, [x1] +; CHECK-GI-NEXT: umov w8, v1.b[4] +; CHECK-GI-NEXT: umov w9, v0.b[4] +; CHECK-GI-NEXT: umov w10, v1.b[0] +; CHECK-GI-NEXT: umov w12, v0.b[0] +; CHECK-GI-NEXT: umov w11, v1.b[1] +; CHECK-GI-NEXT: umov w13, v0.b[1] +; CHECK-GI-NEXT: mul w8, w8, w9 +; CHECK-GI-NEXT: fmov s2, w10 +; CHECK-GI-NEXT: umov w9, v1.b[2] +; CHECK-GI-NEXT: fmov s3, w12 +; CHECK-GI-NEXT: umov w10, v1.b[3] +; CHECK-GI-NEXT: fmov s4, w8 +; CHECK-GI-NEXT: mov v2.s[1], w11 +; CHECK-GI-NEXT: umov w8, v0.b[2] +; CHECK-GI-NEXT: mov v3.s[1], w13 +; CHECK-GI-NEXT: umov w11, v0.b[3] +; CHECK-GI-NEXT: mov v4.s[1], wzr +; CHECK-GI-NEXT: mov v2.s[2], w9 +; CHECK-GI-NEXT: mov v3.s[2], w8 +; CHECK-GI-NEXT: mov v4.s[2], wzr +; CHECK-GI-NEXT: mov v2.s[3], w10 +; CHECK-GI-NEXT: mov v3.s[3], w11 +; CHECK-GI-NEXT: mov v4.s[3], wzr +; CHECK-GI-NEXT: mla v4.4s, v2.4s, v3.4s +; CHECK-GI-NEXT: addv s0, v4.4s ; CHECK-GI-NEXT: fmov w8, s0 ; CHECK-GI-NEXT: add w0, w8, w2 ; CHECK-GI-NEXT: ret @@ -466,20 +468,21 @@ define i32 @test_udot_v5i8_nomla(ptr nocapture readonly %a1) { ; ; CHECK-GI-LABEL: test_udot_v5i8_nomla: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ldrb w8, [x0] -; CHECK-GI-NEXT: ldrb w9, [x0, #4] -; CHECK-GI-NEXT: ldrb w10, [x0, #1] -; CHECK-GI-NEXT: fmov s0, w8 -; CHECK-GI-NEXT: fmov s1, w9 -; CHECK-GI-NEXT: ldrb w8, [x0, #2] -; CHECK-GI-NEXT: mov v0.s[1], w10 -; CHECK-GI-NEXT: mov v1.s[1], wzr -; CHECK-GI-NEXT: mov v0.s[2], w8 -; CHECK-GI-NEXT: mov v1.s[2], wzr -; CHECK-GI-NEXT: ldrb w8, [x0, #3] -; CHECK-GI-NEXT: mov v0.s[3], w8 -; CHECK-GI-NEXT: mov v1.s[3], wzr -; CHECK-GI-NEXT: add v0.4s, v0.4s, v1.4s +; CHECK-GI-NEXT: ldr d0, [x0] +; CHECK-GI-NEXT: umov w8, v0.b[0] +; CHECK-GI-NEXT: umov w9, v0.b[4] +; CHECK-GI-NEXT: umov w10, v0.b[1] +; CHECK-GI-NEXT: fmov s1, w8 +; CHECK-GI-NEXT: fmov s2, w9 +; CHECK-GI-NEXT: umov w8, v0.b[2] +; CHECK-GI-NEXT: umov w9, v0.b[3] +; CHECK-GI-NEXT: mov v1.s[1], w10 +; CHECK-GI-NEXT: mov v2.s[1], wzr +; CHECK-GI-NEXT: mov v1.s[2], w8 +; CHECK-GI-NEXT: mov v2.s[2], wzr +; CHECK-GI-NEXT: mov v1.s[3], w9 +; CHECK-GI-NEXT: mov v2.s[3], wzr +; CHECK-GI-NEXT: add v0.4s, v1.4s, v2.4s ; CHECK-GI-NEXT: addv s0, v0.4s ; CHECK-GI-NEXT: fmov w0, s0 ; CHECK-GI-NEXT: ret @@ -506,31 +509,33 @@ define i32 @test_sdot_v5i8(ptr nocapture readonly %a, ptr nocapture readonly %b, ; ; CHECK-GI-LABEL: test_sdot_v5i8: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ldrsb w8, [x0, #4] -; CHECK-GI-NEXT: ldrsb w9, [x1, #4] -; CHECK-GI-NEXT: ldrsb w10, [x1] -; CHECK-GI-NEXT: ldrsb w11, [x0, #1] -; CHECK-GI-NEXT: ldrsb w12, [x1, #1] -; CHECK-GI-NEXT: mul w8, w9, w8 -; CHECK-GI-NEXT: ldrsb w9, [x0] -; CHECK-GI-NEXT: fmov s0, w10 -; CHECK-GI-NEXT: fmov s1, w9 -; CHECK-GI-NEXT: ldrsb w9, [x1, #2] -; CHECK-GI-NEXT: fmov s2, w8 -; CHECK-GI-NEXT: mov v0.s[1], w12 -; CHECK-GI-NEXT: ldrsb w8, [x0, #2] -; CHECK-GI-NEXT: mov v1.s[1], w11 -; CHECK-GI-NEXT: mov v2.s[1], wzr -; CHECK-GI-NEXT: mov v0.s[2], w9 -; CHECK-GI-NEXT: ldrsb w9, [x1, #3] -; CHECK-GI-NEXT: mov v1.s[2], w8 -; CHECK-GI-NEXT: ldrsb w8, [x0, #3] -; CHECK-GI-NEXT: mov v2.s[2], wzr -; CHECK-GI-NEXT: mov v0.s[3], w9 -; CHECK-GI-NEXT: mov v1.s[3], w8 -; CHECK-GI-NEXT: mov v2.s[3], wzr -; CHECK-GI-NEXT: mla v2.4s, v0.4s, v1.4s -; CHECK-GI-NEXT: addv s0, v2.4s +; CHECK-GI-NEXT: ldr d0, [x0] +; CHECK-GI-NEXT: ldr d1, [x1] +; CHECK-GI-NEXT: smov w8, v1.b[4] +; CHECK-GI-NEXT: smov w9, v0.b[4] +; CHECK-GI-NEXT: smov w10, v1.b[0] +; CHECK-GI-NEXT: smov w12, v0.b[0] +; CHECK-GI-NEXT: smov w11, v1.b[1] +; CHECK-GI-NEXT: smov w13, v0.b[1] +; CHECK-GI-NEXT: mul w8, w8, w9 +; CHECK-GI-NEXT: fmov s2, w10 +; CHECK-GI-NEXT: smov w9, v1.b[2] +; CHECK-GI-NEXT: fmov s3, w12 +; CHECK-GI-NEXT: smov w10, v1.b[3] +; CHECK-GI-NEXT: fmov s4, w8 +; CHECK-GI-NEXT: mov v2.s[1], w11 +; CHECK-GI-NEXT: smov w8, v0.b[2] +; CHECK-GI-NEXT: mov v3.s[1], w13 +; CHECK-GI-NEXT: smov w11, v0.b[3] +; CHECK-GI-NEXT: mov v4.s[1], wzr +; CHECK-GI-NEXT: mov v2.s[2], w9 +; CHECK-GI-NEXT: mov v3.s[2], w8 +; CHECK-GI-NEXT: mov v4.s[2], wzr +; CHECK-GI-NEXT: mov v2.s[3], w10 +; CHECK-GI-NEXT: mov v3.s[3], w11 +; CHECK-GI-NEXT: mov v4.s[3], wzr +; CHECK-GI-NEXT: mla v4.4s, v2.4s, v3.4s +; CHECK-GI-NEXT: addv s0, v4.4s ; CHECK-GI-NEXT: fmov w8, s0 ; CHECK-GI-NEXT: add w0, w8, w2 ; CHECK-GI-NEXT: ret @@ -2298,128 +2303,145 @@ define i32 @test_udot_v25i8(ptr nocapture readonly %a, ptr nocapture readonly %b ; ; CHECK-GI-LABEL: test_udot_v25i8: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ldr q2, [x1] -; CHECK-GI-NEXT: ldrb w9, [x1, #16]! -; CHECK-GI-NEXT: ldrb w11, [x1, #4] -; CHECK-GI-NEXT: ldrb w12, [x1, #5] +; CHECK-GI-NEXT: stp x26, x25, [sp, #-64]! // 16-byte Folded Spill +; CHECK-GI-NEXT: stp x24, x23, [sp, #16] // 16-byte Folded Spill +; CHECK-GI-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill +; CHECK-GI-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill +; CHECK-GI-NEXT: .cfi_def_cfa_offset 64 +; CHECK-GI-NEXT: .cfi_offset w19, -8 +; CHECK-GI-NEXT: .cfi_offset w20, -16 +; CHECK-GI-NEXT: .cfi_offset w21, -24 +; CHECK-GI-NEXT: .cfi_offset w22, -32 +; CHECK-GI-NEXT: .cfi_offset w23, -40 +; CHECK-GI-NEXT: .cfi_offset w24, -48 +; CHECK-GI-NEXT: .cfi_offset w25, -56 +; CHECK-GI-NEXT: .cfi_offset w26, -64 +; CHECK-GI-NEXT: ldp q1, q7, [x1] ; CHECK-GI-NEXT: fmov s0, wzr -; CHECK-GI-NEXT: umov w13, v2.b[4] -; CHECK-GI-NEXT: umov w14, v2.b[5] -; CHECK-GI-NEXT: umov w10, v2.b[0] -; CHECK-GI-NEXT: fmov s3, w9 -; CHECK-GI-NEXT: umov w9, v2.b[8] -; CHECK-GI-NEXT: fmov s5, w11 -; CHECK-GI-NEXT: umov w11, v2.b[12] -; CHECK-GI-NEXT: ldr q1, [x0] -; CHECK-GI-NEXT: ldrb w8, [x1, #1] +; CHECK-GI-NEXT: ldp q16, q3, [x0] +; CHECK-GI-NEXT: umov w9, v1.b[4] +; CHECK-GI-NEXT: umov w11, v1.b[5] +; CHECK-GI-NEXT: umov w18, v1.b[0] +; CHECK-GI-NEXT: umov w0, v1.b[12] +; CHECK-GI-NEXT: umov w3, v7.b[4] +; CHECK-GI-NEXT: umov w12, v1.b[1] +; CHECK-GI-NEXT: umov w13, v1.b[6] +; CHECK-GI-NEXT: umov w1, v1.b[13] +; CHECK-GI-NEXT: umov w4, v7.b[5] +; CHECK-GI-NEXT: umov w15, v1.b[2] +; CHECK-GI-NEXT: umov w8, v1.b[3] +; CHECK-GI-NEXT: umov w16, v1.b[7] +; CHECK-GI-NEXT: fmov s2, w9 +; CHECK-GI-NEXT: umov w14, v1.b[8] +; CHECK-GI-NEXT: umov w17, v1.b[9] +; CHECK-GI-NEXT: umov w10, v1.b[10] +; CHECK-GI-NEXT: umov w9, v1.b[11] +; CHECK-GI-NEXT: umov w5, v1.b[14] +; CHECK-GI-NEXT: umov w6, v7.b[0] +; CHECK-GI-NEXT: fmov s4, w0 +; CHECK-GI-NEXT: fmov s5, w3 +; CHECK-GI-NEXT: mov v2.s[1], w11 +; CHECK-GI-NEXT: umov w11, v1.b[15] +; CHECK-GI-NEXT: fmov s1, w18 +; CHECK-GI-NEXT: umov w7, v7.b[1] +; CHECK-GI-NEXT: umov w18, v7.b[6] +; CHECK-GI-NEXT: umov w21, v16.b[4] +; CHECK-GI-NEXT: mov v4.s[1], w1 +; CHECK-GI-NEXT: mov v5.s[1], w4 +; CHECK-GI-NEXT: fmov s6, w14 +; CHECK-GI-NEXT: mov v1.s[1], w12 +; CHECK-GI-NEXT: umov w12, v7.b[3] +; CHECK-GI-NEXT: umov w14, v7.b[7] +; CHECK-GI-NEXT: mov v2.s[2], w13 +; CHECK-GI-NEXT: umov w13, v7.b[2] +; CHECK-GI-NEXT: umov w0, v7.b[8] +; CHECK-GI-NEXT: fmov s7, w6 +; CHECK-GI-NEXT: umov w23, v16.b[12] +; CHECK-GI-NEXT: umov w25, v3.b[4] +; CHECK-GI-NEXT: mov v6.s[1], w17 +; CHECK-GI-NEXT: mov v4.s[2], w5 +; CHECK-GI-NEXT: mov v5.s[2], w18 +; CHECK-GI-NEXT: mov v1.s[2], w15 +; CHECK-GI-NEXT: umov w6, v16.b[0] +; CHECK-GI-NEXT: umov w3, v16.b[1] +; CHECK-GI-NEXT: mov v2.s[3], w16 +; CHECK-GI-NEXT: mov v7.s[1], w7 +; CHECK-GI-NEXT: umov w16, v16.b[2] +; CHECK-GI-NEXT: umov w15, v16.b[3] +; CHECK-GI-NEXT: umov w22, v16.b[5] +; CHECK-GI-NEXT: umov w5, v16.b[6] +; CHECK-GI-NEXT: umov w18, v16.b[7] +; CHECK-GI-NEXT: umov w19, v16.b[8] +; CHECK-GI-NEXT: umov w7, v16.b[9] +; CHECK-GI-NEXT: umov w24, v16.b[13] +; CHECK-GI-NEXT: umov w1, v16.b[10] +; CHECK-GI-NEXT: umov w17, v16.b[11] +; CHECK-GI-NEXT: umov w20, v16.b[14] +; CHECK-GI-NEXT: umov w4, v16.b[15] +; CHECK-GI-NEXT: fmov s16, w21 +; CHECK-GI-NEXT: umov w21, v3.b[8] +; CHECK-GI-NEXT: umov w26, v3.b[5] +; CHECK-GI-NEXT: fmov s17, w23 +; CHECK-GI-NEXT: umov w23, v3.b[0] +; CHECK-GI-NEXT: fmov s18, w25 +; CHECK-GI-NEXT: umov w25, v3.b[3] +; CHECK-GI-NEXT: mov v16.s[1], w22 +; CHECK-GI-NEXT: umov w22, v3.b[1] +; CHECK-GI-NEXT: fmov s19, w6 +; CHECK-GI-NEXT: mov v17.s[1], w24 +; CHECK-GI-NEXT: umov w24, v3.b[2] +; CHECK-GI-NEXT: umov w6, v3.b[7] +; CHECK-GI-NEXT: mul w0, w0, w21 +; CHECK-GI-NEXT: mov v18.s[1], w26 +; CHECK-GI-NEXT: umov w26, v3.b[6] +; CHECK-GI-NEXT: fmov s3, w19 +; CHECK-GI-NEXT: fmov s20, w23 +; CHECK-GI-NEXT: mov v19.s[1], w3 +; CHECK-GI-NEXT: mov v16.s[2], w5 ; CHECK-GI-NEXT: mov v0.s[1], wzr -; CHECK-GI-NEXT: fmov s7, w13 -; CHECK-GI-NEXT: fmov s4, w10 -; CHECK-GI-NEXT: umov w10, v2.b[13] -; CHECK-GI-NEXT: mov v5.s[1], w12 -; CHECK-GI-NEXT: umov w13, v2.b[9] -; CHECK-GI-NEXT: fmov s6, w9 -; CHECK-GI-NEXT: fmov s16, w11 -; CHECK-GI-NEXT: umov w9, v1.b[0] -; CHECK-GI-NEXT: mov v3.s[1], w8 -; CHECK-GI-NEXT: mov v7.s[1], w14 -; CHECK-GI-NEXT: umov w14, v2.b[6] -; CHECK-GI-NEXT: ldrb w12, [x1, #6] -; CHECK-GI-NEXT: umov w8, v2.b[1] -; CHECK-GI-NEXT: umov w11, v2.b[2] -; CHECK-GI-NEXT: mov v0.s[2], wzr -; CHECK-GI-NEXT: mov v16.s[1], w10 -; CHECK-GI-NEXT: umov w10, v2.b[14] -; CHECK-GI-NEXT: mov v5.s[2], w12 -; CHECK-GI-NEXT: umov w12, v1.b[5] -; CHECK-GI-NEXT: mov v6.s[1], w13 -; CHECK-GI-NEXT: fmov s17, w9 -; CHECK-GI-NEXT: mov v7.s[2], w14 -; CHECK-GI-NEXT: umov w14, v1.b[4] -; CHECK-GI-NEXT: umov w9, v2.b[10] -; CHECK-GI-NEXT: mov v4.s[1], w8 -; CHECK-GI-NEXT: umov w8, v1.b[1] -; CHECK-GI-NEXT: umov w13, v2.b[7] -; CHECK-GI-NEXT: mov v16.s[2], w10 -; CHECK-GI-NEXT: umov w10, v2.b[15] -; CHECK-GI-NEXT: mov v0.s[3], wzr -; CHECK-GI-NEXT: fmov s18, w14 -; CHECK-GI-NEXT: mov v6.s[2], w9 -; CHECK-GI-NEXT: umov w9, v1.b[12] -; CHECK-GI-NEXT: mov v4.s[2], w11 -; CHECK-GI-NEXT: ldrb w11, [x1, #7] -; CHECK-GI-NEXT: mov v17.s[1], w8 -; CHECK-GI-NEXT: ldrb w8, [x1, #2] -; CHECK-GI-NEXT: mov v16.s[3], w10 -; CHECK-GI-NEXT: umov w10, v1.b[13] -; CHECK-GI-NEXT: mov v18.s[1], w12 -; CHECK-GI-NEXT: umov w12, v1.b[6] -; CHECK-GI-NEXT: mov v5.s[3], w11 -; CHECK-GI-NEXT: ldrb w11, [x0, #16]! -; CHECK-GI-NEXT: mov v7.s[3], w13 -; CHECK-GI-NEXT: umov w13, v1.b[2] -; CHECK-GI-NEXT: fmov s20, w9 -; CHECK-GI-NEXT: ldrb w9, [x0, #5] -; CHECK-GI-NEXT: mov v3.s[2], w8 -; CHECK-GI-NEXT: umov w8, v1.b[8] -; CHECK-GI-NEXT: fmov s22, w11 -; CHECK-GI-NEXT: mov v18.s[2], w12 -; CHECK-GI-NEXT: ldrb w12, [x0, #4] -; CHECK-GI-NEXT: umov w11, v2.b[3] -; CHECK-GI-NEXT: mov v20.s[1], w10 -; CHECK-GI-NEXT: ldrb w10, [x0, #8] -; CHECK-GI-NEXT: fmov s21, w12 -; CHECK-GI-NEXT: ldrb w12, [x1, #8] -; CHECK-GI-NEXT: mov v17.s[2], w13 -; CHECK-GI-NEXT: umov w13, v1.b[9] -; CHECK-GI-NEXT: fmov s19, w8 -; CHECK-GI-NEXT: umov w8, v1.b[14] -; CHECK-GI-NEXT: mul w10, w12, w10 -; CHECK-GI-NEXT: umov w12, v1.b[7] +; CHECK-GI-NEXT: mov v6.s[2], w10 +; CHECK-GI-NEXT: fmov s21, w0 +; CHECK-GI-NEXT: mov v17.s[2], w20 ; CHECK-GI-NEXT: mov v4.s[3], w11 -; CHECK-GI-NEXT: mov v21.s[1], w9 -; CHECK-GI-NEXT: ldrb w9, [x0, #6] -; CHECK-GI-NEXT: mov v19.s[1], w13 -; CHECK-GI-NEXT: ldrb w13, [x0, #1] -; CHECK-GI-NEXT: mov v20.s[2], w8 -; CHECK-GI-NEXT: umov w8, v1.b[10] -; CHECK-GI-NEXT: mov v18.s[3], w12 -; CHECK-GI-NEXT: ldrb w12, [x0, #7] -; CHECK-GI-NEXT: mov v21.s[2], w9 -; CHECK-GI-NEXT: umov w9, v2.b[11] -; CHECK-GI-NEXT: fmov s2, w10 -; CHECK-GI-NEXT: ldrb w10, [x0, #2] -; CHECK-GI-NEXT: mov v22.s[1], w13 -; CHECK-GI-NEXT: umov w13, v1.b[15] -; CHECK-GI-NEXT: mov v2.s[1], wzr -; CHECK-GI-NEXT: mov v19.s[2], w8 -; CHECK-GI-NEXT: umov w8, v1.b[3] -; CHECK-GI-NEXT: mov v21.s[3], w12 +; CHECK-GI-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v3.s[1], w7 +; CHECK-GI-NEXT: mov v20.s[1], w22 +; CHECK-GI-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v18.s[2], w26 +; CHECK-GI-NEXT: mov v21.s[1], wzr +; CHECK-GI-NEXT: mov v16.s[3], w18 +; CHECK-GI-NEXT: mov v17.s[3], w4 +; CHECK-GI-NEXT: mov v7.s[2], w13 +; CHECK-GI-NEXT: mov v5.s[3], w14 +; CHECK-GI-NEXT: mov v19.s[2], w16 +; CHECK-GI-NEXT: mov v3.s[2], w1 +; CHECK-GI-NEXT: mov v0.s[2], wzr +; CHECK-GI-NEXT: mov v20.s[2], w24 +; CHECK-GI-NEXT: ldp x24, x23, [sp, #16] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v18.s[3], w6 +; CHECK-GI-NEXT: mov v21.s[2], wzr +; CHECK-GI-NEXT: mul v2.4s, v2.4s, v16.4s +; CHECK-GI-NEXT: mul v4.4s, v4.4s, v17.4s +; CHECK-GI-NEXT: mov v1.s[3], w8 ; CHECK-GI-NEXT: mov v6.s[3], w9 -; CHECK-GI-NEXT: ldrb w9, [x0, #3] -; CHECK-GI-NEXT: mov v20.s[3], w13 -; CHECK-GI-NEXT: umov w13, v1.b[11] -; CHECK-GI-NEXT: mov v22.s[2], w10 -; CHECK-GI-NEXT: ldrb w10, [x1, #3] -; CHECK-GI-NEXT: mul v1.4s, v7.4s, v18.4s -; CHECK-GI-NEXT: mov v2.s[2], wzr -; CHECK-GI-NEXT: mov v17.s[3], w8 -; CHECK-GI-NEXT: mov v3.s[3], w10 -; CHECK-GI-NEXT: mul v5.4s, v5.4s, v21.4s -; CHECK-GI-NEXT: mov v19.s[3], w13 -; CHECK-GI-NEXT: mul v7.4s, v16.4s, v20.4s -; CHECK-GI-NEXT: mov v22.s[3], w9 -; CHECK-GI-NEXT: mov v2.s[3], wzr -; CHECK-GI-NEXT: mla v1.4s, v4.4s, v17.4s -; CHECK-GI-NEXT: mla v7.4s, v6.4s, v19.4s -; CHECK-GI-NEXT: mla v5.4s, v3.4s, v22.4s -; CHECK-GI-NEXT: add v0.4s, v2.4s, v0.4s -; CHECK-GI-NEXT: add v1.4s, v1.4s, v7.4s +; CHECK-GI-NEXT: mov v7.s[3], w12 +; CHECK-GI-NEXT: mov v19.s[3], w15 +; CHECK-GI-NEXT: mov v3.s[3], w17 +; CHECK-GI-NEXT: mov v20.s[3], w25 +; CHECK-GI-NEXT: mov v0.s[3], wzr +; CHECK-GI-NEXT: mul v5.4s, v5.4s, v18.4s +; CHECK-GI-NEXT: mov v21.s[3], wzr +; CHECK-GI-NEXT: mla v2.4s, v1.4s, v19.4s +; CHECK-GI-NEXT: mla v4.4s, v6.4s, v3.4s +; CHECK-GI-NEXT: mla v5.4s, v7.4s, v20.4s +; CHECK-GI-NEXT: add v0.4s, v21.4s, v0.4s +; CHECK-GI-NEXT: add v1.4s, v2.4s, v4.4s ; CHECK-GI-NEXT: add v0.4s, v5.4s, v0.4s ; CHECK-GI-NEXT: add v0.4s, v1.4s, v0.4s ; CHECK-GI-NEXT: addv s0, v0.4s ; CHECK-GI-NEXT: fmov w8, s0 ; CHECK-GI-NEXT: add w0, w8, w2 +; CHECK-GI-NEXT: ldp x26, x25, [sp], #64 // 16-byte Folded Reload ; CHECK-GI-NEXT: ret entry: %0 = load <25 x i8>, ptr %a @@ -2455,73 +2477,77 @@ define i32 @test_udot_v25i8_nomla(ptr nocapture readonly %a1) { ; ; CHECK-GI-LABEL: test_udot_v25i8_nomla: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ldr q1, [x0] -; CHECK-GI-NEXT: ldrb w17, [x0, #16]! -; CHECK-GI-NEXT: ldrb w16, [x0, #4] -; CHECK-GI-NEXT: ldrb w14, [x0, #8] +; CHECK-GI-NEXT: str x19, [sp, #-16]! // 8-byte Folded Spill +; CHECK-GI-NEXT: .cfi_def_cfa_offset 16 +; CHECK-GI-NEXT: .cfi_offset w19, -16 +; CHECK-GI-NEXT: ldp q2, q1, [x0] ; CHECK-GI-NEXT: fmov s0, wzr -; CHECK-GI-NEXT: umov w15, v1.b[0] -; CHECK-GI-NEXT: umov w2, v1.b[4] -; CHECK-GI-NEXT: umov w4, v1.b[8] -; CHECK-GI-NEXT: umov w5, v1.b[12] -; CHECK-GI-NEXT: umov w1, v1.b[1] -; CHECK-GI-NEXT: umov w3, v1.b[5] -; CHECK-GI-NEXT: umov w6, v1.b[9] -; CHECK-GI-NEXT: umov w7, v1.b[13] -; CHECK-GI-NEXT: fmov s6, w17 -; CHECK-GI-NEXT: fmov s7, w16 -; CHECK-GI-NEXT: fmov s16, w14 -; CHECK-GI-NEXT: ldrb w18, [x0, #1] -; CHECK-GI-NEXT: fmov s2, w15 -; CHECK-GI-NEXT: fmov s3, w2 -; CHECK-GI-NEXT: ldrb w11, [x0, #5] -; CHECK-GI-NEXT: fmov s4, w4 -; CHECK-GI-NEXT: fmov s5, w5 -; CHECK-GI-NEXT: ldrb w16, [x0, #2] -; CHECK-GI-NEXT: umov w9, v1.b[2] -; CHECK-GI-NEXT: umov w12, v1.b[6] -; CHECK-GI-NEXT: ldrb w17, [x0, #6] -; CHECK-GI-NEXT: umov w13, v1.b[10] -; CHECK-GI-NEXT: umov w15, v1.b[14] -; CHECK-GI-NEXT: mov v2.s[1], w1 -; CHECK-GI-NEXT: mov v3.s[1], w3 -; CHECK-GI-NEXT: mov v4.s[1], w6 -; CHECK-GI-NEXT: mov v5.s[1], w7 -; CHECK-GI-NEXT: mov v6.s[1], w18 -; CHECK-GI-NEXT: mov v7.s[1], w11 -; CHECK-GI-NEXT: mov v16.s[1], wzr +; CHECK-GI-NEXT: umov w15, v2.b[0] +; CHECK-GI-NEXT: umov w17, v2.b[4] +; CHECK-GI-NEXT: umov w0, v2.b[8] +; CHECK-GI-NEXT: umov w2, v2.b[12] +; CHECK-GI-NEXT: umov w4, v1.b[0] +; CHECK-GI-NEXT: umov w6, v1.b[4] +; CHECK-GI-NEXT: umov w19, v1.b[8] +; CHECK-GI-NEXT: umov w16, v2.b[1] +; CHECK-GI-NEXT: umov w18, v2.b[5] +; CHECK-GI-NEXT: umov w1, v2.b[9] +; CHECK-GI-NEXT: umov w3, v2.b[13] +; CHECK-GI-NEXT: umov w5, v1.b[1] +; CHECK-GI-NEXT: umov w7, v1.b[5] +; CHECK-GI-NEXT: fmov s3, w15 +; CHECK-GI-NEXT: fmov s4, w17 +; CHECK-GI-NEXT: fmov s5, w0 +; CHECK-GI-NEXT: fmov s6, w2 +; CHECK-GI-NEXT: fmov s7, w4 +; CHECK-GI-NEXT: fmov s16, w6 +; CHECK-GI-NEXT: fmov s17, w19 +; CHECK-GI-NEXT: umov w10, v2.b[2] +; CHECK-GI-NEXT: umov w11, v2.b[6] +; CHECK-GI-NEXT: umov w12, v2.b[10] +; CHECK-GI-NEXT: umov w13, v2.b[14] +; CHECK-GI-NEXT: umov w14, v1.b[2] +; CHECK-GI-NEXT: umov w15, v1.b[6] +; CHECK-GI-NEXT: mov v3.s[1], w16 +; CHECK-GI-NEXT: mov v4.s[1], w18 +; CHECK-GI-NEXT: mov v5.s[1], w1 +; CHECK-GI-NEXT: mov v6.s[1], w3 +; CHECK-GI-NEXT: mov v7.s[1], w5 +; CHECK-GI-NEXT: mov v16.s[1], w7 +; CHECK-GI-NEXT: mov v17.s[1], wzr ; CHECK-GI-NEXT: mov v0.s[1], wzr -; CHECK-GI-NEXT: umov w8, v1.b[3] -; CHECK-GI-NEXT: umov w10, v1.b[7] -; CHECK-GI-NEXT: umov w11, v1.b[11] -; CHECK-GI-NEXT: umov w14, v1.b[15] -; CHECK-GI-NEXT: mov v2.s[2], w9 -; CHECK-GI-NEXT: ldrb w9, [x0, #3] -; CHECK-GI-NEXT: mov v3.s[2], w12 -; CHECK-GI-NEXT: ldrb w12, [x0, #7] -; CHECK-GI-NEXT: mov v4.s[2], w13 -; CHECK-GI-NEXT: mov v5.s[2], w15 -; CHECK-GI-NEXT: mov v6.s[2], w16 -; CHECK-GI-NEXT: mov v7.s[2], w17 -; CHECK-GI-NEXT: mov v16.s[2], wzr +; CHECK-GI-NEXT: umov w8, v2.b[3] +; CHECK-GI-NEXT: umov w9, v2.b[7] +; CHECK-GI-NEXT: umov w16, v2.b[11] +; CHECK-GI-NEXT: umov w17, v2.b[15] +; CHECK-GI-NEXT: umov w18, v1.b[3] +; CHECK-GI-NEXT: umov w0, v1.b[7] +; CHECK-GI-NEXT: mov v3.s[2], w10 +; CHECK-GI-NEXT: mov v4.s[2], w11 +; CHECK-GI-NEXT: mov v5.s[2], w12 +; CHECK-GI-NEXT: mov v6.s[2], w13 +; CHECK-GI-NEXT: mov v7.s[2], w14 +; CHECK-GI-NEXT: mov v16.s[2], w15 +; CHECK-GI-NEXT: mov v17.s[2], wzr ; CHECK-GI-NEXT: mov v0.s[2], wzr -; CHECK-GI-NEXT: mov v2.s[3], w8 -; CHECK-GI-NEXT: mov v3.s[3], w10 -; CHECK-GI-NEXT: mov v4.s[3], w11 -; CHECK-GI-NEXT: mov v5.s[3], w14 -; CHECK-GI-NEXT: mov v6.s[3], w9 -; CHECK-GI-NEXT: mov v7.s[3], w12 -; CHECK-GI-NEXT: mov v16.s[3], wzr +; CHECK-GI-NEXT: mov v3.s[3], w8 +; CHECK-GI-NEXT: mov v4.s[3], w9 +; CHECK-GI-NEXT: mov v5.s[3], w16 +; CHECK-GI-NEXT: mov v6.s[3], w17 +; CHECK-GI-NEXT: mov v7.s[3], w18 +; CHECK-GI-NEXT: mov v16.s[3], w0 +; CHECK-GI-NEXT: mov v17.s[3], wzr ; CHECK-GI-NEXT: mov v0.s[3], wzr -; CHECK-GI-NEXT: add v1.4s, v2.4s, v3.4s -; CHECK-GI-NEXT: add v2.4s, v4.4s, v5.4s -; CHECK-GI-NEXT: add v3.4s, v6.4s, v7.4s -; CHECK-GI-NEXT: add v0.4s, v16.4s, v0.4s +; CHECK-GI-NEXT: add v1.4s, v3.4s, v4.4s +; CHECK-GI-NEXT: add v2.4s, v5.4s, v6.4s +; CHECK-GI-NEXT: add v3.4s, v7.4s, v16.4s +; CHECK-GI-NEXT: add v0.4s, v17.4s, v0.4s ; CHECK-GI-NEXT: add v1.4s, v1.4s, v2.4s ; CHECK-GI-NEXT: add v0.4s, v3.4s, v0.4s ; CHECK-GI-NEXT: add v0.4s, v1.4s, v0.4s ; CHECK-GI-NEXT: addv s0, v0.4s ; CHECK-GI-NEXT: fmov w0, s0 +; CHECK-GI-NEXT: ldr x19, [sp], #16 // 8-byte Folded Reload ; CHECK-GI-NEXT: ret entry: %0 = load <25 x i8>, ptr %a1 @@ -2554,128 +2580,145 @@ define i32 @test_sdot_v25i8(ptr nocapture readonly %a, ptr nocapture readonly %b ; ; CHECK-GI-LABEL: test_sdot_v25i8: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ldr q2, [x1] -; CHECK-GI-NEXT: ldrsb w9, [x1, #16]! -; CHECK-GI-NEXT: ldrsb w11, [x1, #4] -; CHECK-GI-NEXT: ldrsb w12, [x1, #5] +; CHECK-GI-NEXT: stp x26, x25, [sp, #-64]! // 16-byte Folded Spill +; CHECK-GI-NEXT: stp x24, x23, [sp, #16] // 16-byte Folded Spill +; CHECK-GI-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill +; CHECK-GI-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill +; CHECK-GI-NEXT: .cfi_def_cfa_offset 64 +; CHECK-GI-NEXT: .cfi_offset w19, -8 +; CHECK-GI-NEXT: .cfi_offset w20, -16 +; CHECK-GI-NEXT: .cfi_offset w21, -24 +; CHECK-GI-NEXT: .cfi_offset w22, -32 +; CHECK-GI-NEXT: .cfi_offset w23, -40 +; CHECK-GI-NEXT: .cfi_offset w24, -48 +; CHECK-GI-NEXT: .cfi_offset w25, -56 +; CHECK-GI-NEXT: .cfi_offset w26, -64 +; CHECK-GI-NEXT: ldp q1, q7, [x1] ; CHECK-GI-NEXT: fmov s0, wzr -; CHECK-GI-NEXT: smov w13, v2.b[4] -; CHECK-GI-NEXT: smov w14, v2.b[5] -; CHECK-GI-NEXT: smov w10, v2.b[0] -; CHECK-GI-NEXT: fmov s3, w9 -; CHECK-GI-NEXT: smov w9, v2.b[8] -; CHECK-GI-NEXT: fmov s5, w11 -; CHECK-GI-NEXT: smov w11, v2.b[12] -; CHECK-GI-NEXT: ldr q1, [x0] -; CHECK-GI-NEXT: ldrsb w8, [x1, #1] +; CHECK-GI-NEXT: ldp q16, q3, [x0] +; CHECK-GI-NEXT: smov w9, v1.b[4] +; CHECK-GI-NEXT: smov w11, v1.b[5] +; CHECK-GI-NEXT: smov w18, v1.b[0] +; CHECK-GI-NEXT: smov w0, v1.b[12] +; CHECK-GI-NEXT: smov w3, v7.b[4] +; CHECK-GI-NEXT: smov w12, v1.b[1] +; CHECK-GI-NEXT: smov w13, v1.b[6] +; CHECK-GI-NEXT: smov w1, v1.b[13] +; CHECK-GI-NEXT: smov w4, v7.b[5] +; CHECK-GI-NEXT: smov w15, v1.b[2] +; CHECK-GI-NEXT: smov w8, v1.b[3] +; CHECK-GI-NEXT: smov w16, v1.b[7] +; CHECK-GI-NEXT: fmov s2, w9 +; CHECK-GI-NEXT: smov w14, v1.b[8] +; CHECK-GI-NEXT: smov w17, v1.b[9] +; CHECK-GI-NEXT: smov w10, v1.b[10] +; CHECK-GI-NEXT: smov w9, v1.b[11] +; CHECK-GI-NEXT: smov w5, v1.b[14] +; CHECK-GI-NEXT: smov w6, v7.b[0] +; CHECK-GI-NEXT: fmov s4, w0 +; CHECK-GI-NEXT: fmov s5, w3 +; CHECK-GI-NEXT: mov v2.s[1], w11 +; CHECK-GI-NEXT: smov w11, v1.b[15] +; CHECK-GI-NEXT: fmov s1, w18 +; CHECK-GI-NEXT: smov w7, v7.b[1] +; CHECK-GI-NEXT: smov w18, v7.b[6] +; CHECK-GI-NEXT: smov w21, v16.b[4] +; CHECK-GI-NEXT: mov v4.s[1], w1 +; CHECK-GI-NEXT: mov v5.s[1], w4 +; CHECK-GI-NEXT: fmov s6, w14 +; CHECK-GI-NEXT: mov v1.s[1], w12 +; CHECK-GI-NEXT: smov w12, v7.b[3] +; CHECK-GI-NEXT: smov w14, v7.b[7] +; CHECK-GI-NEXT: mov v2.s[2], w13 +; CHECK-GI-NEXT: smov w13, v7.b[2] +; CHECK-GI-NEXT: smov w0, v7.b[8] +; CHECK-GI-NEXT: fmov s7, w6 +; CHECK-GI-NEXT: smov w23, v16.b[12] +; CHECK-GI-NEXT: smov w25, v3.b[4] +; CHECK-GI-NEXT: mov v6.s[1], w17 +; CHECK-GI-NEXT: mov v4.s[2], w5 +; CHECK-GI-NEXT: mov v5.s[2], w18 +; CHECK-GI-NEXT: mov v1.s[2], w15 +; CHECK-GI-NEXT: smov w6, v16.b[0] +; CHECK-GI-NEXT: smov w3, v16.b[1] +; CHECK-GI-NEXT: mov v2.s[3], w16 +; CHECK-GI-NEXT: mov v7.s[1], w7 +; CHECK-GI-NEXT: smov w16, v16.b[2] +; CHECK-GI-NEXT: smov w15, v16.b[3] +; CHECK-GI-NEXT: smov w22, v16.b[5] +; CHECK-GI-NEXT: smov w5, v16.b[6] +; CHECK-GI-NEXT: smov w18, v16.b[7] +; CHECK-GI-NEXT: smov w19, v16.b[8] +; CHECK-GI-NEXT: smov w7, v16.b[9] +; CHECK-GI-NEXT: smov w24, v16.b[13] +; CHECK-GI-NEXT: smov w1, v16.b[10] +; CHECK-GI-NEXT: smov w17, v16.b[11] +; CHECK-GI-NEXT: smov w20, v16.b[14] +; CHECK-GI-NEXT: smov w4, v16.b[15] +; CHECK-GI-NEXT: fmov s16, w21 +; CHECK-GI-NEXT: smov w21, v3.b[8] +; CHECK-GI-NEXT: smov w26, v3.b[5] +; CHECK-GI-NEXT: fmov s17, w23 +; CHECK-GI-NEXT: smov w23, v3.b[0] +; CHECK-GI-NEXT: fmov s18, w25 +; CHECK-GI-NEXT: smov w25, v3.b[3] +; CHECK-GI-NEXT: mov v16.s[1], w22 +; CHECK-GI-NEXT: smov w22, v3.b[1] +; CHECK-GI-NEXT: fmov s19, w6 +; CHECK-GI-NEXT: mov v17.s[1], w24 +; CHECK-GI-NEXT: smov w24, v3.b[2] +; CHECK-GI-NEXT: smov w6, v3.b[7] +; CHECK-GI-NEXT: mul w0, w0, w21 +; CHECK-GI-NEXT: mov v18.s[1], w26 +; CHECK-GI-NEXT: smov w26, v3.b[6] +; CHECK-GI-NEXT: fmov s3, w19 +; CHECK-GI-NEXT: fmov s20, w23 +; CHECK-GI-NEXT: mov v19.s[1], w3 +; CHECK-GI-NEXT: mov v16.s[2], w5 ; CHECK-GI-NEXT: mov v0.s[1], wzr -; CHECK-GI-NEXT: fmov s7, w13 -; CHECK-GI-NEXT: fmov s4, w10 -; CHECK-GI-NEXT: smov w10, v2.b[13] -; CHECK-GI-NEXT: mov v5.s[1], w12 -; CHECK-GI-NEXT: smov w13, v2.b[9] -; CHECK-GI-NEXT: fmov s6, w9 -; CHECK-GI-NEXT: fmov s16, w11 -; CHECK-GI-NEXT: smov w9, v1.b[0] -; CHECK-GI-NEXT: mov v3.s[1], w8 -; CHECK-GI-NEXT: mov v7.s[1], w14 -; CHECK-GI-NEXT: smov w14, v2.b[6] -; CHECK-GI-NEXT: ldrsb w12, [x1, #6] -; CHECK-GI-NEXT: smov w8, v2.b[1] -; CHECK-GI-NEXT: smov w11, v2.b[2] -; CHECK-GI-NEXT: mov v0.s[2], wzr -; CHECK-GI-NEXT: mov v16.s[1], w10 -; CHECK-GI-NEXT: smov w10, v2.b[14] -; CHECK-GI-NEXT: mov v5.s[2], w12 -; CHECK-GI-NEXT: smov w12, v1.b[5] -; CHECK-GI-NEXT: mov v6.s[1], w13 -; CHECK-GI-NEXT: fmov s17, w9 -; CHECK-GI-NEXT: mov v7.s[2], w14 -; CHECK-GI-NEXT: smov w14, v1.b[4] -; CHECK-GI-NEXT: smov w9, v2.b[10] -; CHECK-GI-NEXT: mov v4.s[1], w8 -; CHECK-GI-NEXT: smov w8, v1.b[1] -; CHECK-GI-NEXT: smov w13, v2.b[7] -; CHECK-GI-NEXT: mov v16.s[2], w10 -; CHECK-GI-NEXT: smov w10, v2.b[15] -; CHECK-GI-NEXT: mov v0.s[3], wzr -; CHECK-GI-NEXT: fmov s18, w14 -; CHECK-GI-NEXT: mov v6.s[2], w9 -; CHECK-GI-NEXT: smov w9, v1.b[12] -; CHECK-GI-NEXT: mov v4.s[2], w11 -; CHECK-GI-NEXT: ldrsb w11, [x1, #7] -; CHECK-GI-NEXT: mov v17.s[1], w8 -; CHECK-GI-NEXT: ldrsb w8, [x1, #2] -; CHECK-GI-NEXT: mov v16.s[3], w10 -; CHECK-GI-NEXT: smov w10, v1.b[13] -; CHECK-GI-NEXT: mov v18.s[1], w12 -; CHECK-GI-NEXT: smov w12, v1.b[6] -; CHECK-GI-NEXT: mov v5.s[3], w11 -; CHECK-GI-NEXT: ldrsb w11, [x0, #16]! -; CHECK-GI-NEXT: mov v7.s[3], w13 -; CHECK-GI-NEXT: smov w13, v1.b[2] -; CHECK-GI-NEXT: fmov s20, w9 -; CHECK-GI-NEXT: ldrsb w9, [x0, #5] -; CHECK-GI-NEXT: mov v3.s[2], w8 -; CHECK-GI-NEXT: smov w8, v1.b[8] -; CHECK-GI-NEXT: fmov s22, w11 -; CHECK-GI-NEXT: mov v18.s[2], w12 -; CHECK-GI-NEXT: ldrsb w12, [x0, #4] -; CHECK-GI-NEXT: smov w11, v2.b[3] -; CHECK-GI-NEXT: mov v20.s[1], w10 -; CHECK-GI-NEXT: ldrsb w10, [x0, #8] -; CHECK-GI-NEXT: fmov s21, w12 -; CHECK-GI-NEXT: ldrsb w12, [x1, #8] -; CHECK-GI-NEXT: mov v17.s[2], w13 -; CHECK-GI-NEXT: smov w13, v1.b[9] -; CHECK-GI-NEXT: fmov s19, w8 -; CHECK-GI-NEXT: smov w8, v1.b[14] -; CHECK-GI-NEXT: mul w10, w12, w10 -; CHECK-GI-NEXT: smov w12, v1.b[7] +; CHECK-GI-NEXT: mov v6.s[2], w10 +; CHECK-GI-NEXT: fmov s21, w0 +; CHECK-GI-NEXT: mov v17.s[2], w20 ; CHECK-GI-NEXT: mov v4.s[3], w11 -; CHECK-GI-NEXT: mov v21.s[1], w9 -; CHECK-GI-NEXT: ldrsb w9, [x0, #6] -; CHECK-GI-NEXT: mov v19.s[1], w13 -; CHECK-GI-NEXT: ldrsb w13, [x0, #1] -; CHECK-GI-NEXT: mov v20.s[2], w8 -; CHECK-GI-NEXT: smov w8, v1.b[10] -; CHECK-GI-NEXT: mov v18.s[3], w12 -; CHECK-GI-NEXT: ldrsb w12, [x0, #7] -; CHECK-GI-NEXT: mov v21.s[2], w9 -; CHECK-GI-NEXT: smov w9, v2.b[11] -; CHECK-GI-NEXT: fmov s2, w10 -; CHECK-GI-NEXT: ldrsb w10, [x0, #2] -; CHECK-GI-NEXT: mov v22.s[1], w13 -; CHECK-GI-NEXT: smov w13, v1.b[15] -; CHECK-GI-NEXT: mov v2.s[1], wzr -; CHECK-GI-NEXT: mov v19.s[2], w8 -; CHECK-GI-NEXT: smov w8, v1.b[3] -; CHECK-GI-NEXT: mov v21.s[3], w12 +; CHECK-GI-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v3.s[1], w7 +; CHECK-GI-NEXT: mov v20.s[1], w22 +; CHECK-GI-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v18.s[2], w26 +; CHECK-GI-NEXT: mov v21.s[1], wzr +; CHECK-GI-NEXT: mov v16.s[3], w18 +; CHECK-GI-NEXT: mov v17.s[3], w4 +; CHECK-GI-NEXT: mov v7.s[2], w13 +; CHECK-GI-NEXT: mov v5.s[3], w14 +; CHECK-GI-NEXT: mov v19.s[2], w16 +; CHECK-GI-NEXT: mov v3.s[2], w1 +; CHECK-GI-NEXT: mov v0.s[2], wzr +; CHECK-GI-NEXT: mov v20.s[2], w24 +; CHECK-GI-NEXT: ldp x24, x23, [sp, #16] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v18.s[3], w6 +; CHECK-GI-NEXT: mov v21.s[2], wzr +; CHECK-GI-NEXT: mul v2.4s, v2.4s, v16.4s +; CHECK-GI-NEXT: mul v4.4s, v4.4s, v17.4s +; CHECK-GI-NEXT: mov v1.s[3], w8 ; CHECK-GI-NEXT: mov v6.s[3], w9 -; CHECK-GI-NEXT: ldrsb w9, [x0, #3] -; CHECK-GI-NEXT: mov v20.s[3], w13 -; CHECK-GI-NEXT: smov w13, v1.b[11] -; CHECK-GI-NEXT: mov v22.s[2], w10 -; CHECK-GI-NEXT: ldrsb w10, [x1, #3] -; CHECK-GI-NEXT: mul v1.4s, v7.4s, v18.4s -; CHECK-GI-NEXT: mov v2.s[2], wzr -; CHECK-GI-NEXT: mov v17.s[3], w8 -; CHECK-GI-NEXT: mov v3.s[3], w10 -; CHECK-GI-NEXT: mul v5.4s, v5.4s, v21.4s -; CHECK-GI-NEXT: mov v19.s[3], w13 -; CHECK-GI-NEXT: mul v7.4s, v16.4s, v20.4s -; CHECK-GI-NEXT: mov v22.s[3], w9 -; CHECK-GI-NEXT: mov v2.s[3], wzr -; CHECK-GI-NEXT: mla v1.4s, v4.4s, v17.4s -; CHECK-GI-NEXT: mla v7.4s, v6.4s, v19.4s -; CHECK-GI-NEXT: mla v5.4s, v3.4s, v22.4s -; CHECK-GI-NEXT: add v0.4s, v2.4s, v0.4s -; CHECK-GI-NEXT: add v1.4s, v1.4s, v7.4s +; CHECK-GI-NEXT: mov v7.s[3], w12 +; CHECK-GI-NEXT: mov v19.s[3], w15 +; CHECK-GI-NEXT: mov v3.s[3], w17 +; CHECK-GI-NEXT: mov v20.s[3], w25 +; CHECK-GI-NEXT: mov v0.s[3], wzr +; CHECK-GI-NEXT: mul v5.4s, v5.4s, v18.4s +; CHECK-GI-NEXT: mov v21.s[3], wzr +; CHECK-GI-NEXT: mla v2.4s, v1.4s, v19.4s +; CHECK-GI-NEXT: mla v4.4s, v6.4s, v3.4s +; CHECK-GI-NEXT: mla v5.4s, v7.4s, v20.4s +; CHECK-GI-NEXT: add v0.4s, v21.4s, v0.4s +; CHECK-GI-NEXT: add v1.4s, v2.4s, v4.4s ; CHECK-GI-NEXT: add v0.4s, v5.4s, v0.4s ; CHECK-GI-NEXT: add v0.4s, v1.4s, v0.4s ; CHECK-GI-NEXT: addv s0, v0.4s ; CHECK-GI-NEXT: fmov w8, s0 ; CHECK-GI-NEXT: add w0, w8, w2 +; CHECK-GI-NEXT: ldp x26, x25, [sp], #64 // 16-byte Folded Reload ; CHECK-GI-NEXT: ret entry: %0 = load <25 x i8>, ptr %a diff --git a/llvm/test/CodeGen/AArch64/sub.ll b/llvm/test/CodeGen/AArch64/sub.ll index 7a436eddb23a6..5e278d59b6591 100644 --- a/llvm/test/CodeGen/AArch64/sub.ll +++ b/llvm/test/CodeGen/AArch64/sub.ll @@ -110,16 +110,20 @@ define void @v3i8(ptr %p1, ptr %p2) { ; ; CHECK-GI-LABEL: v3i8: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ldrb w8, [x0] -; CHECK-GI-NEXT: ldrb w9, [x1] -; CHECK-GI-NEXT: ldrb w10, [x0, #1] -; CHECK-GI-NEXT: ldrb w11, [x1, #1] +; CHECK-GI-NEXT: ldr w8, [x0] +; CHECK-GI-NEXT: ldr w9, [x1] ; CHECK-GI-NEXT: fmov s0, w8 ; CHECK-GI-NEXT: fmov s1, w9 -; CHECK-GI-NEXT: ldrb w8, [x0, #2] -; CHECK-GI-NEXT: ldrb w9, [x1, #2] -; CHECK-GI-NEXT: mov v0.h[1], w10 -; CHECK-GI-NEXT: mov v1.h[1], w11 +; CHECK-GI-NEXT: mov b2, v0.b[1] +; CHECK-GI-NEXT: mov b3, v1.b[1] +; CHECK-GI-NEXT: mov b4, v0.b[2] +; CHECK-GI-NEXT: mov b5, v1.b[2] +; CHECK-GI-NEXT: fmov w8, s2 +; CHECK-GI-NEXT: fmov w9, s3 +; CHECK-GI-NEXT: mov v0.h[1], w8 +; CHECK-GI-NEXT: mov v1.h[1], w9 +; CHECK-GI-NEXT: fmov w8, s4 +; CHECK-GI-NEXT: fmov w9, s5 ; CHECK-GI-NEXT: mov v0.h[2], w8 ; CHECK-GI-NEXT: mov v1.h[2], w9 ; CHECK-GI-NEXT: sub v0.4h, v0.4h, v1.4h @@ -270,16 +274,10 @@ define void @v3i16(ptr %p1, ptr %p2) { ; ; CHECK-GI-LABEL: v3i16: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ldr h0, [x0] -; CHECK-GI-NEXT: ldr h1, [x1] +; CHECK-GI-NEXT: ldr d0, [x0] +; CHECK-GI-NEXT: ldr d1, [x1] ; CHECK-GI-NEXT: add x8, x0, #2 -; CHECK-GI-NEXT: add x9, x1, #2 -; CHECK-GI-NEXT: add x10, x1, #4 -; CHECK-GI-NEXT: ld1 { v0.h }[1], [x8] -; CHECK-GI-NEXT: ld1 { v1.h }[1], [x9] ; CHECK-GI-NEXT: add x9, x0, #4 -; CHECK-GI-NEXT: ld1 { v0.h }[2], [x9] -; CHECK-GI-NEXT: ld1 { v1.h }[2], [x10] ; CHECK-GI-NEXT: sub v0.4h, v0.4h, v1.4h ; CHECK-GI-NEXT: str h0, [x0] ; CHECK-GI-NEXT: st1 { v0.h }[1], [x8]