Skip to content

[GlobalISel] Widen vector loads from aligned ptrs #144309

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4072,6 +4072,19 @@ LegalizerHelper::LegalizeResult LegalizerHelper::lowerLoad(GAnyLoad &LoadMI) {
if (MemTy != DstTy)
return UnableToLegalize;

Align Alignment = LoadMI.getAlign();
if (Alignment.value() * 8 > MemSizeInBits &&
isPowerOf2_64(DstTy.getScalarSizeInBits())) {
LLT MoreTy = LLT::fixed_vector(NextPowerOf2(DstTy.getNumElements()),
DstTy.getElementType());
MachineMemOperand *NewMMO = MF.getMachineMemOperand(&MMO, 0, MoreTy);
auto NewLoad = MIRBuilder.buildLoad(MoreTy, PtrReg, *NewMMO);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you need to clear range metadata? It no longer applies to the widened elements

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have added a test - it looks like when creating a MMO from another it does not copy the range metadata.

MachineMemOperand *MachineFunction::getMachineMemOperand(
    const MachineMemOperand *MMO, const MachinePointerInfo &PtrInfo, LLT Ty) {
  return new (Allocator)
      MachineMemOperand(PtrInfo, MMO->getFlags(), Ty, MMO->getBaseAlign(),
                        AAMDNodes(), nullptr, MMO->getSyncScopeID(),
                        MMO->getSuccessOrdering(), MMO->getFailureOrdering());
}

Comment on lines +4075 to +4081
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should have a short comment to explain the code for future readers.

MIRBuilder.buildDeleteTrailingVectorElements(LoadMI.getReg(0),
NewLoad.getReg(0));
LoadMI.eraseFromParent();
return Legalized;
}

// TODO: We can do better than scalarizing the vector and at least split it
// in half.
return reduceLoadStoreWidth(LoadMI, 0, DstTy.getElementType());
Expand Down
46 changes: 46 additions & 0 deletions llvm/test/CodeGen/AArch64/GlobalISel/legalize-load-range.mir
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
# RUN: llc -mtriple=aarch64 -run-pass=legalizer -global-isel -o - %s | FileCheck %s

--- |
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32"
target triple = "aarch64"

define <3 x i16> @range_v3i16(ptr %a_ptr, ptr %b_ptr) {
%a = load <3 x i16>, ptr %a_ptr, align 8, !range !0, !noundef !1
%b = load <3 x i16>, ptr %b_ptr, align 8, !range !2, !noundef !1
%result = add <3 x i16> %a, %b
ret <3 x i16> %result
}

!0 = !{i16 16, i16 17}
!1 = !{}
!2 = !{i16 32, i16 33}
...
---
name: range_v3i16
body: |
bb.1 (%ir-block.0):
liveins: $x0, $x1
; Make sure we drop the range metadata when widening an aligned load.

; CHECK-LABEL: name: range_v3i16
; CHECK: liveins: $x0, $x1
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY $x1
; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<4 x s16>) from %ir.a_ptr)
; CHECK-NEXT: [[LOAD1:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY1]](p0) :: (load (<4 x s16>) from %ir.b_ptr)
; CHECK-NEXT: [[ADD:%[0-9]+]]:_(<4 x s16>) = G_ADD [[LOAD]], [[LOAD1]]
; CHECK-NEXT: $d0 = COPY [[ADD]](<4 x s16>)
; CHECK-NEXT: RET_ReallyLR implicit $d0
%0:_(p0) = COPY $x0
%1:_(p0) = COPY $x1
%2:_(<3 x s16>) = G_LOAD %0(p0) :: (load (<3 x s16>) from %ir.a_ptr, align 8, !range !0)
%3:_(<3 x s16>) = G_LOAD %1(p0) :: (load (<3 x s16>) from %ir.b_ptr, align 8, !range !2)
%4:_(<3 x s16>) = G_ADD %2, %3
%5:_(s16), %6:_(s16), %7:_(s16) = G_UNMERGE_VALUES %4(<3 x s16>)
%8:_(s16) = G_IMPLICIT_DEF
%9:_(<4 x s16>) = G_BUILD_VECTOR %5(s16), %6(s16), %7(s16), %8(s16)
$d0 = COPY %9(<4 x s16>)
RET_ReallyLR implicit $d0
...
30 changes: 14 additions & 16 deletions llvm/test/CodeGen/AArch64/add.ll
Original file line number Diff line number Diff line change
Expand Up @@ -110,16 +110,20 @@ define void @v3i8(ptr %p1, ptr %p2) {
;
; CHECK-GI-LABEL: v3i8:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: ldrb w8, [x0]
; CHECK-GI-NEXT: ldrb w9, [x1]
; CHECK-GI-NEXT: ldrb w10, [x0, #1]
; CHECK-GI-NEXT: ldrb w11, [x1, #1]
; CHECK-GI-NEXT: ldr w8, [x0]
; CHECK-GI-NEXT: ldr w9, [x1]
; CHECK-GI-NEXT: fmov s0, w8
; CHECK-GI-NEXT: fmov s1, w9
; CHECK-GI-NEXT: ldrb w8, [x0, #2]
; CHECK-GI-NEXT: ldrb w9, [x1, #2]
; CHECK-GI-NEXT: mov v0.h[1], w10
; CHECK-GI-NEXT: mov v1.h[1], w11
; CHECK-GI-NEXT: mov b2, v0.b[1]
; CHECK-GI-NEXT: mov b3, v1.b[1]
; CHECK-GI-NEXT: mov b4, v0.b[2]
; CHECK-GI-NEXT: mov b5, v1.b[2]
; CHECK-GI-NEXT: fmov w8, s2
; CHECK-GI-NEXT: fmov w9, s3
; CHECK-GI-NEXT: mov v0.h[1], w8
; CHECK-GI-NEXT: mov v1.h[1], w9
; CHECK-GI-NEXT: fmov w8, s4
; CHECK-GI-NEXT: fmov w9, s5
; CHECK-GI-NEXT: mov v0.h[2], w8
; CHECK-GI-NEXT: mov v1.h[2], w9
; CHECK-GI-NEXT: add v0.4h, v0.4h, v1.4h
Expand Down Expand Up @@ -270,16 +274,10 @@ define void @v3i16(ptr %p1, ptr %p2) {
;
; CHECK-GI-LABEL: v3i16:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: ldr h0, [x0]
; CHECK-GI-NEXT: ldr h1, [x1]
; CHECK-GI-NEXT: ldr d0, [x0]
; CHECK-GI-NEXT: ldr d1, [x1]
; CHECK-GI-NEXT: add x8, x0, #2
; CHECK-GI-NEXT: add x9, x1, #2
; CHECK-GI-NEXT: add x10, x1, #4
; CHECK-GI-NEXT: ld1 { v0.h }[1], [x8]
; CHECK-GI-NEXT: ld1 { v1.h }[1], [x9]
; CHECK-GI-NEXT: add x9, x0, #4
; CHECK-GI-NEXT: ld1 { v0.h }[2], [x9]
; CHECK-GI-NEXT: ld1 { v1.h }[2], [x10]
; CHECK-GI-NEXT: add v0.4h, v0.4h, v1.4h
; CHECK-GI-NEXT: str h0, [x0]
; CHECK-GI-NEXT: st1 { v0.h }[1], [x8]
Expand Down
90 changes: 42 additions & 48 deletions llvm/test/CodeGen/AArch64/andorxor.ll
Original file line number Diff line number Diff line change
Expand Up @@ -302,16 +302,20 @@ define void @and_v3i8(ptr %p1, ptr %p2) {
;
; CHECK-GI-LABEL: and_v3i8:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: ldrb w8, [x0]
; CHECK-GI-NEXT: ldrb w9, [x1]
; CHECK-GI-NEXT: ldrb w10, [x0, #1]
; CHECK-GI-NEXT: ldrb w11, [x1, #1]
; CHECK-GI-NEXT: ldr w8, [x0]
; CHECK-GI-NEXT: ldr w9, [x1]
; CHECK-GI-NEXT: fmov s0, w8
; CHECK-GI-NEXT: fmov s1, w9
; CHECK-GI-NEXT: ldrb w8, [x0, #2]
; CHECK-GI-NEXT: ldrb w9, [x1, #2]
; CHECK-GI-NEXT: mov v0.h[1], w10
; CHECK-GI-NEXT: mov v1.h[1], w11
; CHECK-GI-NEXT: mov b2, v0.b[1]
; CHECK-GI-NEXT: mov b3, v1.b[1]
; CHECK-GI-NEXT: mov b4, v0.b[2]
; CHECK-GI-NEXT: mov b5, v1.b[2]
; CHECK-GI-NEXT: fmov w8, s2
; CHECK-GI-NEXT: fmov w9, s3
; CHECK-GI-NEXT: mov v0.h[1], w8
; CHECK-GI-NEXT: mov v1.h[1], w9
; CHECK-GI-NEXT: fmov w8, s4
; CHECK-GI-NEXT: fmov w9, s5
; CHECK-GI-NEXT: mov v0.h[2], w8
; CHECK-GI-NEXT: mov v1.h[2], w9
; CHECK-GI-NEXT: and v0.8b, v0.8b, v1.8b
Expand Down Expand Up @@ -350,16 +354,20 @@ define void @or_v3i8(ptr %p1, ptr %p2) {
;
; CHECK-GI-LABEL: or_v3i8:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: ldrb w8, [x0]
; CHECK-GI-NEXT: ldrb w9, [x1]
; CHECK-GI-NEXT: ldrb w10, [x0, #1]
; CHECK-GI-NEXT: ldrb w11, [x1, #1]
; CHECK-GI-NEXT: ldr w8, [x0]
; CHECK-GI-NEXT: ldr w9, [x1]
; CHECK-GI-NEXT: fmov s0, w8
; CHECK-GI-NEXT: fmov s1, w9
; CHECK-GI-NEXT: ldrb w8, [x0, #2]
; CHECK-GI-NEXT: ldrb w9, [x1, #2]
; CHECK-GI-NEXT: mov v0.h[1], w10
; CHECK-GI-NEXT: mov v1.h[1], w11
; CHECK-GI-NEXT: mov b2, v0.b[1]
; CHECK-GI-NEXT: mov b3, v1.b[1]
; CHECK-GI-NEXT: mov b4, v0.b[2]
; CHECK-GI-NEXT: mov b5, v1.b[2]
; CHECK-GI-NEXT: fmov w8, s2
; CHECK-GI-NEXT: fmov w9, s3
; CHECK-GI-NEXT: mov v0.h[1], w8
; CHECK-GI-NEXT: mov v1.h[1], w9
; CHECK-GI-NEXT: fmov w8, s4
; CHECK-GI-NEXT: fmov w9, s5
; CHECK-GI-NEXT: mov v0.h[2], w8
; CHECK-GI-NEXT: mov v1.h[2], w9
; CHECK-GI-NEXT: orr v0.8b, v0.8b, v1.8b
Expand Down Expand Up @@ -398,16 +406,20 @@ define void @xor_v3i8(ptr %p1, ptr %p2) {
;
; CHECK-GI-LABEL: xor_v3i8:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: ldrb w8, [x0]
; CHECK-GI-NEXT: ldrb w9, [x1]
; CHECK-GI-NEXT: ldrb w10, [x0, #1]
; CHECK-GI-NEXT: ldrb w11, [x1, #1]
; CHECK-GI-NEXT: ldr w8, [x0]
; CHECK-GI-NEXT: ldr w9, [x1]
; CHECK-GI-NEXT: fmov s0, w8
; CHECK-GI-NEXT: fmov s1, w9
; CHECK-GI-NEXT: ldrb w8, [x0, #2]
; CHECK-GI-NEXT: ldrb w9, [x1, #2]
; CHECK-GI-NEXT: mov v0.h[1], w10
; CHECK-GI-NEXT: mov v1.h[1], w11
; CHECK-GI-NEXT: mov b2, v0.b[1]
; CHECK-GI-NEXT: mov b3, v1.b[1]
; CHECK-GI-NEXT: mov b4, v0.b[2]
; CHECK-GI-NEXT: mov b5, v1.b[2]
; CHECK-GI-NEXT: fmov w8, s2
; CHECK-GI-NEXT: fmov w9, s3
; CHECK-GI-NEXT: mov v0.h[1], w8
; CHECK-GI-NEXT: mov v1.h[1], w9
; CHECK-GI-NEXT: fmov w8, s4
; CHECK-GI-NEXT: fmov w9, s5
; CHECK-GI-NEXT: mov v0.h[2], w8
; CHECK-GI-NEXT: mov v1.h[2], w9
; CHECK-GI-NEXT: eor v0.8b, v0.8b, v1.8b
Expand Down Expand Up @@ -805,16 +817,10 @@ define void @and_v3i16(ptr %p1, ptr %p2) {
;
; CHECK-GI-LABEL: and_v3i16:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: ldr h0, [x0]
; CHECK-GI-NEXT: ldr h1, [x1]
; CHECK-GI-NEXT: ldr d0, [x0]
; CHECK-GI-NEXT: ldr d1, [x1]
; CHECK-GI-NEXT: add x8, x0, #2
; CHECK-GI-NEXT: add x9, x1, #2
; CHECK-GI-NEXT: add x10, x1, #4
; CHECK-GI-NEXT: ld1 { v0.h }[1], [x8]
; CHECK-GI-NEXT: ld1 { v1.h }[1], [x9]
; CHECK-GI-NEXT: add x9, x0, #4
; CHECK-GI-NEXT: ld1 { v0.h }[2], [x9]
; CHECK-GI-NEXT: ld1 { v1.h }[2], [x10]
; CHECK-GI-NEXT: and v0.8b, v0.8b, v1.8b
; CHECK-GI-NEXT: str h0, [x0]
; CHECK-GI-NEXT: st1 { v0.h }[1], [x8]
Expand Down Expand Up @@ -842,16 +848,10 @@ define void @or_v3i16(ptr %p1, ptr %p2) {
;
; CHECK-GI-LABEL: or_v3i16:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: ldr h0, [x0]
; CHECK-GI-NEXT: ldr h1, [x1]
; CHECK-GI-NEXT: ldr d0, [x0]
; CHECK-GI-NEXT: ldr d1, [x1]
; CHECK-GI-NEXT: add x8, x0, #2
; CHECK-GI-NEXT: add x9, x1, #2
; CHECK-GI-NEXT: add x10, x1, #4
; CHECK-GI-NEXT: ld1 { v0.h }[1], [x8]
; CHECK-GI-NEXT: ld1 { v1.h }[1], [x9]
; CHECK-GI-NEXT: add x9, x0, #4
; CHECK-GI-NEXT: ld1 { v0.h }[2], [x9]
; CHECK-GI-NEXT: ld1 { v1.h }[2], [x10]
; CHECK-GI-NEXT: orr v0.8b, v0.8b, v1.8b
; CHECK-GI-NEXT: str h0, [x0]
; CHECK-GI-NEXT: st1 { v0.h }[1], [x8]
Expand Down Expand Up @@ -879,16 +879,10 @@ define void @xor_v3i16(ptr %p1, ptr %p2) {
;
; CHECK-GI-LABEL: xor_v3i16:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: ldr h0, [x0]
; CHECK-GI-NEXT: ldr h1, [x1]
; CHECK-GI-NEXT: ldr d0, [x0]
; CHECK-GI-NEXT: ldr d1, [x1]
; CHECK-GI-NEXT: add x8, x0, #2
; CHECK-GI-NEXT: add x9, x1, #2
; CHECK-GI-NEXT: add x10, x1, #4
; CHECK-GI-NEXT: ld1 { v0.h }[1], [x8]
; CHECK-GI-NEXT: ld1 { v1.h }[1], [x9]
; CHECK-GI-NEXT: add x9, x0, #4
; CHECK-GI-NEXT: ld1 { v0.h }[2], [x9]
; CHECK-GI-NEXT: ld1 { v1.h }[2], [x10]
; CHECK-GI-NEXT: eor v0.8b, v0.8b, v1.8b
; CHECK-GI-NEXT: str h0, [x0]
; CHECK-GI-NEXT: st1 { v0.h }[1], [x8]
Expand Down
18 changes: 10 additions & 8 deletions llvm/test/CodeGen/AArch64/ctlz.ll
Original file line number Diff line number Diff line change
Expand Up @@ -56,12 +56,16 @@ define void @v3i8(ptr %p1) {
;
; CHECK-GI-LABEL: v3i8:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: ldr b0, [x0]
; CHECK-GI-NEXT: add x8, x0, #1
; CHECK-GI-NEXT: ldr w8, [x0]
; CHECK-GI-NEXT: add x9, x0, #2
; CHECK-GI-NEXT: ld1 { v0.b }[1], [x8]
; CHECK-GI-NEXT: ld1 { v0.b }[2], [x9]
; CHECK-GI-NEXT: clz v0.8b, v0.8b
; CHECK-GI-NEXT: fmov s0, w8
; CHECK-GI-NEXT: add x8, x0, #1
; CHECK-GI-NEXT: mov b1, v0.b[1]
; CHECK-GI-NEXT: mov v2.b[0], v0.b[0]
; CHECK-GI-NEXT: mov b0, v0.b[2]
; CHECK-GI-NEXT: mov v2.b[1], v1.b[0]
; CHECK-GI-NEXT: mov v2.b[2], v0.b[0]
; CHECK-GI-NEXT: clz v0.8b, v2.8b
; CHECK-GI-NEXT: st1 { v0.b }[0], [x0]
; CHECK-GI-NEXT: st1 { v0.b }[1], [x8]
; CHECK-GI-NEXT: st1 { v0.b }[2], [x9]
Expand Down Expand Up @@ -181,11 +185,9 @@ define void @v3i16(ptr %p1) {
;
; CHECK-GI-LABEL: v3i16:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: ldr h0, [x0]
; CHECK-GI-NEXT: ldr d0, [x0]
; CHECK-GI-NEXT: add x8, x0, #2
; CHECK-GI-NEXT: add x9, x0, #4
; CHECK-GI-NEXT: ld1 { v0.h }[1], [x8]
; CHECK-GI-NEXT: ld1 { v0.h }[2], [x9]
; CHECK-GI-NEXT: clz v0.4h, v0.4h
; CHECK-GI-NEXT: str h0, [x0]
; CHECK-GI-NEXT: st1 { v0.h }[1], [x8]
Expand Down
18 changes: 10 additions & 8 deletions llvm/test/CodeGen/AArch64/ctpop.ll
Original file line number Diff line number Diff line change
Expand Up @@ -55,12 +55,16 @@ define void @v3i8(ptr %p1) {
;
; CHECK-GI-LABEL: v3i8:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: ldr b0, [x0]
; CHECK-GI-NEXT: add x8, x0, #1
; CHECK-GI-NEXT: ldr w8, [x0]
; CHECK-GI-NEXT: add x9, x0, #2
; CHECK-GI-NEXT: ld1 { v0.b }[1], [x8]
; CHECK-GI-NEXT: ld1 { v0.b }[2], [x9]
; CHECK-GI-NEXT: cnt v0.8b, v0.8b
; CHECK-GI-NEXT: fmov s0, w8
; CHECK-GI-NEXT: add x8, x0, #1
; CHECK-GI-NEXT: mov b1, v0.b[1]
; CHECK-GI-NEXT: mov v2.b[0], v0.b[0]
; CHECK-GI-NEXT: mov b0, v0.b[2]
; CHECK-GI-NEXT: mov v2.b[1], v1.b[0]
; CHECK-GI-NEXT: mov v2.b[2], v0.b[0]
; CHECK-GI-NEXT: cnt v0.8b, v2.8b
; CHECK-GI-NEXT: st1 { v0.b }[0], [x0]
; CHECK-GI-NEXT: st1 { v0.b }[1], [x8]
; CHECK-GI-NEXT: st1 { v0.b }[2], [x9]
Expand Down Expand Up @@ -181,11 +185,9 @@ define void @v3i16(ptr %p1) {
;
; CHECK-GI-LABEL: v3i16:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: ldr h0, [x0]
; CHECK-GI-NEXT: ldr d0, [x0]
; CHECK-GI-NEXT: add x8, x0, #2
; CHECK-GI-NEXT: add x9, x0, #4
; CHECK-GI-NEXT: ld1 { v0.h }[1], [x8]
; CHECK-GI-NEXT: ld1 { v0.h }[2], [x9]
; CHECK-GI-NEXT: cnt v0.8b, v0.8b
; CHECK-GI-NEXT: uaddlp v0.4h, v0.8b
; CHECK-GI-NEXT: str h0, [x0]
Expand Down
38 changes: 19 additions & 19 deletions llvm/test/CodeGen/AArch64/cttz.ll
Original file line number Diff line number Diff line change
Expand Up @@ -68,21 +68,23 @@ define void @v3i8(ptr %p1) {
;
; CHECK-GI-LABEL: v3i8:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: ldrb w9, [x0]
; CHECK-GI-NEXT: ldr w9, [x0]
; CHECK-GI-NEXT: mov w8, #65535 // =0xffff
; CHECK-GI-NEXT: ldrb w10, [x0, #1]
; CHECK-GI-NEXT: fmov s0, w8
; CHECK-GI-NEXT: fmov s1, w9
; CHECK-GI-NEXT: ldrb w9, [x0, #2]
; CHECK-GI-NEXT: mov v0.h[1], w8
; CHECK-GI-NEXT: mov v1.h[1], w10
; CHECK-GI-NEXT: mov v0.h[2], w8
; CHECK-GI-NEXT: fmov s2, w8
; CHECK-GI-NEXT: fmov s0, w9
; CHECK-GI-NEXT: mov v2.h[1], w8
; CHECK-GI-NEXT: mov b1, v0.b[1]
; CHECK-GI-NEXT: mov v2.h[2], w8
; CHECK-GI-NEXT: add x8, x0, #1
; CHECK-GI-NEXT: mov v1.h[2], w9
; CHECK-GI-NEXT: fmov w9, s1
; CHECK-GI-NEXT: mov b1, v0.b[2]
; CHECK-GI-NEXT: mov v0.h[1], w9
; CHECK-GI-NEXT: fmov w9, s1
; CHECK-GI-NEXT: mov v0.h[2], w9
; CHECK-GI-NEXT: add x9, x0, #2
; CHECK-GI-NEXT: eor v2.8b, v1.8b, v0.8b
; CHECK-GI-NEXT: add v0.4h, v1.4h, v0.4h
; CHECK-GI-NEXT: and v0.8b, v2.8b, v0.8b
; CHECK-GI-NEXT: eor v1.8b, v0.8b, v2.8b
; CHECK-GI-NEXT: add v0.4h, v0.4h, v2.4h
; CHECK-GI-NEXT: and v0.8b, v1.8b, v0.8b
; CHECK-GI-NEXT: uzp1 v0.8b, v0.8b, v0.8b
; CHECK-GI-NEXT: cnt v0.8b, v0.8b
; CHECK-GI-NEXT: st1 { v0.b }[0], [x0]
Expand Down Expand Up @@ -275,22 +277,20 @@ define void @v3i16(ptr %p1) {
; CHECK-GI-LABEL: v3i16:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: mov w8, #65535 // =0xffff
; CHECK-GI-NEXT: ldr h1, [x0]
; CHECK-GI-NEXT: add x9, x0, #2
; CHECK-GI-NEXT: ldr d1, [x0]
; CHECK-GI-NEXT: add x9, x0, #4
; CHECK-GI-NEXT: fmov s0, w8
; CHECK-GI-NEXT: add x10, x0, #4
; CHECK-GI-NEXT: ld1 { v1.h }[1], [x9]
; CHECK-GI-NEXT: mov v0.h[1], w8
; CHECK-GI-NEXT: ld1 { v1.h }[2], [x10]
; CHECK-GI-NEXT: mov v0.h[2], w8
; CHECK-GI-NEXT: add x8, x0, #2
; CHECK-GI-NEXT: eor v2.8b, v1.8b, v0.8b
; CHECK-GI-NEXT: add v0.4h, v1.4h, v0.4h
; CHECK-GI-NEXT: and v0.8b, v2.8b, v0.8b
; CHECK-GI-NEXT: cnt v0.8b, v0.8b
; CHECK-GI-NEXT: uaddlp v0.4h, v0.8b
; CHECK-GI-NEXT: str h0, [x0]
; CHECK-GI-NEXT: st1 { v0.h }[1], [x9]
; CHECK-GI-NEXT: st1 { v0.h }[2], [x10]
; CHECK-GI-NEXT: st1 { v0.h }[1], [x8]
; CHECK-GI-NEXT: st1 { v0.h }[2], [x9]
; CHECK-GI-NEXT: ret
entry:
%d = load <3 x i16>, ptr %p1
Expand Down
Loading