diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 1735ff5cd6974..53a0e1c053a8c 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -22182,6 +22182,59 @@ performSignExtendSetCCCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, return SDValue(); } +// Convert zext(extract(shuffle a, b, [0,4,8,12])) -> and(uzp1(a, b), 255) +// This comes from interleaved vectorization. It is performed late to capture +// uitofp converts too. +static SDValue performZExtDeinterleaveShuffleCombine(SDNode *N, + SelectionDAG &DAG) { + EVT VT = N->getValueType(0); + if ((VT != MVT::v4i32 && VT != MVT::v8i16) || + N->getOpcode() != ISD::ZERO_EXTEND || + N->getOperand(0).getOpcode() != ISD::EXTRACT_SUBVECTOR) + return SDValue(); + + unsigned ExtOffset = N->getOperand(0).getConstantOperandVal(1); + if (ExtOffset != 0 && ExtOffset != VT.getVectorNumElements()) + return SDValue(); + + EVT InVT = N->getOperand(0).getOperand(0).getValueType(); + auto *Shuffle = dyn_cast(N->getOperand(0).getOperand(0)); + if (!Shuffle || + InVT.getVectorNumElements() != VT.getVectorNumElements() * 2 || + InVT.getScalarSizeInBits() * 2 != VT.getScalarSizeInBits()) + return SDValue(); + + unsigned Idx; + bool IsDeInterleave = ShuffleVectorInst::isDeInterleaveMaskOfFactor( + Shuffle->getMask().slice(ExtOffset, VT.getVectorNumElements()), 4, Idx); + // An undef interleave shuffle can come up after other canonicalizations, + // where the shuffle has been converted to + // zext(extract(shuffle b, undef, [u,u,0,4])) + bool IsUndefDeInterleave = false; + if (!IsDeInterleave) + IsUndefDeInterleave = + Shuffle->getOperand(1).isUndef() && + ShuffleVectorInst::isDeInterleaveMaskOfFactor( + Shuffle->getMask().slice(ExtOffset + VT.getVectorNumElements() / 2, + VT.getVectorNumElements() / 2), + 4, Idx); + if ((!IsDeInterleave && !IsUndefDeInterleave) || Idx >= 4) + return SDValue(); + SDLoc DL(N); + SDValue BC1 = DAG.getNode(AArch64ISD::NVCAST, DL, VT, + Shuffle->getOperand(IsUndefDeInterleave ? 1 : 0)); + SDValue BC2 = DAG.getNode(AArch64ISD::NVCAST, DL, VT, + Shuffle->getOperand(IsUndefDeInterleave ? 0 : 1)); + SDValue UZP = DAG.getNode(Idx < 2 ? AArch64ISD::UZP1 : AArch64ISD::UZP2, DL, + VT, BC1, BC2); + if ((Idx & 1) == 1) + UZP = DAG.getNode(ISD::SRL, DL, VT, UZP, + DAG.getConstant(InVT.getScalarSizeInBits(), DL, VT)); + return DAG.getNode( + ISD::AND, DL, VT, UZP, + DAG.getConstant((1 << InVT.getScalarSizeInBits()) - 1, DL, VT)); +} + static SDValue performExtendCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) { @@ -22202,6 +22255,9 @@ static SDValue performExtendCombine(SDNode *N, return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), NewABD); } + if (SDValue R = performZExtDeinterleaveShuffleCombine(N, DAG)) + return R; + if (N->getValueType(0).isFixedLengthVector() && N->getOpcode() == ISD::SIGN_EXTEND && N->getOperand(0)->getOpcode() == ISD::SETCC) diff --git a/llvm/test/CodeGen/AArch64/zext-shuffle.ll b/llvm/test/CodeGen/AArch64/zext-shuffle.ll index 4ef8daf141715..af5a92017bbbc 100644 --- a/llvm/test/CodeGen/AArch64/zext-shuffle.ll +++ b/llvm/test/CodeGen/AArch64/zext-shuffle.ll @@ -76,12 +76,9 @@ define <2 x i64> @v2i64_37(<4 x i32> %a, <4 x i32> %b) { define <4 x i64> @v2i64_i16_04812(<16 x i16> %a) { ; CHECK-LABEL: v2i64_i16_04812: ; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI6_0 -; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI6_0] -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 -; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b -; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: movi v2.2d, #0x00ffff0000ffff +; CHECK-NEXT: uzp1 v0.4s, v0.4s, v1.4s +; CHECK-NEXT: and v0.16b, v0.16b, v2.16b ; CHECK-NEXT: ushll2 v1.2d, v0.4s, #0 ; CHECK-NEXT: ushll v0.2d, v0.2s, #0 ; CHECK-NEXT: ret @@ -93,12 +90,8 @@ define <4 x i64> @v2i64_i16_04812(<16 x i16> %a) { define <4 x i64> @v2i64_i16_15913(<16 x i16> %a) { ; CHECK-LABEL: v2i64_i16_15913: ; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI7_0 -; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI7_0] -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 -; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b -; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: uzp1 v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ushr v0.4s, v0.4s, #16 ; CHECK-NEXT: ushll2 v1.2d, v0.4s, #0 ; CHECK-NEXT: ushll v0.2d, v0.2s, #0 ; CHECK-NEXT: ret @@ -110,12 +103,9 @@ define <4 x i64> @v2i64_i16_15913(<16 x i16> %a) { define <4 x i64> @v2i64_i16_261014(<16 x i16> %a) { ; CHECK-LABEL: v2i64_i16_261014: ; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI8_0 -; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI8_0] -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 -; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b -; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: movi v2.2d, #0x00ffff0000ffff +; CHECK-NEXT: uzp2 v0.4s, v0.4s, v1.4s +; CHECK-NEXT: and v0.16b, v0.16b, v2.16b ; CHECK-NEXT: ushll2 v1.2d, v0.4s, #0 ; CHECK-NEXT: ushll v0.2d, v0.2s, #0 ; CHECK-NEXT: ret @@ -127,12 +117,8 @@ define <4 x i64> @v2i64_i16_261014(<16 x i16> %a) { define <4 x i64> @v2i64_i16_371115(<16 x i16> %a) { ; CHECK-LABEL: v2i64_i16_371115: ; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI9_0 -; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI9_0] -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 -; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b -; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: uzp2 v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ushr v0.4s, v0.4s, #16 ; CHECK-NEXT: ushll2 v1.2d, v0.4s, #0 ; CHECK-NEXT: ushll v0.2d, v0.2s, #0 ; CHECK-NEXT: ret @@ -167,12 +153,9 @@ define <4 x i32> @v4i32_1357(<8 x i16> %a, <8 x i16> %b) { define <4 x i32> @v4i32_04812(<8 x i16> %a, <8 x i16> %b) { ; CHECK-LABEL: v4i32_04812: ; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI12_0 -; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI12_0] -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 -; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b -; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: movi v2.2d, #0x00ffff0000ffff +; CHECK-NEXT: uzp1 v0.4s, v0.4s, v1.4s +; CHECK-NEXT: and v0.16b, v0.16b, v2.16b ; CHECK-NEXT: ret %c = shufflevector <8 x i16> %a, <8 x i16> %b, <4 x i32> %d = zext <4 x i16> %c to <4 x i32> @@ -182,12 +165,8 @@ define <4 x i32> @v4i32_04812(<8 x i16> %a, <8 x i16> %b) { define <4 x i32> @v4i32_15913(<8 x i16> %a, <8 x i16> %b) { ; CHECK-LABEL: v4i32_15913: ; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI13_0 -; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI13_0] -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 -; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b -; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: uzp1 v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ushr v0.4s, v0.4s, #16 ; CHECK-NEXT: ret %c = shufflevector <8 x i16> %a, <8 x i16> %b, <4 x i32> %d = zext <4 x i16> %c to <4 x i32> @@ -197,12 +176,9 @@ define <4 x i32> @v4i32_15913(<8 x i16> %a, <8 x i16> %b) { define <4 x i32> @v4i32_261014(<8 x i16> %a, <8 x i16> %b) { ; CHECK-LABEL: v4i32_261014: ; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI14_0 -; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI14_0] -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 -; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b -; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: movi v2.2d, #0x00ffff0000ffff +; CHECK-NEXT: uzp2 v0.4s, v0.4s, v1.4s +; CHECK-NEXT: and v0.16b, v0.16b, v2.16b ; CHECK-NEXT: ret %c = shufflevector <8 x i16> %a, <8 x i16> %b, <4 x i32> %d = zext <4 x i16> %c to <4 x i32> @@ -212,12 +188,8 @@ define <4 x i32> @v4i32_261014(<8 x i16> %a, <8 x i16> %b) { define <4 x i32> @v4i32_371115(<8 x i16> %a, <8 x i16> %b) { ; CHECK-LABEL: v4i32_371115: ; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI15_0 -; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI15_0] -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 -; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b -; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: uzp2 v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ushr v0.4s, v0.4s, #16 ; CHECK-NEXT: ret %c = shufflevector <8 x i16> %a, <8 x i16> %b, <4 x i32> %d = zext <4 x i16> %c to <4 x i32> @@ -249,12 +221,8 @@ define <8 x i16> @v8i16_1357(<16 x i8> %a, <16 x i8> %b) { define <8 x i16> @v8i16_04812(<16 x i8> %a, <16 x i8> %b) { ; CHECK-LABEL: v8i16_04812: ; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI18_0 -; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI18_0] -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 -; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b -; CHECK-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-NEXT: uzp1 v0.8h, v0.8h, v1.8h +; CHECK-NEXT: bic v0.8h, #255, lsl #8 ; CHECK-NEXT: ret %c = shufflevector <16 x i8> %a, <16 x i8> %b, <8 x i32> %d = zext <8 x i8> %c to <8 x i16> @@ -264,12 +232,8 @@ define <8 x i16> @v8i16_04812(<16 x i8> %a, <16 x i8> %b) { define <8 x i16> @v8i16_15913(<16 x i8> %a, <16 x i8> %b) { ; CHECK-LABEL: v8i16_15913: ; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI19_0 -; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI19_0] -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 -; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b -; CHECK-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-NEXT: uzp1 v0.8h, v0.8h, v1.8h +; CHECK-NEXT: ushr v0.8h, v0.8h, #8 ; CHECK-NEXT: ret %c = shufflevector <16 x i8> %a, <16 x i8> %b, <8 x i32> %d = zext <8 x i8> %c to <8 x i16> @@ -279,12 +243,8 @@ define <8 x i16> @v8i16_15913(<16 x i8> %a, <16 x i8> %b) { define <8 x i16> @v8i16_261014(<16 x i8> %a, <16 x i8> %b) { ; CHECK-LABEL: v8i16_261014: ; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI20_0 -; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI20_0] -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 -; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b -; CHECK-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-NEXT: uzp2 v0.8h, v0.8h, v1.8h +; CHECK-NEXT: bic v0.8h, #255, lsl #8 ; CHECK-NEXT: ret %c = shufflevector <16 x i8> %a, <16 x i8> %b, <8 x i32> %d = zext <8 x i8> %c to <8 x i16> @@ -294,12 +254,8 @@ define <8 x i16> @v8i16_261014(<16 x i8> %a, <16 x i8> %b) { define <8 x i16> @v8i16_371115(<16 x i8> %a, <16 x i8> %b) { ; CHECK-LABEL: v8i16_371115: ; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI21_0 -; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI21_0] -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 -; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b -; CHECK-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-NEXT: uzp2 v0.8h, v0.8h, v1.8h +; CHECK-NEXT: ushr v0.8h, v0.8h, #8 ; CHECK-NEXT: ret %c = shufflevector <16 x i8> %a, <16 x i8> %b, <8 x i32> %d = zext <8 x i8> %c to <8 x i16> @@ -310,42 +266,23 @@ define <8 x i16> @v8i16_371115(<16 x i8> %a, <16 x i8> %b) { define <8 x i64> @zext_add(<32 x i16> %l) { ; CHECK-LABEL: zext_add: ; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI22_0 -; CHECK-NEXT: adrp x9, .LCPI22_3 -; CHECK-NEXT: // kill: def $q3 killed $q3 killed $q2_q3 def $q2_q3 -; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 -; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI22_0] -; CHECK-NEXT: adrp x8, .LCPI22_1 -; CHECK-NEXT: ldr q7, [x9, :lo12:.LCPI22_3] -; CHECK-NEXT: ldr q5, [x8, :lo12:.LCPI22_1] -; CHECK-NEXT: adrp x8, .LCPI22_2 -; CHECK-NEXT: adrp x9, .LCPI22_7 -; CHECK-NEXT: ldr q6, [x8, :lo12:.LCPI22_2] -; CHECK-NEXT: adrp x8, .LCPI22_4 -; CHECK-NEXT: ldr q18, [x9, :lo12:.LCPI22_7] -; CHECK-NEXT: ldr q16, [x8, :lo12:.LCPI22_4] -; CHECK-NEXT: adrp x8, .LCPI22_5 -; CHECK-NEXT: // kill: def $q2 killed $q2 killed $q2_q3 def $q2_q3 -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 -; CHECK-NEXT: tbl v5.16b, { v0.16b, v1.16b }, v5.16b -; CHECK-NEXT: ldr q17, [x8, :lo12:.LCPI22_5] -; CHECK-NEXT: adrp x8, .LCPI22_6 -; CHECK-NEXT: tbl v7.16b, { v0.16b, v1.16b }, v7.16b -; CHECK-NEXT: ldr q19, [x8, :lo12:.LCPI22_6] -; CHECK-NEXT: tbl v17.16b, { v0.16b, v1.16b }, v17.16b -; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v18.16b -; CHECK-NEXT: tbl v1.16b, { v2.16b, v3.16b }, v4.16b -; CHECK-NEXT: tbl v4.16b, { v2.16b, v3.16b }, v6.16b -; CHECK-NEXT: tbl v6.16b, { v2.16b, v3.16b }, v16.16b -; CHECK-NEXT: tbl v2.16b, { v2.16b, v3.16b }, v19.16b -; CHECK-NEXT: uaddl v5.4s, v5.4h, v7.4h -; CHECK-NEXT: uaddl v7.4s, v17.4h, v0.4h -; CHECK-NEXT: uaddl2 v4.4s, v1.8h, v4.8h -; CHECK-NEXT: uaddl2 v2.4s, v6.8h, v2.8h -; CHECK-NEXT: uaddl v0.2d, v5.2s, v7.2s -; CHECK-NEXT: uaddl2 v1.2d, v5.4s, v7.4s -; CHECK-NEXT: uaddl2 v3.2d, v4.4s, v2.4s -; CHECK-NEXT: uaddl v2.2d, v4.2s, v2.2s +; CHECK-NEXT: movi v4.2d, #0x00ffff0000ffff +; CHECK-NEXT: uzp1 v5.4s, v0.4s, v1.4s +; CHECK-NEXT: uzp2 v0.4s, v0.4s, v1.4s +; CHECK-NEXT: uzp1 v1.4s, v2.4s, v3.4s +; CHECK-NEXT: uzp2 v2.4s, v2.4s, v3.4s +; CHECK-NEXT: and v3.16b, v5.16b, v4.16b +; CHECK-NEXT: and v6.16b, v0.16b, v4.16b +; CHECK-NEXT: and v7.16b, v1.16b, v4.16b +; CHECK-NEXT: and v4.16b, v2.16b, v4.16b +; CHECK-NEXT: usra v3.4s, v5.4s, #16 +; CHECK-NEXT: usra v6.4s, v0.4s, #16 +; CHECK-NEXT: usra v7.4s, v1.4s, #16 +; CHECK-NEXT: usra v4.4s, v2.4s, #16 +; CHECK-NEXT: uaddl v0.2d, v3.2s, v6.2s +; CHECK-NEXT: uaddl2 v1.2d, v3.4s, v6.4s +; CHECK-NEXT: uaddl2 v3.2d, v7.4s, v4.4s +; CHECK-NEXT: uaddl v2.2d, v7.2s, v4.2s ; CHECK-NEXT: ret %s1 = shufflevector <32 x i16> %l, <32 x i16> undef, <8 x i32> %z1 = zext <8 x i16> %s1 to <8 x i64> @@ -392,86 +329,77 @@ define <8 x i64> @zext_load_add(ptr %p) { define <8 x double> @uitofp_fadd(<32 x i16> %l) { ; CHECK-LABEL: uitofp_fadd: ; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI24_0 -; CHECK-NEXT: adrp x9, .LCPI24_1 -; CHECK-NEXT: // kill: def $q3 killed $q3 killed $q2_q3 def $q2_q3 -; CHECK-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 -; CHECK-NEXT: adrp x10, .LCPI24_6 -; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI24_0] -; CHECK-NEXT: ldr q5, [x9, :lo12:.LCPI24_1] -; CHECK-NEXT: adrp x8, .LCPI24_2 -; CHECK-NEXT: adrp x9, .LCPI24_3 -; CHECK-NEXT: ldr q6, [x8, :lo12:.LCPI24_2] -; CHECK-NEXT: adrp x8, .LCPI24_4 -; CHECK-NEXT: // kill: def $q2 killed $q2 killed $q2_q3 def $q2_q3 -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 -; CHECK-NEXT: tbl v4.16b, { v0.16b, v1.16b }, v4.16b -; CHECK-NEXT: tbl v5.16b, { v2.16b, v3.16b }, v5.16b -; CHECK-NEXT: ldr q7, [x9, :lo12:.LCPI24_3] -; CHECK-NEXT: adrp x9, .LCPI24_5 -; CHECK-NEXT: ldr q16, [x8, :lo12:.LCPI24_4] -; CHECK-NEXT: adrp x8, .LCPI24_7 -; CHECK-NEXT: ldr q17, [x9, :lo12:.LCPI24_5] -; CHECK-NEXT: ldr q18, [x10, :lo12:.LCPI24_6] -; CHECK-NEXT: ldr q19, [x8, :lo12:.LCPI24_7] -; CHECK-NEXT: tbl v6.16b, { v0.16b, v1.16b }, v6.16b -; CHECK-NEXT: tbl v7.16b, { v2.16b, v3.16b }, v7.16b -; CHECK-NEXT: tbl v16.16b, { v0.16b, v1.16b }, v16.16b -; CHECK-NEXT: tbl v17.16b, { v2.16b, v3.16b }, v17.16b -; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v18.16b -; CHECK-NEXT: tbl v1.16b, { v2.16b, v3.16b }, v19.16b -; CHECK-NEXT: ushll2 v5.4s, v5.8h, #0 -; CHECK-NEXT: ushll v4.4s, v4.4h, #0 -; CHECK-NEXT: ushll2 v7.4s, v7.8h, #0 -; CHECK-NEXT: ushll v6.4s, v6.4h, #0 -; CHECK-NEXT: ushll v16.4s, v16.4h, #0 -; CHECK-NEXT: ushll2 v20.2d, v5.4s, #0 -; CHECK-NEXT: ushll2 v21.2d, v4.4s, #0 -; CHECK-NEXT: ushll2 v17.4s, v17.8h, #0 -; CHECK-NEXT: ushll v0.4s, v0.4h, #0 -; CHECK-NEXT: ushll2 v1.4s, v1.8h, #0 -; CHECK-NEXT: ushll v2.2d, v5.2s, #0 -; CHECK-NEXT: ushll v3.2d, v4.2s, #0 -; CHECK-NEXT: ushll2 v4.2d, v7.4s, #0 -; CHECK-NEXT: ushll2 v5.2d, v6.4s, #0 -; CHECK-NEXT: ushll v7.2d, v7.2s, #0 -; CHECK-NEXT: ucvtf v18.2d, v20.2d -; CHECK-NEXT: ucvtf v19.2d, v21.2d +; CHECK-NEXT: uzp1 v5.4s, v0.4s, v3.4s +; CHECK-NEXT: uzp1 v6.4s, v0.4s, v1.4s +; CHECK-NEXT: uzp2 v0.4s, v0.4s, v1.4s +; CHECK-NEXT: movi d4, #0x00ffff0000ffff +; CHECK-NEXT: uzp1 v7.4s, v2.4s, v3.4s +; CHECK-NEXT: uzp2 v2.4s, v2.4s, v3.4s +; CHECK-NEXT: ext v16.16b, v6.16b, v6.16b, #8 +; CHECK-NEXT: ext v5.16b, v5.16b, v5.16b, #8 +; CHECK-NEXT: uzp2 v1.4s, v0.4s, v3.4s +; CHECK-NEXT: and v17.8b, v6.8b, v4.8b +; CHECK-NEXT: and v18.8b, v7.8b, v4.8b +; CHECK-NEXT: ushr v6.2s, v6.2s, #16 +; CHECK-NEXT: ushr v7.2s, v7.2s, #16 +; CHECK-NEXT: and v21.8b, v0.8b, v4.8b +; CHECK-NEXT: and v22.8b, v2.8b, v4.8b +; CHECK-NEXT: ushr v2.2s, v2.2s, #16 +; CHECK-NEXT: and v19.8b, v16.8b, v4.8b +; CHECK-NEXT: and v20.8b, v5.8b, v4.8b +; CHECK-NEXT: ushll v3.2d, v17.2s, #0 +; CHECK-NEXT: ushll v17.2d, v18.2s, #0 +; CHECK-NEXT: ext v1.16b, v1.16b, v1.16b, #8 +; CHECK-NEXT: ushr v16.2s, v16.2s, #16 +; CHECK-NEXT: ushr v5.2s, v5.2s, #16 ; CHECK-NEXT: ushll v6.2d, v6.2s, #0 -; CHECK-NEXT: ushll2 v20.2d, v17.4s, #0 -; CHECK-NEXT: ushll2 v21.2d, v16.4s, #0 -; CHECK-NEXT: ushll v17.2d, v17.2s, #0 +; CHECK-NEXT: ushll v7.2d, v7.2s, #0 +; CHECK-NEXT: ushll v18.2d, v19.2s, #0 +; CHECK-NEXT: ushll v19.2d, v20.2s, #0 +; CHECK-NEXT: ext v20.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: ushr v0.2s, v0.2s, #16 ; CHECK-NEXT: ushll v16.2d, v16.2s, #0 -; CHECK-NEXT: ushll v22.2d, v0.2s, #0 -; CHECK-NEXT: ushll2 v23.2d, v1.4s, #0 -; CHECK-NEXT: ushll2 v0.2d, v0.4s, #0 -; CHECK-NEXT: ushll v1.2d, v1.2s, #0 -; CHECK-NEXT: ucvtf v2.2d, v2.2d +; CHECK-NEXT: ushll v21.2d, v21.2s, #0 +; CHECK-NEXT: ushll v5.2d, v5.2s, #0 +; CHECK-NEXT: ushll v22.2d, v22.2s, #0 +; CHECK-NEXT: ushll v2.2d, v2.2s, #0 ; CHECK-NEXT: ucvtf v3.2d, v3.2d -; CHECK-NEXT: ucvtf v4.2d, v4.2d -; CHECK-NEXT: ucvtf v5.2d, v5.2d -; CHECK-NEXT: ucvtf v7.2d, v7.2d -; CHECK-NEXT: ucvtf v6.2d, v6.2d -; CHECK-NEXT: ucvtf v20.2d, v20.2d -; CHECK-NEXT: ucvtf v21.2d, v21.2d ; CHECK-NEXT: ucvtf v17.2d, v17.2d +; CHECK-NEXT: ucvtf v6.2d, v6.2d +; CHECK-NEXT: and v23.8b, v20.8b, v4.8b +; CHECK-NEXT: and v4.8b, v1.8b, v4.8b +; CHECK-NEXT: ushr v20.2s, v20.2s, #16 +; CHECK-NEXT: ushr v1.2s, v1.2s, #16 +; CHECK-NEXT: ushll v0.2d, v0.2s, #0 +; CHECK-NEXT: ucvtf v7.2d, v7.2d +; CHECK-NEXT: ucvtf v18.2d, v18.2d +; CHECK-NEXT: ucvtf v19.2d, v19.2d ; CHECK-NEXT: ucvtf v16.2d, v16.2d +; CHECK-NEXT: ushll v23.2d, v23.2s, #0 +; CHECK-NEXT: ushll v4.2d, v4.2s, #0 +; CHECK-NEXT: ushll v20.2d, v20.2s, #0 +; CHECK-NEXT: ushll v1.2d, v1.2s, #0 +; CHECK-NEXT: ucvtf v5.2d, v5.2d +; CHECK-NEXT: ucvtf v21.2d, v21.2d ; CHECK-NEXT: ucvtf v22.2d, v22.2d -; CHECK-NEXT: ucvtf v23.2d, v23.2d ; CHECK-NEXT: ucvtf v0.2d, v0.2d +; CHECK-NEXT: ucvtf v2.2d, v2.2d +; CHECK-NEXT: ucvtf v23.2d, v23.2d +; CHECK-NEXT: ucvtf v4.2d, v4.2d +; CHECK-NEXT: ucvtf v20.2d, v20.2d ; CHECK-NEXT: ucvtf v1.2d, v1.2d -; CHECK-NEXT: fadd v4.2d, v18.2d, v4.2d -; CHECK-NEXT: fadd v3.2d, v3.2d, v6.2d -; CHECK-NEXT: fadd v2.2d, v2.2d, v7.2d +; CHECK-NEXT: fadd v16.2d, v18.2d, v16.2d +; CHECK-NEXT: fadd v7.2d, v17.2d, v7.2d ; CHECK-NEXT: fadd v5.2d, v19.2d, v5.2d -; CHECK-NEXT: fadd v6.2d, v16.2d, v22.2d -; CHECK-NEXT: fadd v16.2d, v20.2d, v23.2d -; CHECK-NEXT: fadd v7.2d, v17.2d, v1.2d -; CHECK-NEXT: fadd v1.2d, v21.2d, v0.2d -; CHECK-NEXT: fadd v0.2d, v3.2d, v6.2d -; CHECK-NEXT: fadd v3.2d, v4.2d, v16.2d -; CHECK-NEXT: fadd v1.2d, v5.2d, v1.2d -; CHECK-NEXT: fadd v2.2d, v2.2d, v7.2d +; CHECK-NEXT: fadd v3.2d, v3.2d, v6.2d +; CHECK-NEXT: fadd v0.2d, v21.2d, v0.2d +; CHECK-NEXT: fadd v2.2d, v22.2d, v2.2d +; CHECK-NEXT: fadd v4.2d, v4.2d, v1.2d +; CHECK-NEXT: fadd v1.2d, v23.2d, v20.2d +; CHECK-NEXT: fadd v0.2d, v3.2d, v0.2d +; CHECK-NEXT: fadd v2.2d, v7.2d, v2.2d +; CHECK-NEXT: fadd v1.2d, v16.2d, v1.2d +; CHECK-NEXT: fadd v3.2d, v5.2d, v4.2d ; CHECK-NEXT: ret %s1 = shufflevector <32 x i16> %l, <32 x i16> undef, <8 x i32> %z1 = uitofp <8 x i16> %s1 to <8 x double>