diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs index 74b1e9240d434..0e24d6132b4cb 100644 --- a/.git-blame-ignore-revs +++ b/.git-blame-ignore-revs @@ -67,6 +67,3 @@ f6d557ee34b6bbdb1dc32f29e34b4a4a8ad35e81 # [libc++] Rename _LIBCPP_INLINE_VISIBILITY to _LIBCPP_HIDE_FROM_ABI 4c198542226223f6a5c5511a1f89b37d15ee10b9 - -# [libc++] Replace uses of _VSTD:: by std:: (#74331) -77a00c0d546cd4aa8311b5b9031ae9ea8cdb050c diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 8afd2b9ce4a1c..b9a28edc9daf4 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -33,42 +33,26 @@ /lldb/ @JDevlieghere -# Linalg in MLIR. -/mlir/include/mlir/Dialect/Linalg @dcaballe @nicolasvasilache -/mlir/lib/Dialect/Linalg @dcaballe @nicolasvasilache +/mlir/include/mlir/Interfaces/TilingInterface.* @MaheshRavishankar -# Vector in MLIR. -/mlir/**/*AMX* @dcaballe -/mlir/**/*Neon* @banach-space @dcaballe @nicolasvasilache -/mlir/**/*SME* @banach-space @dcaballe @nicolasvasilache -/mlir/**/*SVE* @banach-space @dcaballe @nicolasvasilache -/mlir/**/*VectorInterfaces* @dcaballe @nicolasvasilache -/mlir/**/*VectorToSCF* @banach-space @dcaballe @nicolasvasilache -/mlir/**/*VectorToLLVM* @banach-space @dcaballe @nicolasvasilache -/mlir/**/*X86Vector* @dcaballe @nicolasvasilache -/mlir/include/mlir/Dialect/Vector @dcaballe @nicolasvasilache -/mlir/lib/Dialect/Vector @dcaballe @nicolasvasilache - -/mlir/include/mlir/Interfaces/TilingInterface.* @MaheshRavishankar @nicolasvasilache - -/mlir/lib/Dialect/Linalg/Transforms/DecomposeLinalgOps.cpp @MaheshRavishankar @nicolasvasilache -/mlir/lib/Dialect/Linalg/Transforms/DropUnitDims.cpp @MaheshRavishankar @nicolasvasilache -/mlir/lib/Dialect/Linalg/Transforms/ElementwiseOpFusion.cpp @MaheshRavishankar @nicolasvasilache -/mlir/lib/Dialect/MemRef/Transforms/EmulateNarrowType.cpp @MaheshRavishankar @nicolasvasilache -/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp @MaheshRavishankar @nicolasvasilache -/mlir/lib/Interfaces/TilingInterface.* @MaheshRavishankar @nicolasvasilache +/mlir/lib/Dialect/Linalg/Transforms/DecomposeLinalgOps.cpp @MaheshRavishankar +/mlir/lib/Dialect/Linalg/Transforms/DropUnitDims.cpp @MaheshRavishankar +/mlir/lib/Dialect/Linalg/Transforms/ElementwiseOpFusion.cpp @MaheshRavishankar +/mlir/lib/Dialect/MemRef/Transforms/EmulateNarrowType.cpp @MaheshRavishankar +/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp @MaheshRavishankar +/mlir/lib/Interfaces/TilingInterface.* @MaheshRavishankar /mlir/**/*EmulateNarrowType* @hanhanW -/mlir/lib/Dialect/Linalg/Transforms/DataLayoutPropagation.cpp @hanhanW @nicolasvasilache -/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp @hanhanW @nicolasvasilache -/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp @hanhanW @nicolasvasilache -/mlir/lib/Dialect/Tensor/IR/TensorTilingInterfaceImpl.cpp @hanhanW @nicolasvasilache -/mlir/lib/Dialect/Tensor/Transforms/FoldIntoPackAndUnpackPatterns.cpp @hanhanW @nicolasvasilache -/mlir/lib/Dialect/Vector/Transforms/* @hanhanW @nicolasvasilache +/mlir/lib/Dialect/Linalg/Transforms/DataLayoutPropagation.cpp @hanhanW +/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp @hanhanW +/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp @hanhanW +/mlir/lib/Dialect/Tensor/IR/TensorTilingInterfaceImpl.cpp @hanhanW +/mlir/lib/Dialect/Tensor/Transforms/FoldIntoPackAndUnpackPatterns.cpp @hanhanW +/mlir/lib/Dialect/Vector/Transforms/* @hanhanW # Transform Dialect in MLIR. -/mlir/include/mlir/Dialect/Transform/* @ftynse @nicolasvasilache -/mlir/lib/Dialect/Transform/* @ftynse @nicolasvasilache +/mlir/include/mlir/Dialect/Transform/* @ftynse +/mlir/lib/Dialect/Transform/* @ftynse # SPIR-V in MLIR. /mlir/**/SPIRV/ @antiagainst @kuhar diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 626585c146914..5c04465819713 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -204,9 +204,6 @@ C Language Changes number of elements in the flexible array member. This information can improve the results of the array bound sanitizer and the ``__builtin_dynamic_object_size`` builtin. -- Enums will now be represented in TBAA metadata using their actual underlying - integer type. Previously they were treated as chars, which meant they could - alias with all other types. C23 Feature Support ^^^^^^^^^^^^^^^^^^^ @@ -897,17 +894,11 @@ Arm and AArch64 Support - New AArch64 asm constraints have been added for r8-r11(Uci) and r12-r15(Ucj). -- Support has been added for the following processors (-mcpu identifiers in parenthesis): + Support has been added for the following processors (-mcpu identifiers in parenthesis): - For Arm: - - * Cortex-M52 (cortex-m52). - - For AArch64: - - * Cortex-A520 (cortex-a520). - * Cortex-A720 (cortex-a720). - * Cortex-X4 (cortex-x4). + * Arm Cortex-A520 (cortex-a520). + * Arm Cortex-A720 (cortex-a720). + * Arm Cortex-X4 (cortex-x4). Android Support ^^^^^^^^^^^^^^^ diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td index 85656c00c5b3e..61788cb5cd9bd 100644 --- a/clang/include/clang/Basic/arm_sve.td +++ b/clang/include/clang/Basic/arm_sve.td @@ -1296,11 +1296,6 @@ def SVCREATE_3_BF16 : SInst<"svcreate3[_{d}]", "3ddd", "b", MergeNone, "", [IsT def SVCREATE_4_BF16 : SInst<"svcreate4[_{d}]", "4dddd", "b", MergeNone, "", [IsTupleCreate]>; } -let TargetGuard = "sve2p1" in { - def SVCREATE_2_B : SInst<"svcreate2[_{d}]", "2dd", "Pc", MergeNone, "", [IsTupleCreate]>; - def SVCREATE_4_B : SInst<"svcreate4[_{d}]", "4dddd", "Pc", MergeNone, "", [IsTupleCreate]>; -} - //////////////////////////////////////////////////////////////////////////////// // Vector insertion and extraction def SVGET_2 : SInst<"svget2[_{d}]", "d2i", "csilUcUsUiUlhfd", MergeNone, "", [IsTupleGet], [ImmCheck<1, ImmCheck0_1>]>; @@ -1321,13 +1316,6 @@ def SVSET_3_BF16 : SInst<"svset3[_{d}]", "33id", "b", MergeNone, "", [IsTupleSet def SVSET_4_BF16 : SInst<"svset4[_{d}]", "44id", "b", MergeNone, "", [IsTupleSet], [ImmCheck<1, ImmCheck0_3>]>; } -let TargetGuard = "sve2p1" in { - def SVGET_2_B : SInst<"svget2[_{d}]", "d2i", "Pc", MergeNone, "", [IsTupleGet], [ImmCheck<1, ImmCheck0_1>]>; - def SVGET_4_B : SInst<"svget4[_{d}]", "d4i", "Pc", MergeNone, "", [IsTupleGet], [ImmCheck<1, ImmCheck0_3>]>; - - def SVSET_2_B : SInst<"svset2[_{d}]", "22id", "Pc", MergeNone, "", [IsTupleSet], [ImmCheck<1, ImmCheck0_1>]>; - def SVSET_4_B : SInst<"svset4[_{d}]", "44id", "Pc", MergeNone, "", [IsTupleSet], [ImmCheck<1, ImmCheck0_3>]>; -} //////////////////////////////////////////////////////////////////////////////// // SVE2 WhileGE/GT let TargetGuard = "sve2" in { @@ -2161,9 +2149,6 @@ let TargetGuard = "sme2" in { def SVSQRSHRU_X4 : SInst<"svqrshru[_n]_{0}[_{d}_x4]", "b4i", "il", MergeNone, "aarch64_sve_sqrshru_x4", [IsStreaming], [ImmCheck<1, ImmCheckShiftRight, 0>]>; def SVSQRSHRUN_X4 : SInst<"svqrshrun[_n]_{0}[_{d}_x4]", "b4i", "il", MergeNone, "aarch64_sve_sqrshrun_x4", [IsStreaming], [ImmCheck<1, ImmCheckShiftRight, 0>]>; - - def REINTERPRET_SVBOOL_TO_SVCOUNT : Inst<"svreinterpret[_c]", "}P", "Pc", MergeNone, "", [IsStreamingCompatible], []>; - def REINTERPRET_SVCOUNT_TO_SVBOOL : Inst<"svreinterpret[_b]", "P}", "Pc", MergeNone, "", [IsStreamingCompatible], []>; } let TargetGuard = "sve2p1" in { diff --git a/clang/lib/AST/Interp/InterpBuiltin.cpp b/clang/lib/AST/Interp/InterpBuiltin.cpp index 4384ace6b6be5..9cf206ecc212a 100644 --- a/clang/lib/AST/Interp/InterpBuiltin.cpp +++ b/clang/lib/AST/Interp/InterpBuiltin.cpp @@ -34,19 +34,6 @@ PrimType getIntPrimType(const InterpState &S) { llvm_unreachable("Int isn't 16 or 32 bit?"); } -PrimType getLongPrimType(const InterpState &S) { - const TargetInfo &TI = S.getCtx().getTargetInfo(); - unsigned LongWidth = TI.getLongWidth(); - - if (LongWidth == 64) - return PT_Sint64; - else if (LongWidth == 32) - return PT_Sint32; - else if (LongWidth == 16) - return PT_Sint16; - llvm_unreachable("long isn't 16, 32 or 64 bit?"); -} - /// Peek an integer value from the stack into an APSInt. static APSInt peekToAPSInt(InterpStack &Stk, PrimType T, size_t Offset = 0) { if (Offset == 0) @@ -123,19 +110,6 @@ static void pushAPSInt(InterpState &S, const APSInt &Val) { } } -/// Pushes \p Val to the stack, as a target-dependent 'long'. -static void pushLong(InterpState &S, int64_t Val) { - PrimType LongType = getLongPrimType(S); - if (LongType == PT_Sint64) - S.Stk.push>(Integral<64, true>::from(Val)); - else if (LongType == PT_Sint32) - S.Stk.push>(Integral<32, true>::from(Val)); - else if (LongType == PT_Sint16) - S.Stk.push>(Integral<16, true>::from(Val)); - else - llvm_unreachable("Long isn't 16, 32 or 64 bit?"); -} - static void pushSizeT(InterpState &S, uint64_t Val) { const TargetInfo &TI = S.getCtx().getTargetInfo(); unsigned SizeTWidth = TI.getTypeWidth(TI.getSizeType()); @@ -559,26 +533,6 @@ static bool interp__builtin_classify_type(InterpState &S, CodePtr OpPC, return true; } -// __builtin_expect(long, long) -// __builtin_expect_with_probability(long, long, double) -static bool interp__builtin_expect(InterpState &S, CodePtr OpPC, - const InterpFrame *Frame, - const Function *Func, const CallExpr *Call) { - // The return value is simply the value of the first parameter. - // We ignore the probability. - unsigned NumArgs = Call->getNumArgs(); - assert(NumArgs == 2 || NumArgs == 3); - - PrimType ArgT = *S.getContext().classify(Call->getArg(0)->getType()); - unsigned Offset = align(primSize(getLongPrimType(S))) * 2; - if (NumArgs == 3) - Offset += align(primSize(PT_Float)); - - APSInt Val = peekToAPSInt(S.Stk, ArgT, Offset); - pushLong(S, Val.getSExtValue()); - return true; -} - bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const Function *F, const CallExpr *Call) { InterpFrame *Frame = S.Current; @@ -748,12 +702,6 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const Function *F, return false; break; - case Builtin::BI__builtin_expect: - case Builtin::BI__builtin_expect_with_probability: - if (!interp__builtin_expect(S, OpPC, Frame, F, Call)) - return false; - break; - default: return false; } diff --git a/clang/lib/Analysis/FlowSensitive/HTMLLogger.cpp b/clang/lib/Analysis/FlowSensitive/HTMLLogger.cpp index 2a7bfce535015..7430ef599e032 100644 --- a/clang/lib/Analysis/FlowSensitive/HTMLLogger.cpp +++ b/clang/lib/Analysis/FlowSensitive/HTMLLogger.cpp @@ -351,10 +351,9 @@ class HTMLLogger : public Logger { if (Invalid) return; + static constexpr unsigned Missing = -1; // TokenInfo stores the BB and set of elements that a token is part of. struct TokenInfo { - enum : unsigned { Missing = static_cast(-1) }; - // The basic block this is part of. // This is the BB of the stmt with the smallest containing range. unsigned BB = Missing; diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 83d0a72aac549..0d8b3e4aaad47 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -10211,22 +10211,6 @@ Value *CodeGenFunction::EmitAArch64SVEBuiltinExpr(unsigned BuiltinID, switch (BuiltinID) { default: return nullptr; - - case SVE::BI__builtin_sve_svreinterpret_b: { - auto SVCountTy = - llvm::TargetExtType::get(getLLVMContext(), "aarch64.svcount"); - Function *CastFromSVCountF = - CGM.getIntrinsic(Intrinsic::aarch64_sve_convert_to_svbool, SVCountTy); - return Builder.CreateCall(CastFromSVCountF, Ops[0]); - } - case SVE::BI__builtin_sve_svreinterpret_c: { - auto SVCountTy = - llvm::TargetExtType::get(getLLVMContext(), "aarch64.svcount"); - Function *CastToSVCountF = - CGM.getIntrinsic(Intrinsic::aarch64_sve_convert_from_svbool, SVCountTy); - return Builder.CreateCall(CastToSVCountF, Ops[0]); - } - case SVE::BI__builtin_sve_svpsel_lane_b8: case SVE::BI__builtin_sve_svpsel_lane_b16: case SVE::BI__builtin_sve_svpsel_lane_b32: diff --git a/clang/lib/CodeGen/CodeGenTBAA.cpp b/clang/lib/CodeGen/CodeGenTBAA.cpp index dc288bc3f6157..5906b14dd93cf 100644 --- a/clang/lib/CodeGen/CodeGenTBAA.cpp +++ b/clang/lib/CodeGen/CodeGenTBAA.cpp @@ -196,14 +196,11 @@ llvm::MDNode *CodeGenTBAA::getTypeInfoHelper(const Type *Ty) { // Enum types are distinct types. In C++ they have "underlying types", // however they aren't related for TBAA. if (const EnumType *ETy = dyn_cast(Ty)) { - if (!Features.CPlusPlus) - return getTypeInfo(ETy->getDecl()->getIntegerType()); - // In C++ mode, types have linkage, so we can rely on the ODR and // on their mangled names, if they're external. // TODO: Is there a way to get a program-wide unique name for a // decl with local linkage or no linkage? - if (!ETy->getDecl()->isExternallyVisible()) + if (!Features.CPlusPlus || !ETy->getDecl()->isExternallyVisible()) return getChar(); SmallString<256> OutName; diff --git a/clang/lib/Parse/ParseOpenACC.cpp b/clang/lib/Parse/ParseOpenACC.cpp index f7f096762e91a..40b53dd180c79 100644 --- a/clang/lib/Parse/ParseOpenACC.cpp +++ b/clang/lib/Parse/ParseOpenACC.cpp @@ -84,7 +84,6 @@ OpenACCAtomicKind getOpenACCAtomicKind(Token Tok) { } enum class OpenACCSpecialTokenKind { - ReadOnly, DevNum, Queues, }; @@ -94,8 +93,6 @@ bool isOpenACCSpecialToken(OpenACCSpecialTokenKind Kind, Token Tok) { return false; switch (Kind) { - case OpenACCSpecialTokenKind::ReadOnly: - return Tok.getIdentifierInfo()->isStr("readonly"); case OpenACCSpecialTokenKind::DevNum: return Tok.getIdentifierInfo()->isStr("devnum"); case OpenACCSpecialTokenKind::Queues: @@ -206,7 +203,7 @@ OpenACCDirectiveKind ParseOpenACCDirectiveKind(Parser &P) { // Just #pragma acc can get us immediately to the end, make sure we don't // introspect on the spelling before then. - if (FirstTok.isNot(tok::identifier)) { + if (FirstTok.isAnnotation()) { P.Diag(FirstTok, diag::err_acc_missing_directive); return OpenACCDirectiveKind::Invalid; } @@ -224,8 +221,11 @@ OpenACCDirectiveKind ParseOpenACCDirectiveKind(Parser &P) { if (ExDirKind >= OpenACCDirectiveKindEx::Invalid) { switch (ExDirKind) { case OpenACCDirectiveKindEx::Invalid: { - P.Diag(FirstTok, diag::err_acc_invalid_directive) - << 0 << FirstTok.getIdentifierInfo(); + if (!FirstTok.is(tok::identifier)) + P.Diag(FirstTok, diag::err_expected) << tok::identifier; + else + P.Diag(FirstTok, diag::err_acc_invalid_directive) + << 0 << FirstTok.getIdentifierInfo(); return OpenACCDirectiveKind::Invalid; } case OpenACCDirectiveKindEx::Enter: @@ -422,7 +422,8 @@ void Parser::ParseOpenACCCacheVarList() { // specifications. First, see if we have `readonly:`, else we back-out and // treat it like the beginning of a reference to a potentially-existing // `readonly` variable. - if (isOpenACCSpecialToken(OpenACCSpecialTokenKind::ReadOnly, Tok) && + if (getCurToken().is(tok::identifier) && + getCurToken().getIdentifierInfo()->isStr("readonly") && NextToken().is(tok::colon)) { // Consume both tokens. ConsumeToken(); diff --git a/clang/test/AST/Interp/builtin-functions.cpp b/clang/test/AST/Interp/builtin-functions.cpp index 35a1f9a75092a..0726dab37cb4e 100644 --- a/clang/test/AST/Interp/builtin-functions.cpp +++ b/clang/test/AST/Interp/builtin-functions.cpp @@ -331,11 +331,3 @@ namespace bitreverse { char bitreverse3[__builtin_bitreverse32(0x12345678) == 0x1E6A2C48 ? 1 : -1]; char bitreverse4[__builtin_bitreverse64(0x0123456789ABCDEFULL) == 0xF7B3D591E6A2C480 ? 1 : -1]; } - -namespace expect { - constexpr int a() { - return 12; - } - static_assert(__builtin_expect(a(),1) == 12, ""); - static_assert(__builtin_expect_with_probability(a(), 1, 1.0) == 12, ""); -} diff --git a/clang/test/CXX/drs/dr10xx.cpp b/clang/test/CXX/drs/dr10xx.cpp index 77c59078414c6..f30ed1cb3e496 100644 --- a/clang/test/CXX/drs/dr10xx.cpp +++ b/clang/test/CXX/drs/dr10xx.cpp @@ -1,9 +1,9 @@ -// RUN: %clang_cc1 -std=c++98 %s -verify=expected -fexceptions -fcxx-exceptions -pedantic-errors -// RUN: %clang_cc1 -std=c++11 %s -verify=expected -fexceptions -fcxx-exceptions -pedantic-errors -// RUN: %clang_cc1 -std=c++14 %s -verify=expected,since-cxx14 -fexceptions -fcxx-exceptions -pedantic-errors -// RUN: %clang_cc1 -std=c++17 %s -verify=expected,since-cxx14 -fexceptions -fcxx-exceptions -pedantic-errors -// RUN: %clang_cc1 -std=c++20 %s -verify=expected,since-cxx14 -fexceptions -fcxx-exceptions -pedantic-errors -// RUN: %clang_cc1 -std=c++23 %s -verify=expected,since-cxx14 -fexceptions -fcxx-exceptions -pedantic-errors +// RUN: %clang_cc1 -std=c++98 %s -verify -fexceptions -fcxx-exceptions -pedantic-errors +// RUN: %clang_cc1 -std=c++11 %s -verify -fexceptions -fcxx-exceptions -pedantic-errors +// RUN: %clang_cc1 -std=c++14 %s -verify -fexceptions -fcxx-exceptions -pedantic-errors +// RUN: %clang_cc1 -std=c++17 %s -verify -fexceptions -fcxx-exceptions -pedantic-errors +// RUN: %clang_cc1 -std=c++20 %s -verify -fexceptions -fcxx-exceptions -pedantic-errors +// RUN: %clang_cc1 -std=c++23 %s -verify -fexceptions -fcxx-exceptions -pedantic-errors namespace std { __extension__ typedef __SIZE_TYPE__ size_t; @@ -18,40 +18,30 @@ namespace dr1004 { // dr1004: 5 template struct A {}; template struct B1 {}; template class> struct B2 {}; - template void f(); // #dr1004-f-1 - template class X> void f(); // #dr1004-f-2 - template class X> void g(); // #dr1004-g-1 - template void g(); // #dr1004-g-2 + template void f(); // expected-note {{[with X = dr1004::A]}} + template class X> void f(); // expected-note {{[with X = dr1004::A]}} + template class X> void g(); // expected-note {{[with X = dr1004::A]}} + template void g(); // expected-note {{[with X = dr1004::A]}} struct C : A { B1 b1a; B2 b2a; void h() { - f(); - // expected-error@-1 {{call to 'f' is ambiguous}} - // expected-note@#dr1004-f-1 {{candidate function [with X = dr1004::A]}} - // expected-note@#dr1004-f-2 {{candidate function [with X = dr1004::A]}} - g(); - // expected-error@-1 {{call to 'g' is ambiguous}} - // expected-note@#dr1004-g-1 {{candidate function [with X = dr1004::A]}} - // expected-note@#dr1004-g-2 {{candidate function [with X = dr1004::A]}} + f(); // expected-error {{ambiguous}} + g(); // expected-error {{ambiguous}} } }; // This example (from the standard) is actually ill-formed, because // name lookup of "T::template A" names the constructor. - template class U = T::template A> struct Third { }; - // expected-error@-1 {{is a constructor name}} - // expected-note@#dr1004-t {{in instantiation of default argument}} - Third > t; // #dr1004-t + template class U = T::template A> struct Third { }; // expected-error {{is a constructor name}} + Third > t; // expected-note {{in instantiation of default argument}} } namespace dr1042 { // dr1042: 3.5 #if __cplusplus >= 201402L // C++14 added an attribute that we can test the semantics of. - using foo [[deprecated]] = int; // #dr1042-using - foo f = 12; - // since-cxx14-warning@-1 {{'foo' is deprecated}} - // since-cxx14-note@#dr1042-using {{'foo' has been explicitly marked deprecated here}} + using foo [[deprecated]] = int; // expected-note {{'foo' has been explicitly marked deprecated here}} + foo f = 12; // expected-warning {{'foo' is deprecated}} #elif __cplusplus >= 201103L // C++11 did not have any attributes that could be applied to an alias // declaration, so the best we can test is that we accept an empty attribute @@ -86,8 +76,7 @@ namespace dr1054 { // dr1054: no // which copy-initializes a temporary from 'a'. Therefore this is // ill-formed because A does not have a volatile copy constructor. // (We might want to track this aspect under dr1383 instead?) - a; - // expected-warning@-1 {{expression result unused; assign into a variable to force a volatile load}} + a; // expected-warning {{assign into a variable to force a volatile load}} } } diff --git a/clang/test/CXX/drs/dr11xx.cpp b/clang/test/CXX/drs/dr11xx.cpp index 86e726ae8c741..2e35535a18c4a 100644 --- a/clang/test/CXX/drs/dr11xx.cpp +++ b/clang/test/CXX/drs/dr11xx.cpp @@ -1,30 +1,30 @@ -// RUN: %clang_cc1 -std=c++98 %s -verify=expected,cxx98 -fexceptions -fcxx-exceptions -pedantic-errors -// RUN: %clang_cc1 -std=c++11 %s -verify=expected -fexceptions -fcxx-exceptions -pedantic-errors -// RUN: %clang_cc1 -std=c++14 %s -verify=expected -fexceptions -fcxx-exceptions -pedantic-errors -// RUN: %clang_cc1 -std=c++17 %s -verify=expected -fexceptions -fcxx-exceptions -pedantic-errors -// RUN: %clang_cc1 -std=c++2a %s -verify=expected -fexceptions -fcxx-exceptions -pedantic-errors +// RUN: %clang_cc1 -std=c++98 %s -verify -fexceptions -fcxx-exceptions -pedantic-errors +// RUN: %clang_cc1 -std=c++11 %s -verify -fexceptions -fcxx-exceptions -pedantic-errors +// RUN: %clang_cc1 -std=c++14 %s -verify -fexceptions -fcxx-exceptions -pedantic-errors +// RUN: %clang_cc1 -std=c++17 %s -verify -fexceptions -fcxx-exceptions -pedantic-errors +// RUN: %clang_cc1 -std=c++2a %s -verify -fexceptions -fcxx-exceptions -pedantic-errors namespace dr1111 { // dr1111: 3.2 namespace example1 { -template struct set; // #dr1111-struct-set +template struct set; struct X { - template void set(const T &value); // #dr1111-func-set + template void set(const T &value); }; void foo() { X x; - // FIXME: should we backport C++11 behavior? +#pragma clang diagnostic push +#if __cplusplus < 201103L +#pragma clang diagnostic ignored "-Wambiguous-member-template" +#endif x.set(3.2); - // cxx98-error@-1 {{lookup of 'set' in member access expression is ambiguous; using member of 'X'}} - // cxx98-note@#dr1111-func-set {{lookup in the object type 'X' refers here}} - // cxx98-note@#dr1111-struct-set {{lookup from the current scope refers here}} +#pragma clang diagnostic pop } struct Y {}; void bar() { Y y; - y.set(3.2); - // expected-error@-1 {{no member named 'set' in 'dr1111::example1::Y'}} + y.set(3.2); // expected-error {{no member named 'set' in 'dr1111::example1::Y'}} } } // namespace example1 @@ -46,10 +46,8 @@ void baz() { namespace dr1113 { // dr1113: partial namespace named { - extern int a; // #dr1113-a - static int a; - // expected-error@-1 {{static declaration of 'a' follows non-static}} - // expected-note@#dr1113-a {{previous declaration is here}} + extern int a; // expected-note {{previous}} + static int a; // expected-error {{static declaration of 'a' follows non-static}} } namespace { extern int a; diff --git a/clang/test/CXX/drs/dr12xx.cpp b/clang/test/CXX/drs/dr12xx.cpp index adf7f56711c45..339712be45b38 100644 --- a/clang/test/CXX/drs/dr12xx.cpp +++ b/clang/test/CXX/drs/dr12xx.cpp @@ -1,9 +1,9 @@ -// RUN: %clang_cc1 -std=c++98 %s -verify=expected,cxx98-14,cxx98 -fexceptions -fcxx-exceptions -pedantic-errors -// RUN: %clang_cc1 -std=c++11 %s -verify=expected,cxx98-14,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors -// RUN: %clang_cc1 -std=c++14 %s -verify=expected,cxx98-14,since-cxx14,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors -// RUN: %clang_cc1 -std=c++17 %s -verify=expected,since-cxx17,since-cxx14,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors -// RUN: %clang_cc1 -std=c++20 %s -verify=expected,since-cxx17,since-cxx14,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors -// RUN: %clang_cc1 -std=c++23 %s -verify=expected,since-cxx17,since-cxx14,since-cxx11,since-cxx23 -fexceptions -fcxx-exceptions -pedantic-errors +// RUN: %clang_cc1 -std=c++98 %s -verify -fexceptions -fcxx-exceptions -pedantic-errors +// RUN: %clang_cc1 -std=c++11 %s -verify -fexceptions -fcxx-exceptions -pedantic-errors +// RUN: %clang_cc1 -std=c++14 %s -verify -fexceptions -fcxx-exceptions -pedantic-errors +// RUN: %clang_cc1 -std=c++17 %s -verify -fexceptions -fcxx-exceptions -pedantic-errors +// RUN: %clang_cc1 -std=c++20 %s -verify -fexceptions -fcxx-exceptions -pedantic-errors +// RUN: %clang_cc1 -std=c++23 %s -verify -fexceptions -fcxx-exceptions -pedantic-errors // dr1200: na @@ -40,33 +40,28 @@ struct S { S* operator()(); int N; int M; -#if __cplusplus >= 202302L +#if __cplusplus > 202002L template static constexpr auto V = 0; void f(char); void f(int); void mem(S s) { - auto(s)()->M; - // since-cxx23-warning@-1 {{expression result unused}} - auto(s)()->V; - // since-cxx23-warning@-1 {{expression result unused}} + auto(s)()->M; //expected-warning {{expression result unused}} + auto(s)()->V; //expected-warning {{expression result unused}} auto(s)()->f(0); } #endif }; void f(S s) { { -#if __cplusplus >= 202302L - auto(s)()->N; - //since-cxx23-warning@-1 {{expression result unused}} +#if __cplusplus > 202002L + auto(s)()->N; //expected-warning {{expression result unused}} #endif auto(s)()->M; } { - S(s)()->N; - // since-cxx11-warning@-1 {{expression result unused}} - S(s)()->M; - // since-cxx11-warning@-1 {{expression result unused}} + S(s)()->N; //expected-warning {{expression result unused}} + S(s)()->M; //expected-warning {{expression result unused}} } } @@ -79,8 +74,7 @@ void g() { A a(B ()->C); A b(auto ()->C); static_assert(sizeof(B ()->C[1] == sizeof(int)), ""); - sizeof(auto () -> C[1]); - // since-cxx11-error@-1 {{function cannot return array type 'C[1]' (aka 'dr1223::BB[1]')}} + sizeof(auto () -> C[1]); // expected-error{{function cannot return array type 'C[1]'}} } } @@ -88,18 +82,15 @@ void g() { #if __cplusplus >= 201103L namespace dr1227 { // dr1227: 3.0 -template struct A { using X = typename T::X; }; -// since-cxx11-error@-1 {{type 'int' cannot be used prior to '::' because it has no members}} -// since-cxx11-note@#dr1227-g {{in instantiation of template class 'dr1227::A' requested here}} -// since-cxx11-note@#dr1227-g-int {{while substituting explicitly-specified template arguments into function template 'g'}} +template struct A { using X = typename T::X; }; // expected-error {{type 'int' cannot be used prior to '::' because it has no members}} template typename T::X f(typename A::X); template void f(...) { } -template auto g(typename A::X) -> typename T::X; // #dr1227-g +template auto g(typename A::X) -> typename T::X; // expected-note {{in instantiation of template class 'dr1227::A' requested here}} template void g(...) { } void h() { f(0); // OK, substituting return type causes deduction to fail - g(0); // #dr1227-g-int + g(0); // expected-note {{while substituting explicitly-specified template arguments into function template 'g'}} } } #endif @@ -118,21 +109,15 @@ struct Derived : Base { namespace dr1265 { // dr1265: 5 #if __cplusplus >= 201103L - auto a = 0, b() -> int; - // since-cxx11-error@-1 {{declaration with trailing return type must be the only declaration in its group}} - auto b() -> int, d = 0; - // since-cxx11-error@-1 {{declaration with trailing return type must be the only declaration in its group}} - auto e() -> int, f() -> int; - // since-cxx11-error@-1 {{declaration with trailing return type must be the only declaration in its group}} + auto a = 0, b() -> int; // expected-error {{declaration with trailing return type must be the only declaration in its group}} + auto b() -> int, d = 0; // expected-error {{declaration with trailing return type must be the only declaration in its group}} + auto e() -> int, f() -> int; // expected-error {{declaration with trailing return type must be the only declaration in its group}} #endif #if __cplusplus >= 201402L - auto g(), h = 0; - // since-cxx14-error@-1 {{function with deduced return type must be the only declaration in its group}} - auto i = 0, j(); - // since-cxx14-error@-1 {{function with deduced return type must be the only declaration in its group}} - auto k(), l(); - // since-cxx14-error@-1 {{function with deduced return type must be the only declaration in its group}} + auto g(), h = 0; // expected-error {{function with deduced return type must be the only declaration in its group}} + auto i = 0, j(); // expected-error {{function with deduced return type must be the only declaration in its group}} + auto k(), l(); // expected-error {{function with deduced return type must be the only declaration in its group}} #endif } @@ -145,16 +130,16 @@ namespace dr1295 { // dr1295: 4 X x = {1}; - unsigned const &r1 = static_cast(x).bitfield; - // cxx98-error@-1 {{rvalue references are a C++11 extension}} - unsigned const &r2 = static_cast(x.bitfield); - // cxx98-error@-1 {{rvalue references are a C++11 extension}} + unsigned const &r1 = static_cast(x).bitfield; // expected-error 0-1{{C++11}} + unsigned const &r2 = static_cast(x.bitfield); // expected-error 0-1{{C++11}} - template struct Y {}; // #dr1295-Y - Y y; // #dr1295-y - // cxx98-14-error@-1 {{non-type template argument does not refer to any declaration}} - // cxx98-14-note@#dr1295-Y {{template parameter is declared here}} - // since-cxx17-error@#dr1295-y {{non-type template argument refers to subobject 'x.bitfield'}} + template struct Y {}; + Y y; +#if __cplusplus <= 201402L + // expected-error@-2 {{does not refer to any declaration}} expected-note@-3 {{here}} +#else + // expected-error@-4 {{refers to subobject}} +#endif #if __cplusplus >= 201103L const unsigned other = 0; diff --git a/clang/test/CXX/drs/dr13xx.cpp b/clang/test/CXX/drs/dr13xx.cpp index 359c04b3e0f3d..abee100961602 100644 --- a/clang/test/CXX/drs/dr13xx.cpp +++ b/clang/test/CXX/drs/dr13xx.cpp @@ -1,10 +1,7 @@ -// RUN: %clang_cc1 -std=c++98 %s -verify=expected,cxx98-14,cxx98 -fexceptions -fcxx-exceptions -pedantic-errors -// RUN: %clang_cc1 -std=c++11 %s -verify=expected,cxx11-17,cxx11-14,cxx98-14,since-cxx11,cxx11 -fexceptions -fcxx-exceptions -pedantic-errors -// RUN: %clang_cc1 -std=c++14 %s -verify=expected,cxx11-17,cxx11-14,since-cxx14,cxx98-14,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors -// RUN: %clang_cc1 -std=c++17 %s -verify=expected,cxx11-17,since-cxx14,since-cxx17,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors -// RUN: %clang_cc1 -std=c++20 %s -verify=expected,since-cxx14,since-cxx20,since-cxx17,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors -// RUN: %clang_cc1 -std=c++23 %s -verify=expected,since-cxx14,since-cxx20,since-cxx17,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors -// RUN: %clang_cc1 -std=c++2c %s -verify=expected,since-cxx14,since-cxx20,since-cxx17,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors +// RUN: %clang_cc1 -std=c++98 %s -verify -fexceptions -fcxx-exceptions -pedantic-errors +// RUN: %clang_cc1 -std=c++11 %s -verify -fexceptions -fcxx-exceptions -pedantic-errors +// RUN: %clang_cc1 -std=c++14 %s -verify -fexceptions -fcxx-exceptions -pedantic-errors +// RUN: %clang_cc1 -std=c++17 %s -verify -fexceptions -fcxx-exceptions -pedantic-errors __extension__ typedef __SIZE_TYPE__ size_t; @@ -18,12 +15,10 @@ namespace std { #if __cplusplus >= 201103L namespace dr1305 { // dr1305: 3.0 -struct Incomplete; // #dr1305-Incomplete +struct Incomplete; // expected-note {{forward declaration of 'dr1305::Incomplete'}} struct Complete {}; -int incomplete = alignof(Incomplete(&)[]); -// since-cxx11-error@-1 {{invalid application of 'alignof' to an incomplete type 'Incomplete'}} -// since-cxx11-note@#dr1305-Incomplete {{forward declaration of 'dr1305::Incomplete'}} +int incomplete = alignof(Incomplete(&)[]); // expected-error {{invalid application of 'alignof' to an incomplete type 'Incomplete'}} int complete = alignof(Complete(&)[]); } #endif @@ -41,11 +36,9 @@ void caller() { } // namespace dr1307 namespace dr1310 { // dr1310: 5 - struct S {} * sp = new S::S; - // expected-error@-1 {{qualified reference to 'S' is a constructor name rather than a type in this context}} + struct S {} * sp = new S::S; // expected-error {{qualified reference to 'S' is a constructor name}} void f() { - S::S(a); - // expected-error@-1 {{qualified reference to 'S' is a constructor name rather than a type in this context}} + S::S(a); // expected-error {{qualified reference to 'S' is a constructor name}} } struct T { int n; typedef int U; typedef T V; }; int k = T().T::T::n; @@ -71,67 +64,39 @@ namespace dr1310 { // dr1310: 5 template struct W : WBase { typedef int X; int n; }; void w_test() { - W::W w1a; - // expected-error@-1 {{qualified reference to 'W' is a constructor name rather than a type in this context}} + W::W w1a; // expected-error {{qualified reference to 'W' is a constructor name}} W::W::X w1ax; - W::W w1b; - // expected-error@-1 {{qualified reference to 'W' is a constructor name rather than a template name in this context}} + W::W w1b; // expected-error {{qualified reference to 'W' is a constructor name}} W::W::X w1bx; - typename W::W w2a; - // expected-error@-1 {{ISO C++ specifies that qualified reference to 'W' is a constructor name rather than a type in this context, despite preceding 'typename' keyword}} - // cxx98-error@-2 {{'typename' occurs outside of a template}} - typename W::W::X w2ax; - // cxx98-error@-1 {{'typename' occurs outside of a template}} - typename W::W w2b; - // expected-error@-1 {{ISO C++ specifies that qualified reference to 'W' is a constructor name rather than a template name in this context, despite preceding 'typename' keyword}} - // cxx98-error@-2 {{'typename' occurs outside of a template}} - typename W::W::X w2bx; - // cxx98-error@-1 {{'typename' occurs outside of a template}} - W::template W w3; - // expected-error@-1 {{ISO C++ specifies that qualified reference to 'W' is a constructor name rather than a template name in this context, despite preceding 'template' keyword}} - // cxx98-error@-2 {{'template' keyword outside of a template}} - W::template W::X w3x; - // cxx98-error@-1 {{'template' keyword outside of a template}} - typename W::template W w4; - // expected-error@-1 {{ISO C++ specifies that qualified reference to 'W' is a constructor name rather than a template name in this context, despite preceding 'template' keyword}} - // cxx98-error@-2 {{'template' keyword outside of a template}} - // cxx98-error@-3 {{'typename' occurs outside of a template}} - typename W::template W::X w4x; - // cxx98-error@-1 {{'template' keyword outside of a template}} - // cxx98-error@-2 {{'typename' occurs outside of a template}} - - TT::W> tt1; - // expected-error@-1 {{qualified reference to 'W' is a constructor name rather than a type in this context}} - TTy::W> tt1a; - // expected-error@-1 {{qualified reference to 'W' is a constructor name rather than a type in this context}} - TT::template W> tt2; - // expected-error@-1 {{ISO C++ specifies that qualified reference to 'W' is a constructor name rather than a template name in this context, despite preceding 'template' keyword}} - // cxx98-error@-2 {{'template' keyword outside of a template}} + typename W::W w2a; // expected-error {{qualified reference to 'W' is a constructor name}} expected-error 0-1{{outside of a template}} + typename W::W::X w2ax; // expected-error 0-1{{outside of a template}} + typename W::W w2b; // expected-error {{qualified reference to 'W' is a constructor name}} expected-error 0-1{{outside of a template}} + typename W::W::X w2bx; // expected-error 0-1{{outside of a template}} + W::template W w3; // expected-error {{qualified reference to 'W' is a constructor name}} expected-error 0-1{{outside of a template}} + W::template W::X w3x; // expected-error 0-1{{outside of a template}} + typename W::template W w4; // expected-error {{qualified reference to 'W' is a constructor name}} expected-error 0-2{{outside of a template}} + typename W::template W::X w4x; // expected-error 0-2{{outside of a template}} + + TT::W> tt1; // expected-error {{qualified reference to 'W' is a constructor name}} + TTy::W> tt1a; // expected-error {{qualified reference to 'W' is a constructor name}} + TT::template W> tt2; // expected-error {{qualified reference to 'W' is a constructor name}} expected-error 0-1{{outside of a template}} TT::WBase> tt3; TTy::WBase> tt3a; - TT::template WBase> tt4; - // cxx98-error@-1 {{'template' keyword outside of a template}} + TT::template WBase> tt4; // expected-error 0-1{{outside of a template}} W w; (void)w.W::W::n; (void)w.W::W::n; (void)w.W::W::n; - (void)w.W::template W::n; - // cxx98-error@-1 {{'template' keyword outside of a template}} + (void)w.W::template W::n; // expected-error 0-1{{outside of a template}} } template void wt_test() { - typename W::W w2a; - // expected-error@-1 {{ISO C++ specifies that qualified reference to 'W' is a constructor name rather than a type in this context, despite preceding 'typename' keyword}} - // cxx98-note@#dr1310-W-int {{in instantiation of function template specialization 'dr1310::wt_test >' requested here}} - // since-cxx11-note@#dr1310-W-int {{in instantiation of function template specialization 'dr1310::wt_test>' requested here}} - typename W::template W w4; - // expected-error@-1 {{ISO C++ specifies that qualified reference to 'W' is a constructor name rather than a template name in this context, despite preceding 'template' keyword}} - TTy tt2; - // expected-error@-1 {{ISO C++ specifies that qualified reference to 'W' is a constructor name rather than a type in this context, despite preceding 'typename' keyword}} - TT tt3; - // expected-error@-1 {{ISO C++ specifies that qualified reference to 'W' is a constructor name rather than a template name in this context, despite preceding 'template' keyword}} + typename W::W w2a; // expected-error {{qualified reference to 'W' is a constructor name}} + typename W::template W w4; // expected-error {{qualified reference to 'W' is a constructor name}} + TTy tt2; // expected-error {{qualified reference to 'W' is a constructor name}} + TT tt3; // expected-error {{qualified reference to 'W' is a constructor name}} } template void wt_test_good() { @@ -146,19 +111,18 @@ namespace dr1310 { // dr1310: 5 (void)w.template W::W::n; (void)w.template W::template W::n; } - template void wt_test >(); // #dr1310-W-int + template void wt_test >(); // expected-note {{instantiation of}} template void wt_test_good >(); } namespace dr1315 { // dr1315: partial template struct A {}; - template struct A {}; - // expected-error@-1 {{class template partial specialization contains a template parameter that cannot be deduced; this partial specialization will never be used}} - // expected-note@-2 {{non-deducible template parameter 'I'}} + template // expected-note {{non-deducible template parameter 'I'}} + struct A {}; // expected-error {{contains a template parameter that cannot be deduced}} template struct A {}; template struct B; - template struct B {}; // #dr1315-B-1 + template struct B {}; // expected-note {{matches}} B<1, 2, 3> b1; // Multiple declarations with the same dependent expression are equivalent @@ -167,11 +131,8 @@ namespace dr1315 { // dr1315: partial B<1, 2, 2>::type b2; // Multiple declarations with differing dependent expressions are unordered. - template struct B {}; // #dr1315-B-2 - B<1, 2, 4> b3; - // expected-error@-1 {{ambiguous partial specializations of 'B<1, 2, 4>'}} - // expected-note@#dr1315-B-1 {{partial specialization matches [with I = 1, K = 4]}} - // expected-note@#dr1315-B-2 {{partial specialization matches [with I = 1, K = 4]}} + template struct B {}; // expected-note {{matches}} + B<1, 2, 4> b3; // expected-error {{ambiguous}} // FIXME: Under dr1315, this is perhaps valid, but that is not clear: this // fails the "more specialized than the primary template" test because the @@ -185,9 +146,7 @@ namespace dr1315 { // dr1315: partial namespace dr1330 { // dr1330: 4 c++11 // exception-specifications are parsed in a context where the class is complete. struct A { - void f() throw(T) {} - // since-cxx17-error@-1 {{ISO C++17 does not allow dynamic exception specifications}} - // since-cxx17-note@-2 {{use 'noexcept(false)' instead}} + void f() throw(T) {} // expected-error 0-1{{C++17}} expected-note 0-1{{noexcept}} struct T {}; #if __cplusplus >= 201103L @@ -197,12 +156,8 @@ namespace dr1330 { // dr1330: 4 c++11 #endif }; - void (A::*af1)() throw(A::T) = &A::f; - // since-cxx17-error@-1 {{ISO C++17 does not allow dynamic exception specifications}} - // since-cxx17-note@-2 {{use 'noexcept(false)' instead}} - void (A::*af2)() throw() = &A::f; - // cxx98-14-error@-1 {{target exception specification is not superset of source}} - // since-cxx17-error@-2 {{cannot initialize a variable of type 'void (dr1330::A::*)() throw()' with an rvalue of type 'void (dr1330::A::*)() throw(T)': different exception specifications}} + void (A::*af1)() throw(A::T) = &A::f; // expected-error 0-1{{C++17}} expected-note 0-1{{noexcept}} + void (A::*af2)() throw() = &A::f; // expected-error-re {{{{not superset|different exception spec}}}} #if __cplusplus >= 201103L static_assert(noexcept(A().g()), ""); @@ -211,9 +166,7 @@ namespace dr1330 { // dr1330: 4 c++11 // Likewise, they're instantiated separately from an enclosing class template. template struct B { - void f() throw(T, typename U::type) {} - // since-cxx17-error@-1 {{ISO C++17 does not allow dynamic exception specifications}} - // since-cxx17-note@-2 {{use 'noexcept(false)' instead}} + void f() throw(T, typename U::type) {} // expected-error 0-1{{C++17}} expected-note 0-1{{noexcept}} struct T {}; #if __cplusplus >= 201103L @@ -230,6 +183,9 @@ namespace dr1330 { // dr1330: 4 c++11 static const int value = true; }; + void (B

::*bpf1)() throw(B

::T, int) = &B

::f; // expected-error 0-1{{C++17}} expected-note 0-1{{noexcept}} +#if __cplusplus < 201103L + // expected-error@-2 {{not superset}} // FIXME: We only delay instantiation in C++11 onwards. In C++98, something // weird happens: instantiation of B

fails because it references T before // it's instantiated, but the diagnostic is suppressed in @@ -237,20 +193,20 @@ namespace dr1330 { // dr1330: 4 c++11 // obviously a bad way to react to this situation; we should still producing // the "T has not yet been instantiated" error here, rather than giving // confusing errors later on. - void (B

::*bpf1)() throw(B

::T, int) = &B

::f; - // since-cxx17-error@-1 {{ISO C++17 does not allow dynamic exception specifications}} - // since-cxx17-note@-2 {{use 'noexcept(false)' instead}} - // cxx98-error@-3 {{target exception specification is not superset of source}} - - void (B

::*bpf2)() throw(int) = &B

::f; - // since-cxx17-error@-1 {{ISO C++17 does not allow dynamic exception specifications}} - // since-cxx17-note@-2 {{use 'noexcept(false)' instead}} - // cxx98-14-error@-3 {{target exception specification is not superset of source}} - // since-cxx17-warning@-4 {{target exception specification is not superset of source}} +#endif + void (B

::*bpf2)() throw(int) = &B

::f; // expected-error 0-1{{C++17}} expected-note 0-1{{noexcept}} +#if __cplusplus <= 201402L + // expected-error@-2 {{not superset}} +#else + // expected-warning@-4 {{not superset}} +#endif void (B

::*bpf3)() = &B

::f; void (B

::*bpf4)() throw() = &B

::f; - // cxx98-14-error@-1 {{target exception specification is not superset of source}} - // since-cxx17-error@-2 {{cannot initialize a variable of type 'void (B

::*)() throw()' with an rvalue of type 'void (dr1330::B::*)() throw(T, typename P::type)': different exception specifications}} +#if __cplusplus <= 201402L + // expected-error@-2 {{not superset}} +#else + // expected-error@-4 {{different exception specifications}} +#endif #if __cplusplus >= 201103L static_assert(noexcept(B

().g()), ""); @@ -258,64 +214,57 @@ namespace dr1330 { // dr1330: 4 c++11 static_assert(!noexcept(B().g()), ""); #endif - template int f() throw(typename T::error) { return 0; } // #dr1330-f - // expected-error@#dr1330-f {{type 'int' cannot be used prior to '::' because it has no members}} - // cxx98-note@#dr1330-f-int {{in instantiation of function template specialization 'dr1330::f' requested here}} - // since-cxx11-note@#dr1330-f-int {{in instantiation of exception specification for 'f' requested here}} - // cxx98-14-error@#dr1330-f {{type 'short' cannot be used prior to '::' because it has no members}} - // cxx98-14-note@#dr1330-f-short {{in instantiation of function template specialization 'dr1330::f' requested here}} - // cxx11-14-note@#dr1330-f {{in instantiation of exception specification for 'f' requested here}} - // since-cxx11-error@#dr1330-f {{type 'char' cannot be used prior to '::' because it has no members}} - // since-cxx11-note@#dr1330-f-char {{in instantiation of exception specification for 'f' requested here}} - // since-cxx11-error@#dr1330-f {{type 'float' cannot be used prior to '::' because it has no members}} - // since-cxx11-note@#dr1330-f-float {{in instantiation of exception specification for 'f' requested here}} - // since-cxx17-error@#dr1330-f {{ISO C++17 does not allow dynamic exception specifications}} - // since-cxx17-note@#dr1330-f {{use 'noexcept(false)' instead}} - + template int f() throw(typename T::error) { return 0; } // expected-error 1-4{{prior to '::'}} expected-note 0-1{{prior to '::'}} expected-note 0-1{{requested here}} +#if __cplusplus > 201402L + // expected-error@-2 0-1{{C++17}} expected-note@-2 0-1{{noexcept}} +#endif // An exception-specification is needed even if the function is only used in // an unevaluated operand. - int f1 = sizeof(f()); // #dr1330-f-int + int f1 = sizeof(f()); // expected-note {{instantiation of}} #if __cplusplus >= 201103L - decltype(f()) f2; // #dr1330-f-char - bool f3 = noexcept(f()); /// #dr1330-f-float + decltype(f()) f2; // expected-note {{instantiation of}} + bool f3 = noexcept(f()); // expected-note {{instantiation of}} #endif // In C++17 onwards, substituting explicit template arguments into the // function type substitutes into the exception specification (because it's // part of the type). In earlier languages, we don't notice there's a problem // until we've already started to instantiate. - template int f(); // #dr1330-f-short - // since-cxx17-error@-1 {{explicit instantiation of 'f' does not refer to a function template, variable template, member function, member class, or static data member}} - // since-cxx17-note@#dr1330-f {{candidate template ignored: substitution failure [with T = short]: type 'short' cannot be used prior to '::' because it has no members}} + template int f(); +#if __cplusplus >= 201703L + // expected-error@-2 {{does not refer to a function template}} +#else + // expected-note@-4 {{instantiation of}} +#endif template struct C { - C() throw(typename T::type); // #dr1330-C - // since-cxx17-error@-1 {{ISO C++17 does not allow dynamic exception specifications}} - // since-cxx17-note@-2 {{use 'noexcept(false)' instead}} - // cxx98-error@#dr1330-C {{type 'void' cannot be used prior to '::' because it has no members}} - // cxx98-note@#dr1330-C-void {{in instantiation of template class 'dr1330::C' requested here}} - // expected-error@#dr1330-C {{type 'int' cannot be used prior to '::' because it has no members}} - // cxx98-note@#dr1330-C-int {{in instantiation of template class 'dr1330::C' requested here}} - // since-cxx11-note@#dr1330-C-int {{in instantiation of exception specification for 'C' requested here}} - // since-cxx11-note@#dr1330-e {{in evaluation of exception specification for 'dr1330::E::E' needed here}} + C() throw(typename T::type); // expected-error 1-2{{prior to '::'}} +#if __cplusplus > 201402L + // expected-error@-2 0-1{{C++17}} expected-note@-2 0-1{{noexcept}} +#endif }; - struct D : C {}; // #dr1330-C-void + struct D : C {}; // ok +#if __cplusplus < 201103L + // expected-note@-2 {{instantiation of}} +#endif void f(D &d) { d = d; } // ok - struct E : C {}; // #dr1330-C-int - E e; // #dr1330-e + struct E : C {}; // expected-note {{in instantiation of}} +#if __cplusplus >= 201103L + E e; // expected-note {{needed here}} +#endif } namespace dr1341 { // dr1341: sup P0683R1 #if __cplusplus >= 202002L int a; -const int b = 0; // #dr1341-b +const int b = 0; // #dr1341-b-decl struct S { int x1 : 8 = 42; int x2 : 8 { 42 }; int y1 : true ? 8 : a = 42; int y2 : true ? 8 : b = 42; - // since-cxx20-error@-1 {{cannot assign to variable 'b' with const-qualified type 'const int'}} - // since-cxx20-note@#dr1341-b {{variable 'b' declared const here}} + // expected-error@-1 {{cannot assign to variable 'b' with const-qualified type 'const int'}} + // expected-note@#dr1341-b-decl {{variable 'b' declared const here}} int y3 : (true ? 8 : b) = 42; int z : 1 || new int { 0 }; }; @@ -323,66 +272,44 @@ struct S { } namespace dr1346 { // dr1346: 3.5 - auto a(1); - // cxx98-error@-1 {{'auto' type specifier is a C++11 extension}} - auto b(1, 2); - // cxx98-error@-1 {{'auto' type specifier is a C++11 extension}} - // expected-error@-2 {{initializer for variable 'b' with type 'auto' contains multiple expressions}} + auto a(1); // expected-error 0-1{{extension}} + auto b(1, 2); // expected-error {{multiple expressions}} expected-error 0-1{{extension}} #if __cplusplus >= 201103L - auto c({}); - // since-cxx11-error@-1 {{cannot deduce type for variable 'c' with type 'auto' from parenthesized initializer list}} - auto d({1}); - // since-cxx11-error@-1 {{cannot deduce type for variable 'd' with type 'auto' from parenthesized initializer list}} - auto e({1, 2}); - // since-cxx11-error@-1 {{cannot deduce type for variable 'e' with type 'auto' from parenthesized initializer list}} + auto c({}); // expected-error {{parenthesized initializer list}} + auto d({1}); // expected-error {{parenthesized initializer list}} + auto e({1, 2}); // expected-error {{parenthesized initializer list}} #endif - template void f(Ts ...ts) { - // cxx98-error@-1 {{variadic templates are a C++11 extension}} - auto x(ts...); - // cxx98-error@-1 {{'auto' type specifier is a C++11 extension}} - // expected-error@-2 {{initializer for variable 'x' with type 'auto' is empty}} - // expected-note@#dr1346-f {{in instantiation of function template specialization 'dr1346::f<>' requested here}} + template void f(Ts ...ts) { // expected-error 0-1{{extension}} + auto x(ts...); // expected-error {{empty}} expected-error 0-1{{extension}} } - template void f(); // #dr1346-f + template void f(); // expected-note {{instantiation}} #if __cplusplus >= 201103L void init_capture() { - [a(1)] {} (); - // cxx11-error@-1 {{initialized lambda captures are a C++14 extension}} - [b(1, 2)] {} (); - // cxx11-error@-1 {{initialized lambda captures are a C++14 extension}} - // since-cxx11-error@-2 {{initializer for lambda capture 'b' contains multiple expressions}} - [c({})] {} (); - // cxx11-error@-1 {{initialized lambda captures are a C++14 extension}} - // since-cxx11-error@-2 {{cannot deduce type for lambda capture 'c' from parenthesized initializer list}} - [d({1})] {} (); - // cxx11-error@-1 {{initialized lambda captures are a C++14 extension}} - // since-cxx11-error@-2 {{cannot deduce type for lambda capture 'd' from parenthesized initializer list}} - [e({1, 2})] {} (); - // cxx11-error@-1 {{initialized lambda captures are a C++14 extension}} - // since-cxx11-error@-2 {{cannot deduce type for lambda capture 'e' from parenthesized initializer list}} + [a(1)] {} (); // expected-error 0-1{{extension}} + [b(1, 2)] {} (); // expected-error {{multiple expressions}} expected-error 0-1{{extension}} +#if __cplusplus >= 201103L + [c({})] {} (); // expected-error {{parenthesized initializer list}} expected-error 0-1{{extension}} + [d({1})] {} (); // expected-error {{parenthesized initializer list}} expected-error 0-1{{extension}} + [e({1, 2})] {} (); // expected-error {{parenthesized initializer list}} expected-error 0-1{{extension}} +#endif } #endif } namespace dr1347 { // dr1347: 3.1 - auto x = 5, *y = &x; - // cxx98-error@-1 {{'auto' type specifier is a C++11 extension}} - auto z = y, *q = y; - // cxx98-error@-1 {{'auto' type specifier is a C++11 extension}} - // expected-error@-2 {{'auto' deduced as 'int *' in declaration of 'z' and deduced as 'int' in declaration of 'q'}} + auto x = 5, *y = &x; // expected-error 0-1{{extension}} + auto z = y, *q = y; // expected-error {{'auto' deduced as 'int *' in declaration of 'z' and deduced as 'int' in declaration of 'q'}} expected-error 0-1{{extension}} #if __cplusplus >= 201103L - auto a = 5, b = {1, 2}; - // since-cxx11-error@-1 {{'auto' deduced as 'int' in declaration of 'a' and deduced as 'std::initializer_list' in declaration of 'b'}} - auto (*fp)(int) -> int, i = 0; - // since-cxx11-error@-1 {{declaration with trailing return type must be the only declaration in its group}} + auto a = 5, b = {1, 2}; // expected-error {{'auto' deduced as 'int' in declaration of 'a' and deduced as 'std::initializer_list' in declaration of 'b'}} + auto (*fp)(int) -> int, i = 0; // expected-error {{declaration with trailing return type must be the only declaration in its group}} #endif } namespace dr1358 { // dr1358: 3.1 #if __cplusplus >= 201103L struct Lit { constexpr operator int() const { return 0; } }; - struct NonLit { NonLit(); operator int(); }; // #dr1358-NonLit + struct NonLit { NonLit(); operator int(); }; // expected-note 2{{no constexpr constructors}} struct NonConstexprConv { constexpr operator int() const; }; struct Virt { virtual int f(int) const; }; @@ -409,12 +336,8 @@ namespace dr1358 { // dr1358: 3.1 // But the corresponding non-template cases are rejected. struct B : Virt { int member; - constexpr B(NonLit u) : member(u) {} - // since-cxx11-error@-1 {{constexpr constructor's 1st parameter type 'NonLit' is not a literal type}} - // since-cxx11-note@#dr1358-NonLit {{'NonLit' is not literal because it is not an aggregate and has no constexpr constructors other than copy or move constructors}} - constexpr NonLit f(NonLit u) const { return NonLit(); } - // since-cxx11-error@-1 {{constexpr function's return type 'NonLit' is not a literal type}} - // since-cxx11-note@#dr1358-NonLit {{'NonLit' is not literal because it is not an aggregate and has no constexpr constructors other than copy or move constructors}} + constexpr B(NonLit u) : member(u) {} // expected-error {{not a literal type}} + constexpr NonLit f(NonLit u) const { return NonLit(); } // expected-error {{not a literal type}} }; #endif } @@ -422,113 +345,65 @@ namespace dr1358 { // dr1358: 3.1 namespace dr1359 { // dr1359: 3.5 #if __cplusplus >= 201103L union A { constexpr A() = default; }; - union B { constexpr B() = default; int a; }; // #dr1359-B - // cxx11-17-error@-1 {{defaulted definition of default constructor is not constexpr}} - union C { constexpr C() = default; int a, b; }; // #dr1359-C - // cxx11-17-error@-1 {{defaulted definition of default constructor is not constexpr}} - struct X { constexpr X() = default; union {}; }; - // since-cxx11-error@-1 {{declaration does not declare anything}} - struct Y { constexpr Y() = default; union { int a; }; }; // #dr1359-Y - // cxx11-17-error@-1 {{defaulted definition of default constructor is not constexpr}} + union B { constexpr B() = default; int a; }; // expected-error {{not constexpr}} expected-note 2{{candidate}} + union C { constexpr C() = default; int a, b; }; // expected-error {{not constexpr}} expected-note 2{{candidate}} + struct X { constexpr X() = default; union {}; }; // expected-error {{does not declare anything}} + struct Y { constexpr Y() = default; union { int a; }; }; // expected-error {{not constexpr}} expected-note 2{{candidate}} constexpr A a = A(); - constexpr B b = B(); - // cxx11-17-error@-1 {{no matching constructor for initialization of 'B'}} - // cxx11-17-note@#dr1359-B {{candidate constructor (the implicit copy constructor) not viable: requires 1 argument, but 0 were provided}} - // cxx11-17-note@#dr1359-B {{candidate constructor (the implicit move constructor) not viable: requires 1 argument, but 0 were provided}} - constexpr C c = C(); - // cxx11-17-error@-1 {{no matching constructor for initialization of 'C'}} - // cxx11-17-note@#dr1359-C {{candidate constructor (the implicit copy constructor) not viable: requires 1 argument, but 0 were provided}} - // cxx11-17-note@#dr1359-C {{candidate constructor (the implicit move constructor) not viable: requires 1 argument, but 0 were provided}} + constexpr B b = B(); // expected-error {{no matching}} + constexpr C c = C(); // expected-error {{no matching}} constexpr X x = X(); - constexpr Y y = Y(); - // cxx11-17-error@-1 {{no matching constructor for initialization of 'Y'}} - // cxx11-17-note@#dr1359-Y {{candidate constructor (the implicit copy constructor) not viable: requires 1 argument, but 0 were provided}} - // cxx11-17-note@#dr1359-Y {{candidate constructor (the implicit move constructor) not viable: requires 1 argument, but 0 were provided}} + constexpr Y y = Y(); // expected-error {{no matching}} #endif } namespace dr1388 { // dr1388: 4 - template void f(T..., A); // #dr1388-f - // cxx98-error@-1 {{variadic templates are a C++11 extension}} - template void g(T..., int); // #dr1388-g - // cxx98-error@-1 {{variadic templates are a C++11 extension}} - template void h(T..., A); // #dr1388-h - // cxx98-error@-1 {{variadic templates are a C++11 extension}} + template void f(T..., A); // expected-note 1+{{candidate}} expected-error 0-1{{C++11}} + template void g(T..., int); // expected-note 1+{{candidate}} expected-error 0-1{{C++11}} + template void h(T..., A); // expected-note 1+{{candidate}} expected-error 0-1{{C++11}} void test_f() { f(0); // ok, trailing parameter pack deduced to empty - f(0, 0); - // expected-error@-1 {{no matching function for call to 'f'}} - // expected-note@#dr1388-f {{candidate function [with A = int, T = <>] not viable: requires 1 argument, but 2 were provided}} + f(0, 0); // expected-error {{no matching}} f(0); - f(0, 0); - // expected-error@-1 {{no matching function for call to 'f'}} - // expected-note@#dr1388-f {{candidate function [with A = int, T = <>] not viable: requires 1 argument, but 2 were provided}} + f(0, 0); // expected-error {{no matching}} f(0, 0); - f(0, 0); - // expected-error@-1 {{no matching function for call to 'f'}} - // expected-note@#dr1388-f {{candidate function [with A = int, T = ] not viable: requires 3 arguments, but 2 were provided}} + f(0, 0); // expected-error {{no matching}} g(0); - g(0, 0); - // expected-error@-1 {{no matching function for call to 'g'}} - // expected-note@#dr1388-g {{candidate function [with T = <>] not viable: requires 1 argument, but 2 were provided}} + g(0, 0); // expected-error {{no matching}} g<>(0); - g(0); - // expected-error@-1 {{no matching function for call to 'g'}} - // expected-note@#dr1388-g {{candidate function [with T = ] not viable: requires 2 arguments, but 1 was provided}} + g(0); // expected-error {{no matching}} g(0, 0); h(0); - h(0, 0); - // expected-error@-1 {{no matching function for call to 'h'}} - // expected-note@#dr1388-h {{candidate function [with T = <>, A = int] not viable: requires 1 argument, but 2 were provided}} + h(0, 0); // expected-error {{no matching}} h(0, 0); - h(0, 0); - // expected-error@-1 {{no matching function for call to 'h'}} - // expected-note@#dr1388-h {{candidate template ignored: couldn't infer template argument 'A'}} + h(0, 0); // expected-error {{no matching}} } // A non-trailing parameter pack is still a non-deduced context, even though // we know exactly how many arguments correspond to it. template struct pair {}; - template struct tuple { typedef char type; }; // - // cxx98-error@-1 {{variadic templates are a C++11 extension}} - template void f_pair_1(pair..., int); // #dr1388-f-1 - // cxx98-error@-1 {{variadic templates are a C++11 extension}} - // cxx98-error@-2 {{variadic templates are a C++11 extension}} - template void f_pair_2(pair..., U); - // cxx98-error@-1 {{variadic templates are a C++11 extension}} - template void f_pair_3(pair..., tuple); // #dr1388-f-3 - // cxx98-error@-1 {{variadic templates are a C++11 extension}} - // cxx98-error@-2 {{variadic templates are a C++11 extension}} - template void f_pair_4(pair..., T...); // #dr1388-f-4 - // cxx98-error@-1 {{variadic templates are a C++11 extension}} + template struct tuple { typedef char type; }; // expected-error 0-2{{C++11}} + template void f_pair_1(pair..., int); // expected-error 0-2{{C++11}} expected-note {{[with T = ]: deduced incomplete pack <(no value), (no value)> for template parameter 'U'}} + template void f_pair_2(pair..., U); // expected-error 0-2{{C++11}} + template void f_pair_3(pair..., tuple); // expected-error 0-2{{C++11}} expected-note {{deduced packs of different lengths for parameter 'U' (<(no value), (no value)> vs. )}} + template void f_pair_4(pair..., T...); // expected-error 0-2{{C++11}} expected-note {{ vs. }} void g(pair a, pair b, tuple c) { - f_pair_1(a, b, 0); - // expected-error@-1 {{no matching function for call to 'f_pair_1'}} - // expected-note@#dr1388-f-1 {{candidate template ignored: substitution failure [with T = ]: deduced incomplete pack <(no value), (no value)> for template parameter 'U'}} + f_pair_1(a, b, 0); // expected-error {{no match}} f_pair_2(a, b, 0); f_pair_3(a, b, c); - f_pair_3(a, b, tuple()); - // expected-error@-1 {{no matching function for call to 'f_pair_3'}} - // expected-note@#dr1388-f-3 {{candidate template ignored: deduced packs of different lengths for parameter 'U' (<(no value), (no value)> vs. )}} + f_pair_3(a, b, tuple()); // expected-error {{no match}} f_pair_4(a, b, 0, 0L); - f_pair_4(a, b, 0, 0L, "foo"); - // expected-error@-1 {{no matching function for call to 'f_pair_4'}} - // expected-note@#dr1388-f-4 {{candidate template ignored: deduced packs of different lengths for parameter 'T' ( vs. )}} + f_pair_4(a, b, 0, 0L, "foo"); // expected-error {{no match}} } } namespace dr1391 { // dr1391: partial struct A {}; struct B : A {}; - template struct C { C(int); typename T::error error; }; // #dr1391-C - // expected-error@#dr1391-C {{type 'int' cannot be used prior to '::' because it has no members}} - // expected-note@#dr1391-b {{in instantiation of template class 'dr1391::C' requested here}} - // expected-note@#dr1391-b {{while substituting deduced template arguments into function template 'b' [with T = int]}} - // expected-error@#dr1391-C {{type 'double' cannot be used prior to '::' because it has no members}} - // expected-note@#dr1391-c {{in instantiation of template class 'dr1391::C' requested here}} + template struct C { C(int); typename T::error error; }; // expected-error 2{{'::'}} template struct D {}; // No deduction is performed for parameters with no deducible template-parameters, therefore types do not need to match. @@ -569,36 +444,28 @@ namespace dr1391 { // dr1391: partial void test_b() { b(0, 0); // ok, deduction fails prior to forming a conversion sequence and instantiating C // FIXME: The "while substituting" note should point at the overload candidate. - b(0, 0); // #dr1391-b + b(0, 0); // expected-note {{instantiation of}} expected-note {{while substituting}} } template struct Id { typedef T type; }; template void c(T, typename Id >::type); void test_c() { // Implicit conversion sequences for dependent types are checked later. - c(0.0, 0); // #dr1391-c + c(0.0, 0); // expected-note {{instantiation of}} } namespace partial_ordering { // FIXME: Second template should be considered more specialized because non-dependent parameter is ignored. - template int a(T, short) = delete; // #dr1391-a-short - // cxx98-error@-1 {{deleted function definitions are a C++11 extension}} - template int a(T*, char); // #dr1391-a-char - int test_a = a((int*)0, 0); - // expected-error@-1 {{call to 'a' is ambiguous}} FIXME - // expected-note@#dr1391-a-short {{candidate function [with T = int *] has been explicitly deleted}} - // expected-note@#dr1391-a-char {{candidate function [with T = int]}} + template int a(T, short) = delete; // expected-error 0-1{{extension}} expected-note {{candidate}} + template int a(T*, char); // expected-note {{candidate}} + int test_a = a((int*)0, 0); // FIXME: expected-error {{ambiguous}} // FIXME: Second template should be considered more specialized: // deducing #1 from #2 ignores the second P/A pair, so deduction succeeds, // deducing #2 from #1 fails to deduce T, so deduction fails. - template int b(T, int) = delete; // #dr1391-b-int - // cxx98-error@-1 {{deleted function definitions are a C++11 extension}} - template int b(T*, U); // #dr1391-b-U - int test_b = b((int*)0, 0); - // expected-error@-1 {{call to 'b' is ambiguous}} FIXME - // expected-note@#dr1391-b-int {{candidate function [with T = int *] has been explicitly deleted}} - // expected-note@#dr1391-b-U {{candidate function [with T = int, U = int]}} + template int b(T, int) = delete; // expected-error 0-1{{extension}} expected-note {{candidate}} + template int b(T*, U); // expected-note {{candidate}} + int test_b = b((int*)0, 0); // FIXME: expected-error {{ambiguous}} // Unintended consequences: because partial ordering does not consider // explicit template arguments, and deduction from a non-dependent type @@ -632,26 +499,26 @@ namespace dr1395 { // dr1395: 16 namespace dr1397 { // dr1397: 3.2 #if __cplusplus >= 201103L -struct A { -// cxx11-error@-1 {{default member initializer for 'p' needed within definition of enclosing class 'A' outside of member functions}} -// cxx11-note@#dr1397-p {{in evaluation of exception specification for 'dr1397::A::A' needed here}} -// cxx11-note@#dr1397-p {{default member initializer declared here}} - void *p = A{}; // #dr1397-p - // since-cxx14-error@-1 {{default member initializer for 'p' needed within definition of enclosing class 'A' outside of member functions}} - // since-cxx14-note@-2 {{default member initializer declared here}} +struct A { // #dr1397-struct-A + void *p = A{}; // #dr1397-void-p +#if __cplusplus == 201103L + // expected-error@#dr1397-struct-A {{default member initializer for 'p' needed within definition of enclosing class 'A' outside of member functions}} + // expected-note@#dr1397-void-p {{in evaluation of exception specification for 'dr1397::A::A' needed here}} + // expected-note@#dr1397-void-p {{default member initializer declared here}} +#elif __cplusplus >= 201402L + // expected-error@#dr1397-void-p {{default member initializer for 'p' needed within definition of enclosing class 'A' outside of member functions}} + // expected-note@#dr1397-void-p {{default member initializer declared here}} +#endif operator void*() const { return nullptr; } }; #endif } // namespace dr1397 namespace dr1399 { // dr1399: dup 1388 - template void f(T..., int, T...) {} // #dr1399-f - // cxx98-error@-1 {{variadic templates are a C++11 extension}} + template void f(T..., int, T...) {} // expected-note {{candidate}} expected-error 0-1{{C++11}} void g() { f(0); f(0, 0, 0); - f(0, 0, 0); - // expected-error@-1 {{no matching function for call to 'f'}} - // expected-note@#dr1399-f {{candidate template ignored: deduced packs of different lengths for parameter 'T' (<> vs. )}} + f(0, 0, 0); // expected-error {{no match}} } } diff --git a/clang/test/CXX/drs/dr14xx.cpp b/clang/test/CXX/drs/dr14xx.cpp index d262f6f9dcab7..7dd7da95d034d 100644 --- a/clang/test/CXX/drs/dr14xx.cpp +++ b/clang/test/CXX/drs/dr14xx.cpp @@ -1,10 +1,8 @@ -// RUN: %clang_cc1 -std=c++98 %s -verify=expected -fexceptions -fcxx-exceptions -pedantic-errors -// RUN: %clang_cc1 -std=c++11 %s -verify=expected,cxx11-17,since-cxx11, -fexceptions -fcxx-exceptions -pedantic-errors -// RUN: %clang_cc1 -std=c++14 %s -verify=expected,cxx14-17,cxx11-17,since-cxx11,since-cxx14 -fexceptions -fcxx-exceptions -pedantic-errors -// RUN: %clang_cc1 -std=c++17 %s -verify=expected,cxx14-17,cxx11-17,since-cxx11,since-cxx14 -fexceptions -fcxx-exceptions -pedantic-errors -// RUN: %clang_cc1 -std=c++20 %s -verify=expected,since-cxx11,since-cxx14,since-cxx20 -fexceptions -fcxx-exceptions -pedantic-errors -// RUN: %clang_cc1 -std=c++23 %s -verify=expected,since-cxx11,since-cxx14,since-cxx20 -fexceptions -fcxx-exceptions -pedantic-errors -// RUN: %clang_cc1 -std=c++2c %s -verify=expected,since-cxx11,since-cxx14,since-cxx20 -fexceptions -fcxx-exceptions -pedantic-errors +// RUN: %clang_cc1 -std=c++98 %s -verify -fexceptions -fcxx-exceptions -pedantic-errors +// RUN: %clang_cc1 -std=c++11 %s -verify -fexceptions -fcxx-exceptions -pedantic-errors +// RUN: %clang_cc1 -std=c++14 %s -verify -fexceptions -fcxx-exceptions -pedantic-errors +// RUN: %clang_cc1 -std=c++17 %s -verify -fexceptions -fcxx-exceptions -pedantic-errors +// RUN: %clang_cc1 -std=c++2a %s -verify -fexceptions -fcxx-exceptions -pedantic-errors namespace dr1413 { // dr1413: 12 template struct Check { @@ -17,41 +15,24 @@ namespace dr1413 { // dr1413: 12 void d(); void f() { - Check::type *var1; - // expected-error@-1 {{use of undeclared identifier 'var1'}} - - // ok, variable declaration - Check::type *var2; // #dr1413-var2 - Check::type *var3; - // expected-error@-1 {{use of undeclared identifier 'var3'}} - // expected-note@#dr1413-var2 {{'var2' declared here}} - Check::type *var4; - // expected-error@-1 {{use of undeclared identifier 'var4'}} - // expected-note@#dr1413-var2 {{'var2' declared here}} - + Check::type *var1; // expected-error {{undeclared identifier 'var1'}} + Check::type *var2; // ok, variable declaration expected-note 0+{{here}} + Check::type *var3; // expected-error {{undeclared identifier 'var3'}} + Check::type *var4; // expected-error {{undeclared identifier 'var4'}} // value-dependent because of the implied type-dependent 'this->', not because of 'd' - Check::type *var5; - // expected-error@-1 {{use of undeclared identifier 'var5'}} - // expected-note@#dr1413-var2 {{'var2' declared here}} - + Check::type *var5; // expected-error {{undeclared identifier 'var5'}} // value-dependent because of the value-dependent '&' operator, not because of 'A::d' - Check::type *var5; - // expected-error@-1 {{use of undeclared identifier 'var5'}} - // expected-note@#dr1413-var2 {{'var2' declared here}} + Check::type *var5; // expected-error {{undeclared identifier 'var5'}} } }; } namespace dr1423 { // dr1423: 11 #if __cplusplus >= 201103L - bool b1 = nullptr; - // since-cxx11-error@-1 {{cannot initialize a variable of type 'bool' with an rvalue of type 'std::nullptr_t'}} - bool b2(nullptr); - // since-cxx11-warning@-1 {{implicit conversion of nullptr constant to 'bool'}} - bool b3 = {nullptr}; - // since-cxx11-error@-1 {{cannot initialize a variable of type 'bool' with an rvalue of type 'std::nullptr_t'}} - bool b4{nullptr}; - // since-cxx11-warning@-1 {{implicit conversion of nullptr constant to 'bool'}} + bool b1 = nullptr; // expected-error {{cannot initialize}} + bool b2(nullptr); // expected-warning {{implicit conversion of nullptr constant to 'bool'}} + bool b3 = {nullptr}; // expected-error {{cannot initialize}} + bool b4{nullptr}; // expected-warning {{implicit conversion of nullptr constant to 'bool'}} #endif } @@ -81,8 +62,7 @@ namespace dr1432 { // dr1432: 16 namespace dr1443 { // dr1443: yes struct A { int i; - A() { void foo(int=i); } - // expected-error@-1 {{default argument references 'this'}} + A() { void foo(int=i); } // expected-error {{default argument references 'this'}} }; } @@ -90,54 +70,49 @@ namespace dr1460 { // dr1460: 3.5 #if __cplusplus >= 201103L namespace DRExample { union A { - union {}; - // expected-error@-1 {{declaration does not declare anything}} - union {}; - // expected-error@-1 {{declaration does not declare anything}} + union {}; // expected-error {{does not declare anything}} + union {}; // expected-error {{does not declare anything}} constexpr A() {} }; constexpr A a = A(); union B { - union {}; - // expected-error@-1 {{declaration does not declare anything}} - union {}; - // expected-error@-1 {{declaration does not declare anything}} + union {}; // expected-error {{does not declare anything}} + union {}; // expected-error {{does not declare anything}} constexpr B() = default; }; constexpr B b = B(); union C { - union {}; - // expected-error@-1 {{declaration does not declare anything}} - union {}; - // expected-error@-1 {{declaration does not declare anything}} + union {}; // expected-error {{does not declare anything}} + union {}; // expected-error {{does not declare anything}} }; constexpr C c = C(); -#if __cplusplus >= 201403L +#if __cplusplus > 201103L constexpr void f() { C c; } static_assert((f(), true), ""); #endif } union A {}; - union B { int n; }; // #dr1460-B + union B { int n; }; // expected-note 0+{{here}} union C { int n = 0; }; - struct D { union {}; }; - // expected-error@-1 {{declaration does not declare anything}} - struct E { union { int n; }; }; // #dr1460-E + struct D { union {}; }; // expected-error {{does not declare anything}} + struct E { union { int n; }; }; // expected-note 0+{{here}} struct F { union { int n = 0; }; }; struct X { friend constexpr A::A() noexcept; friend constexpr B::B() noexcept; - // cxx11-17-error@-1 {{constexpr declaration of 'B' follows non-constexpr declaration}} - // cxx11-17-note@#dr1460-B {{previous declaration is here}} +#if __cplusplus <= 201703L + // expected-error@-2 {{follows non-constexpr declaration}} +#endif friend constexpr C::C() noexcept; friend constexpr D::D() noexcept; friend constexpr E::E() noexcept; - // cxx11-17-error@-1 {{constexpr declaration of 'E' follows non-constexpr declaration}} - // cxx11-17-note@#dr1460-E {{previous declaration is here}} +#if __cplusplus <= 201703L + // expected-error@-2 {{follows non-constexpr declaration}} +#endif friend constexpr F::F() noexcept; }; @@ -153,77 +128,79 @@ namespace dr1460 { // dr1460: 3.5 namespace Defaulted { union A { constexpr A() = default; }; union B { int n; constexpr B() = default; }; - // cxx11-17-error@-1 {{defaulted definition of default constructor is not constexpr}} +#if __cplusplus <= 201703L + // expected-error@-2 {{not constexpr}} +#endif union C { int n = 0; constexpr C() = default; }; - struct D { union {}; constexpr D() = default; }; - // expected-error@-1 {{declaration does not declare anything}} + struct D { union {}; constexpr D() = default; }; // expected-error {{does not declare anything}} struct E { union { int n; }; constexpr E() = default; }; - // cxx11-17-error@-1 {{defaulted definition of default constructor is not constexpr}} +#if __cplusplus <= 201703L + // expected-error@-2 {{not constexpr}} +#endif struct F { union { int n = 0; }; constexpr F() = default; }; struct G { union { int n = 0; }; union { int m; }; constexpr G() = default; }; - // cxx11-17-error@-1 {{defaulted definition of default constructor is not constexpr}} +#if __cplusplus <= 201703L + // expected-error@-2 {{not constexpr}} +#endif struct H { union { int n = 0; }; - union { // #dr1460-H-union + union { // expected-note 0-2{{member not initialized}} int m; }; constexpr H() {} - // cxx11-17-error@-1 {{constexpr constructor that does not initialize all members is a C++20 extension}} - // cxx11-17-note@#dr1460-H-union {{member not initialized by constructor}} +#if __cplusplus <= 201703L + // expected-error@-2 {{initialize all members}} +#endif constexpr H(bool) : m(1) {} constexpr H(char) : n(1) {} - // cxx11-17-error@-1 {{constexpr constructor that does not initialize all members is a C++20 extension}} - // cxx11-17-note@#dr1460-H-union {{member not initialized by constructor}} +#if __cplusplus <= 201703L + // expected-error@-2 {{initialize all members}} +#endif constexpr H(double) : m(1), n(1) {} }; } -#if __cplusplus >= 201403L +#if __cplusplus > 201103L template constexpr bool check() { - T t; // #dr1460-t + T t; +#if __cplusplus <= 201703L + // expected-note-re@-2 2{{non-constexpr constructor '{{[BE]}}'}} +#endif return true; } static_assert(check(), ""); - static_assert(check(), ""); // #dr1460-check-B - // cxx14-17-error@-1 {{static assertion expression is not an integral constant expression}} - // cxx14-17-note@#dr1460-t {{non-constexpr constructor 'B' cannot be used in a constant expression}} - // cxx14-17-note@#dr1460-check-B {{in call to 'check()'}} - // cxx14-17-note@#dr1460-B {{declared here}} + static_assert(check(), ""); +#if __cplusplus <= 201703L + // expected-error@-2 {{constant}} expected-note@-2 {{in call}} +#endif static_assert(check(), ""); static_assert(check(), ""); - static_assert(check(), ""); // #dr1460-check-E - // cxx14-17-error@-1 {{static assertion expression is not an integral constant expression}} - // cxx14-17-note@#dr1460-t {{non-constexpr constructor 'E' cannot be used in a constant expression}} - // cxx14-17-note@#dr1460-check-E {{in call to 'check()'}} - // cxx14-17-note@#dr1460-E {{declared here}} + static_assert(check(), ""); +#if __cplusplus <= 201703L + // expected-error@-2 {{constant}} expected-note@-2 {{in call}} +#endif static_assert(check(), ""); #endif union G { - int a = 0; // #dr1460-G-a - int b = 0; - // expected-error@-1 {{initializing multiple members of union}} - // expected-note@#dr1460-G-a {{previous initialization is here}} + int a = 0; // expected-note {{previous initialization is here}} + int b = 0; // expected-error {{initializing multiple members of union}} }; union H { union { - int a = 0; // #dr1460-H-a + int a = 0; // expected-note {{previous initialization is here}} }; union { - int b = 0; - // expected-error@-1 {{initializing multiple members of union}} - // expected-note@#dr1460-H-a {{previous initialization is here}} + int b = 0; // expected-error {{initializing multiple members of union}} }; }; struct I { union { - int a = 0; // #dr1460-I-a - int b = 0; - // expected-error@-1 {{initializing multiple members of union}} - // expected-note@#dr1460-I-a {{previous initialization is here}} + int a = 0; // expected-note {{previous initialization is here}} + int b = 0; // expected-error {{initializing multiple members of union}} }; }; struct J { @@ -246,24 +223,14 @@ namespace dr1460 { // dr1460: 3.5 constexpr B(const char*) {} }; static_assert(B().a == 1, ""); - static_assert(B().b == 2, ""); - // expected-error@-1 {{static assertion expression is not an integral constant expression}} - // expected-note@-2 {{read of member 'b' of union with active member 'a' is not allowed in a constant expression}} - static_assert(B('x').a == 0, ""); - // expected-error@-1 {{static assertion expression is not an integral constant expression}} - // expected-note@-2 {{read of member 'a' of union with active member 'b' is not allowed in a constant expression}} + static_assert(B().b == 2, ""); // expected-error {{constant}} expected-note {{read of}} + static_assert(B('x').a == 0, ""); // expected-error {{constant}} expected-note {{read of}} static_assert(B('x').b == 4, ""); - static_assert(B(123).b == 2, ""); - // expected-error@-1 {{static assertion expression is not an integral constant expression}} - // expected-note@-2 {{read of member 'b' of union with active member 'c' is not allowed in a constant expression}} + static_assert(B(123).b == 2, ""); // expected-error {{constant}} expected-note {{read of}} static_assert(B(123).c == 3, ""); - static_assert(B("").a == 1, ""); - // expected-error@-1 {{static assertion expression is not an integral constant expression}} - // expected-note@-2 {{read of member 'a' of union with active member 'b' is not allowed in a constant expression}} + static_assert(B("").a == 1, ""); // expected-error {{constant}} expected-note {{read of}} static_assert(B("").b == 2, ""); - static_assert(B("").c == 3, ""); - // expected-error@-1 {{static assertion expression is not an integral constant expression}} - // expected-note@-2 {{read of member 'c' of union with active member 'b' is not allowed in a constant expression}} + static_assert(B("").c == 3, ""); // expected-error {{constant}} expected-note {{read of}} struct C { union { int a, b = 2, c; }; @@ -276,55 +243,31 @@ namespace dr1460 { // dr1460: 3.5 }; static_assert(C().a == 1, ""); - static_assert(C().b == 2, ""); - // expected-error@-1 {{static assertion expression is not an integral constant expression}} - // expected-note@-2 {{read of member 'b' of union with active member 'a' is not allowed in a constant expression}} - static_assert(C().d == 4, ""); - // expected-error@-1 {{static assertion expression is not an integral constant expression}} - // expected-note@-2 {{read of member 'd' of union with active member 'e' is not allowed in a constant expression}} + static_assert(C().b == 2, ""); // expected-error {{constant}} expected-note {{read of}} + static_assert(C().d == 4, ""); // expected-error {{constant}} expected-note {{read of}} static_assert(C().e == 5, ""); - static_assert(C('x').b == 2, ""); - // expected-error@-1 {{static assertion expression is not an integral constant expression}} - // expected-note@-2 {{read of member 'b' of union with active member 'c' is not allowed in a constant expression}} + static_assert(C('x').b == 2, ""); // expected-error {{constant}} expected-note {{read of}} static_assert(C('x').c == 3, ""); - static_assert(C('x').d == 4, ""); - // expected-error@-1 {{static assertion expression is not an integral constant expression}} - // expected-note@-2 {{read of member 'd' of union with active member 'e' is not allowed in a constant expression}} + static_assert(C('x').d == 4, ""); // expected-error {{constant}} expected-note {{read of}} static_assert(C('x').e == 5, ""); static_assert(C(1).b == 2, ""); - static_assert(C(1).c == 3, ""); - // expected-error@-1 {{static assertion expression is not an integral constant expression}} - // expected-note@-2 {{read of member 'c' of union with active member 'b' is not allowed in a constant expression}} + static_assert(C(1).c == 3, ""); // expected-error {{constant}} expected-note {{read of}} static_assert(C(1).d == 4, ""); - static_assert(C(1).e == 5, ""); - // expected-error@-1 {{static assertion expression is not an integral constant expression}} - // expected-note@-2 {{read of member 'e' of union with active member 'd' is not allowed in a constant expression}} + static_assert(C(1).e == 5, ""); // expected-error {{constant}} expected-note {{read of}} static_assert(C(1.f).b == 2, ""); - static_assert(C(1.f).c == 3, ""); - // expected-error@-1 {{static assertion expression is not an integral constant expression}} - // expected-note@-2 {{read of member 'c' of union with active member 'b' is not allowed in a constant expression}} - static_assert(C(1.f).e == 5, ""); - // expected-error@-1 {{static assertion expression is not an integral constant expression}} - // expected-note@-2 {{read of member 'e' of union with active member 'f' is not allowed in a constant expression}} + static_assert(C(1.f).c == 3, ""); // expected-error {{constant}} expected-note {{read of}} + static_assert(C(1.f).e == 5, ""); // expected-error {{constant}} expected-note {{read of}} static_assert(C(1.f).f == 6, ""); - static_assert(C("").a == 1, ""); - // expected-error@-1 {{static assertion expression is not an integral constant expression}} - // expected-note@-2 {{read of member 'a' of union with active member 'b' is not allowed in a constant expression}} + static_assert(C("").a == 1, ""); // expected-error {{constant}} expected-note {{read of}} static_assert(C("").b == 2, ""); - static_assert(C("").c == 3, ""); - // expected-error@-1 {{static assertion expression is not an integral constant expression}} - // expected-note@-2 {{read of member 'c' of union with active member 'b' is not allowed in a constant expression}} - static_assert(C("").d == 4, ""); - // expected-error@-1 {{static assertion expression is not an integral constant expression}} - // expected-note@-2 {{read of member 'd' of union with active member 'e' is not allowed in a constant expression}} + static_assert(C("").c == 3, ""); // expected-error {{constant}} expected-note {{read of}} + static_assert(C("").d == 4, ""); // expected-error {{constant}} expected-note {{read of}} static_assert(C("").e == 5, ""); - static_assert(C("").f == 6, ""); - // expected-error@-1 {{static assertion expression is not an integral constant expression}} - // expected-note@-2 {{read of member 'f' of union with active member 'e' is not allowed in a constant expression}} + static_assert(C("").f == 6, ""); // expected-error {{constant}} expected-note {{read of}} struct D; extern const D d; @@ -372,10 +315,8 @@ namespace std { const _E* end() const {return __begin_ + __size_;} }; } // std -#endif namespace dr1467 { // dr1467: 3.7 c++11 -#if __cplusplus >= 201103L // Note that the change to [over.best.ics] was partially undone by DR2076; // the resulting rule is tested with the tests for that change. @@ -441,12 +382,9 @@ namespace dr1467 { // dr1467: 3.7 c++11 X x; X x2{x}; - void f1(int); // #dr1467-f1 - void f1(std::initializer_list) = delete; // #dr1467-f1-deleted - void g1() { f1({42}); } - // since-cxx11-error@-1 {{call to deleted function 'f1'}} - // since-cxx11-note@#dr1467-f1 {{candidate function}} - // since-cxx11-note@#dr1467-f1-deleted {{candidate function has been explicitly deleted}} + void f1(int); // expected-note {{candidate function}} + void f1(std::initializer_list) = delete; // expected-note {{candidate function has been explicitly deleted}} + void g1() { f1({42}); } // expected-error {{call to deleted function 'f1'}} template struct Pair { @@ -456,12 +394,9 @@ namespace dr1467 { // dr1467: 3.7 c++11 String(const char *); }; - void f2(Pair); // #dr1467-f2 - void f2(std::initializer_list) = delete; // #dr1467-f2-deleted - void g2() { f2({"foo", "bar"}); } - // since-cxx11-error@-1 {{call to deleted function 'f2'}} - // since-cxx11-note@#dr1467-f2 {{candidate function}} - // since-cxx11-note@#dr1467-f2-deleted {{candidate function has been explicitly deleted}} + void f2(Pair); // expected-note {{candidate function}} + void f2(std::initializer_list) = delete; // expected-note {{candidate function has been explicitly deleted}} + void g2() { f2({"foo", "bar"}); } // expected-error {{call to deleted function 'f2'}} } // dr_example namespace nonaggregate { @@ -518,89 +453,39 @@ namespace dr1467 { // dr1467: 3.7 c++11 } } // namespace NonAmbiguous +#if __cplusplus >= 201103L namespace StringLiterals { // When the array size is 4 the call will attempt to bind an lvalue to an // rvalue and fail. Therefore #2 will be called. (rsmith will bring this // issue to CWG) - void f(const char(&&)[4]); // #dr1467-f-char-4 - void f(const char(&&)[5]) = delete; // #dr1467-f-char-5 - void f(const wchar_t(&&)[4]); // #dr1467-f-wchar-4 - void f(const wchar_t(&&)[5]) = delete; // #dr1467-f-wchar-5 + void f(const char(&&)[4]); // expected-note 2 {{expects an rvalue}} expected-note 3 {{no known conversion}} + void f(const char(&&)[5]) = delete; // expected-note 2 {{candidate function has been explicitly deleted}} expected-note 3 {{no known conversion}} + void f(const wchar_t(&&)[4]); // expected-note {{expects an rvalue}} expected-note 4 {{no known conversion}} + void f(const wchar_t(&&)[5]) = delete; // expected-note {{candidate function has been explicitly deleted}} expected-note 4 {{no known conversion}} #if __cplusplus >= 202002L - void f2(const char8_t(&&)[4]); // #dr1467-f2-char8-4 - void f2(const char8_t(&&)[5]) = delete; // #dr1467-f2-char8-5 + void f2(const char8_t(&&)[4]); // expected-note {{expects an rvalue}} + void f2(const char8_t(&&)[5]) = delete; // expected-note {{candidate function has been explicitly deleted}} #endif - void f(const char16_t(&&)[4]); // #dr1467-f-char16-4 - void f(const char16_t(&&)[5]) = delete; // #dr1467-f-char16-5 - void f(const char32_t(&&)[4]); // #dr1467-f-char32-4 - void f(const char32_t(&&)[5]) = delete; // #dr1467-f-char32-5 + void f(const char16_t(&&)[4]); // expected-note {{expects an rvalue}} expected-note 4 {{no known conversion}} + void f(const char16_t(&&)[5]) = delete; // expected-note {{candidate function has been explicitly deleted}} expected-note 4 {{no known conversion}} + void f(const char32_t(&&)[4]); // expected-note {{expects an rvalue}} expected-note 4 {{no known conversion}} + void f(const char32_t(&&)[5]) = delete; // expected-note {{candidate function has been explicitly deleted}} expected-note 4 {{no known conversion}} void g() { - f({"abc"}); - // since-cxx11-error@-1 {{call to deleted function 'f'}} - // since-cxx11-note@#dr1467-f-char-5 {{candidate function has been explicitly deleted}} - // since-cxx11-note@#dr1467-f-char-4 {{candidate function not viable: expects an rvalue for 1st argument}} - // since-cxx11-note@#dr1467-f-wchar-4 {{candidate function not viable: no known conversion from 'const char[4]' to 'const wchar_t' for 1st argument}} - // since-cxx11-note@#dr1467-f-wchar-5 {{candidate function not viable: no known conversion from 'const char[4]' to 'const wchar_t' for 1st argument}} - // since-cxx11-note@#dr1467-f-char16-4 {{candidate function not viable: no known conversion from 'const char[4]' to 'const char16_t' for 1st argument}} - // since-cxx11-note@#dr1467-f-char16-5 {{candidate function not viable: no known conversion from 'const char[4]' to 'const char16_t' for 1st argument}} - // since-cxx11-note@#dr1467-f-char32-4 {{candidate function not viable: no known conversion from 'const char[4]' to 'const char32_t' for 1st argument}} - // since-cxx11-note@#dr1467-f-char32-5 {{candidate function not viable: no known conversion from 'const char[4]' to 'const char32_t' for 1st argument}} - f({((("abc")))}); - // since-cxx11-error@-1 {{call to deleted function 'f'}} - // since-cxx11-note@#dr1467-f-char-5 {{candidate function has been explicitly deleted}} - // since-cxx11-note@#dr1467-f-char-4 {{candidate function not viable: expects an rvalue for 1st argument}} - // since-cxx11-note@#dr1467-f-wchar-4 {{candidate function not viable: no known conversion from 'const char[4]' to 'const wchar_t' for 1st argument}} - // since-cxx11-note@#dr1467-f-wchar-5 {{candidate function not viable: no known conversion from 'const char[4]' to 'const wchar_t' for 1st argument}} - // since-cxx11-note@#dr1467-f-char16-4 {{candidate function not viable: no known conversion from 'const char[4]' to 'const char16_t' for 1st argument}} - // since-cxx11-note@#dr1467-f-char16-5 {{candidate function not viable: no known conversion from 'const char[4]' to 'const char16_t' for 1st argument}} - // since-cxx11-note@#dr1467-f-char32-4 {{candidate function not viable: no known conversion from 'const char[4]' to 'const char32_t' for 1st argument}} - // since-cxx11-note@#dr1467-f-char32-5 {{candidate function not viable: no known conversion from 'const char[4]' to 'const char32_t' for 1st argument}} - f({L"abc"}); - // since-cxx11-error@-1 {{call to deleted function 'f'}} - // since-cxx11-note@#dr1467-f-wchar-5 {{candidate function has been explicitly deleted}} - // since-cxx11-note@#dr1467-f-char-4 {{candidate function not viable: no known conversion from 'const wchar_t[4]' to 'const char' for 1st argument}} - // since-cxx11-note@#dr1467-f-char-5 {{candidate function not viable: no known conversion from 'const wchar_t[4]' to 'const char' for 1st argument}} - // since-cxx11-note@#dr1467-f-wchar-4 {{candidate function not viable: expects an rvalue for 1st argument}} - // since-cxx11-note@#dr1467-f-char16-4 {{candidate function not viable: no known conversion from 'const wchar_t[4]' to 'const char16_t' for 1st argument}} - // since-cxx11-note@#dr1467-f-char16-5 {{candidate function not viable: no known conversion from 'const wchar_t[4]' to 'const char16_t' for 1st argument}} - // since-cxx11-note@#dr1467-f-char32-4 {{candidate function not viable: no known conversion from 'const wchar_t[4]' to 'const char32_t' for 1st argument}} - // since-cxx11-note@#dr1467-f-char32-5 {{candidate function not viable: no known conversion from 'const wchar_t[4]' to 'const char32_t' for 1st argument}} + f({"abc"}); // expected-error {{call to deleted function 'f'}} + f({((("abc")))}); // expected-error {{call to deleted function 'f'}} + f({L"abc"}); // expected-error {{call to deleted function 'f'}} #if __cplusplus >= 202002L - f2({u8"abc"}); - // since-cxx20-error@-1 {{call to deleted function 'f2'}} - // since-cxx20-note@#dr1467-f2-char8-5 {{candidate function has been explicitly deleted}} - // since-cxx20-note@#dr1467-f2-char8-4 {{candidate function not viable: expects an rvalue for 1st argument}} + f2({u8"abc"}); // expected-error {{call to deleted function 'f2'}} #endif - f({uR"(abc)"}); - // since-cxx11-error@-1 {{call to deleted function 'f'}} - // since-cxx11-note@#dr1467-f-char16-5 {{candidate function has been explicitly deleted}} - // since-cxx11-note@#dr1467-f-char-4 {{candidate function not viable: no known conversion from 'const char16_t[4]' to 'const char' for 1st argument}} - // since-cxx11-note@#dr1467-f-char-5 {{candidate function not viable: no known conversion from 'const char16_t[4]' to 'const char' for 1st argument}} - // since-cxx11-note@#dr1467-f-wchar-4 {{candidate function not viable: no known conversion from 'const char16_t[4]' to 'const wchar_t' for 1st argument}} - // since-cxx11-note@#dr1467-f-wchar-5 {{candidate function not viable: no known conversion from 'const char16_t[4]' to 'const wchar_t' for 1st argument}} - // since-cxx11-note@#dr1467-f-char16-4 {{candidate function not viable: expects an rvalue for 1st argument}} - // since-cxx11-note@#dr1467-f-char32-4 {{candidate function not viable: no known conversion from 'const char16_t[4]' to 'const char32_t' for 1st argument}} - // since-cxx11-note@#dr1467-f-char32-5 {{candidate function not viable: no known conversion from 'const char16_t[4]' to 'const char32_t' for 1st argument}} - f({(UR"(abc)")}); - // since-cxx11-error@-1 {{call to deleted function 'f'}} - // since-cxx11-note@#dr1467-f-char32-5 {{candidate function has been explicitly deleted}} - // since-cxx11-note@#dr1467-f-char-4 {{candidate function not viable: no known conversion from 'const char32_t[4]' to 'const char' for 1st argument}} - // since-cxx11-note@#dr1467-f-char-5 {{candidate function not viable: no known conversion from 'const char32_t[4]' to 'const char' for 1st argument}} - // since-cxx11-note@#dr1467-f-wchar-4 {{candidate function not viable: no known conversion from 'const char32_t[4]' to 'const wchar_t' for 1st argument}} - // since-cxx11-note@#dr1467-f-wchar-5 {{candidate function not viable: no known conversion from 'const char32_t[4]' to 'const wchar_t' for 1st argument}} - // since-cxx11-note@#dr1467-f-char16-4 {{candidate function not viable: no known conversion from 'const char32_t[4]' to 'const char16_t' for 1st argument}} - // since-cxx11-note@#dr1467-f-char16-5 {{candidate function not viable: no known conversion from 'const char32_t[4]' to 'const char16_t' for 1st argument}} - // since-cxx11-note@#dr1467-f-char32-4 {{candidate function not viable: expects an rvalue for 1st argument}} + f({uR"(abc)"}); // expected-error {{call to deleted function 'f'}} + f({(UR"(abc)")}); // expected-error {{call to deleted function 'f'}} } } // namespace StringLiterals #endif } // dr1467 namespace dr1479 { // dr1479: 3.1 -#if __cplusplus >= 201103L - int operator"" _a(const char*, std::size_t = 0); - // since-cxx11-error@-1 {{literal operator cannot have a default argument}} -#endif + int operator"" _a(const char*, std::size_t = 0); // expected-error {{literal operator cannot have a default argument}} } namespace dr1482 { // dr1482: 3.0 @@ -610,59 +495,42 @@ template struct S { typedef char I; }; enum E2 : S::I { e }; -// since-cxx11-error@-1 {{use of undeclared identifier 'E2'}} +// expected-error@-1 {{use of undeclared identifier 'E2'}} #endif } // namespace dr1482 namespace dr1490 { // dr1490: 3.7 c++11 -#if __cplusplus >= 201103L // List-initialization from a string literal - char s[4]{"abc"}; // Ok - std::initializer_list{"abc"}; - // since-cxx11-error@-1 {{expected unqualified-id}}} -#endif + char s[4]{"abc"}; // Ok + std::initializer_list{"abc"}; // expected-error {{expected unqualified-id}}} } // dr1490 namespace dr1495 { // dr1495: 4 -#if __cplusplus >= 201103L // Deduction succeeds in both directions. - template struct A {}; // #dr1495-A - template struct A {}; - // since-cxx11-error@-1 {{class template partial specialization is not more specialized than the primary template}} - // since-cxx11-note@#dr1495-A {{template is declared here}} + template struct A {}; // expected-note {{template is declared here}} + template struct A {}; // expected-error {{class template partial specialization is not more specialized}} // Primary template is more specialized. - template struct B {}; // #dr1495-B - template struct B {}; - // since-cxx11-error@-1 {{class template partial specialization is not more specialized than the primary template}} - // since-cxx11-note@#dr1495-B {{template is declared here}} + template struct B {}; // expected-note {{template is declared here}} + template struct B {}; // expected-error {{not more specialized}} // Deduction fails in both directions. - template struct C {}; // #dr1495-C - template struct C<0, Ts...> {}; - // since-cxx11-error@-1 {{class template partial specialization is not more specialized than the primary template}} - // since-cxx11-note@#dr1495-C {{template is declared here}} + template struct C {}; // expected-note {{template is declared here}} + template struct C<0, Ts...> {}; // expected-error {{not more specialized}} #if __cplusplus >= 201402L // Deduction succeeds in both directions. - template int a; // #dr1495-a - template int a; - // since-cxx14-error@-1 {{variable template partial specialization is not more specialized than the primary template}} - // since-cxx14-note@#dr1495-a {{template is declared here}} + template int a; // expected-note {{template is declared here}} + template int a; // expected-error {{variable template partial specialization is not more specialized}} // Primary template is more specialized. - template int b; // #dr1495-b - template int b; - // since-cxx14-error@-1 {{variable template partial specialization is not more specialized than the primary template}} - // since-cxx14-note@#dr1495-b {{template is declared here}} + template int b; // expected-note {{template is declared here}} + template int b; // expected-error {{not more specialized}} // Deduction fails in both directions. - template int c; // #dr1495-c - template int c<0, Ts...>; - // since-cxx14-error@-1 {{variable template partial specialization is not more specialized than the primary template}} - // since-cxx14-note@#dr1495-c {{template is declared here}} -#endif + template int c; // expected-note {{template is declared here}} + template int c<0, Ts...>; // expected-error {{not more specialized}} #endif } @@ -676,3 +544,5 @@ struct A { static_assert(__is_trivial(A), ""); #endif } + +#endif diff --git a/clang/test/CXX/drs/dr15xx.cpp b/clang/test/CXX/drs/dr15xx.cpp index 007b42c74affb..a53b14694998b 100644 --- a/clang/test/CXX/drs/dr15xx.cpp +++ b/clang/test/CXX/drs/dr15xx.cpp @@ -1,18 +1,13 @@ // RUN: %clang_cc1 -std=c++98 -triple x86_64-unknown-unknown %s -verify=expected -fexceptions -fcxx-exceptions -pedantic-errors -// RUN: %clang_cc1 -std=c++11 -triple x86_64-unknown-unknown %s -verify=expected,since-cxx11,cxx11-14 -fexceptions -fcxx-exceptions -pedantic-errors -// RUN: %clang_cc1 -std=c++14 -triple x86_64-unknown-unknown %s -verify=expected,since-cxx11,cxx11-14,cxx14-17 -fexceptions -fcxx-exceptions -pedantic-errors -// RUN: %clang_cc1 -std=c++17 -triple x86_64-unknown-unknown %s -verify=expected,since-cxx11,since-cxx17 -fexceptions -fcxx-exceptions -pedantic-errors -// RUN: %clang_cc1 -std=c++20 -triple x86_64-unknown-unknown %s -verify=expected,since-cxx20,since-cxx11,since-cxx17 -fexceptions -fcxx-exceptions -pedantic-errors -// RUN: %clang_cc1 -std=c++23 -triple x86_64-unknown-unknown %s -verify=expected,since-cxx20,since-cxx11,since-cxx17 -fexceptions -fcxx-exceptions -pedantic-errors -// RUN: %clang_cc1 -std=c++2c -triple x86_64-unknown-unknown %s -verify=expected,since-cxx20,since-cxx11,since-cxx17 -fexceptions -fcxx-exceptions -pedantic-errors +// RUN: %clang_cc1 -std=c++11 -triple x86_64-unknown-unknown %s -verify=expected -fexceptions -fcxx-exceptions -pedantic-errors +// RUN: %clang_cc1 -std=c++14 -triple x86_64-unknown-unknown %s -verify=expected,cxx14_17 -fexceptions -fcxx-exceptions -pedantic-errors +// RUN: %clang_cc1 -std=c++17 -triple x86_64-unknown-unknown %s -verify=expected,cxx17 -fexceptions -fcxx-exceptions -pedantic-errors namespace dr1512 { // dr1512: 4 void f(char *p) { - if (p > 0) {} - // expected-error@-1 {{ordered comparison between pointer and zero ('char *' and 'int')}} + if (p > 0) {} // expected-error {{ordered comparison between pointer and zero}} #if __cplusplus >= 201103L - if (p > nullptr) {} - // since-cxx11-error@-1 {{invalid operands to binary expression ('char *' and 'std::nullptr_t')}} + if (p > nullptr) {} // expected-error {{invalid operands}} #endif } bool g(int **x, const int **y) { @@ -33,20 +28,10 @@ namespace dr1512 { // dr1512: 4 template void composite_pointer_type_is_ord() { composite_pointer_type_is_base(); - typedef __typeof(val() < val()) cmp; // #dr1512-lt - // since-cxx17-warning@#dr1512-lt {{ordered comparison of function pointers ('int (*)() noexcept' and 'int (*)()')}} - // since-cxx17-note@#dr1512-noexcept-1st {{in instantiation of function template specialization 'dr1512::composite_pointer_type_is_ord' requested here}} - // since-cxx17-warning@#dr1512-lt {{ordered comparison of function pointers ('int (*)()' and 'int (*)() noexcept')}} - // since-cxx17-note@#dr1512-noexcept-2nd {{in instantiation of function template specialization 'dr1512::composite_pointer_type_is_ord' requested here}} - typedef __typeof(val() <= val()) cmp; - // since-cxx17-warning@-1 {{ordered comparison of function pointers ('int (*)() noexcept' and 'int (*)()')}} - // since-cxx17-warning@-2 {{ordered comparison of function pointers ('int (*)()' and 'int (*)() noexcept')}} - typedef __typeof(val() > val()) cmp; - // since-cxx17-warning@-1 {{ordered comparison of function pointers ('int (*)() noexcept' and 'int (*)()')}} - // since-cxx17-warning@-2 {{ordered comparison of function pointers ('int (*)()' and 'int (*)() noexcept')}} - typedef __typeof(val() >= val()) cmp; - // since-cxx17-warning@-1 {{ordered comparison of function pointers ('int (*)() noexcept' and 'int (*)()')}} - // since-cxx17-warning@-2 {{ordered comparison of function pointers ('int (*)()' and 'int (*)() noexcept')}} + typedef __typeof(val() < val()) cmp; // cxx17-warning 2 {{ordered comparison of function pointers}} + typedef __typeof(val() <= val()) cmp; // cxx17-warning 2 {{ordered comparison of function pointers}} + typedef __typeof(val() > val()) cmp; // cxx17-warning 2 {{ordered comparison of function pointers}} + typedef __typeof(val() >= val()) cmp; // cxx17-warning 2 {{ordered comparison of function pointers}} typedef bool cmp; } @@ -92,11 +77,10 @@ namespace dr1512 { // dr1512: 4 composite_pointer_type_is_unord(); no_composite_pointer_type(); no_composite_pointer_type(); - // since-cxx20-warning@-1 {{volatile-qualified return type 'volatile int' is deprecated}} -#if __cplusplus >= 201703L - composite_pointer_type_is_ord(); // #dr1512-noexcept-1st - composite_pointer_type_is_ord(); // #dr1512-noexcept-2nd +#if __cplusplus > 201402 + composite_pointer_type_is_ord(); // expected-note {{requested here}} + composite_pointer_type_is_ord(); // expected-note {{requested here}} composite_pointer_type_is_unord(); composite_pointer_type_is_unord(); // FIXME: This looks like a standard defect; these should probably all have type 'int (B::*)()'. @@ -129,40 +113,24 @@ namespace dr1512 { // dr1512: 4 } #if __cplusplus >= 201103L - template struct Wrap { operator T(); }; // #dr1512-Wrap + template struct Wrap { operator T(); }; // expected-note 4{{converted to type 'std::nullptr_t'}} expected-note 4{{converted to type 'int *'}} void test_overload() { using nullptr_t = decltype(nullptr); void(Wrap() == Wrap()); void(Wrap() != Wrap()); - void(Wrap() < Wrap()); - // since-cxx11-error@-1 {{invalid operands to binary expression ('Wrap' (aka 'Wrap') and 'Wrap' (aka 'Wrap'))}} - void(Wrap() > Wrap()); - // since-cxx11-error@-1 {{invalid operands to binary expression ('Wrap' (aka 'Wrap') and 'Wrap' (aka 'Wrap'))}} - void(Wrap() <= Wrap()); - // since-cxx11-error@-1 {{invalid operands to binary expression ('Wrap' (aka 'Wrap') and 'Wrap' (aka 'Wrap'))}} - void(Wrap() >= Wrap()); - // since-cxx11-error@-1 {{invalid operands to binary expression ('Wrap' (aka 'Wrap') and 'Wrap' (aka 'Wrap'))}} + void(Wrap() < Wrap()); // expected-error {{invalid operands}} + void(Wrap() > Wrap()); // expected-error {{invalid operands}} + void(Wrap() <= Wrap()); // expected-error {{invalid operands}} + void(Wrap() >= Wrap()); // expected-error {{invalid operands}} // Under dr1213, this is ill-formed: we select the builtin operator<(int*, int*) // but then only convert as far as 'nullptr_t', which we then can't convert to 'int*'. void(Wrap() == Wrap()); void(Wrap() != Wrap()); - void(Wrap() < Wrap()); - // since-cxx11-error@-1 {{invalid operands to binary expression ('Wrap' (aka 'Wrap') and 'Wrap')}} - // since-cxx11-note@#dr1512-Wrap {{first operand was implicitly converted to type 'std::nullptr_t'}} - // since-cxx11-note@#dr1512-Wrap {{second operand was implicitly converted to type 'int *'}} - void(Wrap() > Wrap()); - // since-cxx11-error@-1 {{invalid operands}} - // since-cxx11-note@#dr1512-Wrap {{first operand was implicitly converted to type 'std::nullptr_t'}} - // since-cxx11-note@#dr1512-Wrap{{second operand was implicitly converted to type 'int *'}} - void(Wrap() <= Wrap()); - // since-cxx11-error@-1 {{invalid operands}} - // since-cxx11-note@#dr1512-Wrap {{first operand was implicitly converted to type 'std::nullptr_t'}} - // since-cxx11-note@#dr1512-Wrap {{second operand was implicitly converted to type 'int *'}} - void(Wrap() >= Wrap()); - // since-cxx11-error@-1 {{invalid operands}} - // since-cxx11-note@#dr1512-Wrap {{first operand was implicitly converted to type 'std::nullptr_t'}} - // since-cxx11-note@#dr1512-Wrap {{second operand was implicitly converted to type 'int *'}} + void(Wrap() < Wrap()); // expected-error {{invalid operands to binary expression ('Wrap' (aka 'Wrap') and 'Wrap')}} + void(Wrap() > Wrap()); // expected-error {{invalid operands}} + void(Wrap() <= Wrap()); // expected-error {{invalid operands}} + void(Wrap() >= Wrap()); // expected-error {{invalid operands}} } #endif } @@ -170,10 +138,8 @@ namespace dr1512 { // dr1512: 4 namespace dr1514 { // dr1514: 11 #if __cplusplus >= 201103L struct S { - enum E : int {}; // #dr1514-E - enum E : int {}; - // since-cxx11-error@-1 {{redefinition of 'E'}} - // since-cxx11-note@#dr1514-E {{previous definition is here}} + enum E : int {}; // expected-note {{previous}} + enum E : int {}; // expected-error {{redefinition}} }; S::E se; // OK, complete type, not zero-width bitfield. @@ -183,139 +149,88 @@ namespace dr1514 { // dr1514: 11 namespace dr1518 { // dr1518: 4 #if __cplusplus >= 201103L -struct Z0 { // #dr1518-Z0 - explicit Z0() = default; // #dr1518-Z0-ctor +struct Z0 { // expected-note 0+ {{candidate}} + explicit Z0() = default; // expected-note 0+ {{here}} }; -struct Z { // #dr1518-Z - explicit Z(); // #dr1518-Z-ctor - explicit Z(int); // #dr1518-Z-int - explicit Z(int, int); // #dr1518-Z-int-int +struct Z { // expected-note 0+ {{candidate}} + explicit Z(); // expected-note 0+ {{here}} + explicit Z(int); // expected-note {{not a candidate}} + explicit Z(int, int); // expected-note 0+ {{here}} }; -template int Eat(T); // #dr1518-Eat +template int Eat(T); // expected-note 0+ {{candidate}} Z0 a; Z0 b{}; -Z0 c = {}; -// since-cxx11-error@-1 {{chosen constructor is explicit in copy-initialization}} -// since-cxx11-note@#dr1518-Z0-ctor {{explicit constructor declared here}} -int i = Eat({}); -// since-cxx11-error@-1 {{no matching function for call to 'Eat'}} -// since-cxx11-note@#dr1518-Eat {{candidate function template not viable: cannot convert initializer list argument to 'Z0'}} - -Z c2 = {}; -// since-cxx11-error@-1 {{chosen constructor is explicit in copy-initialization}} -// since-cxx11-note@#dr1518-Z-ctor {{explicit constructor declared here}} -int i2 = Eat({}); -// since-cxx11-error@-1 {{no matching function for call to 'Eat'}} -// since-cxx11-note@#dr1518-Eat {{candidate function template not viable: cannot convert initializer list argument to 'Z'}} -Z a1 = 1; -// since-cxx11-error@-1 {{no viable conversion from 'int' to 'Z'}} -// since-cxx11-note@#dr1518-Z {{candidate constructor (the implicit copy constructor) not viable: no known conversion from 'int' to 'const Z &' for 1st argument}} -// since-cxx11-note@#dr1518-Z {{candidate constructor (the implicit move constructor) not viable: no known conversion from 'int' to 'Z &&' for 1st argument}} -// since-cxx11-note@#dr1518-Z-int {{explicit constructor is not a candidate}} +Z0 c = {}; // expected-error {{explicit in copy-initialization}} +int i = Eat({}); // expected-error {{no matching function for call to 'Eat'}} + +Z c2 = {}; // expected-error {{explicit in copy-initialization}} +int i2 = Eat({}); // expected-error {{no matching function for call to 'Eat'}} +Z a1 = 1; // expected-error {{no viable conversion}} Z a3 = Z(1); Z a2(1); Z *p = new Z(1); Z a4 = (Z)1; Z a5 = static_cast(1); -Z a6 = {4, 3}; -// since-cxx11-error@-1 {{chosen constructor is explicit in copy-initialization}} -// since-cxx11-note@#dr1518-Z-int-int {{explicit constructor declared here}} +Z a6 = {4, 3}; // expected-error {{explicit in copy-initialization}} -struct UserProvidedBaseCtor { // #dr1518-U +struct UserProvidedBaseCtor { // expected-note 0+ {{candidate}} UserProvidedBaseCtor() {} }; -struct DoesntInheritCtor : UserProvidedBaseCtor { // #dr1518-D-U +struct DoesntInheritCtor : UserProvidedBaseCtor { // expected-note 0+ {{candidate}} int x; }; DoesntInheritCtor I{{}, 42}; -// cxx11-14-error@-1 {{no matching constructor for initialization of 'DoesntInheritCtor'}} -// cxx11-14-note@#dr1518-D-U {{candidate constructor (the implicit copy constructor) not viable: requires 1 argument, but 2 were provided}} -// cxx11-14-note@#dr1518-D-U {{candidate constructor (the implicit move constructor) not viable: requires 1 argument, but 2 were provided}} -// cxx11-14-note@#dr1518-D-U {{candidate constructor (the implicit default constructor) not viable: requires 0 arguments, but 2 were provided}} - -struct BaseCtor { BaseCtor() = default; }; // #dr1518-BC -struct InheritsCtor : BaseCtor { // #dr1518-I - using BaseCtor::BaseCtor; // #dr1518-I-using +#if __cplusplus <= 201402L +// expected-error@-2 {{no matching constructor}} +#endif + +struct BaseCtor { BaseCtor() = default; }; // expected-note 0+ {{candidate}} +struct InheritsCtor : BaseCtor { // expected-note 1+ {{candidate}} + using BaseCtor::BaseCtor; // expected-note 2 {{inherited here}} int x; }; -InheritsCtor II = {{}, 42}; -// since-cxx11-error@-1 {{no matching constructor for initialization of 'InheritsCtor'}} -// since-cxx11-note@#dr1518-BC {{candidate constructor (the implicit copy constructor) not viable: requires 1 argument, but 2 were provided}} -// since-cxx11-note@#dr1518-I-using {{constructor from base class 'BaseCtor' inherited here}} -// since-cxx11-note@#dr1518-BC {{candidate constructor (the implicit move constructor) not viable: requires 1 argument, but 2 were provided}} -// since-cxx11-note@#dr1518-I-using {{constructor from base class 'BaseCtor' inherited here}} -// since-cxx11-note@#dr1518-I {{candidate constructor (the implicit copy constructor) not viable: requires 1 argument, but 2 were provided}} -// since-cxx11-note@#dr1518-I {{candidate constructor (the implicit move constructor) not viable: requires 1 argument, but 2 were provided}} -// since-cxx11-note@#dr1518-I {{candidate constructor (the implicit default constructor) not viable: requires 0 arguments, but 2 were provided}} +InheritsCtor II = {{}, 42}; // expected-error {{no matching constructor}} namespace std_example { struct A { - explicit A() = default; // #dr1518-A + explicit A() = default; // expected-note 2{{declared here}} }; struct B : A { - explicit B() = default; // #dr1518-B + explicit B() = default; // expected-note 2{{declared here}} }; struct C { - explicit C(); // #dr1518-C + explicit C(); // expected-note 2{{declared here}} }; struct D : A { C c; - explicit D() = default; // #dr1518-D + explicit D() = default; // expected-note 2{{declared here}} }; template void f() { T t; // ok T u{}; // ok - T v = {}; // #dr1518-v - // since-cxx11-error@#dr1518-v {{chosen constructor is explicit in copy-initialization}} - // since-cxx11-note@#dr1518-f-A {{in instantiation of function template specialization 'dr1518::std_example::f' requested here}} - // since-cxx11-note@#dr1518-A {{explicit constructor declared here}} - // since-cxx11-error@#dr1518-v {{chosen constructor is explicit in copy-initialization}} - // since-cxx11-note@#dr1518-f-B {{in instantiation of function template specialization 'dr1518::std_example::f' requested here}} - // since-cxx11-note@#dr1518-B {{explicit constructor declared here}} - // since-cxx11-error@#dr1518-v {{chosen constructor is explicit in copy-initialization}} - // since-cxx11-note@#dr1518-f-C {{in instantiation of function template specialization 'dr1518::std_example::f' requested here}} - // since-cxx11-note@#dr1518-C {{explicit constructor declared here}} - // since-cxx11-error@#dr1518-v {{chosen constructor is explicit in copy-initialization}} - // since-cxx11-note@#dr1518-f-D {{in instantiation of function template specialization 'dr1518::std_example::f' requested here}} - // since-cxx11-note@#dr1518-D {{explicit constructor declared here}} + T v = {}; // expected-error 4{{explicit}} } template void g() { - void x(T t); // #dr1518-x - x({}); // #dr1518-x-call - // since-cxx11-error@#dr1518-x-call {{chosen constructor is explicit in copy-initialization}} - // since-cxx11-note@#dr1518-g-A {{in instantiation of function template specialization 'dr1518::std_example::g' requested here}} - // since-cxx11-note@#dr1518-A {{explicit constructor declared here}} - // since-cxx11-note@#dr1518-x {{passing argument to parameter 't' here}} - // since-cxx11-error@#dr1518-x-call {{chosen constructor is explicit in copy-initialization}} - // since-cxx11-note@#dr1518-g-B {{in instantiation of function template specialization 'dr1518::std_example::g' requested here}} - // since-cxx11-note@#dr1518-B {{explicit constructor declared here}} - // since-cxx11-note@#dr1518-x {{passing argument to parameter 't' here}} - // since-cxx11-error@#dr1518-x-call {{chosen constructor is explicit in copy-initialization}} - // since-cxx11-note@#dr1518-g-C {{in instantiation of function template specialization 'dr1518::std_example::g' requested here}} - // since-cxx11-note@#dr1518-C {{explicit constructor declared here}} - // since-cxx11-note@#dr1518-x {{passing argument to parameter 't' here}} - // since-cxx11-error@#dr1518-x-call {{chosen constructor is explicit in copy-initialization}} - // since-cxx11-note@#dr1518-g-D {{in instantiation of function template specialization 'dr1518::std_example::g' requested here}} - // since-cxx11-note@#dr1518-D {{explicit constructor declared here}} - // since-cxx11-note@#dr1518-x {{passing argument to parameter 't' here}} + void x(T t); // expected-note 4{{parameter}} + x({}); // expected-error 4{{explicit}} } void test() { - f(); // #dr1518-f-A - f(); // #dr1518-f-B - f(); // #dr1518-f-C - f(); // #dr1518-f-D - g(); // #dr1518-g-A - g(); // #dr1518-g-B - g(); // #dr1518-g-C - g(); // #dr1518-g-D + f(); // expected-note {{instantiation of}} + f(); // expected-note {{instantiation of}} + f(); // expected-note {{instantiation of}} + f(); // expected-note {{instantiation of}} + g(); // expected-note {{instantiation of}} + g(); // expected-note {{instantiation of}} + g(); // expected-note {{instantiation of}} + g(); // expected-note {{instantiation of}} } } -#endif // __cplusplus >= 201103L +#endif // __cplusplus >= 201103L } namespace dr1550 { // dr1550: 3.4 @@ -327,16 +242,13 @@ namespace dr1550 { // dr1550: 3.4 namespace dr1558 { // dr1558: 12 #if __cplusplus >= 201103L template using first_of = T; - template first_of f(int); // #dr1558-f - template void f(...) = delete; // #dr1558-f-deleted + template first_of f(int); // expected-note {{'int' cannot be used prior to '::'}} + template void f(...) = delete; // expected-note {{deleted}} struct X { typedef void type; }; void test() { f(0); - f(0); - // since-cxx11-error@-1 {{call to deleted function 'f'}} - // since-cxx11-note@#dr1558-f-deleted {{candidate function [with T = int] has been explicitly deleted}} - // since-cxx11-note@#dr1558-f {{candidate template ignored: substitution failure [with T = int]: type 'int' cannot be used prior to '::' because it has no members}} + f(0); // expected-error {{deleted}} } #endif } @@ -371,25 +283,17 @@ namespace dr1573 { // dr1573: 3.9 struct C { C(); constexpr C(int) {} }; struct D : C { using C::C; }; constexpr D d = D(0); // ok - struct E : C { using C::C; A a; }; // #dr1573-E - constexpr E e = E(0); - // since-cxx11-error@-1 {{constexpr variable cannot have non-literal type 'const E'}} - // since-cxx11-note@#dr1573-E {{'E' is not literal because it has data member 'a' of non-literal type 'A'}} - + struct E : C { using C::C; A a; }; // expected-note {{non-literal type}} + constexpr E e = E(0); // expected-error {{non-literal type}} // FIXME: This diagnostic is pretty bad; we should explain that the problem // is that F::c would be initialized by a non-constexpr constructor. - struct F : C { using C::C; C c; }; // #dr1573-F - constexpr F f = F(0); - // since-cxx11-error@-1 {{constexpr variable 'f' must be initialized by a constant expression}} - // since-cxx11-note@-2 {{constructor inherited from base class 'C' cannot be used in a constant expression; derived class cannot be implicitly initialized}} - // since-cxx11-note@#dr1573-F {{declared here}} + struct F : C { using C::C; C c; }; // expected-note {{here}} + constexpr F f = F(0); // expected-error {{constant expression}} expected-note {{constructor inherited from base class 'C'}} // inherited constructor is effectively deleted if the user-written constructor would be struct G { G(int); }; - struct H : G { using G::G; G g; }; // #dr1573-H - H h(0); - // since-cxx11-error@-1 {{constructor inherited by 'H' from base class 'G' is implicitly deleted}} - // since-cxx11-note@#dr1573-H {{constructor inherited by 'H' is implicitly deleted because field 'g' has no default constructor}} + struct H : G { using G::G; G g; }; // expected-note {{constructor inherited by 'H' is implicitly deleted because field 'g' has no default constructor}} + H h(0); // expected-error {{constructor inherited by 'H' from base class 'G' is implicitly deleted}} #endif } @@ -432,15 +336,13 @@ namespace std { typedef basic_string string; } // std -#endif namespace dr1579 { // dr1579: 3.9 -#if __cplusplus >= 201103L template struct GenericMoveOnly { GenericMoveOnly(); - template GenericMoveOnly(const GenericMoveOnly &) = delete; // #dr1579-deleted-U - GenericMoveOnly(const int &) = delete; // #dr1579-deleted-int + template GenericMoveOnly(const GenericMoveOnly &) = delete; // expected-note 5 {{marked deleted here}} + GenericMoveOnly(const int &) = delete; // expected-note 2 {{marked deleted here}} template GenericMoveOnly(GenericMoveOnly &&); GenericMoveOnly(int &&); }; @@ -467,29 +369,17 @@ GenericMoveOnly DR1579_Ineligible(int &AnInt, extern GenericMoveOnly ExternMove; if (0) - return AnInt; - // since-cxx11-error@-1 {{conversion function from 'int' to 'GenericMoveOnly' invokes a deleted function}} - // since-cxx11-note@#dr1579-deleted-int {{'GenericMoveOnly' has been explicitly marked deleted here}} + return AnInt; // expected-error{{invokes a deleted function}} else if (0) - return GlobalMO; - // since-cxx11-error@-1 {{conversion function from 'GenericMoveOnly' to 'GenericMoveOnly' invokes a deleted function}} - // since-cxx11-note@#dr1579-deleted-U {{'GenericMoveOnly' has been explicitly marked deleted here}} + return GlobalMO; // expected-error{{invokes a deleted function}} else if (0) - return StaticMove; - // since-cxx11-error@-1 {{conversion function from 'GenericMoveOnly' to 'GenericMoveOnly' invokes a deleted function}} - // since-cxx11-note@#dr1579-deleted-U {{'GenericMoveOnly' has been explicitly marked deleted here}} + return StaticMove; // expected-error{{invokes a deleted function}} else if (0) - return ExternMove; - // since-cxx11-error@-1 {{conversion function from 'GenericMoveOnly' to 'GenericMoveOnly' invokes a deleted function}} - // since-cxx11-note@#dr1579-deleted-U {{'GenericMoveOnly' has been explicitly marked deleted here}} + return ExternMove; // expected-error{{invokes a deleted function}} else if (0) - return AnInt; - // since-cxx11-error@-1 {{conversion function from 'int' to 'GenericMoveOnly' invokes a deleted function}} - // since-cxx11-note@#dr1579-deleted-int {{'GenericMoveOnly' has been explicitly marked deleted here}} + return AnInt; // expected-error{{invokes a deleted function}} else - return CharMO; - // since-cxx11-error@-1 {{conversion function from 'GenericMoveOnly' to 'GenericMoveOnly' invokes a deleted function}} - // since-cxx11-note@#dr1579-deleted-U {{'GenericMoveOnly' has been explicitly marked deleted here}} + return CharMO; // expected-error{{invokes a deleted function}} } auto DR1579_lambda_valid = [](GenericMoveOnly mo) -> @@ -499,34 +389,24 @@ auto DR1579_lambda_valid = [](GenericMoveOnly mo) -> auto DR1579_lambda_invalid = []() -> GenericMoveOnly { static GenericMoveOnly mo; - return mo; - // since-cxx11-error@-1 {{conversion function from 'GenericMoveOnly' to 'GenericMoveOnly' invokes a deleted function}} - // since-cxx11-note@#dr1579-deleted-U {{'GenericMoveOnly' has been explicitly marked deleted here}} + return mo; // expected-error{{invokes a deleted function}} }; -#endif } // end namespace dr1579 namespace dr1584 { -#if __cplusplus >= 201103L // Deducing function types from cv-qualified types - template void f(const T *); // #dr1584-f + template void f(const T *); // expected-note {{candidate template ignored}} template void g(T *, const T * = 0); - template void h(T *) { T::error; } - // since-cxx11-error@-1 {{type 'void ()' cannot be used prior to '::' because it has no members}} - // since-cxx11-note@#dr1584-h {{in instantiation of function template specialization 'dr1584::h' requested here}} + template void h(T *) { T::error; } // expected-error {{no members}} template void h(const T *); void i() { - f(&i); - // since-cxx11-error@-1 {{no matching function for call to 'f'}} - // since-cxx11-note@#dr1584-f {{candidate template ignored: could not match 'const T *' against 'void (*)()'}} + f(&i); // expected-error {{no matching function}} g(&i); - h(&i); // #dr1584-h + h(&i); // expected-note {{here}} } -#endif } namespace dr1589 { // dr1589: 3.7 c++11 -#if __cplusplus >= 201103L // Ambiguous ranking of list-initialization sequences void f0(long, int=0); // Would makes selection of #0 ambiguous @@ -545,35 +425,25 @@ namespace dr1589 { // dr1589: 3.7 c++11 void g2() { f2({"foo","bar"}); } // chooses #4 namespace with_error { - void f0(long); - void f0(std::initializer_list); // #dr1589-f0-ilist - void f0(std::initializer_list, int = 0); // #dr1589-f0-ilist-int - void g0() { f0({1L}); } - // since-cxx11-error@-1 {{call to 'f0' is ambiguous}} - // since-cxx11-note@#dr1589-f0-ilist {{candidate function}} - // since-cxx11-note@#dr1589-f0-ilist-int {{candidate function}} - - void f1(int); - void f1(std::initializer_list); // #dr1589-f1-ilist - void f1(std::initializer_list, int = 0); // #dr1589-f1-ilist-long - void g1() { f1({42}); } - // since-cxx11-error@-1 {{call to 'f1' is ambiguous}} - // since-cxx11-note@#dr1589-f1-ilist {{candidate function}} - // since-cxx11-note@#dr1589-f1-ilist-long {{candidate function}} - - void f2(std::pair); - void f2(std::initializer_list); // #dr1589-f2-ilist - void f2(std::initializer_list, int = 0); // #dr1589-f2-ilist-int - void g2() { f2({"foo","bar"}); } - // since-cxx11-error@-1 {{call to 'f2' is ambiguous}} - // since-cxx11-note@#dr1589-f2-ilist {{candidate function}} - // since-cxx11-note@#dr1589-f2-ilist-int {{candidate function}} + void f0(long); // #0 + void f0(std::initializer_list); // #00 expected-note {{candidate function}} + void f0(std::initializer_list, int = 0); // expected-note {{candidate function}} + void g0() { f0({1L}); } // expected-error{{call to 'f0' is ambiguous}} + + void f1(int); // #1 + void f1(std::initializer_list); // #2 expected-note {{candidate function}} + void f1(std::initializer_list, int = 0); // expected-note {{candidate function}} + void g1() { f1({42}); } // expected-error{{call to 'f1' is ambiguous}} + + void f2(std::pair); // #3 + void f2(std::initializer_list); // #4 expected-note {{candidate function}} + void f2(std::initializer_list, int = 0); // expected-note {{candidate function}} + void g2() { f2({"foo","bar"}); } // expected-error{{call to 'f2' is ambiguous}} } -#endif + } // dr1589 -namespace dr1591 { //dr1591. Deducing array bound and element type from initializer list -#if __cplusplus >= 201103L +namespace dr1591 { //dr1591. Deducing array bound and element type from initializer list template int h(T const(&)[N]); int X = h({1,2,3}); // T deduced to int, N deduced to 3 @@ -581,10 +451,8 @@ namespace dr1591 { //dr1591. Deducing array bound and element type from initial int Y = j({42}); // T deduced to int, array bound not considered struct Aggr { int i; int j; }; - template int k(Aggr const(&)[N]); // #dr1591-k - int Y0 = k({1,2,3}); - // since-cxx11-error@-1 {{no matching function for call to 'k'}} - // since-cxx11-note@#dr1591-k {{candidate function [with N = 3] not viable: no known conversion from 'int' to 'const Aggr' for 1st argument}} + template int k(Aggr const(&)[N]); //expected-note{{not viable}} + int Y0 = k({1,2,3}); //expected-error{{no matching function}} int Z = k({{1},{2},{3}}); // OK, N deduced to 3 template int m(int const(&)[M][N]); @@ -595,64 +463,54 @@ namespace dr1591 { //dr1591. Deducing array bound and element type from initial namespace check_multi_dim_arrays { - template int ***f(const T (&a)[N][M][O]); // #dr1591-f-3 - template int **f(const T (&a)[N][M]); // #dr1591-f-2 + template int ***f(const T (&a)[N][M][O]); //expected-note{{deduced conflicting values}} + template int **f(const T (&a)[N][M]); //expected-note{{couldn't infer}} - template int *f(const T (&a)[N]); // #dr1591-f-1 + template int *f(const T (&a)[N]); //expected-note{{couldn't infer}} int ***p3 = f({ { {1,2}, {3, 4} }, { {5,6}, {7, 8} }, { {9,10}, {11, 12} } }); - int ***p33 = f({ { {1,2}, {3, 4} }, { {5,6}, {7, 8} }, { {9,10}, {11, 12, 13} } }); - // since-cxx11-error@-1 {{no matching function for call to 'f'}} - // since-cxx11-note@#dr1591-f-2 {{candidate template ignored: couldn't infer template argument 'T'}} - // since-cxx11-note@#dr1591-f-1 {{candidate template ignored: couldn't infer template argument 'T'}} - // since-cxx11-note@#dr1591-f-3 {{candidate template ignored: deduced conflicting values for parameter 'O' (2 vs. 3)}} + int ***p33 = f({ { {1,2}, {3, 4} }, { {5,6}, {7, 8} }, { {9,10}, {11, 12, 13} } }); //expected-error{{no matching}} int **p2 = f({ {1,2,3}, {3, 4, 5} }); int **p22 = f({ {1,2}, {3, 4} }); int *p1 = f({1, 2, 3}); } namespace check_multi_dim_arrays_rref { - template int ***g(T (&&a)[N][M][O]); // #dr1591-g-3 - template int **g(T (&&a)[N][M]); // #dr1591-g-2 + template int ***f(T (&&a)[N][M][O]); //expected-note{{deduced conflicting values}} + template int **f(T (&&a)[N][M]); //expected-note{{couldn't infer}} - template int *g(T (&&a)[N]); // #dr1591-g-1 - int ***p3 = g({ { {1,2}, {3, 4} }, { {5,6}, {7, 8} }, { {9,10}, {11, 12} } }); - int ***p33 = g({ { {1,2}, {3, 4} }, { {5,6}, {7, 8} }, { {9,10}, {11, 12, 13} } }); - // since-cxx11-error@-1 {{no matching function for call to 'g'}} - // since-cxx11-note@#dr1591-g-2 {{candidate template ignored: couldn't infer template argument 'T'}} - // since-cxx11-note@#dr1591-g-1 {{candidate template ignored: couldn't infer template argument 'T'}} - // since-cxx11-note@#dr1591-g-3 {{candidate template ignored: deduced conflicting values for parameter 'O' (2 vs. 3)}} - int **p2 = g({ {1,2,3}, {3, 4, 5} }); - int **p22 = g({ {1,2}, {3, 4} }); - int *p1 = g({1, 2, 3}); + template int *f(T (&&a)[N]); //expected-note{{couldn't infer}} + int ***p3 = f({ { {1,2}, {3, 4} }, { {5,6}, {7, 8} }, { {9,10}, {11, 12} } }); + int ***p33 = f({ { {1,2}, {3, 4} }, { {5,6}, {7, 8} }, { {9,10}, {11, 12, 13} } }); //expected-error{{no matching}} + int **p2 = f({ {1,2,3}, {3, 4, 5} }); + int **p22 = f({ {1,2}, {3, 4} }); + int *p1 = f({1, 2, 3}); } namespace check_arrays_of_init_list { - template float *h(const std::initializer_list (&)[N]); - template double *h(const T(&)[N]); - double *p = h({1, 2, 3}); - float *fp = h({{1}, {1, 2}, {1, 2, 3}}); + template float *f(const std::initializer_list (&)[N]); + template double *f(const T(&)[N]); + double *p = f({1, 2, 3}); + float *fp = f({{1}, {1, 2}, {1, 2, 3}}); } namespace core_reflector_28543 { - template int *i(T (&&)[N]); // #1 - template char *i(std::initializer_list &&); // #2 - template int **i(T (&&)[N][M]); // #3 #dr1591-i-2 - template char **i(std::initializer_list (&&)[N]); // #4 #dr1591-i-1 + template int *f(T (&&)[N]); // #1 + template char *f(std::initializer_list &&); //#2 + template int **f(T (&&)[N][M]); //#3 expected-note{{candidate}} + template char **f(std::initializer_list (&&)[N]); //#4 expected-note{{candidate}} - template short *i(T (&&)[2]); // #5 + template short *f(T (&&)[2]); //#5 template using Arr = T[]; - char *pc = i({1, 2, 3}); // OK prefer #2 via 13.3.3.2 [over.ics.rank] - char *pc2 = i({1, 2}); // #2 also - int *pi = i(Arr{1, 2, 3}); // OK prefer #1 + char *pc = f({1, 2, 3}); // OK prefer #2 via 13.3.3.2 [over.ics.rank] + char *pc2 = f({1, 2}); // #2 also + int *pi = f(Arr{1, 2, 3}); // OK prefer #1 - void *pv1 = i({ {1, 2, 3}, {4, 5, 6} }); // ambiguous btw 3 & 4 - // since-cxx11-error@-1 {{call to 'i' is ambiguous}} - // since-cxx11-note@#dr1591-i-2 {{candidate function [with T = int, N = 2, M = 3]}} - // since-cxx11-note@#dr1591-i-1 {{candidate function [with T = int, N = 2]}} - char **pcc = i({ {1}, {2, 3} }); // OK #4 + void *pv1 = f({ {1, 2, 3}, {4, 5, 6} }); // expected-error{{ambiguous}} btw 3 & 4 + char **pcc = f({ {1}, {2, 3} }); // OK #4 - short *ps = i(Arr{1, 2}); // OK #5 + short *ps = f(Arr{1, 2}); // OK #5 } -#endif } // dr1591 + +#endif diff --git a/clang/test/CXX/drs/dr16xx.cpp b/clang/test/CXX/drs/dr16xx.cpp index 3f074c4d57354..cbf0b487ba155 100644 --- a/clang/test/CXX/drs/dr16xx.cpp +++ b/clang/test/CXX/drs/dr16xx.cpp @@ -1,14 +1,12 @@ -// RUN: %clang_cc1 -std=c++98 -triple x86_64-unknown-unknown %s -verify=expected,cxx98-14,cxx98 -fexceptions -fcxx-exceptions -pedantic-errors -// RUN: %clang_cc1 -std=c++11 -triple x86_64-unknown-unknown %s -verify=expected,cxx98-14,since-cxx11,cxx11 -fexceptions -fcxx-exceptions -pedantic-errors -// RUN: %clang_cc1 -std=c++14 -triple x86_64-unknown-unknown %s -verify=expected,since-cxx14,cxx98-14,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors -// RUN: %clang_cc1 -std=c++17 -triple x86_64-unknown-unknown %s -verify=expected,since-cxx14,since-cxx17,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors -// RUN: %clang_cc1 -std=c++20 -triple x86_64-unknown-unknown %s -verify=expected,since-cxx14,since-cxx20,since-cxx17,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors -// RUN: %clang_cc1 -std=c++23 -triple x86_64-unknown-unknown %s -verify=expected,since-cxx14,since-cxx20,since-cxx17,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors -// RUN: %clang_cc1 -std=c++2c -triple x86_64-unknown-unknown %s -verify=expected,since-cxx14,since-cxx20,since-cxx17,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors - -#if __cplusplus == 199711L +// RUN: %clang_cc1 -std=c++98 -triple x86_64-unknown-unknown %s -verify -fexceptions -fcxx-exceptions -pedantic-errors +// RUN: %clang_cc1 -std=c++11 -triple x86_64-unknown-unknown %s -verify -fexceptions -fcxx-exceptions -pedantic-errors +// RUN: %clang_cc1 -std=c++14 -triple x86_64-unknown-unknown %s -verify -fexceptions -fcxx-exceptions -pedantic-errors +// RUN: %clang_cc1 -std=c++17 -triple x86_64-unknown-unknown %s -verify -fexceptions -fcxx-exceptions -pedantic-errors +// RUN: %clang_cc1 -std=c++2a -triple x86_64-unknown-unknown %s -verify -fexceptions -fcxx-exceptions -pedantic-errors + +#if __cplusplus < 201103L +// expected-error@+1 {{variadic macro}} #define static_assert(...) __extension__ _Static_assert(__VA_ARGS__) -// cxx98-error@-1 {{variadic macros are a C99 feature}} #endif #if __cplusplus >= 201103L @@ -27,7 +25,9 @@ namespace std { namespace dr1601 { // dr1601: 10 enum E : char { e }; -// cxx98-error@-1 {{enumeration types with a fixed underlying type are a C++11 extension}} +#if __cplusplus < 201103L + // expected-error@-2 {{enumeration types with a fixed underlying type are a C++11 extension}} +#endif void f(char); void f(int); void g() { @@ -53,20 +53,16 @@ namespace dr1631 { // dr1631: 3.7 void f(int, A); void test() { - f({0}, {{1}}); - // since-cxx11-warning@-1 {{braces around scalar initializer}} + f({0}, {{1}}); // expected-warning {{braces around scalar init}} } namespace with_error { void f(B, int); // TODO: expected- note {{candidate function}} - void f(int, A); // #dr1631-f - void f(int, A, int = 0); // #dr1631-f-int + void f(int, A); // expected-note {{candidate function}} + void f(int, A, int = 0); // expected-note {{candidate function}} void test() { - f({0}, {{1}}); - // since-cxx11-error@-1 {{call to 'f' is ambiguous}} - // since-cxx11-note@#dr1631-f {{candidate function}} - // since-cxx11-note@#dr1631-f-int {{candidate function}} + f({0}, {{1}}); // expected-error{{call to 'f' is ambiguous}} } } #endif @@ -75,8 +71,8 @@ namespace dr1631 { // dr1631: 3.7 namespace dr1638 { // dr1638: 3.1 #if __cplusplus >= 201103L template struct A { - enum class E; // #dr1638-E - enum class F : T; // #dr1638-F + enum class E; // expected-note {{previous}} + enum class F : T; // expected-note 2{{previous}} }; template<> enum class A::E; @@ -87,27 +83,16 @@ namespace dr1638 { // dr1638: 3.1 template<> enum class A::E : int; template<> enum class A::E : int {}; - template<> enum class A::F; - // since-cxx11-error@-1 {{enumeration redeclared with different underlying type 'int' (was 'short')}} - // since-cxx11-note@#dr1638-F {{previous declaration is here}} - template<> enum class A::E : char; - // since-cxx11-error@-1 {{enumeration redeclared with different underlying type 'char' (was 'int')}} - // since-cxx11-note@#dr1638-E {{previous declaration is here}} - template<> enum class A::F : int; - // since-cxx11-error@-1 {{enumeration redeclared with different underlying type 'int' (was 'char')}} - // since-cxx11-note@#dr1638-F {{previous declaration is here}} - - enum class A::E; - // since-cxx11-error@-1 {{template specialization requires 'template<>'}} - // since-cxx11-error@-2 {{forward declaration of enum class cannot have a nested name specifier}} - template enum class A::E; - // since-cxx11-error@-1 {{enumerations cannot be explicitly instantiated}} - enum class A::E *e; - // since-cxx11-error@-1 {{reference to enumeration must use 'enum' not 'enum class'}} + template<> enum class A::F; // expected-error {{different underlying type}} + template<> enum class A::E : char; // expected-error {{different underlying type}} + template<> enum class A::F : int; // expected-error {{different underlying type}} + + enum class A::E; // expected-error {{template specialization requires 'template<>'}} expected-error {{nested name specifier}} + template enum class A::E; // expected-error {{enumerations cannot be explicitly instantiated}} + enum class A::E *e; // expected-error {{must use 'enum' not 'enum class'}} struct B { - friend enum class A::E; - // since-cxx11-error@-1 {{reference to enumeration must use 'enum' not 'enum class'}} + friend enum class A::E; // expected-error {{must use 'enum' not 'enum class'}} }; #endif } @@ -115,50 +100,37 @@ namespace dr1638 { // dr1638: 3.1 namespace dr1645 { // dr1645: 3.9 #if __cplusplus >= 201103L struct A { - constexpr A(int, float = 0); // #dr1645-int-float - explicit A(int, int = 0); // #dr1645-int-int - A(int, int, int = 0) = delete; // #dr1645-int-int-int + constexpr A(int, float = 0); // expected-note {{candidate}} + explicit A(int, int = 0); // expected-note 2{{candidate}} + A(int, int, int = 0) = delete; // expected-note {{candidate}} }; struct B : A { - using A::A; // #dr1645-using + using A::A; // expected-note 4{{inherited here}} }; - constexpr B a(0); - // since-cxx11-error@-1 {{call to constructor of 'const B' is ambiguous}} - // since-cxx11-note@#dr1645-int-float {{candidate inherited constructor}} - // since-cxx11-note@#dr1645-using {{constructor from base class 'A' inherited here}} - // since-cxx11-note@#dr1645-int-int {{candidate inherited constructor}} - // since-cxx11-note@#dr1645-using {{constructor from base class 'A' inherited here}} - constexpr B b(0, 0); - // since-cxx11-error@-1 {{call to constructor of 'const B' is ambiguous}} - // since-cxx11-note@#dr1645-int-int {{candidate inherited constructor}} - // since-cxx11-note@#dr1645-using {{constructor from base class 'A' inherited here}} - // since-cxx11-note@#dr1645-int-int-int {{candidate inherited constructor has been explicitly deleted}} - // since-cxx11-note@#dr1645-using {{constructor from base class 'A' inherited here}} + constexpr B a(0); // expected-error {{ambiguous}} + constexpr B b(0, 0); // expected-error {{ambiguous}} #endif } namespace dr1652 { // dr1652: 3.6 int a, b; - int arr[&a + 1 == &b ? 1 : 2]; - // expected-error@-1 {{variable length arrays in C++ are a Clang extension}} - // expected-note@-2 {{comparison against pointer '&a + 1' that points past the end of a complete object has unspecified value}} - // expected-error@-3 {{variable length array declaration not allowed at file scope}} + int arr[&a + 1 == &b ? 1 : 2]; // expected-error 2{{variable length array}} + // expected-note@-1 {{points past the end}} } namespace dr1653 { // dr1653: 4 c++17 void f(bool b) { ++b; - // cxx98-14-warning@-1 {{incrementing expression of type bool is deprecated and incompatible with C++17}} - // since-cxx17-error@-2 {{SO C++17 does not allow incrementing expression of type bool}} b++; - // cxx98-14-warning@-1 {{incrementing expression of type bool is deprecated and incompatible with C++17}} - // since-cxx17-error@-2 {{SO C++17 does not allow incrementing expression of type bool}} - --b; - // expected-error@-1 {{cannot decrement expression of type bool}} - b--; - // expected-error@-1 {{cannot decrement expression of type bool}} +#if __cplusplus <= 201402L + // expected-warning@-3 {{deprecated}} expected-warning@-2 {{deprecated}} +#else + // expected-error@-5 {{incrementing expression of type bool}} expected-error@-4 {{incrementing expression of type bool}} +#endif + --b; // expected-error {{cannot decrement expression of type bool}} + b--; // expected-error {{cannot decrement expression of type bool}} b += 1; // ok b -= 1; // ok } @@ -166,88 +138,71 @@ namespace dr1653 { // dr1653: 4 c++17 namespace dr1658 { // dr1658: 5 namespace DefCtor { - class A { A(); }; // #dr1658-A1 - class B { ~B(); }; // #dr1658-B1 + class A { A(); }; // expected-note 0-2{{here}} + class B { ~B(); }; // expected-note 0-2{{here}} // The stars align! An abstract class does not construct its virtual bases. struct C : virtual A { C(); virtual void foo() = 0; }; - C::C() = default; // ok, not deleted - // cxx98-error@-1 {{defaulted function definitions are a C++11 extension}} + C::C() = default; // ok, not deleted, expected-error 0-1{{extension}} struct D : virtual B { D(); virtual void foo() = 0; }; - D::D() = default; // ok, not deleted - // cxx98-error@-1 {{defaulted function definitions are a C++11 extension}} + D::D() = default; // ok, not deleted, expected-error 0-1{{extension}} // In all other cases, we are not so lucky. - struct E : A { E(); virtual void foo() = 0; }; // #dr1658-E1 - E::E() = default; // #dr1658-E1-ctor - // cxx98-error@-1 {{defaulted function definitions are a C++11 extension}} - // cxx98-error@-2 {{base class 'A' has private default constructor}} - // cxx98-note@-3 {{in defaulted default constructor for 'dr1658::DefCtor::E' first required here}} - // cxx98-note@#dr1658-A1 {{implicitly declared private here}} - // since-cxx11-error@#dr1658-E1-ctor {{defaulting this default constructor would delete it after its first declaration}} - // since-cxx11-note@#dr1658-E1 {{default constructor of 'E' is implicitly deleted because base class 'A' has an inaccessible default constructor}} - struct F : virtual A { F(); }; // #dr1658-F1 - F::F() = default; // #dr1658-F1-ctor - // cxx98-error@-1 {{defaulted function definitions are a C++11 extension}} - // cxx98-error@-2 {{inherited virtual base class 'A' has private default constructor}} - // cxx98-note@-3 {{in defaulted default constructor for 'dr1658::DefCtor::F' first required here}} - // cxx98-note@#dr1658-A1 {{implicitly declared private here}} - // since-cxx11-error@#dr1658-F1-ctor {{defaulting this default constructor would delete it after its first declaration}} - // since-cxx11-note@#dr1658-F1 {{default constructor of 'F' is implicitly deleted because base class 'A' has an inaccessible default constructor}} - - struct G : B { G(); virtual void foo() = 0; }; // #dr1658-G1 - G::G() = default; // #dr1658-G1-ctor - // cxx98-error@-1 {{defaulted function definitions are a C++11 extension}} - // cxx98-error@#dr1658-G1 {{base class 'B' has private destructor}} - // cxx98-note@#dr1658-G1-ctor {{in defaulted default constructor for 'dr1658::DefCtor::G' first required here}} - // cxx98-note@#dr1658-B1 {{implicitly declared private here}} - // since-cxx11-error@#dr1658-G1-ctor {{defaulting this default constructor would delete it after its first declaration}} - // since-cxx11-note@#dr1658-G1 {{default constructor of 'G' is implicitly deleted because base class 'B' has an inaccessible destructor}} - struct H : virtual B { H(); }; // #dr1658-H1 - H::H() = default; // #dr1658-H1-ctor - // cxx98-error@-1 {{defaulted function definitions are a C++11 extension}} - // cxx98-error@#dr1658-H1 {{base class 'B' has private destructor}} - // cxx98-note@#dr1658-H1-ctor {{in defaulted default constructor for 'dr1658::DefCtor::H' first required here}} - // cxx98-note@#dr1658-B1 {{implicitly declared private here}} - // since-cxx11-error@#dr1658-H1-ctor {{defaulting this default constructor would delete it after its first declaration}} - // since-cxx11-note@#dr1658-H1 {{default constructor of 'H' is implicitly deleted because base class 'B' has an inaccessible destructor}} + struct E : A { E(); virtual void foo() = 0; }; +#if __cplusplus < 201103L + E::E() = default; // expected-error {{private default constructor}} expected-error {{extension}} expected-note {{here}} +#else + E::E() = default; // expected-error {{would delete}} expected-note@-4{{inaccessible default constructor}} +#endif + struct F : virtual A { F(); }; +#if __cplusplus < 201103L + F::F() = default; // expected-error {{private default constructor}} expected-error {{extension}} expected-note {{here}} +#else + F::F() = default; // expected-error {{would delete}} expected-note@-4{{inaccessible default constructor}} +#endif + + struct G : B { G(); virtual void foo() = 0; }; +#if __cplusplus < 201103L + G::G() = default; // expected-error@-2 {{private destructor}} expected-error {{extension}} expected-note {{here}} +#else + G::G() = default; // expected-error {{would delete}} expected-note@-4{{inaccessible destructor}} +#endif + struct H : virtual B { H(); }; +#if __cplusplus < 201103L + H::H() = default; // expected-error@-2 {{private destructor}} expected-error {{extension}} expected-note {{here}} +#else + H::H() = default; // expected-error {{would delete}} expected-note@-4{{inaccessible destructor}} +#endif } namespace Dtor { - class B { ~B(); }; // #dr1658-B2 + class B { ~B(); }; // expected-note 0-2{{here}} struct D : virtual B { ~D(); virtual void foo() = 0; }; - D::~D() = default; // ok, not deleted - // cxx98-error@-1 {{defaulted function definitions are a C++11 extension}} - - struct G : B { ~G(); virtual void foo() = 0; }; // #dr1658-G2 - G::~G() = default; // #dr1658-G2-dtor - // cxx98-error@-1 {{defaulted function definitions are a C++11 extension}} - // cxx98-error@#dr1658-G2 {{base class 'B' has private destructor}} - // cxx98-note@#dr1658-G2-dtor {{in defaulted destructor for 'dr1658::Dtor::G' first required here}} - // cxx98-note@#dr1658-B2 {{implicitly declared private here}} - // since-cxx11-error@#dr1658-G2-dtor {{defaulting this destructor would delete it after its first declaration}} - // since-cxx11-note@#dr1658-G2 {{destructor of 'G' is implicitly deleted because base class 'B' has an inaccessible destructor}} - struct H : virtual B { ~H(); }; // #dr1658-H2 - H::~H() = default; // #dr1658-H2-dtor - // cxx98-error@-1 {{defaulted function definitions are a C++11 extension}} - // cxx98-error@#dr1658-H2 {{base class 'B' has private destructor}} - // cxx98-note@#dr1658-H2-dtor {{in defaulted destructor for 'dr1658::Dtor::H' first required here}} - // cxx98-note@#dr1658-B2 {{implicitly declared private here}} - // since-cxx11-error@#dr1658-H2-dtor {{defaulting this destructor would delete it after its first declaration}} - // since-cxx11-note@#dr1658-H2 {{destructor of 'H' is implicitly deleted because base class 'B' has an inaccessible destructor}} + D::~D() = default; // ok, not deleted, expected-error 0-1{{extension}} + + struct G : B { ~G(); virtual void foo() = 0; }; +#if __cplusplus < 201103L + G::~G() = default; // expected-error@-2 {{private destructor}} expected-error {{extension}} expected-note {{here}} +#else + G::~G() = default; // expected-error {{would delete}} expected-note@-4{{inaccessible destructor}} +#endif + struct H : virtual B { ~H(); }; +#if __cplusplus < 201103L + H::~H() = default; // expected-error@-2 {{private destructor}} expected-error {{extension}} expected-note {{here}} +#else + H::~H() = default; // expected-error {{would delete}} expected-note@-4{{inaccessible destructor}} +#endif } namespace MemInit { - struct A { A(int); }; // #dr1658-A3 + struct A { A(int); }; // expected-note {{here}} struct B : virtual A { B() {} virtual void f() = 0; }; struct C : virtual A { - C() {} - // expected-error@-1 {{constructor for 'dr1658::MemInit::C' must explicitly initialize the base class 'A' which does not have a default constructor}} - // expected-note@#dr1658-A3 {{'dr1658::MemInit::A' declared here}} + C() {} // expected-error {{must explicitly initialize}} }; } @@ -265,51 +220,28 @@ namespace dr1658 { // dr1658: 5 } namespace CopyCtor { - class A { A(const A&); A(A&&); }; // #dr1658-A5 - // cxx98-error@-1 {{rvalue references are a C++11 extension}} - - struct C : virtual A { C(const C&); C(C&&); virtual void foo() = 0; }; - // cxx98-error@-1 {{rvalue references are a C++11 extension}} - C::C(const C&) = default; - // cxx98-error@-1 {{defaulted function definitions are a C++11 extension}} - C::C(C&&) = default; - // cxx98-error@-1 {{rvalue references are a C++11 extension}} - // cxx98-error@-2 {{defaulted function definitions are a C++11 extension}} - - struct E : A { E(const E&); E(E&&); virtual void foo() = 0; }; // #dr1658-E5 - // cxx98-error@-1 {{rvalue references are a C++11 extension}} - E::E(const E&) = default; // #dr1658-E5-copy-ctor - // cxx98-error@-1 {{defaulted function definitions are a C++11 extension}} - // cxx98-error@-2 {{base class 'A' has private copy constructor}} - // cxx98-note@-3 {{in defaulted copy constructor for 'dr1658::CopyCtor::E' first required here}} - // cxx98-note@#dr1658-A5 {{implicitly declared private here}} - // since-cxx11-error@#dr1658-E5-copy-ctor {{defaulting this copy constructor would delete it after its first declaration}} - // since-cxx11-note@#dr1658-E5 {{copy constructor of 'E' is implicitly deleted because base class 'A' has an inaccessible copy constructor}} - E::E(E&&) = default; // #dr1658-E5-move-ctor - // cxx98-error@-1 {{rvalue references are a C++11 extension}} - // cxx98-error@-2 {{defaulted function definitions are a C++11 extension}} - // cxx98-error@-3 {{base class 'A' has private move constructor}} - // cxx98-note@-4 {{in defaulted move constructor for 'dr1658::CopyCtor::E' first required here}} - // cxx98-note@#dr1658-A5 {{implicitly declared private here}} - // since-cxx11-error@#dr1658-E5-move-ctor {{defaulting this move constructor would delete it after its first declaration}} - // since-cxx11-note@#dr1658-E5 {{move constructor of 'E' is implicitly deleted because base class 'A' has an inaccessible move constructor}} - struct F : virtual A { F(const F&); F(F&&); }; // #dr1658-F5 - // cxx98-error@-1 {{rvalue references are a C++11 extension}} - F::F(const F&) = default; // #dr1658-F5-copy-ctor - // cxx98-error@-1 {{defaulted function definitions are a C++11 extension}} - // cxx98-error@-2 {{inherited virtual base class 'A' has private copy constructor}} - // cxx98-note@-3 {{in defaulted copy constructor for 'dr1658::CopyCtor::F' first required here}} - // cxx98-note@#dr1658-A5 {{implicitly declared private here}} - // since-cxx11-error@#dr1658-F5-copy-ctor {{defaulting this copy constructor would delete it after its first declaration}} - // since-cxx11-note@#dr1658-F5 {{copy constructor of 'F' is implicitly deleted because base class 'A' has an inaccessible copy constructor}} - F::F(F&&) = default; // #dr1658-F5-move-ctor - // cxx98-error@-1 {{rvalue references are a C++11 extension}} - // cxx98-error@-2 {{defaulted function definitions are a C++11 extension}} - // cxx98-error@-3 {{inherited virtual base class 'A' has private move constructor}} - // cxx98-note@-4 {{in defaulted move constructor for 'dr1658::CopyCtor::F' first required here}} - // cxx98-note@#dr1658-A5 {{implicitly declared private here}} - // since-cxx11-error@#dr1658-F5-move-ctor {{defaulting this move constructor would delete it after its first declaration}} - // since-cxx11-note@#dr1658-F5 {{move constructor of 'F' is implicitly deleted because base class 'A' has an inaccessible move constructor}} + class A { A(const A&); A(A&&); }; // expected-note 0-4{{here}} expected-error 0-1{{extension}} + + struct C : virtual A { C(const C&); C(C&&); virtual void foo() = 0; }; // expected-error 0-1{{extension}} + C::C(const C&) = default; // expected-error 0-1{{extension}} + C::C(C&&) = default; // expected-error 0-2{{extension}} + + struct E : A { E(const E&); E(E&&); virtual void foo() = 0; }; // expected-error 0-1{{extension}} +#if __cplusplus < 201103L + E::E(const E&) = default; // expected-error {{private copy constructor}} expected-error {{extension}} expected-note {{here}} + E::E(E&&) = default; // expected-error {{private move constructor}} expected-error 2{{extension}} expected-note {{here}} +#else + E::E(const E&) = default; // expected-error {{would delete}} expected-note@-5{{inaccessible copy constructor}} + E::E(E&&) = default; // expected-error {{would delete}} expected-note@-6{{inaccessible move constructor}} +#endif + struct F : virtual A { F(const F&); F(F&&); }; // expected-error 0-1{{extension}} +#if __cplusplus < 201103L + F::F(const F&) = default; // expected-error {{private copy constructor}} expected-error {{extension}} expected-note {{here}} + F::F(F&&) = default; // expected-error {{private move constructor}} expected-error 2{{extension}} expected-note {{here}} +#else + F::F(const F&) = default; // expected-error {{would delete}} expected-note@-5{{inaccessible copy constructor}} + F::F(F&&) = default; // expected-error {{would delete}} expected-note@-6{{inaccessible move constructor}} +#endif } // assignment case is superseded by dr2180 @@ -342,38 +274,31 @@ namespace dr1672 { // dr1672: 7 namespace dr1684 { // dr1684: 3.6 #if __cplusplus >= 201103L - struct NonLiteral { // #dr1684-struct + struct NonLiteral { // expected-note {{because}} NonLiteral(); - constexpr int f() { return 0; } - // cxx11-warning@-1 {{'constexpr' non-static member function will not be implicitly 'const' in C++14; add 'const' to avoid a change in behavior}} + constexpr int f() { return 0; } // expected-warning 0-1{{will not be implicitly 'const'}} }; constexpr int f(NonLiteral &) { return 0; } - constexpr int f(NonLiteral) { return 0; } - // since-cxx11-error@-1 {{constexpr function's 1st parameter type 'NonLiteral' is not a literal type}} - // since-cxx11-note@#dr1684-struct {{'NonLiteral' is not literal because it is not an aggregate and has no constexpr constructors other than copy or move constructors}} + constexpr int f(NonLiteral) { return 0; } // expected-error {{not a literal type}} #endif } namespace dr1687 { // dr1687: 7 template struct To { - operator T(); // #dr1687-op-T + operator T(); // expected-note 2{{first operand was implicitly converted to type 'int *'}} + // expected-note@-1 {{second operand was implicitly converted to type 'double'}} +#if __cplusplus > 201703L + // expected-note@-3 2{{operand was implicitly converted to type 'dr1687::E}} +#endif }; - int *a = To() + 100.0; - // expected-error@-1 {{invalid operands to binary expression ('To' and 'double')}} - // expected-note@#dr1687-op-T {{first operand was implicitly converted to type 'int *'}} - // since-cxx20-note@#dr1687-op-T {{second operand was implicitly converted to type 'dr1687::E2'}} - int *b = To() + To(); - // expected-error@-1 {{invalid operands to binary expression ('To' and 'To')}} - // expected-note@#dr1687-op-T {{first operand was implicitly converted to type 'int *'}} - // expected-note@#dr1687-op-T {{second operand was implicitly converted to type 'double'}} + int *a = To() + 100.0; // expected-error {{invalid operands to binary expression ('To' and 'double')}} + int *b = To() + To(); // expected-error {{invalid operands to binary expression ('To' and 'To')}} -#if __cplusplus >= 202002L +#if __cplusplus > 201703L enum E1 {}; enum E2 {}; - auto c = To() <=> To(); - // since-cxx20-error@-1 {{invalid operands to binary expression ('To' and 'To')}} - // since-cxx20-note@#dr1687-op-T {{operand was implicitly converted to type 'dr1687::E}} + auto c = To() <=> To(); // expected-error {{invalid operands to binary expression ('To' and 'To')}} #endif } @@ -400,14 +325,12 @@ namespace dr1691 { // dr1691: 9 void f(E); } enum M::E : int {}; - void g(M::E); // #dr1691-g + void g(M::E); // expected-note {{declared here}} } void test() { N::M::E e; f(e); // ok - g(e); - // since-cxx11-error@-1 {{use of undeclared identifier 'g'; did you mean 'N::g'?}} - // since-cxx11-note@#dr1691-g {{'N::g' declared here}} + g(e); // expected-error {{use of undeclared}} } #endif } @@ -433,9 +356,7 @@ namespace dr1696 { // dr1696: 7 extern struct A a; struct A { const A &x = { A{a, a} }; - const A &y = { A{} }; - // since-cxx14-error@-1 {{default member initializer for 'y' needed within definition of enclosing class 'A' outside of member functions}} - // since-cxx14-note@-2 {{default member initializer declared here}} + const A &y = { A{} }; // expected-error {{default member initializer for 'y' needed within definition of enclosing class 'A' outside of member functions}} expected-note {{here}} }; A a{a, a}; #endif @@ -444,20 +365,16 @@ namespace dr1696 { // dr1696: 7 struct A { A(); ~A(); }; #if __cplusplus >= 201103L struct B { - A &&a; // #dr1696-a - B() : a{} {} - // since-cxx11-error@-1 {{reference member 'a' binds to a temporary object whose lifetime would be shorter than the lifetime of the constructed object}} - // since-cxx11-note@#dr1696-a {{reference member declared here}} + A &&a; // expected-note {{declared here}} + B() : a{} {} // expected-error {{reference member 'a' binds to a temporary object whose lifetime would be shorter than the lifetime of the constructed object}} } b; #endif struct C { C(); - const A &a; // #dr1696-C-a + const A &a; // expected-note {{declared here}} }; - C::C() : a(A()) {} - // expected-error@-1 {{reference member 'a' binds to a temporary object whose lifetime would be shorter than the lifetime of the constructed object}} - // expected-note@#dr1696-C-a {{reference member declared here}} + C::C() : a(A()) {} // expected-error {{reference member 'a' binds to a temporary object whose lifetime would be shorter than the lifetime of the constructed object}} #if __cplusplus >= 201103L // This is OK in C++14 onwards, per DR1815, though we don't support that yet: @@ -466,62 +383,51 @@ namespace dr1696 { // dr1696: 7 // D1 d1 = {A()}; // ... which lifetime-extends the A temporary. struct D1 { - // cxx11-error@-1 {{reference member 'a' binds to a temporary object whose lifetime would be shorter than the lifetime of the constructed object}} - // cxx11-note@#dr1696-d1 {{in implicit default constructor for 'dr1696::D1' first required here}} - // cxx11-note@#dr1696-D1-a {{initializing field 'a' with default member initializer}} - const A &a = A(); // #dr1696-D1-a +#if __cplusplus < 201402L + // expected-error@-2 {{binds to a temporary}} +#endif + const A &a = A(); // expected-note {{default member init}} }; - D1 d1 = {}; // #dr1696-d1 - // since-cxx14-warning@-1 {{sorry, lifetime extension of temporary created by aggregate initialization using default member initializer is not supported; lifetime of temporary will end at the end of the full-expression}} - // since-cxx14-note@#dr1696-D1-a {{initializing field 'a' with default member initializer}} + D1 d1 = {}; +#if __cplusplus < 201402L + // expected-note@-2 {{first required here}} +#else + // expected-warning-re@-4 {{sorry, lifetime extension {{.*}} not supported}} +#endif struct D2 { - const A &a = A(); // #dr1696-D2-a - D2() {} - // since-cxx11-error@-1 {{reference member 'a' binds to a temporary object whose lifetime would be shorter than the lifetime of the constructed object}} - // since-cxx11-note@#dr1696-D2-a {{initializing field 'a' with default member initializer}} + const A &a = A(); // expected-note {{default member init}} + D2() {} // expected-error {{binds to a temporary}} }; - struct D3 { - // since-cxx11-error@-1 {{reference member 'a' binds to a temporary object whose lifetime would be shorter than the lifetime of the constructed object}} - // since-cxx11-note@#dr1696-d3 {{in implicit default constructor for 'dr1696::D3' first required here}} - // since-cxx11-note@#dr1696-D3-a {{initializing field 'a' with default member initializer}} - const A &a = A(); // #dr1696-D3-a + struct D3 { // expected-error {{binds to a temporary}} + const A &a = A(); // expected-note {{default member init}} }; - D3 d3; // #dr1696-d3 + D3 d3; // expected-note {{first required here}} struct haslist1 { - std::initializer_list il; // #dr1696-il-1 - haslist1(int i) : il{i, 2, 3} {} - // since-cxx11-error@-1 {{backing array for 'std::initializer_list' member 'il' is a temporary object whose lifetime would be shorter than the lifetime of the constructed object}} - // since-cxx11-note@#dr1696-il-1 {{'std::initializer_list' member declared here}} + std::initializer_list il; // expected-note {{'std::initializer_list' member}} + haslist1(int i) : il{i, 2, 3} {} // expected-error {{backing array for 'std::initializer_list' member 'il' is a temporary object}} }; struct haslist2 { - std::initializer_list il; // #dr1696-il-2 + std::initializer_list il; // expected-note {{'std::initializer_list' member}} haslist2(); }; - haslist2::haslist2() : il{1, 2} {} - // since-cxx11-error@-1 {{backing array for 'std::initializer_list' member 'il' is a temporary object whose lifetime would be shorter than the lifetime of the constructed object}} - // since-cxx11-note@#dr1696-il-2 {{'std::initializer_list' member declared here}} + haslist2::haslist2() : il{1, 2} {} // expected-error {{backing array for 'std::initializer_list' member 'il' is a temporary object}} struct haslist3 { std::initializer_list il = {1, 2, 3}; }; - struct haslist4 { - // since-cxx11-error@-1 {{backing array for 'std::initializer_list' member 'il' is a temporary object whose lifetime would be shorter than the lifetime of the constructed object}} - // since-cxx11-note@#dr1696-hl4 {{in implicit default constructor for 'dr1696::haslist4' first required here}} - // since-cxx11-note@#dr1696-il-4 {{initializing field 'il' with default member initializer}} - std::initializer_list il = {1, 2, 3}; // #dr1696-il-4 + struct haslist4 { // expected-error {{backing array for 'std::initializer_list' member 'il' is a temporary object}} + std::initializer_list il = {1, 2, 3}; // expected-note {{default member initializer}} }; - haslist4 hl4; // #dr1696-hl4 + haslist4 hl4; // expected-note {{in implicit default constructor}} struct haslist5 { - std::initializer_list il = {1, 2, 3}; // #dr1696-il-5 - haslist5() {} - // since-cxx11-error@-1 {{backing array for 'std::initializer_list' member 'il' is a temporary object whose lifetime would be shorter than the lifetime of the constructed object}} - // since-cxx11-note@#dr1696-il-5 {{nitializing field 'il' with default member initializer}} + std::initializer_list il = {1, 2, 3}; // expected-note {{default member initializer}} + haslist5() {} // expected-error {{backing array for 'std::initializer_list' member 'il' is a temporary object}} }; #endif } diff --git a/clang/test/CXX/drs/dr17xx.cpp b/clang/test/CXX/drs/dr17xx.cpp index 0c44fb231ce51..219119d1a4cd0 100644 --- a/clang/test/CXX/drs/dr17xx.cpp +++ b/clang/test/CXX/drs/dr17xx.cpp @@ -1,10 +1,10 @@ -// RUN: %clang_cc1 -std=c++98 %s -verify=expected -fexceptions -fcxx-exceptions -pedantic-errors -// RUN: %clang_cc1 -std=c++11 %s -verify=expected,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors -// RUN: %clang_cc1 -std=c++14 %s -verify=expected,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors -// RUN: %clang_cc1 -std=c++17 %s -verify=expected,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors -// RUN: %clang_cc1 -std=c++20 %s -verify=expected,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors -// RUN: %clang_cc1 -std=c++23 %s -verify=expected,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors -// RUN: %clang_cc1 -std=c++2c %s -verify=expected,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors +// RUN: %clang_cc1 -std=c++98 %s -verify -fexceptions -fcxx-exceptions -pedantic-errors +// RUN: %clang_cc1 -std=c++11 %s -verify -fexceptions -fcxx-exceptions -pedantic-errors +// RUN: %clang_cc1 -std=c++14 %s -verify -fexceptions -fcxx-exceptions -pedantic-errors +// RUN: %clang_cc1 -std=c++17 %s -verify -fexceptions -fcxx-exceptions -pedantic-errors +// RUN: %clang_cc1 -std=c++20 %s -verify -fexceptions -fcxx-exceptions -pedantic-errors +// RUN: %clang_cc1 -std=c++23 %s -verify -fexceptions -fcxx-exceptions -pedantic-errors +// RUN: %clang_cc1 -std=c++2c %s -verify -fexceptions -fcxx-exceptions -pedantic-errors namespace dr1710 { // dr1710: no // FIXME: all of the following is well-formed @@ -32,17 +32,13 @@ namespace dr1715 { // dr1715: 3.9 struct D : B { using B::B; }; - struct E : B { // #dr1715-E - template E(T t, typename T::Q q) : B(t, q) {} // #dr1715-E-ctor + struct E : B { // expected-note 2{{candidate}} + template E(T t, typename T::Q q) : B(t, q) {} // expected-note {{'Q' is a private member}} }; B b(S(), 1); D d(S(), 2); - E e(S(), 3); - // since-cxx11-error@-1 {{no matching constructor for initialization of 'E'}} - // since-cxx11-note@#dr1715-E-ctor {{candidate template ignored: substitution failure [with T = S]: 'Q' is a private member of 'dr1715::S'}} - // since-cxx11-note@#dr1715-E {{candidate constructor (the implicit copy constructor) not viable: requires 1 argument, but 2 were provided}} - // since-cxx11-note@#dr1715-E {{candidate constructor (the implicit move constructor) not viable: requires 1 argument, but 2 were provided}} + E e(S(), 3); // expected-error {{no match}} #endif } @@ -77,15 +73,12 @@ struct S { struct L : S { using S::S; }; - typename T::type value; - // since-cxx11-error@-1 {{type 'int' cannot be used prior to '::' because it has no members}} - // since-cxx11-note@#dr1736-l {{in instantiation of function template specialization 'dr1736::S::S' requested here}} - // since-cxx11-note@#dr1736-s {{in instantiation of function template specialization 'dr1736::S::S' requested here}} - L l(value); // #dr1736-l + typename T::type value; // expected-error {{no member}} + L l(value); // expected-note {{instantiation of}} } }; struct Q { typedef int type; } q; -S s(q); // #dr1736-s +S s(q); // expected-note {{instantiation of}} #endif } @@ -98,23 +91,18 @@ namespace dr1753 { // dr1753: 11 n.~T(); n.T::~T(); - n.dr1753::~T(); - // expected-error@-1 {{'dr1753' does not refer to a type name in pseudo-destructor expression; expected the name of type 'T' (aka 'int')}} + n.dr1753::~T(); // expected-error {{'dr1753' does not refer to a type name in pseudo-destructor}} n.dr1753::T::~T(); - n.A::~T(); - // expected-error@-1 {{the type of object expression ('T' (aka 'int')) does not match the type being destroyed ('A') in pseudo-destructor expression}} + n.A::~T(); // expected-error {{the type of object expression ('T' (aka 'int')) does not match the type being destroyed ('A') in pseudo-destructor expression}} n.A::T::~T(); - n.B::~T(); - // expected-error@-1 {{'B' does not refer to a type name in pseudo-destructor expression; expected the name of type 'T' (aka 'int')}} + n.B::~T(); // expected-error {{'B' does not refer to a type name in pseudo-destructor expression}} n.B::T::~T(); #if __cplusplus >= 201103L - n.decltype(n)::~T(); - // since-cxx11-error@-1 {{'decltype(n)' (aka 'int') is not a class, namespace, or enumeration}} - n.T::~decltype(n)(); - // since-cxx11-error@-1 {{expected a class name after '~' to name a destructor}} + n.decltype(n)::~T(); // expected-error {{not a class, namespace, or enumeration}} + n.T::~decltype(n)(); // expected-error {{expected a class name after '~'}} n.~decltype(n)(); // OK #endif } @@ -153,9 +141,9 @@ namespace dr1758 { // dr1758: 3.7 namespace dr1762 { // dr1762: 14 #if __cplusplus >= 201103L float operator ""_E(const char *); + // expected-error@+2 {{invalid suffix on literal; C++11 requires a space between literal and identifier}} + // expected-warning@+1 {{user-defined literal suffixes not starting with '_' are reserved; no literal will invoke this operator}} float operator ""E(const char *); - // since-cxx11-error@-1 {{invalid suffix on literal; C++11 requires a space between literal and identifier}} - // since-cxx11-warning@-2 {{user-defined literal suffixes not starting with '_' are reserved; no literal will invoke this operator}} #endif } diff --git a/clang/test/CXX/drs/dr18xx.cpp b/clang/test/CXX/drs/dr18xx.cpp index fbe67bd0c2f6d..49c1167220875 100644 --- a/clang/test/CXX/drs/dr18xx.cpp +++ b/clang/test/CXX/drs/dr18xx.cpp @@ -1,14 +1,14 @@ -// RUN: %clang_cc1 -std=c++98 -triple x86_64-unknown-unknown %s -verify=expected,cxx98 -fexceptions -Wno-deprecated-builtins -fcxx-exceptions -pedantic-errors -// RUN: %clang_cc1 -std=c++11 -triple x86_64-unknown-unknown %s -verify=expected,cxx11-17,since-cxx11 -fexceptions -Wno-deprecated-builtins -fcxx-exceptions -pedantic-errors -// RUN: %clang_cc1 -std=c++14 -triple x86_64-unknown-unknown %s -verify=expected,cxx11-17,since-cxx11,since-cxx14 -fexceptions -Wno-deprecated-builtins -fcxx-exceptions -pedantic-errors -// RUN: %clang_cc1 -std=c++17 -triple x86_64-unknown-unknown %s -verify=expected,cxx11-17,since-cxx11,since-cxx14 -fexceptions -Wno-deprecated-builtins -fcxx-exceptions -pedantic-errors -// RUN: %clang_cc1 -std=c++20 -triple x86_64-unknown-unknown %s -verify=expected,since-cxx20,since-cxx11,since-cxx14 -fexceptions -Wno-deprecated-builtins -fcxx-exceptions -pedantic-errors -// RUN: %clang_cc1 -std=c++23 -triple x86_64-unknown-unknown %s -verify=expected,since-cxx20,since-cxx11,since-cxx14 -fexceptions -Wno-deprecated-builtins -fcxx-exceptions -pedantic-errors -// RUN: %clang_cc1 -std=c++2c -triple x86_64-unknown-unknown %s -verify=expected,since-cxx20,since-cxx11,since-cxx14 -fexceptions -Wno-deprecated-builtins -fcxx-exceptions -pedantic-errors - -#if __cplusplus == 199711L +// RUN: %clang_cc1 -std=c++98 -triple x86_64-unknown-unknown %s -verify -fexceptions -Wno-deprecated-builtins -fcxx-exceptions -pedantic-errors +// RUN: %clang_cc1 -std=c++11 -triple x86_64-unknown-unknown %s -verify -fexceptions -Wno-deprecated-builtins -fcxx-exceptions -pedantic-errors +// RUN: %clang_cc1 -std=c++14 -triple x86_64-unknown-unknown %s -verify -fexceptions -Wno-deprecated-builtins -fcxx-exceptions -pedantic-errors +// RUN: %clang_cc1 -std=c++17 -triple x86_64-unknown-unknown %s -verify -fexceptions -Wno-deprecated-builtins -fcxx-exceptions -pedantic-errors +// RUN: %clang_cc1 -std=c++20 -triple x86_64-unknown-unknown %s -verify -fexceptions -Wno-deprecated-builtins -fcxx-exceptions -pedantic-errors +// RUN: %clang_cc1 -std=c++23 -triple x86_64-unknown-unknown %s -verify -fexceptions -Wno-deprecated-builtins -fcxx-exceptions -pedantic-errors +// RUN: %clang_cc1 -std=c++2c -triple x86_64-unknown-unknown %s -verify -fexceptions -Wno-deprecated-builtins -fcxx-exceptions -pedantic-errors + +#if __cplusplus < 201103L +// expected-error@+1 {{variadic macro}} #define static_assert(...) __extension__ _Static_assert(__VA_ARGS__) -// cxx98-error@-1 {{variadic macros are a C99 feature}} #endif namespace dr1812 { // dr1812: no @@ -16,7 +16,7 @@ namespace dr1812 { // dr1812: no #if __cplusplus >= 201103L template struct A { using B = typename T::C; - // since-cxx11-error@-1 {{use 'template' keyword to treat 'C' as a dependent template name}} + // expected-error@-1 {{use 'template' keyword to treat 'C' as a dependent template name}} }; #endif } // namespace dr1812 @@ -54,16 +54,11 @@ namespace dr1814 { // dr1814: yes namespace dr1815 { // dr1815: no #if __cplusplus >= 201402L // FIXME: needs codegen test - struct A { int &&r = 0; }; // #dr1815-A - A a = {}; - // since-cxx14-warning@-1 {{sorry, lifetime extension of temporary created by aggregate initialization using default member initializer is not supported; lifetime of temporary will end at the end of the full-expression}} FIXME - // since-cxx14-note@#dr1815-A {{initializing field 'r' with default member initializer}} - - struct B { int &&r = 0; }; // #dr1815-B - // since-cxx14-error@-1 {{reference member 'r' binds to a temporary object whose lifetime would be shorter than the lifetime of the constructed object}} - // since-cxx14-note@#dr1815-B {{initializing field 'r' with default member initializer}} - // since-cxx14-note@#dr1815-b {{in implicit default constructor for 'dr1815::B' first required here}} - B b; // #dr1815-b + struct A { int &&r = 0; }; // expected-note {{default member init}} + A a = {}; // FIXME expected-warning {{not supported}} + + struct B { int &&r = 0; }; // expected-error {{binds to a temporary}} expected-note {{default member init}} + B b; // expected-note {{here}} #endif } @@ -85,9 +80,9 @@ struct A { namespace dr1822 { // dr1822: yes #if __cplusplus >= 201103L - double a; + int a; auto x = [] (int a) { - static_assert(__is_same(decltype(a), int), "should be resolved to lambda parameter"); +#pragma clang __debug dump a // CHECK: ParmVarDecl }; #endif } @@ -103,22 +98,16 @@ namespace dr1837 { // dr1837: 3.3 }; class Outer { - friend auto Other::q() -> decltype(this->p()) *; - // since-cxx11-error@-1 {{invalid use of 'this' outside of a non-static member function}} + friend auto Other::q() -> decltype(this->p()) *; // expected-error {{invalid use of 'this'}} int g(); int f() { extern void f(decltype(this->g()) *); struct Inner { - static_assert(Fishg())>::value, ""); - // since-cxx11-error@-1 {{invalid use of 'this' outside of a non-static member function}} - enum { X = Fishf())>::value }; - // since-cxx11-error@-1 {{invalid use of 'this' outside of a non-static member function}} - struct Inner2 : Fishg())> { }; - // since-cxx11-error@-1 {{invalid use of 'this' outside of a non-static member function}} - friend void f(decltype(this->g()) *); - // since-cxx11-error@-1 {{invalid use of 'this' outside of a non-static member function}} - friend auto Other::q() -> decltype(this->p()) *; - // since-cxx11-error@-1 {{invalid use of 'this' outside of a non-static member function}} + static_assert(Fishg())>::value, ""); // expected-error {{invalid use of 'this'}} + enum { X = Fishf())>::value }; // expected-error {{invalid use of 'this'}} + struct Inner2 : Fishg())> { }; // expected-error {{invalid use of 'this'}} + friend void f(decltype(this->g()) *); // expected-error {{invalid use of 'this'}} + friend auto Other::q() -> decltype(this->p()) *; // expected-error {{invalid use of 'this'}} }; return 0; } @@ -146,21 +135,19 @@ namespace dr1872 { // dr1872: 9 constexpr int x = A().f(); constexpr int y = A().f(); - // cxx11-17-error@-1 {{constexpr variable 'y' must be initialized by a constant expression}} - // cxx11-17-note@-2 {{cannot evaluate call to virtual function in a constant expression in C++ standards before C++20}} -#if __cplusplus >= 202002L +#if __cplusplus <= 201703L + // expected-error@-2 {{constant expression}} expected-note@-2 {{call to virtual function}} +#else static_assert(y == 0); #endif // Note, this is invalid even though it would not use virtual dispatch. constexpr int y2 = A().A::f(); - // cxx11-17-error@-1 {{constexpr variable 'y2' must be initialized by a constant expression}} - // cxx11-17-note@-2 {{cannot evaluate call to virtual function in a constant expression in C++ standards before C++20}} -#if __cplusplus >= 202002L +#if __cplusplus <= 201703L + // expected-error@-2 {{constant expression}} expected-note@-2 {{call to virtual function}} +#else static_assert(y == 0); #endif - constexpr int z = A().f(); - // since-cxx11-error@-1 {{constexpr variable 'z' must be initialized by a constant expression}} - // since-cxx11-note@-2 {{non-literal type 'A' cannot be used in a constant expression}} + constexpr int z = A().f(); // expected-error {{constant expression}} expected-note {{non-literal type}} #endif } @@ -179,38 +166,33 @@ namespace dr1881 { // dr1881: 7 void dr1891() { // dr1891: 4 #if __cplusplus >= 201103L int n; - auto a = []{}; // #dr1891-a - auto b = [=]{ return n; }; // #dr1891-b + auto a = []{}; // expected-note 0-4{{}} + auto b = [=]{ return n; }; // expected-note 0-4{{}} typedef decltype(a) A; typedef decltype(b) B; static_assert(!__has_trivial_constructor(A), ""); - // since-cxx20-error@-1 {{failed}} +#if __cplusplus > 201703L + // expected-error@-2 {{failed}} +#endif static_assert(!__has_trivial_constructor(B), ""); // C++20 allows default construction for non-capturing lambdas (P0624R2). A x; - // cxx11-17-error@-1 {{no matching constructor for initialization of 'A' (aka '(lambda at}} - // cxx11-17-note@#dr1891-a {{candidate constructor (the implicit copy constructor) not viable: requires 1 argument, but 0 were provided}} - // cxx11-17-note@#dr1891-a {{candidate constructor (the implicit move constructor) not viable: requires 1 argument, but 0 were provided}} - B y; - // since-cxx11-error@-1 {{no matching constructor for initialization of 'B' (aka '(lambda at}} - // since-cxx11-note@#dr1891-b {{candidate constructor (the implicit copy constructor) not viable: requires 1 argument, but 0 were provided}} - // since-cxx11-note@#dr1891-b {{candidate constructor (the implicit move constructor) not viable: requires 1 argument, but 0 were provided}} +#if __cplusplus <= 201703L + // expected-error@-2 {{no matching constructor}} +#endif + B y; // expected-error {{no matching constructor}} // C++20 allows assignment for non-capturing lambdas (P0624R2). a = a; - // cxx11-17-error-re@-1 {{{{object of type '\(lambda at .+\)' cannot be assigned because its copy assignment operator is implicitly deleted}}}} - // cxx11-17-note@#dr1891-a {{lambda expression begins here}} a = static_cast(a); - // cxx11-17-error-re@-1 {{{{object of type '\(lambda at .+\)' cannot be assigned because its copy assignment operator is implicitly deleted}}}} - // cxx11-17-note@#dr1891-a {{lambda expression begins here}} - b = b; - // since-cxx11-error-re@-1 {{{{object of type '\(lambda at .+\)' cannot be assigned because its copy assignment operator is implicitly deleted}}}} - // since-cxx11-note@#dr1891-b {{lambda expression begins here}} - b = static_cast(b); - // since-cxx11-error-re@-1 {{{{object of type '\(lambda at .+\)' cannot be assigned because its copy assignment operator is implicitly deleted}}}} - // since-cxx11-note@#dr1891-b {{lambda expression begins here}} +#if __cplusplus <= 201703L + // expected-error@-3 {{copy assignment operator is implicitly deleted}} + // expected-error@-3 {{copy assignment operator is implicitly deleted}} +#endif + b = b; // expected-error {{copy assignment operator is implicitly deleted}} + b = static_cast(b); // expected-error {{copy assignment operator is implicitly deleted}} #endif } diff --git a/clang/test/CXX/drs/dr19xx.cpp b/clang/test/CXX/drs/dr19xx.cpp index 716b1476831ed..73aa20cc58e2d 100644 --- a/clang/test/CXX/drs/dr19xx.cpp +++ b/clang/test/CXX/drs/dr19xx.cpp @@ -1,36 +1,41 @@ -// RUN: %clang_cc1 -std=c++98 %s -verify=expected,cxx98-11,cxx98 -fexceptions -fcxx-exceptions -pedantic-errors -// RUN: %clang_cc1 -std=c++11 %s -verify=expected,cxx98-11,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors -// RUN: %clang_cc1 -std=c++14 %s -verify=expected,since-cxx14,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors -// RUN: %clang_cc1 -std=c++17 %s -verify=expected,since-cxx14,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors -// RUN: %clang_cc1 -std=c++20 %s -verify=expected,since-cxx14,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors -// RUN: %clang_cc1 -std=c++23 %s -verify=expected,since-cxx14,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors -// RUN: %clang_cc1 -std=c++2c %s -verify=expected,since-cxx14,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors +// RUN: %clang_cc1 -std=c++98 %s -verify -fexceptions -fcxx-exceptions -pedantic-errors +// RUN: %clang_cc1 -std=c++11 %s -verify -fexceptions -fcxx-exceptions -pedantic-errors +// RUN: %clang_cc1 -std=c++14 %s -verify -fexceptions -fcxx-exceptions -pedantic-errors +// RUN: %clang_cc1 -std=c++1z %s -verify -fexceptions -fcxx-exceptions -pedantic-errors namespace std { struct type_info; } namespace dr1902 { // dr1902: 3.7 struct A {}; struct B { - B(A); // #dr1902-B-A - B() = delete; // #dr1902-B-ctor - // cxx98-error@-1 {{deleted function definitions are a C++11 extension}} - B(const B&) = delete; // #dr1902-B-copy-ctor - // cxx98-error@-1 {{deleted function definitions are a C++11 extension}} + B(A); +#if __cplusplus >= 201103L + // expected-note@-2 {{candidate}} +#endif + + B() = delete; +#if __cplusplus < 201103L + // expected-error@-2 {{extension}} +#endif + + B(const B&) // expected-note {{deleted here}} +#if __cplusplus >= 201103L + // expected-note@-2 {{candidate}} +#else + // expected-error@+2 {{extension}} +#endif + = delete; + operator A(); }; extern B b1; - B b2(b1); - // expected-error@-1 {{call to deleted constructor of 'B'}} - // expected-note@#dr1902-B-copy-ctor {{'B' has been explicitly marked deleted here}} + B b2(b1); // expected-error {{call to deleted}} #if __cplusplus >= 201103L // This is ambiguous, even though calling the B(const B&) constructor would // both directly and indirectly call a deleted function. - B b({}); - // since-cxx11-error@-1 {{call to constructor of 'B' is ambiguous}} - // since-cxx11-note@#dr1902-B-A {{candidate constructor}} - // since-cxx11-note@#dr1902-B-copy-ctor {{candidate constructor has been explicitly deleted}} + B b({}); // expected-error {{ambiguous}} #endif } @@ -61,22 +66,16 @@ namespace dr1903 { namespace dr1909 { // dr1909: 3.7 struct A { - template struct A {}; - // expected-error@-1 {{member 'A' has the same name as its class}} + template struct A {}; // expected-error {{member 'A' has the same name as its class}} }; struct B { - template void B() {} - // expected-error@-1 {{constructor cannot have a return type}} + template void B() {} // expected-error {{constructor cannot have a return type}} }; struct C { - template static int C; - // expected-error@-1 {{member 'C' has the same name as its class}} - // cxx98-11-error@-2 {{variable templates are a C++14 extension}} + template static int C; // expected-error {{member 'C' has the same name as its class}} expected-error 0-1{{extension}} }; struct D { - template using D = int; - // cxx98-error@-1 {{alias declarations are a C++11 extension}} - // expected-error@-2 {{member 'D' has the same name as its class}} + template using D = int; // expected-error {{member 'D' has the same name as its class}} expected-error 0-1{{extension}} }; } @@ -84,8 +83,7 @@ namespace dr1940 { // dr1940: 3.5 #if __cplusplus >= 201103L static union { static_assert(true, ""); // ok - static_assert(false, ""); - // since-cxx11-error@-1 {{static assertion failed}} + static_assert(false, ""); // expected-error {{static assertion failed}} int not_empty; }; #endif @@ -124,18 +122,15 @@ derived d2(42, 9); namespace dr1947 { // dr1947: 3.5 #if __cplusplus >= 201402L unsigned o = 0'01; // ok -unsigned b = 0b'01; -// since-cxx14-error@-1 {{invalid digit 'b' in octal constant}} -unsigned x = 0x'01; -// since-cxx14-error@-1 {{invalid suffix 'x'01' on integer constant}} +unsigned b = 0b'01; // expected-error {{invalid digit 'b' in octal constant}} +unsigned x = 0x'01; // expected-error {{invalid suffix 'x'01' on integer constant}} #endif } #if __cplusplus >= 201103L // dr1948: 3.5 // FIXME: This diagnostic could be improved. -void *operator new(__SIZE_TYPE__) noexcept { return nullptr; } -// since-cxx11-error@-1 {{exception specification in declaration does not match previous declaration}} +void *operator new(__SIZE_TYPE__) noexcept { return nullptr; } // expected-error{{exception specification in declaration does not match previous declaration}} #endif namespace dr1959 { // dr1959: 3.9 @@ -144,31 +139,22 @@ namespace dr1959 { // dr1959: 3.9 struct c; struct a { a() = default; - a(const a &) = delete; // #dr1959-copy-ctor + a(const a &) = delete; // expected-note {{deleted}} a(const b &) = delete; // not inherited - a(c &&) = delete; // #dr1959-move-ctor - template a(T) = delete; // #dr1959-temp-ctor + a(c &&) = delete; // expected-note {{not viable}} + template a(T) = delete; // expected-note {{would take its own class type by value}} }; - struct b : a { // #dr1959-b - using a::a; // #dr1959-using-a + struct b : a { // expected-note {{cannot bind}} expected-note {{deleted because}} + using a::a; // expected-note 2{{inherited here}} }; a x; // FIXME: As a resolution to an open DR against P0136R0, we disallow // use of inherited constructors to construct from a single argument // where the base class is reference-related to the argument type. - b y = x; - // since-cxx11-error@-1 {{no viable conversion from 'a' to 'b'}} - // since-cxx11-note@#dr1959-move-ctor {{candidate inherited constructor not viable: no known conversion from 'a' to 'c &&' for 1st argument}} - // since-cxx11-note@#dr1959-using-a {{constructor from base class 'a' inherited here}} - // since-cxx11-note@#dr1959-b {{candidate constructor (the implicit copy constructor) not viable: cannot bind base class object of type 'a' to derived class reference 'const b &' for 1st argument}} - // since-cxx11-note@#dr1959-temp-ctor {{candidate template ignored: instantiation would take its own class type by value}} - // since-cxx11-note@#dr1959-using-a {{constructor from base class 'a' inherited here}} - b z = z; - // since-cxx11-error@-1 {{call to implicitly-deleted copy constructor of 'b'}} - // since-cxx11-note@#dr1959-b {{copy constructor of 'b' is implicitly deleted because base class 'a' has a deleted copy constructor}} - // since-cxx11-note@#dr1959-copy-ctor {{'a' has been explicitly marked deleted here}} + b y = x; // expected-error {{no viable conversion}} + b z = z; // expected-error {{deleted}} struct c : a { using a::a; @@ -205,28 +191,16 @@ using A::g; namespace dr1966 { // dr1966: 11 #if __cplusplus >= 201103L struct A { - enum E : int {1}; - // since-cxx11-error@-1 {{expected identifier}} (not bit-field) + enum E : int {1}; // expected-error {{expected identifier}} (not bit-field) }; - auto *p1 = new enum E : int; - // since-cxx11-error@-1 {{non-defining declaration of enumeration with a fixed underlying type is only permitted as a standalone declaration}} - auto *p2 = new enum F : int {}; - // since-cxx11-error@-1 {{non-defining declaration of enumeration with a fixed underlying type is only permitted as a standalone declaration}} - auto *p3 = true ? new enum G : int {}; - // since-cxx11-error@-1 {{ISO C++ forbids forward references to 'enum' types}} - // since-cxx11-error@-2 {{allocation of incomplete type 'enum G'}} - // since-cxx11-note@-3 {{forward declaration of 'dr1966::G'}} - auto h() -> enum E : int {}; - // since-cxx11-error@-1 {{non-defining declaration of enumeration with a fixed underlying type is only permitted as a standalone declaration}} - - enum X : enum Y : int {} {}; - // since-cxx11-error@-1 {{'dr1966::Y' cannot be defined in a type specifier}} + auto *p1 = new enum E : int; // expected-error {{only permitted as a standalone declaration}} + auto *p2 = new enum F : int {}; // expected-error {{only permitted as a standalone declaration}} + auto *p3 = true ? new enum G : int {}; // expected-error {{forward reference}} expected-error {{incomplete}} expected-note {{declaration}} + auto h() -> enum E : int {}; // expected-error {{only permitted as a standalone declaration}} + + enum X : enum Y : int {} {}; // expected-error {{cannot be defined in a type specifier}} struct Q { - // FIXME: can we emit something nicer than that? - enum X : enum Y : int {} {}; - // since-cxx11-error@-1 {{non-defining declaration of enumeration with a fixed underlying type is only permitted as a standalone declaration; missing list of enumerators?}} - // since-cxx11-error@-2 {{non-integral type 'enum Y' is an invalid underlying type}} - // since-cxx11-error@-3 {{anonymous bit-field cannot have a default member initializer}} + enum X : enum Y : int {} {}; // expected-error +{{}} }; #endif } diff --git a/clang/test/CXX/drs/dr412.cpp b/clang/test/CXX/drs/dr412.cpp index 8ea29135d1df8..af385ff463c73 100644 --- a/clang/test/CXX/drs/dr412.cpp +++ b/clang/test/CXX/drs/dr412.cpp @@ -1,10 +1,7 @@ // RUN: %clang_cc1 -std=c++98 %s -verify -fexceptions -fcxx-exceptions -pedantic-errors -DNOEXCEPT="throw()" -DBAD_ALLOC="throw(std::bad_alloc)" // RUN: %clang_cc1 -std=c++11 %s -verify -fexceptions -fcxx-exceptions -pedantic-errors -DNOEXCEPT=noexcept -DBAD_ALLOC= // RUN: %clang_cc1 -std=c++14 %s -verify -fexceptions -fcxx-exceptions -pedantic-errors -DNOEXCEPT=noexcept -DBAD_ALLOC= -// RUN: %clang_cc1 -std=c++17 %s -verify -fexceptions -fcxx-exceptions -pedantic-errors -DNOEXCEPT=noexcept -DBAD_ALLOC= -// RUN: %clang_cc1 -std=c++20 %s -verify -fexceptions -fcxx-exceptions -pedantic-errors -DNOEXCEPT=noexcept -DBAD_ALLOC= -// RUN: %clang_cc1 -std=c++23 %s -verify -fexceptions -fcxx-exceptions -pedantic-errors -DNOEXCEPT=noexcept -DBAD_ALLOC= -// RUN: %clang_cc1 -std=c++2c %s -verify -fexceptions -fcxx-exceptions -pedantic-errors -DNOEXCEPT=noexcept -DBAD_ALLOC= +// RUN: %clang_cc1 -std=c++1z %s -verify -fexceptions -fcxx-exceptions -pedantic-errors -DNOEXCEPT=noexcept -DBAD_ALLOC= // dr412: 3.4 // lwg404: yes @@ -14,17 +11,11 @@ __extension__ typedef __SIZE_TYPE__ size_t; namespace std { struct bad_alloc {}; } -inline void* operator new(size_t) BAD_ALLOC; -// expected-error@-1 {{replacement function 'operator new' cannot be declared 'inline'}} -inline void* operator new[](size_t) BAD_ALLOC; -// expected-error@-1 {{replacement function 'operator new[]' cannot be declared 'inline'}} -inline void operator delete(void*) NOEXCEPT; -// expected-error@-1 {{replacement function 'operator delete' cannot be declared 'inline'}} -inline void operator delete[](void*) NOEXCEPT; -// expected-error@-1 {{replacement function 'operator delete[]' cannot be declared 'inline'}} +inline void* operator new(size_t) BAD_ALLOC; // expected-error {{cannot be declared 'inline'}} +inline void* operator new[](size_t) BAD_ALLOC; // expected-error {{cannot be declared 'inline'}} +inline void operator delete(void*) NOEXCEPT; // expected-error {{cannot be declared 'inline'}} +inline void operator delete[](void*) NOEXCEPT; // expected-error {{cannot be declared 'inline'}} #ifdef __cpp_sized_deallocation -inline void operator delete(void*, size_t) NOEXCEPT; -// expected-error@-1 {{replacement function 'operator delete' cannot be declared 'inline'}} -inline void operator delete[](void*, size_t) NOEXCEPT; -// expected-error@-1 {{replacement function 'operator delete[]' cannot be declared 'inline'}} +inline void operator delete(void*, size_t) NOEXCEPT; // expected-error {{cannot be declared 'inline'}} +inline void operator delete[](void*, size_t) NOEXCEPT; // expected-error {{cannot be declared 'inline'}} #endif diff --git a/clang/test/CXX/drs/dr7xx.cpp b/clang/test/CXX/drs/dr7xx.cpp index 926bff1cc479c..11901b80d6462 100644 --- a/clang/test/CXX/drs/dr7xx.cpp +++ b/clang/test/CXX/drs/dr7xx.cpp @@ -1,53 +1,37 @@ -// RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++98 %s -verify=expected,cxx98-14,cxx98-11 -fexceptions -fcxx-exceptions -pedantic-errors -// RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++11 %s -verify=expected,cxx98-14,cxx98-11,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors -// RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++14 %s -verify=expected,cxx98-14,since-cxx14,since-cxx11,cxx14 -fexceptions -fcxx-exceptions -pedantic-errors -// RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++17 %s -verify=expected,since-cxx14,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors -// RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++2a %s -verify=expected,since-cxx14,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors +// RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++98 %s -verify -fexceptions -fcxx-exceptions -pedantic-errors +// RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++11 %s -verify -fexceptions -fcxx-exceptions -pedantic-errors +// RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++14 %s -verify -fexceptions -fcxx-exceptions -pedantic-errors +// RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++17 %s -verify -fexceptions -fcxx-exceptions -pedantic-errors +// RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++2a %s -verify -fexceptions -fcxx-exceptions -pedantic-errors namespace dr705 { // dr705: yes namespace N { struct S {}; - void f(S); // #dr705-f + void f(S); // expected-note {{declared here}} } void g() { N::S s; f(s); // ok - (f)(s); - // expected-error@-1 {{use of undeclared identifier 'f'}} - // expected-note@#dr705-f {{'N::f' declared here}} + (f)(s); // expected-error {{use of undeclared}} } } namespace dr712 { // dr712: partial void use(int); void f() { - const int a = 0; // #dr712-f-a + const int a = 0; // expected-note 5{{here}} struct X { void g(bool cond) { use(a); use((a)); use(cond ? a : a); - // FIXME: should only warn once - use((cond, a)); - // expected-warning@-1 {{left operand of comma operator has no effect}} - // expected-warning@-2 {{left operand of comma operator has no effect}} - - (void)a; - // expected-error@-1 {{reference to local variable 'a' declared in enclosing function 'dr712::f'}} FIXME - // expected-note@#dr712-f-a {{'a' declared here}} - (void)(a); - // expected-error@-1 {{reference to local variable 'a' declared in enclosing function 'dr712::f'}} FIXME - // expected-note@#dr712-f-a {{'a' declared here}} - (void)(cond ? a : a); // #dr712-ternary - // expected-error@#dr712-ternary {{reference to local variable 'a' declared in enclosing function 'dr712::f'}} FIXME - // expected-note@#dr712-f-a {{'a' declared here}} - // expected-error@#dr712-ternary {{reference to local variable 'a' declared in enclosing function 'dr712::f'}} FIXME - // expected-note@#dr712-f-a {{'a' declared here}} - (void)(cond, a); // #dr712-comma - // expected-error@-1 {{reference to local variable 'a' declared in enclosing function 'dr712::f'}} FIXME - // expected-note@#dr712-f-a {{'a' declared here}} - // expected-warning@#dr712-comma {{left operand of comma operator has no effect}} + use((cond, a)); // expected-warning 2{{left operand of comma operator has no effect}} FIXME: should only warn once + + (void)a; // FIXME: expected-error {{declared in enclosing}} + (void)(a); // FIXME: expected-error {{declared in enclosing}} + (void)(cond ? a : a); // FIXME: expected-error 2{{declared in enclosing}} + (void)(cond, a); // FIXME: expected-error {{declared in enclosing}} expected-warning {{left operand of comma operator has no effect}} } }; } @@ -55,18 +39,14 @@ namespace dr712 { // dr712: partial #if __cplusplus >= 201103L void g() { struct A { int n; }; - constexpr A a = {0}; // #dr712-g-a + constexpr A a = {0}; // expected-note 2{{here}} struct X { void g(bool cond) { use(a.n); use(a.*&A::n); - (void)a.n; - // since-cxx11-error@-1 {{reference to local variable 'a' declared in enclosing function 'dr712::g'}} FIXME - // since-cxx11-note@#dr712-g-a {{'a' declared here}} - (void)(a.*&A::n); - // since-cxx11-error@-1 {{reference to local variable 'a' declared in enclosing function 'dr712::g'}} FIXME - // since-cxx11-note@#dr712-g-a {{'a' declared here}} + (void)a.n; // FIXME: expected-error {{declared in enclosing}} + (void)(a.*&A::n); // FIXME: expected-error {{declared in enclosing}} } }; } @@ -75,10 +55,9 @@ namespace dr712 { // dr712: partial namespace dr727 { // dr727: partial struct A { - template struct C; // #dr727-C - template void f(); // #dr727-f - template static int N; // #dr727-N - // cxx98-11-error@-1 {{variable templates are a C++14 extension}} + template struct C; // expected-note 6{{here}} + template void f(); // expected-note {{here}} + template static int N; // expected-error 0-1{{C++14}} expected-note 6{{here}} template<> struct C; template<> void f(); @@ -88,40 +67,19 @@ namespace dr727 { // dr727: partial template static int N; struct B { - template<> struct C; - // expected-error@-1 {{class template specialization of 'C' not in class 'A' or an enclosing namespace}} - // expected-note@#dr727-C {{explicitly specialized declaration is here}} - template<> void f(); - // expected-error@-1 {{no function template matches function template specialization 'f'}} - template<> static int N; - // expected-error@-1 {{variable template specialization of 'N' not in class 'A' or an enclosing namespace}} - // expected-note@#dr727-N {{explicitly specialized declaration is here}} - - template struct C; - // expected-error@-1 {{class template partial specialization of 'C' not in class 'A' or an enclosing namespace}} - // expected-note@#dr727-C {{explicitly specialized declaration is here}} - template static int N; - // expected-error@-1 {{variable template partial specialization of 'N' not in class 'A' or an enclosing namespace}} - // expected-note@#dr727-N {{explicitly specialized declaration is here}} - - template<> struct A::C; - // expected-error@-1 {{class template specialization of 'C' not in class 'A' or an enclosing namespace}} - // expected-note@#dr727-C {{explicitly specialized declaration is here}} - template<> void A::f(); - // expected-error@-1 {{o function template matches function template specialization 'f'}} - // expected-error@-2 {{non-friend class member 'f' cannot have a qualified name}} - template<> static int A::N; - // expected-error@-1 {{non-friend class member 'N' cannot have a qualified name}} - // expected-error@-2 {{variable template specialization of 'N' not in class 'A' or an enclosing namespace}} - // expected-note@#dr727-N {{explicitly specialized declaration is here}} - - template struct A::C; - // expected-error@-1 {{class template partial specialization of 'C' not in class 'A' or an enclosing namespace}} - // expected-note@#dr727-C {{explicitly specialized declaration is here}} - template static int A::N; - // expected-error@-1 {{non-friend class member 'N' cannot have a qualified name}} - // expected-error@-2 {{variable template partial specialization of 'N' not in class 'A' or an enclosing namespace}} - // expected-note@#dr727-N {{explicitly specialized declaration is here}} + template<> struct C; // expected-error {{not in class 'A' or an enclosing namespace}} + template<> void f(); // expected-error {{no function template matches}} + template<> static int N; // expected-error {{not in class 'A' or an enclosing namespace}} + + template struct C; // expected-error {{not in class 'A' or an enclosing namespace}} + template static int N; // expected-error {{not in class 'A' or an enclosing namespace}} + + template<> struct A::C; // expected-error {{not in class 'A' or an enclosing namespace}} + template<> void A::f(); // expected-error {{no function template matches}} expected-error {{cannot have a qualified name}} + template<> static int A::N; // expected-error {{not in class 'A' or an enclosing namespace}} expected-error {{cannot have a qualified name}} + + template struct A::C; // expected-error {{not in class 'A' or an enclosing namespace}} + template static int A::N; // expected-error {{not in class 'A' or an enclosing namespace}} expected-error {{cannot have a qualified name}} }; }; @@ -133,36 +91,19 @@ namespace dr727 { // dr727: partial template int A::N; namespace C { - template<> struct A::C; - // expected-error@-1 {{class template specialization of 'C' not in class 'A' or an enclosing namespace}} - // expected-note@#dr727-C {{explicitly specialized declaration is here}} - template<> void A::f(); - // expected-error@-1 {{function template specialization of 'f' not in class 'A' or an enclosing namespace}} - // expected-note@#dr727-f {{explicitly specialized declaration is here}} - template<> int A::N; - // expected-error@-1 {{variable template specialization of 'N' not in class 'A' or an enclosing namespace}} - // expected-note@#dr727-N {{explicitly specialized declaration is here}} - - template struct A::C; - // expected-error@-1 {{class template partial specialization of 'C' not in class 'A' or an enclosing namespace}} - // expected-note@#dr727-C {{explicitly specialized declaration is here}} - template int A::N; - // expected-error@-1 {{variable template partial specialization of 'N' not in class 'A' or an enclosing namespace}} - // expected-note@#dr727-N {{explicitly specialized declaration is here}} + template<> struct A::C; // expected-error {{not in class 'A' or an enclosing namespace}} + template<> void A::f(); // expected-error {{not in class 'A' or an enclosing namespace}} + template<> int A::N; // expected-error {{not in class 'A' or an enclosing namespace}} + + template struct A::C; // expected-error {{not in class 'A' or an enclosing namespace}} + template int A::N; // expected-error {{not in class 'A' or an enclosing namespace}} } template struct D { - template struct C { typename T::error e; }; - // expected-error@-1 {{type 'float' cannot be used prior to '::' because it has no members}} - // expected-note@#dr727-C-float {{in instantiation of template class 'dr727::D::C' requested here}} - template void f() { T::error; } - // expected-error@-1 {{type 'float' cannot be used prior to '::' because it has no members}} - // expected-note@#dr727-f-float {{in instantiation of function template specialization 'dr727::D::f' requested here}} - template static const int N = T::error; - // cxx98-11-error@-1 {{variable templates are a C++14 extension}} - // expected-error@-2 {{type 'float' cannot be used prior to '::' because it has no members}} - // expected-note@#dr727-N-float {{in instantiation of static data member 'dr727::D::N' requested here}} + template struct C { typename T::error e; }; // expected-error {{no members}} + template void f() { T::error; } // expected-error {{no members}} + template static const int N = T::error; // expected-error {{no members}} expected-error 0-1{{C++14}} template<> struct C {}; template<> void f() {} @@ -173,8 +114,7 @@ namespace dr727 { // dr727: partial template struct E { - template<> void f() {} - // expected-error@-1 {{no candidate function template was found for dependent member function template specialization}} + template<> void f() {} // expected-error {{no candidate function template}} }; }; @@ -186,9 +126,9 @@ namespace dr727 { // dr727: partial D::C(); int b = D::N; - D::C(); // #dr727-C-float - di.f(); // #dr727-f-float - int c = D::N; // #dr727-N-float + D::C(); // expected-note {{instantiation of}} + di.f(); // expected-note {{instantiation of}} + int c = D::N; // expected-note {{instantiation of}} } namespace mixed_inner_outer_specialization { @@ -208,30 +148,28 @@ namespace dr727 { // dr727: partial #if __cplusplus >= 201402L template struct B { template static const int u = 1; - template<> static const int u<0> = 2; // #dr727-u0 + template<> static const int u<0> = 2; // expected-note {{here}} // Note that in C++17 onwards, these are implicitly inline, and so the // initializer of v<0> is not instantiated with the declaration. In // C++14, v<0> is a non-defining declaration and its initializer is // instantiated with the class. template static constexpr int v = 1; - template<> static constexpr int v<0> = 2; // #dr727-v0 + template<> static constexpr int v<0> = 2; // #v0 - template static const inline int w = 1; - // cxx14-error@-1 {{inline variables are a C++17 extension}} - template<> static const inline int w<0> = 2; - // cxx14-error@-1 {{inline variables are a C++17 extension}} + template static const inline int w = 1; // expected-error 0-1{{C++17 extension}} + template<> static const inline int w<0> = 2; // expected-error 0-1{{C++17 extension}} }; template<> template constexpr int B<0>::u = 3; - template<> template<> constexpr int B<0>::u<0> = 4; - // since-cxx14-error@-1 {{static data member 'u' already has an initializer}} - // since-cxx14-note@#dr727-u0 {{previous initialization is here}} + template<> template<> constexpr int B<0>::u<0> = 4; // expected-error {{already has an initializer}} template<> template constexpr int B<0>::v = 3; template<> template<> constexpr int B<0>::v<0> = 4; - // cxx14-error@-1 {{static data member 'v' already has an initializer}} - // cxx14-note@#dr727-v0 {{previous initialization is here}} +#if __cplusplus < 201702L + // expected-error@-2 {{already has an initializer}} + // expected-note@#v0 {{here}} +#endif template<> template constexpr int B<0>::w = 3; template<> template<> constexpr int B<0>::w<0> = 4; @@ -244,8 +182,10 @@ namespace dr727 { // dr727: partial static_assert(B<1>().v<0> == 2, ""); static_assert(B<0>().v<1> == 3, ""); static_assert(B<0>().v<0> == 4, ""); - // cxx14-error@-1 {{static assertion failed due to requirement 'dr727::mixed_inner_outer_specialization::B<0>().v<0> == 4'}} - // cxx14-note@-2 {{expression evaluates to '2 == 4'}} +#if __cplusplus < 201702L + // expected-error@-2 {{failed}} \ + // expected-note@-2 {{evaluates to '2 == 4'}} +#endif static_assert(B<1>().w<1> == 1, ""); static_assert(B<1>().w<0> == 2, ""); @@ -265,23 +205,13 @@ namespace dr727 { // dr727: partial template<> int f2() {} template<> int f2() {} - template static int v1; - // cxx98-11-error@-1 {{variable templates are a C++14 extension}} - template<> static int v1; // #dr727-v1-T - template<> static int v1; - // expected-error@-1 {{duplicate member 'v1'}} - // expected-note@#dr727-Collision-int-int {{in instantiation of template class 'dr727::Collision' requested here}} - // expected-note@#dr727-v1-T {{previous}} - - template static inline int v2; - // cxx98-11-error@-1 {{variable templates are a C++14 extension}} - // cxx98-14-error@-2 {{inline variables are a C++17 extension}} - template<> static inline int v2; // #dr727-v2-T - // cxx98-14-error@-1 {{inline variables are a C++17 extension}} - template<> static inline int v2; - // cxx98-14-error@-1 {{inline variables are a C++17 extension}} - // expected-error@-2 {{duplicate member 'v2'}} - // expected-note@#dr727-v2-T {{previous declaration is here}} + template static int v1; // expected-error 0-1{{C++14 extension}} + template<> static int v1; // expected-note {{previous}} + template<> static int v1; // expected-error {{duplicate member}} + + template static inline int v2; // expected-error 0-1{{C++17 extension}} expected-error 0-1{{C++14 extension}} + template<> static inline int v2; // expected-error 0-1{{C++17 extension}} expected-note {{previous}} + template<> static inline int v2; // expected-error 0-1{{C++17 extension}} expected-error {{duplicate member}} // FIXME: Missing diagnostic for duplicate class explicit specialization. template struct S1; @@ -289,12 +219,10 @@ namespace dr727 { // dr727: partial template<> struct S1; template struct S2; - template<> struct S2 {}; // #dr727-S2-T - template<> struct S2 {}; - // expected-error@-1 {{redefinition of 'S2'}} - // expected-note@#dr727-S2-T {{previous}} + template<> struct S2 {}; // expected-note {{previous}} + template<> struct S2 {}; // expected-error {{redefinition}} }; - Collision c; // #dr727-Collision-int-int + Collision c; // expected-note {{in instantiation of}} } namespace dr777 { // dr777: 3.7 diff --git a/clang/test/CXX/drs/dr8xx.cpp b/clang/test/CXX/drs/dr8xx.cpp index b031e65095cdb..11948d0410eb5 100644 --- a/clang/test/CXX/drs/dr8xx.cpp +++ b/clang/test/CXX/drs/dr8xx.cpp @@ -1,27 +1,30 @@ -// RUN: %clang_cc1 -std=c++98 -triple x86_64-unknown-unknown %s -verify=expected -fexceptions -fcxx-exceptions -pedantic-errors -// RUN: %clang_cc1 -std=c++11 -triple x86_64-unknown-unknown %s -verify=expected,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors -// RUN: %clang_cc1 -std=c++14 -triple x86_64-unknown-unknown %s -verify=expected,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors -// RUN: %clang_cc1 -std=c++17 -triple x86_64-unknown-unknown %s -verify=expected,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors -// RUN: %clang_cc1 -std=c++20 -triple x86_64-unknown-unknown %s -verify=expected,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors -// RUN: %clang_cc1 -std=c++23 -triple x86_64-unknown-unknown %s -verify=expected,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors -// RUN: %clang_cc1 -std=c++2c -triple x86_64-unknown-unknown %s -verify=expected,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors +// RUN: %clang_cc1 -std=c++98 -triple x86_64-unknown-unknown %s -verify -fexceptions -fcxx-exceptions -pedantic-errors +// RUN: %clang_cc1 -std=c++11 -triple x86_64-unknown-unknown %s -verify -fexceptions -fcxx-exceptions -pedantic-errors 2>&1 | FileCheck %s +// RUN: %clang_cc1 -std=c++14 -triple x86_64-unknown-unknown %s -verify -fexceptions -fcxx-exceptions -pedantic-errors 2>&1 | FileCheck %s +// RUN: %clang_cc1 -std=c++17 -triple x86_64-unknown-unknown %s -verify -fexceptions -fcxx-exceptions -pedantic-errors 2>&1 | FileCheck %s +// RUN: %clang_cc1 -std=c++20 -triple x86_64-unknown-unknown %s -verify -fexceptions -fcxx-exceptions -pedantic-errors 2>&1 | FileCheck %s +// RUN: %clang_cc1 -std=c++23 -triple x86_64-unknown-unknown %s -verify -fexceptions -fcxx-exceptions -pedantic-errors 2>&1 | FileCheck %s +// RUN: %clang_cc1 -std=c++2c -triple x86_64-unknown-unknown %s -verify -fexceptions -fcxx-exceptions -pedantic-errors 2>&1 | FileCheck %s -#if __cplusplus == 199711L // expected-no-diagnostics -#endif namespace dr873 { // dr873: 3.0 #if __cplusplus >= 201103L template void f(T &&); -template <> void f(int &) = delete; // #dr873-lvalue-ref -template <> void f(int &&) = delete; // #dr873-rvalue-ref +template <> void f(int &) {} // #1 +template <> void f(int &&) {} // #2 void g(int i) { - f(i); // calls f(int&) - // since-cxx11-error@-1 {{call to deleted function 'f'}} - // since-cxx11-note@#dr873-lvalue-ref {{candidate function [with T = int &] has been implicitly deleted}} - f(0); // calls f(int&&) - // since-cxx11-error@-1 {{call to deleted function 'f'}} - // since-cxx11-note@#dr873-rvalue-ref {{candidate function [with T = int] has been implicitly deleted}} + f(i); // calls f(int&), i.e., #1 +#pragma clang __debug dump f(i) + // CHECK: CallExpr {{.*}} + // CHECK-NEXT: |-ImplicitCastExpr {{.*}} + // CHECK-NEXT: | `-DeclRefExpr {{.*}} 'f' 'void (int &)' {{.*}} + + f(0); // calls f(int&&), i.e., #2 +#pragma clang __debug dump f(0) + // CHECK: CallExpr {{.*}} + // CHECK-NEXT: |-ImplicitCastExpr {{.*}} + // CHECK-NEXT: | `-DeclRefExpr {{.*}} 'f' 'void (int &&)' {{.*}} } #endif } // namespace dr873 diff --git a/clang/test/CXX/drs/dr9xx.cpp b/clang/test/CXX/drs/dr9xx.cpp index 7ea295a684b7a..e526582292480 100644 --- a/clang/test/CXX/drs/dr9xx.cpp +++ b/clang/test/CXX/drs/dr9xx.cpp @@ -1,9 +1,9 @@ -// RUN: %clang_cc1 -std=c++98 %s -verify=expected -fexceptions -fcxx-exceptions -pedantic-errors -// RUN: %clang_cc1 -std=c++11 %s -verify=expected,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors -// RUN: %clang_cc1 -std=c++14 %s -verify=expected,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors -// RUN: %clang_cc1 -std=c++17 %s -verify=expected,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors -// RUN: %clang_cc1 -std=c++20 %s -verify=expected,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors -// RUN: %clang_cc1 -std=c++23 %s -verify=expected,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors +// RUN: %clang_cc1 -std=c++98 %s -verify -fexceptions -fcxx-exceptions -pedantic-errors +// RUN: %clang_cc1 -std=c++11 %s -verify -fexceptions -fcxx-exceptions -pedantic-errors +// RUN: %clang_cc1 -std=c++14 %s -verify -fexceptions -fcxx-exceptions -pedantic-errors +// RUN: %clang_cc1 -std=c++17 %s -verify -fexceptions -fcxx-exceptions -pedantic-errors +// RUN: %clang_cc1 -std=c++20 %s -verify -fexceptions -fcxx-exceptions -pedantic-errors +// RUN: %clang_cc1 -std=c++23 %s -verify -fexceptions -fcxx-exceptions -pedantic-errors namespace std { __extension__ typedef __SIZE_TYPE__ size_t; @@ -46,31 +46,27 @@ namespace dr948 { // dr948: 3.7 namespace dr952 { // dr952: 2.8 namespace example1 { struct A { - typedef int I; // #dr952-I + typedef int I; // #dr952-typedef-decl }; -struct B : private A { // #dr952-B +struct B : private A { // #dr952-inheritance }; struct C : B { void f() { - I i1; - // expected-error@-1 {{'I' is a private member of 'dr952::example1::A'}} - // expected-note@#dr952-B {{constrained by private inheritance here}} - // expected-note@#dr952-I {{member is declared here}} + I i1; // expected-error {{private member}} + // expected-note@#dr952-inheritance {{constrained by private inheritance}} + // expected-note@#dr952-typedef-decl {{declared here}} } - I i2; - // expected-error@-1 {{'I' is a private member of 'dr952::example1::A'}} - // expected-note@#dr952-B {{constrained by private inheritance here}} - // expected-note@#dr952-I {{member is declared here}} + I i2; // expected-error {{private member}} + // expected-note@#dr952-inheritance {{constrained by private inheritance}} + // expected-note@#dr952-typedef-decl {{declared here}} struct D { - I i3; - // expected-error@-1 {{'I' is a private member of 'dr952::example1::A'}} - // expected-note@#dr952-B {{constrained by private inheritance here}} - // expected-note@#dr952-I {{member is declared here}} + I i3; // expected-error {{private member}} + // expected-note@#dr952-inheritance {{constrained by private inheritance}} + // expected-note@#dr952-typedef-decl {{declared here}} void g() { - I i4; - // expected-error@-1 {{'I' is a private member of 'dr952::example1::A'}} - // expected-note@#dr952-B {{constrained by private inheritance here}} - // expected-note@#dr952-I {{member is declared here}} + I i4; // expected-error {{private member}} + // expected-note@#dr952-inheritance {{constrained by private inheritance}} + // expected-note@#dr952-typedef-decl {{declared here}} } }; }; @@ -95,10 +91,10 @@ namespace dr974 { // dr974: yes } namespace dr977 { // dr977: yes -enum E { e = E() }; // #dr977-E +enum E { e = E() }; #if !defined(_WIN32) || defined(__MINGW32__) -// expected-error@#dr977-E {{invalid use of incomplete type 'E'}} -// expected-note@#dr977-E {{definition of 'dr977::E' is not complete until the closing '}'}} +// expected-error@-2 {{invalid use of incomplete type 'E'}} +// expected-note@-3 {{definition of 'dr977::E' is not complete until the closing '}'}} #endif #if __cplusplus >= 201103L enum E2 : int { e2 = E2() }; @@ -109,29 +105,23 @@ enum struct E4 : int { e = static_cast(E4()) }; namespace dr990 { // dr990: 3.5 #if __cplusplus >= 201103L - struct A { // #dr990-A - A(std::initializer_list); // #dr990-A-init-list + struct A { // expected-note 2{{candidate}} + A(std::initializer_list); // expected-note {{candidate}} }; struct B { A a; }; B b1 { }; - B b2 { 1 }; - // since-cxx11-error@-1 {{no viable conversion from 'int' to 'A'}} - // since-cxx11-note@#dr990-A {{candidate constructor (the implicit copy constructor) not viable: no known conversion from 'int' to 'const A &' for 1st argument}} - // since-cxx11-note@#dr990-A {{candidate constructor (the implicit move constructor) not viable: no known conversion from 'int' to 'A &&' for 1st argument}} - // since-cxx11-note@#dr990-A-init-list {{candidate constructor not viable: no known conversion from 'int' to 'std::initializer_list' for 1st argument}} + B b2 { 1 }; // expected-error {{no viable conversion from 'int' to 'A'}} B b3 { { 1 } }; struct C { C(); C(int); - C(std::initializer_list) = delete; // #dr990-deleted + C(std::initializer_list) = delete; // expected-note {{here}} }; C c1[3] { 1 }; // ok - C c2[3] { 1, {2} }; - // since-cxx11-error@-1 {{call to deleted constructor of 'C'}} - // since-cxx11-note@#dr990-deleted {{'C' has been explicitly marked deleted here}} + C c2[3] { 1, {2} }; // expected-error {{call to deleted}} struct D { D(); diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_reinterpret_svcount_svbool.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_reinterpret_svcount_svbool.c deleted file mode 100644 index 82b30e8bbe9b9..0000000000000 --- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_reinterpret_svcount_svbool.c +++ /dev/null @@ -1,47 +0,0 @@ -// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py - -// REQUIRES: aarch64-registered-target - -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK -// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK - -#include - -#ifdef SVE_OVERLOADED_FORMS -// A simple used,unused... macro, long enough to represent any SVE builtin.§ -#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3 -#else -#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4 -#endif - -// CHECK-LABEL: @test_svreinterpret_svbool_svcnt( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.taarch64.svcountt(target("aarch64.svcount") [[CNT:%.*]]) -// CHECK-NEXT: ret [[TMP0]] -// -// CPP-CHECK-LABEL: @_Z31test_svreinterpret_svbool_svcntu11__SVCount_t( -// CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.aarch64.sve.convert.to.svbool.taarch64.svcountt(target("aarch64.svcount") [[CNT:%.*]]) -// CPP-CHECK-NEXT: ret [[TMP0]] -// -svbool_t test_svreinterpret_svbool_svcnt(svcount_t cnt) __arm_streaming_compatible -{ - return SVE_ACLE_FUNC(svreinterpret,_b,,)(cnt); -} - -// CHECK-LABEL: @test_svreinterpret_svcnt_svbool( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.convert.from.svbool.taarch64.svcountt( [[PG:%.*]]) -// CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] -// -// CPP-CHECK-LABEL: @_Z31test_svreinterpret_svcnt_svboolu10__SVBool_t( -// CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.convert.from.svbool.taarch64.svcountt( [[PG:%.*]]) -// CPP-CHECK-NEXT: ret target("aarch64.svcount") [[TMP0]] -// -svcount_t test_svreinterpret_svcnt_svbool(svbool_t pg) __arm_streaming_compatible -{ - return SVE_ACLE_FUNC(svreinterpret,_c,,)(pg); -} diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_create2_bool.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_create2_bool.c deleted file mode 100644 index eb5a19b1d9d32..0000000000000 --- a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_create2_bool.c +++ /dev/null @@ -1,38 +0,0 @@ -// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s \ -// RUN: | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s \ -// RUN: | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK -// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s \ -// RUN: | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s\ -// RUN: | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s - -// REQUIRES: aarch64-registered-target - -#include - -#ifdef SVE_OVERLOADED_FORMS -// A simple used,unused... macro, long enough to represent any SVE builtin. -#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3 -#else -#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4 -#endif - -// CHECK-LABEL: @test_svcreate2_s8( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[X0:%.*]], i64 0) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP0]], [[X1:%.*]], i64 16) -// CHECK-NEXT: ret [[TMP1]] -// -// CPP-CHECK-LABEL: @_Z17test_svcreate2_s8u10__SVBool_tS_( -// CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( poison, [[X0:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TMP0]], [[X1:%.*]], i64 16) -// CPP-CHECK-NEXT: ret [[TMP1]] -// -svboolx2_t test_svcreate2_s8(svbool_t x0, svbool_t x1) -{ - return SVE_ACLE_FUNC(svcreate2,_b8,,)(x0, x1); -} diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_create4_bool.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_create4_bool.c deleted file mode 100644 index 3f99ee6e79c07..0000000000000 --- a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_create4_bool.c +++ /dev/null @@ -1,42 +0,0 @@ -// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s \ -// RUN: | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s \ -// RUN: | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK -// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s \ -// RUN: | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s\ -// RUN: | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s - -// REQUIRES: aarch64-registered-target - -#include - -#ifdef SVE_OVERLOADED_FORMS -// A simple used,unused... macro, long enough to represent any SVE builtin. -#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3 -#else -#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4 -#endif - -// CHECK-LABEL: @test_svcreate4_b8( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.insert.nxv64i1.nxv16i1( poison, [[X0:%.*]], i64 0) -// CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.insert.nxv64i1.nxv16i1( [[TMP0]], [[X1:%.*]], i64 16) -// CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv64i1.nxv16i1( [[TMP1]], [[X2:%.*]], i64 32) -// CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv64i1.nxv16i1( [[TMP2]], [[X4:%.*]], i64 48) -// CHECK-NEXT: ret [[TMP3]] -// -// CPP-CHECK-LABEL: @_Z17test_svcreate4_b8u10__SVBool_tS_S_S_( -// CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.insert.nxv64i1.nxv16i1( poison, [[X0:%.*]], i64 0) -// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.vector.insert.nxv64i1.nxv16i1( [[TMP0]], [[X1:%.*]], i64 16) -// CPP-CHECK-NEXT: [[TMP2:%.*]] = tail call @llvm.vector.insert.nxv64i1.nxv16i1( [[TMP1]], [[X2:%.*]], i64 32) -// CPP-CHECK-NEXT: [[TMP3:%.*]] = tail call @llvm.vector.insert.nxv64i1.nxv16i1( [[TMP2]], [[X4:%.*]], i64 48) -// CPP-CHECK-NEXT: ret [[TMP3]] -// -svboolx4_t test_svcreate4_b8(svbool_t x0, svbool_t x1, svbool_t x2, svbool_t x4) -{ - return SVE_ACLE_FUNC(svcreate4,_b8,,)(x0, x1, x2, x4); -} diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_get2_bool.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_get2_bool.c deleted file mode 100644 index 5d38f72b34b16..0000000000000 --- a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_get2_bool.c +++ /dev/null @@ -1,49 +0,0 @@ -// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s \ -// RUN: | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s \ -// RUN: | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK -// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s \ -// RUN: | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s\ -// RUN: | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s -// REQUIRES: aarch64-registered-target -#include - -#ifdef SVE_OVERLOADED_FORMS -// A simple used,unused... macro, long enough to represent any SVE builtin. -#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3 -#else -#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4 -#endif - -// CHECK-LABEL: @test_svget2_b8_0( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i1.nxv32i1( [[TUPLE:%.*]], i64 0) -// CHECK-NEXT: ret [[TMP0]] -// -// CPP-CHECK-LABEL: @_Z16test_svget2_b8_010svboolx2_t( -// CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i1.nxv32i1( [[TUPLE:%.*]], i64 0) -// CPP-CHECK-NEXT: ret [[TMP0]] -// -svbool_t test_svget2_b8_0(svboolx2_t tuple) -{ - return SVE_ACLE_FUNC(svget2,_b8,,)(tuple, 0); -} - -// CHECK-LABEL: @test_svget2_b8_1( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i1.nxv32i1( [[TUPLE:%.*]], i64 16) -// CHECK-NEXT: ret [[TMP0]] -// -// CPP-CHECK-LABEL: @_Z16test_svget2_b8_110svboolx2_t( -// CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i1.nxv32i1( [[TUPLE:%.*]], i64 16) -// CPP-CHECK-NEXT: ret [[TMP0]] -// -svbool_t test_svget2_b8_1(svboolx2_t tuple) -{ - return SVE_ACLE_FUNC(svget2,_b8,,)(tuple, 1); -} diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_get4_bool.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_get4_bool.c deleted file mode 100644 index 821a5be3254eb..0000000000000 --- a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_get4_bool.c +++ /dev/null @@ -1,72 +0,0 @@ -// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s \ -// RUN: | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s \ -// RUN: | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK -// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s \ -// RUN: | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s\ -// RUN: | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s - -// REQUIRES: aarch64-registered-target - -#include - -#ifdef SVE_OVERLOADED_FORMS -// A simple used,unused... macro, long enough to represent any SVE builtin. -#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3 -#else -#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4 -#endif - -// NOTE: For these tests clang converts the struct parameter into -// several parameters, one for each member of the original struct. -// CHECK-LABEL: @test_svget4_b8_0( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i1.nxv64i1( [[TUPLE:%.*]], i64 0) -// CHECK-NEXT: ret [[TMP0]] -// -// CPP-CHECK-LABEL: @_Z16test_svget4_b8_010svboolx4_t( -// CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i1.nxv64i1( [[TUPLE:%.*]], i64 0) -// CPP-CHECK-NEXT: ret [[TMP0]] -// -svbool_t test_svget4_b8_0(svboolx4_t tuple) -{ - return SVE_ACLE_FUNC(svget4,_b8,,)(tuple, 0); -} - -// NOTE: For these tests clang converts the struct parameter into -// several parameters, one for each member of the original struct. -// CHECK-LABEL: @test_svget4_b8_1( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i1.nxv64i1( [[TUPLE:%.*]], i64 16) -// CHECK-NEXT: ret [[TMP0]] -// -// CPP-CHECK-LABEL: @_Z16test_svget4_b8_110svboolx4_t( -// CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i1.nxv64i1( [[TUPLE:%.*]], i64 16) -// CPP-CHECK-NEXT: ret [[TMP0]] -// -svbool_t test_svget4_b8_1(svboolx4_t tuple) -{ - return SVE_ACLE_FUNC(svget4,_b8,,)(tuple, 1); -} - -// NOTE: For these tests clang converts the struct parameter into -// several parameters, one for each member of the original struct. -// CHECK-LABEL: @test_svget4_b8_3( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i1.nxv64i1( [[TUPLE:%.*]], i64 48) -// CHECK-NEXT: ret [[TMP0]] -// -// CPP-CHECK-LABEL: @_Z16test_svget4_b8_310svboolx4_t( -// CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.extract.nxv16i1.nxv64i1( [[TUPLE:%.*]], i64 48) -// CPP-CHECK-NEXT: ret [[TMP0]] -// -svbool_t test_svget4_b8_3(svboolx4_t tuple) -{ - return SVE_ACLE_FUNC(svget4,_b8,,)(tuple, 3); -} diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_set2_bool.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_set2_bool.c deleted file mode 100644 index 298e5b80ceb00..0000000000000 --- a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_set2_bool.c +++ /dev/null @@ -1,52 +0,0 @@ -// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s \ -// RUN: | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s \ -// RUN: | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK -// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s \ -// RUN: | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s\ -// RUN: | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s - -// REQUIRES: aarch64-registered-target - -#include - -#ifdef SVE_OVERLOADED_FORMS -// A simple used,unused... macro, long enough to represent any SVE builtin. -#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3 -#else -#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4 -#endif - -// CHECK-LABEL: @test_svset2_b8_0( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TUPLE:%.*]], [[X:%.*]], i64 0) -// CHECK-NEXT: ret [[TMP0]] -// -// CPP-CHECK-LABEL: @_Z16test_svset2_b8_010svboolx2_tu10__SVBool_t( -// CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TUPLE:%.*]], [[X:%.*]], i64 0) -// CPP-CHECK-NEXT: ret [[TMP0]] -// -svboolx2_t test_svset2_b8_0(svboolx2_t tuple, svbool_t x) -{ - return SVE_ACLE_FUNC(svset2,_b8,,)(tuple, 0, x); -} - -// CHECK-LABEL: @test_svset2_b8_1( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TUPLE:%.*]], [[X:%.*]], i64 16) -// CHECK-NEXT: ret [[TMP0]] -// -// CPP-CHECK-LABEL: @_Z16test_svset2_b8_110svboolx2_tu10__SVBool_t( -// CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.insert.nxv32i1.nxv16i1( [[TUPLE:%.*]], [[X:%.*]], i64 16) -// CPP-CHECK-NEXT: ret [[TMP0]] -// -svboolx2_t test_svset2_b8_1(svboolx2_t tuple, svbool_t x) -{ - return SVE_ACLE_FUNC(svset2,_b8,,)(tuple, 1, x); -} - diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_set4_bool.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_set4_bool.c deleted file mode 100644 index 746a8280ac53f..0000000000000 --- a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_set4_bool.c +++ /dev/null @@ -1,66 +0,0 @@ -// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s \ -// RUN: | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s \ -// RUN: | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK -// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s \ -// RUN: | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s\ -// RUN: | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s -// REQUIRES: aarch64-registered-target - -#include - -#ifdef SVE_OVERLOADED_FORMS -// A simple used,unused... macro, long enough to represent any SVE builtin. -#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3 -#else -#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4 -#endif - - -// CHECK-LABEL: @test_svset4_b8_0( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.insert.nxv64i1.nxv16i1( [[TUPLE:%.*]], [[X:%.*]], i64 0) -// CHECK-NEXT: ret [[TMP0]] -// -// CPP-CHECK-LABEL: @_Z16test_svset4_b8_010svboolx4_tu10__SVBool_t( -// CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.insert.nxv64i1.nxv16i1( [[TUPLE:%.*]], [[X:%.*]], i64 0) -// CPP-CHECK-NEXT: ret [[TMP0]] -// -svboolx4_t test_svset4_b8_0(svboolx4_t tuple, svbool_t x) -{ - return SVE_ACLE_FUNC(svset4,_b8,,)(tuple, 0, x); -} - -// CHECK-LABEL: @test_svset4_b8_1( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.insert.nxv64i1.nxv16i1( [[TUPLE:%.*]], [[X:%.*]], i64 16) -// CHECK-NEXT: ret [[TMP0]] -// -// CPP-CHECK-LABEL: @_Z16test_svset4_b8_110svboolx4_tu10__SVBool_t( -// CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.insert.nxv64i1.nxv16i1( [[TUPLE:%.*]], [[X:%.*]], i64 16) -// CPP-CHECK-NEXT: ret [[TMP0]] -// -svboolx4_t test_svset4_b8_1(svboolx4_t tuple, svbool_t x) -{ - return SVE_ACLE_FUNC(svset4,_b8,,)(tuple, 1, x); -} - -// CHECK-LABEL: @test_svset4_b8_3( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.insert.nxv64i1.nxv16i1( [[TUPLE:%.*]], [[X:%.*]], i64 48) -// CHECK-NEXT: ret [[TMP0]] -// -// CPP-CHECK-LABEL: @_Z16test_svset4_b8_310svboolx4_tu10__SVBool_t( -// CPP-CHECK-NEXT: entry: -// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call @llvm.vector.insert.nxv64i1.nxv16i1( [[TUPLE:%.*]], [[X:%.*]], i64 48) -// CPP-CHECK-NEXT: ret [[TMP0]] -// -svboolx4_t test_svset4_b8_3(svboolx4_t tuple, svbool_t x) -{ - return SVE_ACLE_FUNC(svset4,_b8,,)(tuple, 3, x); -} diff --git a/clang/test/CodeGen/arm-target-features.c b/clang/test/CodeGen/arm-target-features.c index ad4bfd45c408b..bd051059fb0cc 100644 --- a/clang/test/CodeGen/arm-target-features.c +++ b/clang/test/CodeGen/arm-target-features.c @@ -113,9 +113,6 @@ // RUN: %clang_cc1 -triple thumb-linux-gnueabi -target-cpu cortex-m85 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-ARMV81M-CORTEX-M85-LINUX // CHECK-ARMV81M-CORTEX-M85-LINUX: "target-features"="+armv8.1-m.main,+dsp,+fp-armv8d16,+fp-armv8d16sp,+fp16,+fp64,+fullfp16,+hwdiv,+lob,+mve,+mve.fp,+pacbti,+ras,+thumb-mode,+vfp2,+vfp2sp,+vfp3d16,+vfp3d16sp,+vfp4d16,+vfp4d16sp" -// RUN: %clang_cc1 -triple thumb-linux-gnueabi -target-cpu cortex-m52 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-ARMV81M-CORTEX-M52-LINUX -// CHECK-ARMV81M-CORTEX-M52-LINUX: "target-features"="+armv8.1-m.main,+dsp,+fp-armv8d16,+fp-armv8d16sp,+fp16,+fp64,+fullfp16,+hwdiv,+lob,+mve,+mve.fp,+pacbti,+ras,+thumb-mode,+vfp2,+vfp2sp,+vfp3d16,+vfp3d16sp,+vfp4d16,+vfp4d16sp" - // RUN: %clang_cc1 -triple thumbv9.3a-linux-gnueabihf -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-ARCH93 // CHECK-ARCH93: "target-features"="+armv9.3-a,+thumb-mode,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8.5a,+v8.6a,+v8.7a,+v8.8a,+v9.1a,+v9.2a,+v9.3a,+v9a" diff --git a/clang/test/CodeGen/tbaa.c b/clang/test/CodeGen/tbaa.c deleted file mode 100644 index 0ab81f60a7194..0000000000000 --- a/clang/test/CodeGen/tbaa.c +++ /dev/null @@ -1,116 +0,0 @@ -// RUN: %clang_cc1 -triple x86_64-apple-darwin -O1 -no-struct-path-tbaa -disable-llvm-passes %s -emit-llvm -o - | FileCheck %s -// RUN: %clang_cc1 -triple x86_64-apple-darwin -O1 -disable-llvm-passes %s -emit-llvm -o - | FileCheck %s -check-prefixes=PATH -// RUN: %clang_cc1 -triple x86_64-apple-darwin -O0 -disable-llvm-passes %s -emit-llvm -o - | FileCheck %s -check-prefix=NO-TBAA -// RUN: %clang_cc1 -triple x86_64-apple-darwin -O1 -relaxed-aliasing -disable-llvm-passes %s -emit-llvm -o - | FileCheck %s -check-prefix=NO-TBAA -// Test TBAA metadata generated by front-end. -// -// NO-TBAA-NOT: !tbaa - -typedef unsigned char uint8_t; -typedef unsigned short uint16_t; -typedef unsigned int uint32_t; -typedef unsigned long long uint64_t; - -typedef enum { - RED_AUTO_32, - GREEN_AUTO_32, - BLUE_AUTO_32 -} EnumAuto32; - -typedef enum { - RED_AUTO_64, - GREEN_AUTO_64, - BLUE_AUTO_64 = 0x100000000ull -} EnumAuto64; - -typedef enum : uint16_t { - RED_16, - GREEN_16, - BLUE_16 -} Enum16; - -typedef enum : uint8_t { - RED_8, - GREEN_8, - BLUE_8 -} Enum8; - -uint32_t g0(EnumAuto32 *E, uint32_t *val) { -// CHECK-LABEL: define{{.*}} i32 @g0( -// CHECK: store i32 5, ptr %{{.*}}, align 4, !tbaa [[TAG_i32:!.*]] -// CHECK: store i32 0, ptr %{{.*}}, align 4, !tbaa [[TAG_i32]] -// CHECK: load i32, ptr %{{.*}}, align 4, !tbaa [[TAG_i32]] -// PATH-LABEL: define{{.*}} i32 @g0( -// PATH: store i32 5, ptr %{{.*}}, align 4, !tbaa [[TAG_i32:!.*]] -// PATH: store i32 0, ptr %{{.*}}, align 4, !tbaa [[TAG_i32]] -// PATH: load i32, ptr %{{.*}}, align 4, !tbaa [[TAG_i32]] - *val = 5; - *E = RED_AUTO_32; - return *val; -} - -uint64_t g1(EnumAuto64 *E, uint64_t *val) { -// CHECK-LABEL: define{{.*}} i64 @g1( -// CHECK: store i64 5, ptr %{{.*}}, align 8, !tbaa [[TAG_i64:!.*]] -// CHECK: store i64 0, ptr %{{.*}}, align 8, !tbaa [[TAG_long:!.*]] -// CHECK: load i64, ptr %{{.*}}, align 8, !tbaa [[TAG_i64]] -// PATH-LABEL: define{{.*}} i64 @g1( -// PATH: store i64 5, ptr %{{.*}}, align 8, !tbaa [[TAG_i64:!.*]] -// PATH: store i64 0, ptr %{{.*}}, align 8, !tbaa [[TAG_long:!.*]] -// PATH: load i64, ptr %{{.*}}, align 8, !tbaa [[TAG_i64]] - *val = 5; - *E = RED_AUTO_64; - return *val; -} - -uint16_t g2(Enum16 *E, uint16_t *val) { -// CHECK-LABEL: define{{.*}} i16 @g2( -// CHECK: store i16 5, ptr %{{.*}}, align 2, !tbaa [[TAG_i16:!.*]] -// CHECK: store i16 0, ptr %{{.*}}, align 2, !tbaa [[TAG_i16]] -// CHECK: load i16, ptr %{{.*}}, align 2, !tbaa [[TAG_i16]] -// PATH-LABEL: define{{.*}} i16 @g2( -// PATH: store i16 5, ptr %{{.*}}, align 2, !tbaa [[TAG_i16:!.*]] -// PATH: store i16 0, ptr %{{.*}}, align 2, !tbaa [[TAG_i16]] -// PATH: load i16, ptr %{{.*}}, align 2, !tbaa [[TAG_i16]] - *val = 5; - *E = RED_16; - return *val; -} - -uint8_t g3(Enum8 *E, uint8_t *val) { -// CHECK-LABEL: define{{.*}} i8 @g3( -// CHECK: store i8 5, ptr %{{.*}}, align 1, !tbaa [[TAG_i8:!.*]] -// CHECK: store i8 0, ptr %{{.*}}, align 1, !tbaa [[TAG_i8]] -// CHECK: load i8, ptr %{{.*}}, align 1, !tbaa [[TAG_i8]] -// PATH-LABEL: define{{.*}} i8 @g3( -// PATH: store i8 5, ptr %{{.*}}, align 1, !tbaa [[TAG_i8:!.*]] -// PATH: store i8 0, ptr %{{.*}}, align 1, !tbaa [[TAG_i8]] -// PATH: load i8, ptr %{{.*}}, align 1, !tbaa [[TAG_i8]] - *val = 5; - *E = RED_8; - return *val; -} - -// CHECK: [[TYPE_char:!.*]] = !{!"omnipotent char", [[TAG_c_tbaa:!.*]], -// CHECK: [[TAG_c_tbaa]] = !{!"Simple C/C++ TBAA"} -// CHECK: [[TAG_i32]] = !{[[TYPE_i32:!.*]], [[TYPE_i32]], i64 0} -// CHECK: [[TYPE_i32]] = !{!"int", [[TYPE_char]], -// CHECK: [[TAG_i64]] = !{[[TYPE_i64:!.*]], [[TYPE_i64]], i64 0} -// CHECK: [[TYPE_i64]] = !{!"long long", [[TYPE_char]], -// CHECK: [[TAG_long]] = !{[[TYPE_long:!.*]], [[TYPE_long]], i64 0} -// CHECK: [[TYPE_long]] = !{!"long", [[TYPE_char]], -// CHECK: [[TAG_i16]] = !{[[TYPE_i16:!.*]], [[TYPE_i16]], i64 0} -// CHECK: [[TYPE_i16]] = !{!"short", [[TYPE_char]], -// CHECK: [[TAG_i8]] = !{[[TYPE_i8:!.*]], [[TYPE_char]], i64 0} - -// PATH: [[TYPE_char:!.*]] = !{!"omnipotent char", [[TAG_c_tbaa:!.*]], -// PATH: [[TAG_c_tbaa]] = !{!"Simple C/C++ TBAA"} -// PATH: [[TAG_i32]] = !{[[TYPE_i32:!.*]], [[TYPE_i32]], i64 0} -// PATH: [[TYPE_i32]] = !{!"int", [[TYPE_char]], -// PATH: [[TAG_i64]] = !{[[TYPE_i64:!.*]], [[TYPE_i64]], i64 0} -// PATH: [[TYPE_i64]] = !{!"long long", [[TYPE_char]], -// PATH: [[TAG_long]] = !{[[TYPE_long:!.*]], [[TYPE_long]], i64 0} -// PATH: [[TYPE_long]] = !{!"long", [[TYPE_char]], -// PATH: [[TAG_i16]] = !{[[TYPE_i16:!.*]], [[TYPE_i16]], i64 0} -// PATH: [[TYPE_i16]] = !{!"short", [[TYPE_char]], -// PATH: [[TAG_i8]] = !{[[TYPE_i8:!.*]], [[TYPE_char]], i64 0} diff --git a/clang/test/Driver/arm-cortex-cpus-2.c b/clang/test/Driver/arm-cortex-cpus-2.c index 4bf2b3a50412d..5b52488bc498f 100644 --- a/clang/test/Driver/arm-cortex-cpus-2.c +++ b/clang/test/Driver/arm-cortex-cpus-2.c @@ -562,9 +562,6 @@ // RUN: %clang -target arm -mcpu=cortex-m85 -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CORTEX-M85 %s // CHECK-CORTEX-M85: "-cc1"{{.*}} "-triple" "thumbv8.1m.main-{{.*}} "-target-cpu" "cortex-m85" -// RUN: %clang -target arm -mcpu=cortex-m52 -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CORTEX-M52 %s -// CHECK-CORTEX-M52: "-cc1"{{.*}} "-triple" "thumbv8.1m.main-{{.*}} "-target-cpu" "cortex-m52" - // RUN: %clang -target arm -mcpu=neoverse-n2 -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-NEOVERSE-N2 %s // CHECK-NEOVERSE-N2: "-cc1"{{.*}} "-triple" "armv8.5a-{{.*}}" "-target-cpu" "neoverse-n2" diff --git a/clang/test/Driver/hip-offload-compress-zlib.hip b/clang/test/Driver/hip-offload-compress-zlib.hip index 7557fdde8786c..a29b6d037350d 100644 --- a/clang/test/Driver/hip-offload-compress-zlib.hip +++ b/clang/test/Driver/hip-offload-compress-zlib.hip @@ -4,13 +4,13 @@ // Test compress bundled bitcode. -// RUN: rm -rf %t.bc +// RUN: rm -rf %T/a.bc // RUN: %clang -c -v --target=x86_64-linux-gnu \ // RUN: -x hip --offload-arch=gfx1100 --offload-arch=gfx1101 \ // RUN: -fgpu-rdc -nogpuinc -nogpulib \ // RUN: %S/Inputs/hip_multiple_inputs/a.cu \ // RUN: --offload-compress --offload-device-only --gpu-bundle-output \ -// RUN: -o %t.bc \ +// RUN: -o %T/a.bc \ // RUN: 2>&1 | FileCheck %s // CHECK: clang-offload-bundler{{.*}} -type=bc @@ -23,7 +23,7 @@ // RUN: %clang --hip-link -### -v --target=x86_64-linux-gnu \ // RUN: --offload-arch=gfx1100 --offload-arch=gfx1101 \ // RUN: -fgpu-rdc -nogpulib \ -// RUN: %t.bc --offload-device-only \ +// RUN: %T/a.bc --offload-device-only \ // RUN: 2>&1 | FileCheck -check-prefix=UNBUNDLE %s // UNBUNDLE: clang-offload-bundler{{.*}} "-type=bc" diff --git a/clang/test/Driver/hip-offload-compress-zstd.hip b/clang/test/Driver/hip-offload-compress-zstd.hip index 3680ae47974a6..688c2c85329c1 100644 --- a/clang/test/Driver/hip-offload-compress-zstd.hip +++ b/clang/test/Driver/hip-offload-compress-zstd.hip @@ -4,13 +4,13 @@ // Test compress bundled bitcode. -// RUN: rm -rf %t.bc +// RUN: rm -rf %T/a.bc // RUN: %clang -c -v --target=x86_64-linux-gnu \ // RUN: -x hip --offload-arch=gfx1100 --offload-arch=gfx1101 \ // RUN: -fgpu-rdc -nogpuinc -nogpulib \ // RUN: %S/Inputs/hip_multiple_inputs/a.cu \ // RUN: --offload-compress --offload-device-only --gpu-bundle-output \ -// RUN: -o %t.bc \ +// RUN: -o %T/a.bc \ // RUN: 2>&1 | FileCheck %s // CHECK: clang-offload-bundler{{.*}} -type=bc @@ -23,7 +23,7 @@ // RUN: %clang --hip-link -### -v --target=x86_64-linux-gnu \ // RUN: --offload-arch=gfx1100 --offload-arch=gfx1101 \ // RUN: -fgpu-rdc -nogpulib \ -// RUN: %t.bc --offload-device-only \ +// RUN: %T/a.bc --offload-device-only \ // RUN: 2>&1 | FileCheck -check-prefix=UNBUNDLE %s // UNBUNDLE: clang-offload-bundler{{.*}} "-type=bc" diff --git a/clang/test/Misc/target-invalid-cpu-note.c b/clang/test/Misc/target-invalid-cpu-note.c index e8c5ecbcb5363..c7146e63add5f 100644 --- a/clang/test/Misc/target-invalid-cpu-note.c +++ b/clang/test/Misc/target-invalid-cpu-note.c @@ -1,7 +1,7 @@ // Use CHECK-NEXT instead of multiple CHECK-SAME to ensure we will fail if there is anything extra in the output. // RUN: not %clang_cc1 -triple armv5--- -target-cpu not-a-cpu -fsyntax-only %s 2>&1 | FileCheck %s --check-prefix ARM // ARM: error: unknown target CPU 'not-a-cpu' -// ARM-NEXT: note: valid target CPU values are: arm8, arm810, strongarm, strongarm110, strongarm1100, strongarm1110, arm7tdmi, arm7tdmi-s, arm710t, arm720t, arm9, arm9tdmi, arm920, arm920t, arm922t, arm940t, ep9312, arm10tdmi, arm1020t, arm9e, arm946e-s, arm966e-s, arm968e-s, arm10e, arm1020e, arm1022e, arm926ej-s, arm1136j-s, arm1136jf-s, mpcore, mpcorenovfp, arm1176jz-s, arm1176jzf-s, arm1156t2-s, arm1156t2f-s, cortex-m0, cortex-m0plus, cortex-m1, sc000, cortex-a5, cortex-a7, cortex-a8, cortex-a9, cortex-a12, cortex-a15, cortex-a17, krait, cortex-r4, cortex-r4f, cortex-r5, cortex-r7, cortex-r8, cortex-r52, sc300, cortex-m3, cortex-m4, cortex-m7, cortex-m23, cortex-m33, cortex-m35p, cortex-m55, cortex-m85, cortex-m52, cortex-a32, cortex-a35, cortex-a53, cortex-a55, cortex-a57, cortex-a72, cortex-a73, cortex-a75, cortex-a76, cortex-a76ae, cortex-a77, cortex-a78, cortex-a78c, cortex-a710, cortex-x1, cortex-x1c, neoverse-n1, neoverse-n2, neoverse-v1, cyclone, exynos-m3, exynos-m4, exynos-m5, kryo, iwmmxt, xscale, swift{{$}} +// ARM-NEXT: note: valid target CPU values are: arm8, arm810, strongarm, strongarm110, strongarm1100, strongarm1110, arm7tdmi, arm7tdmi-s, arm710t, arm720t, arm9, arm9tdmi, arm920, arm920t, arm922t, arm940t, ep9312, arm10tdmi, arm1020t, arm9e, arm946e-s, arm966e-s, arm968e-s, arm10e, arm1020e, arm1022e, arm926ej-s, arm1136j-s, arm1136jf-s, mpcore, mpcorenovfp, arm1176jz-s, arm1176jzf-s, arm1156t2-s, arm1156t2f-s, cortex-m0, cortex-m0plus, cortex-m1, sc000, cortex-a5, cortex-a7, cortex-a8, cortex-a9, cortex-a12, cortex-a15, cortex-a17, krait, cortex-r4, cortex-r4f, cortex-r5, cortex-r7, cortex-r8, cortex-r52, sc300, cortex-m3, cortex-m4, cortex-m7, cortex-m23, cortex-m33, cortex-m35p, cortex-m55, cortex-m85, cortex-a32, cortex-a35, cortex-a53, cortex-a55, cortex-a57, cortex-a72, cortex-a73, cortex-a75, cortex-a76, cortex-a76ae, cortex-a77, cortex-a78, cortex-a78c, cortex-a710, cortex-x1, cortex-x1c, neoverse-n1, neoverse-n2, neoverse-v1, cyclone, exynos-m3, exynos-m4, exynos-m5, kryo, iwmmxt, xscale, swift{{$}} // RUN: not %clang_cc1 -triple arm64--- -target-cpu not-a-cpu -fsyntax-only %s 2>&1 | FileCheck %s --check-prefix AARCH64 // AARCH64: error: unknown target CPU 'not-a-cpu' diff --git a/clang/test/ParserOpenACC/parse-constructs.c b/clang/test/ParserOpenACC/parse-constructs.c index 83d9bd6070d41..b745f54bd715c 100644 --- a/clang/test/ParserOpenACC/parse-constructs.c +++ b/clang/test/ParserOpenACC/parse-constructs.c @@ -7,17 +7,6 @@ void func() { #pragma acc for(;;){} - // expected-error@+4{{expected OpenACC directive}} - // expected-error@+3{{expected clause-list or newline in OpenACC directive}} - // expected-warning@+2{{OpenACC clause parsing not yet implemented}} - // expected-warning@+1{{OpenACC directives not yet implemented, pragma ignored}} -#pragma acc(whatever) routine - - // expected-error@+3{{expected OpenACC directive}} - // expected-warning@+2{{OpenACC clause parsing not yet implemented}} - // expected-warning@+1{{OpenACC directives not yet implemented, pragma ignored}} -#pragma acc) routine - // expected-error@+2{{invalid OpenACC directive 'invalid'}} // expected-warning@+1{{OpenACC directives not yet implemented, pragma ignored}} #pragma acc invalid diff --git a/clang/test/Sema/aarch64-sve2p1-intrinsics/acle_sve2p1_imm.cpp b/clang/test/Sema/aarch64-sve2p1-intrinsics/acle_sve2p1_imm.cpp index f90ae0fb50e5f..35bf99bfcf4c5 100644 --- a/clang/test/Sema/aarch64-sve2p1-intrinsics/acle_sve2p1_imm.cpp +++ b/clang/test/Sema/aarch64-sve2p1-intrinsics/acle_sve2p1_imm.cpp @@ -167,21 +167,3 @@ void test_svpmov_lane(){ zn_u32 = svpmov_lane_u32_m(zn_u32, pn, 5); // expected-error {{argument value 5 is outside the valid range [1, 3]}} zn_u64 = svpmov_lane_u64_m(zn_u64, pn, 8); // expected-error {{argument value 8 is outside the valid range [1, 7]}} } - -__attribute__((target("+sve2p1"))) -void test_svget_svset_b(uint64_t idx, svboolx2_t tuple2, svboolx4_t tuple4, svbool_t res){ - svset2(tuple2, -1, res); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 1]}} - svset2(tuple2, 2, res); // expected-error {{argument value 2 is outside the valid range [0, 1]}} - svset4(tuple4, -1, res); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}} - svset4(tuple4, 4, res); // expected-error {{argument value 4 is outside the valid range [0, 3]}} - - svget2(tuple2, -1); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 1]}} - svget2(tuple2, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}} - svget4(tuple4, -1); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}} - svget4(tuple4, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} - - svset2(tuple2, idx, res); // expected-error {{argument to 'svste2' must be a constant integer}} - svset4(tupl4, idx, res); // expected-error {{argument to 'svset4' must be a constant integer}} - svget2(tuple2, idx); // expected-error {{argument to 'svget2' must be a constant integer}} - svget4(tuple4, idx); // expected-error {{argument to 'svget4' must be a constant integer}} -} diff --git a/clang/test/Sema/builtin-expect-with-probability.cpp b/clang/test/Sema/builtin-expect-with-probability.cpp index c55cde84b2548..2b72c7b27ae93 100644 --- a/clang/test/Sema/builtin-expect-with-probability.cpp +++ b/clang/test/Sema/builtin-expect-with-probability.cpp @@ -1,5 +1,4 @@ // RUN: %clang_cc1 -fsyntax-only -verify %s -// RUN: %clang_cc1 -fsyntax-only -verify -fexperimental-new-constant-interpreter %s __attribute__((noreturn)) extern void bar(); diff --git a/flang/lib/Optimizer/Builder/HLFIRTools.cpp b/flang/lib/Optimizer/Builder/HLFIRTools.cpp index 17efa45b8667d..0b1e57e8f6c37 100644 --- a/flang/lib/Optimizer/Builder/HLFIRTools.cpp +++ b/flang/lib/Optimizer/Builder/HLFIRTools.cpp @@ -1132,13 +1132,8 @@ hlfir::genTypeAndKindConvert(mlir::Location loc, fir::FirOpBuilder &builder, std::optional toKindCharConvert; if (auto toCharTy = mlir::dyn_cast(toType)) { if (auto fromCharTy = mlir::dyn_cast(fromType)) - if (toCharTy.getFKind() != fromCharTy.getFKind()) { + if (toCharTy.getFKind() != fromCharTy.getFKind()) toKindCharConvert = toCharTy.getFKind(); - // Preserve source length (padding/truncation will occur in assignment - // if needed). - toType = fir::CharacterType::get( - fromType.getContext(), toCharTy.getFKind(), fromCharTy.getLen()); - } // Do not convert in case of character length mismatch only, hlfir.assign // deals with it. if (!toKindCharConvert) diff --git a/flang/lib/Optimizer/CodeGen/CodeGen.cpp b/flang/lib/Optimizer/CodeGen/CodeGen.cpp index 293208ce3b601..bf175c8ebadee 100644 --- a/flang/lib/Optimizer/CodeGen/CodeGen.cpp +++ b/flang/lib/Optimizer/CodeGen/CodeGen.cpp @@ -2387,9 +2387,6 @@ struct XArrayCoorOpConversion const bool baseIsBoxed = coor.getMemref().getType().isa(); TypePair baseBoxTyPair = baseIsBoxed ? getBoxTypePair(coor.getMemref().getType()) : TypePair{}; - mlir::LLVM::IntegerOverflowFlagsAttr nsw = - mlir::LLVM::IntegerOverflowFlagsAttr::get( - rewriter.getContext(), mlir::LLVM::IntegerOverflowFlags::nsw); // For each dimension of the array, generate the offset calculation. for (unsigned i = 0; i < rank; ++i, ++indexOffset, ++shapeOffset, @@ -2410,15 +2407,14 @@ struct XArrayCoorOpConversion if (normalSlice) step = integerCast(loc, rewriter, idxTy, operands[sliceOffset + 2]); } - auto idx = rewriter.create(loc, idxTy, index, lb, nsw); + auto idx = rewriter.create(loc, idxTy, index, lb); mlir::Value diff = - rewriter.create(loc, idxTy, idx, step, nsw); + rewriter.create(loc, idxTy, idx, step); if (normalSlice) { mlir::Value sliceLb = integerCast(loc, rewriter, idxTy, operands[sliceOffset]); - auto adj = - rewriter.create(loc, idxTy, sliceLb, lb, nsw); - diff = rewriter.create(loc, idxTy, diff, adj, nsw); + auto adj = rewriter.create(loc, idxTy, sliceLb, lb); + diff = rewriter.create(loc, idxTy, diff, adj); } // Update the offset given the stride and the zero based index `diff` // that was just computed. @@ -2426,21 +2422,17 @@ struct XArrayCoorOpConversion // Use stride in bytes from the descriptor. mlir::Value stride = getStrideFromBox(loc, baseBoxTyPair, operands[0], i, rewriter); - auto sc = - rewriter.create(loc, idxTy, diff, stride, nsw); - offset = - rewriter.create(loc, idxTy, sc, offset, nsw); + auto sc = rewriter.create(loc, idxTy, diff, stride); + offset = rewriter.create(loc, idxTy, sc, offset); } else { // Use stride computed at last iteration. - auto sc = - rewriter.create(loc, idxTy, diff, prevExt, nsw); - offset = - rewriter.create(loc, idxTy, sc, offset, nsw); + auto sc = rewriter.create(loc, idxTy, diff, prevExt); + offset = rewriter.create(loc, idxTy, sc, offset); // Compute next stride assuming contiguity of the base array // (in element number). auto nextExt = integerCast(loc, rewriter, idxTy, operands[shapeOffset]); - prevExt = rewriter.create(loc, idxTy, prevExt, - nextExt, nsw); + prevExt = + rewriter.create(loc, idxTy, prevExt, nextExt); } } @@ -2499,8 +2491,8 @@ struct XArrayCoorOpConversion assert(coor.getLenParams().size() == 1); auto length = integerCast(loc, rewriter, idxTy, operands[coor.lenParamsOffset()]); - offset = rewriter.create(loc, idxTy, offset, - length, nsw); + offset = + rewriter.create(loc, idxTy, offset, length); } else { TODO(loc, "compute size of derived type with type parameters"); } @@ -2673,9 +2665,6 @@ struct CoordinateOpConversion auto cpnTy = fir::dyn_cast_ptrOrBoxEleTy(boxObjTy); mlir::Type llvmPtrTy = ::getLlvmPtrType(coor.getContext()); mlir::Type byteTy = ::getI8Type(coor.getContext()); - mlir::LLVM::IntegerOverflowFlagsAttr nsw = - mlir::LLVM::IntegerOverflowFlagsAttr::get( - rewriter.getContext(), mlir::LLVM::IntegerOverflowFlags::nsw); for (unsigned i = 1, last = operands.size(); i < last; ++i) { if (auto arrTy = cpnTy.dyn_cast()) { @@ -2691,9 +2680,9 @@ struct CoordinateOpConversion index < lastIndex; ++index) { mlir::Value stride = getStrideFromBox(loc, boxTyPair, operands[0], index - i, rewriter); - auto sc = rewriter.create( - loc, idxTy, operands[index], stride, nsw); - off = rewriter.create(loc, idxTy, sc, off, nsw); + auto sc = rewriter.create(loc, idxTy, + operands[index], stride); + off = rewriter.create(loc, idxTy, sc, off); } resultAddr = rewriter.create( loc, llvmPtrTy, byteTy, resultAddr, diff --git a/flang/test/Fir/array-coor.fir b/flang/test/Fir/array-coor.fir index a765670d20b28..738acd7dd91fd 100644 --- a/flang/test/Fir/array-coor.fir +++ b/flang/test/Fir/array-coor.fir @@ -9,12 +9,12 @@ func.func @array_coor_box_value(%29 : !fir.box>, } // CHECK-LABEL: define double @array_coor_box_value -// CHECK: %[[t3:.*]] = sub nsw i64 %{{.*}}, 1 -// CHECK: %[[t4:.*]] = mul nsw i64 %[[t3]], 1 +// CHECK: %[[t3:.*]] = sub i64 %{{.*}}, 1 +// CHECK: %[[t4:.*]] = mul i64 %[[t3]], 1 // CHECK: %[[t5:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr %{{.*}}, i32 0, i32 7, i32 0, i32 2 // CHECK: %[[t6:.*]] = load i64, ptr %[[t5]] -// CHECK: %[[t7:.*]] = mul nsw i64 %[[t4]], %[[t6]] -// CHECK: %[[t8:.*]] = add nsw i64 %[[t7]], 0 +// CHECK: %[[t7:.*]] = mul i64 %[[t4]], %[[t6]] +// CHECK: %[[t8:.*]] = add i64 %[[t7]], 0 // CHECK: %[[t9:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr %{{.*}}, i32 0, i32 0 // CHECK: %[[t10:.*]] = load ptr, ptr %[[t9]] // CHECK: %[[t11:.*]] = getelementptr i8, ptr %[[t10]], i64 %[[t8]] @@ -36,8 +36,8 @@ func.func private @take_int(%arg0: !fir.ref) -> () // CHECK-SAME: ptr %[[VAL_0:.*]]) // CHECK: %[[VAL_1:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]], ptr, [1 x i64] }, ptr %[[VAL_0]], i32 0, i32 7, i32 0, i32 2 // CHECK: %[[VAL_2:.*]] = load i64, ptr %[[VAL_1]] -// CHECK: %[[VAL_3:.*]] = mul nsw i64 1, %[[VAL_2]] -// CHECK: %[[VAL_4:.*]] = add nsw i64 %[[VAL_3]], 0 +// CHECK: %[[VAL_3:.*]] = mul i64 1, %[[VAL_2]] +// CHECK: %[[VAL_4:.*]] = add i64 %[[VAL_3]], 0 // CHECK: %[[VAL_5:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]], ptr, [1 x i64] }, ptr %[[VAL_0]], i32 0, i32 0 // CHECK: %[[VAL_6:.*]] = load ptr, ptr %[[VAL_5]] // CHECK: %[[VAL_7:.*]] = getelementptr i8, ptr %[[VAL_6]], i64 %[[VAL_4]] diff --git a/flang/test/Fir/arrexp.fir b/flang/test/Fir/arrexp.fir index 5d265a5e3a08d..87a2763608250 100644 --- a/flang/test/Fir/arrexp.fir +++ b/flang/test/Fir/arrexp.fir @@ -114,8 +114,8 @@ func.func @f5(%arg0: !fir.box>, %arg1: !fir.box (!fir.array) { // CHECK: %[[B_STRIDE_GEP:.*]] = getelementptr {{.*}}, ptr %[[B]], i32 0, i32 7, i32 0, i32 2 // CHECK: %[[B_STRIDE:.*]] = load i64, ptr %[[B_STRIDE_GEP]] - // CHECK: %[[B_DIM_OFFSET:.*]] = mul nsw i64 %{{.*}}, %[[B_STRIDE]] - // CHECK: %[[B_OFFSET:.*]] = add nsw i64 %[[B_DIM_OFFSET]], 0 + // CHECK: %[[B_DIM_OFFSET:.*]] = mul i64 %{{.*}}, %[[B_STRIDE]] + // CHECK: %[[B_OFFSET:.*]] = add i64 %[[B_DIM_OFFSET]], 0 // CHECK: %[[B_BASE_GEP:.*]] = getelementptr {{.*}}, ptr %{{.*}}, i32 0, i32 0 // CHECK: %[[B_BASE:.*]] = load ptr, ptr %[[B_BASE_GEP]] // CHECK: %[[B_VOID_ADDR:.*]] = getelementptr i8, ptr %[[B_BASE]], i64 %[[B_OFFSET]] @@ -172,7 +172,7 @@ func.func @f7(%arg0: !fir.ref, %arg1: !fir.box>) { %0 = fir.shift %c4 : (index) -> !fir.shift<1> // CHECK: %[[STRIDE_GEP:.*]] = getelementptr {{.*}}, ptr %[[Y]], i32 0, i32 7, i32 0, i32 2 // CHECK: %[[STRIDE:.*]] = load i64, ptr %[[STRIDE_GEP]] - // CHECK: mul nsw i64 96, %[[STRIDE]] + // CHECK: mul i64 96, %[[STRIDE]] %1 = fir.array_coor %arg1(%0) %c100 : (!fir.box>, !fir.shift<1>, index) -> !fir.ref %2 = fir.load %1 : !fir.ref fir.store %2 to %arg0 : !fir.ref @@ -202,7 +202,7 @@ func.func @f8(%a : !fir.ref>>, %i : i32) { func.func @f9(%i: i32, %e : i64, %j: i64, %c: !fir.ref>>) -> !fir.ref> { %s = fir.shape %e, %e : (i64, i64) -> !fir.shape<2> // CHECK: %[[CAST:.*]] = sext i32 %[[I]] to i64 - // CHECK: %[[OFFSET:.*]] = mul nsw i64 %{{.*}}, %[[CAST]] + // CHECK: %[[OFFSET:.*]] = mul i64 %{{.*}}, %[[CAST]] // CHECK: getelementptr i8, ptr %[[C]], i64 %[[OFFSET]] %a = fir.array_coor %c(%s) %j, %j typeparams %i : (!fir.ref>>, !fir.shape<2>, i64, i64, i32) -> !fir.ref> return %a : !fir.ref> diff --git a/flang/test/Fir/convert-to-llvm.fir b/flang/test/Fir/convert-to-llvm.fir index be82ffab7e33e..993058ebb0a4d 100644 --- a/flang/test/Fir/convert-to-llvm.fir +++ b/flang/test/Fir/convert-to-llvm.fir @@ -2027,10 +2027,10 @@ func.func @ext_array_coor0(%arg0: !fir.ref>) { // CHECK: %[[C0:.*]] = llvm.mlir.constant(0 : i64) : i64 // CHECK: %[[C1:.*]] = llvm.mlir.constant(1 : i64) : i64 // CHECK: %[[C0_1:.*]] = llvm.mlir.constant(0 : i64) : i64 -// CHECK: %[[IDX:.*]] = llvm.sub %[[C0]], %[[C1]] overflow : i64 -// CHECK: %[[DIFF0:.*]] = llvm.mul %[[IDX]], %[[C1]] overflow : i64 -// CHECK: %[[SC:.*]] = llvm.mul %[[DIFF0]], %[[C1]] overflow : i64 -// CHECK: %[[OFFSET:.*]] = llvm.add %[[SC]], %[[C0_1]] overflow : i64 +// CHECK: %[[IDX:.*]] = llvm.sub %[[C0]], %[[C1]] : i64 +// CHECK: %[[DIFF0:.*]] = llvm.mul %[[IDX]], %[[C1]] : i64 +// CHECK: %[[SC:.*]] = llvm.mul %[[DIFF0]], %[[C1]] : i64 +// CHECK: %[[OFFSET:.*]] = llvm.add %[[SC]], %[[C0_1]] : i64 // CHECK: %{{.*}} = llvm.getelementptr %[[ARG0]][%[[OFFSET]]] : (!llvm.ptr, i64) -> !llvm.ptr, i32 // Conversion with shift and slice. @@ -2046,12 +2046,12 @@ func.func @ext_array_coor1(%arg0: !fir.ref>) { // CHECK: %[[C0:.*]] = llvm.mlir.constant(0 : i64) : i64 // CHECK: %[[C1:.*]] = llvm.mlir.constant(1 : i64) : i64 // CHECK: %[[C0_1:.*]] = llvm.mlir.constant(0 : i64) : i64 -// CHECK: %[[IDX:.*]] = llvm.sub %[[C0]], %[[C0]] overflow : i64 -// CHECK: %[[DIFF0:.*]] = llvm.mul %[[IDX]], %[[C0]] overflow : i64 -// CHECK: %[[ADJ:.*]] = llvm.sub %[[C0]], %[[C0]] overflow : i64 -// CHECK: %[[DIFF1:.*]] = llvm.add %[[DIFF0]], %[[ADJ]] overflow : i64 -// CHECK: %[[STRIDE:.*]] = llvm.mul %[[DIFF1]], %[[C1]] overflow : i64 -// CHECK: %[[OFFSET:.*]] = llvm.add %[[STRIDE]], %[[C0_1]] overflow : i64 +// CHECK: %[[IDX:.*]] = llvm.sub %[[C0]], %[[C0]] : i64 +// CHECK: %[[DIFF0:.*]] = llvm.mul %[[IDX]], %[[C0]] : i64 +// CHECK: %[[ADJ:.*]] = llvm.sub %[[C0]], %[[C0]] : i64 +// CHECK: %[[DIFF1:.*]] = llvm.add %[[DIFF0]], %[[ADJ]] : i64 +// CHECK: %[[STRIDE:.*]] = llvm.mul %[[DIFF1]], %[[C1]] : i64 +// CHECK: %[[OFFSET:.*]] = llvm.add %[[STRIDE]], %[[C0_1]] : i64 // CHECK: %{{.*}} = llvm.getelementptr %[[ARG0]][%[[OFFSET]]] : (!llvm.ptr, i64) -> !llvm.ptr, i32 // Conversion for a dynamic length char. @@ -2067,10 +2067,10 @@ func.func @ext_array_coor2(%arg0: !fir.ref>>) { // CHECK: %[[C0:.*]] = llvm.mlir.constant(0 : i64) : i64 // CHECK: %[[C1:.*]] = llvm.mlir.constant(1 : i64) : i64 // CHECK: %[[C0_1:.*]] = llvm.mlir.constant(0 : i64) : i64 -// CHECK: %[[IDX:.*]] = llvm.sub %[[C0]], %[[C1]] overflow : i64 -// CHECK: %[[DIFF0:.*]] = llvm.mul %[[IDX]], %[[C1]] overflow : i64 -// CHECK: %[[SC:.*]] = llvm.mul %[[DIFF0]], %[[C1]] overflow : i64 -// CHECK: %[[OFFSET:.*]] = llvm.add %[[SC]], %[[C0_1]] overflow : i64 +// CHECK: %[[IDX:.*]] = llvm.sub %[[C0]], %[[C1]] : i64 +// CHECK: %[[DIFF0:.*]] = llvm.mul %[[IDX]], %[[C1]] : i64 +// CHECK: %[[SC:.*]] = llvm.mul %[[DIFF0]], %[[C1]] : i64 +// CHECK: %[[OFFSET:.*]] = llvm.add %[[SC]], %[[C0_1]] : i64 // CHECK: %{{.*}} = llvm.getelementptr %[[ARG0]][%[[OFFSET]]] : (!llvm.ptr, i64) -> !llvm.ptr, i8 // Conversion for a `fir.box`. @@ -2086,12 +2086,12 @@ func.func @ext_array_coor3(%arg0: !fir.box>) { // CHECK: %[[C0:.*]] = llvm.mlir.constant(0 : i64) : i64 // CHECK: %[[C1:.*]] = llvm.mlir.constant(1 : i64) : i64 // CHECK: %[[C0_1:.*]] = llvm.mlir.constant(0 : i64) : i64 -// CHECK: %[[IDX:.*]] = llvm.sub %[[C0]], %[[C1]] overflow : i64 -// CHECK: %[[DIFF0:.*]] = llvm.mul %[[IDX]], %[[C1]] overflow : i64 +// CHECK: %[[IDX:.*]] = llvm.sub %[[C0]], %[[C1]] : i64 +// CHECK: %[[DIFF0:.*]] = llvm.mul %[[IDX]], %[[C1]] : i64 // CHECK: %[[GEPSTRIDE:.*]] = llvm.getelementptr %[[ARG0]][0, 7, 0, 2] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, array<1 x array<3 x i64>>)> // CHECK: %[[LOADEDSTRIDE:.*]] = llvm.load %[[GEPSTRIDE]] : !llvm.ptr -> i64 -// CHECK: %[[SC:.*]] = llvm.mul %[[DIFF0]], %[[LOADEDSTRIDE]] overflow : i64 -// CHECK: %[[OFFSET:.*]] = llvm.add %[[SC]], %[[C0_1]] overflow : i64 +// CHECK: %[[SC:.*]] = llvm.mul %[[DIFF0]], %[[LOADEDSTRIDE]] : i64 +// CHECK: %[[OFFSET:.*]] = llvm.add %[[SC]], %[[C0_1]] : i64 // CHECK: %[[GEPADDR:.*]] = llvm.getelementptr %[[ARG0]][0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, array<1 x array<3 x i64>>)> // CHECK: %[[LOADEDADDR:.*]] = llvm.load %[[GEPADDR]] : !llvm.ptr -> !llvm.ptr // CHECK: %[[GEPADDROFFSET:.*]] = llvm.getelementptr %[[LOADEDADDR]][%[[OFFSET]]] : (!llvm.ptr, i64) -> !llvm.ptr, i8 @@ -2115,12 +2115,12 @@ func.func @ext_array_coor4(%arg0: !fir.ref>) { // CHECK: %[[C1:.*]] = llvm.mlir.constant(1 : i64) : i64 // CHECK: %[[C1_1:.*]] = llvm.mlir.constant(1 : i64) : i64 // CHECK: %[[C0_1:.*]] = llvm.mlir.constant(0 : i64) : i64 -// CHECK: %[[IDX:.*]] = llvm.sub %[[C1]], %[[C0]] overflow : i64 -// CHECK: %[[DIFF0:.*]] = llvm.mul %[[IDX]], %[[C1]] overflow : i64 -// CHECK: %[[ADJ:.*]] = llvm.sub %[[C10]], %[[C0]] overflow : i64 -// CHECK: %[[DIFF1:.*]] = llvm.add %[[DIFF0]], %[[ADJ]] overflow : i64 -// CHECK: %[[STRIDE:.*]] = llvm.mul %[[DIFF1]], %[[C1_1]] overflow : i64 -// CHECK: %[[OFFSET:.*]] = llvm.add %[[STRIDE]], %[[C0_1]] overflow : i64 +// CHECK: %[[IDX:.*]] = llvm.sub %[[C1]], %[[C0]] : i64 +// CHECK: %[[DIFF0:.*]] = llvm.mul %[[IDX]], %[[C1]] : i64 +// CHECK: %[[ADJ:.*]] = llvm.sub %[[C10]], %[[C0]] : i64 +// CHECK: %[[DIFF1:.*]] = llvm.add %[[DIFF0]], %[[ADJ]] : i64 +// CHECK: %[[STRIDE:.*]] = llvm.mul %[[DIFF1]], %[[C1_1]] : i64 +// CHECK: %[[OFFSET:.*]] = llvm.add %[[STRIDE]], %[[C0_1]] : i64 // CHECK: %{{.*}} = llvm.getelementptr %[[ARG0]][%[[OFFSET]]] : (!llvm.ptr, i64) -> !llvm.ptr, i32 // Conversion with index type shape and slice @@ -2134,13 +2134,13 @@ func.func @ext_array_coor5(%arg0: !fir.ref>, %idx1 : index, %i // CHECK-SAME: %[[VAL_0:.*]]: !llvm.ptr, %[[VAL_1:.*]]: i64, %[[VAL_2:.*]]: i64, %[[VAL_3:.*]]: i64, %[[VAL_4:.*]]: i64, %[[VAL_5:.*]]: i64) { // CHECK: %[[VAL_6:.*]] = llvm.mlir.constant(1 : i64) : i64 // CHECK: %[[VAL_7:.*]] = llvm.mlir.constant(0 : i64) : i64 -// CHECK: %[[VAL_8:.*]] = llvm.sub %[[VAL_5]], %[[VAL_6]] overflow : i64 -// CHECK: %[[VAL_9:.*]] = llvm.mul %[[VAL_8]], %[[VAL_4]] overflow : i64 -// CHECK: %[[VAL_10:.*]] = llvm.sub %[[VAL_2]], %[[VAL_6]] overflow : i64 -// CHECK: %[[VAL_11:.*]] = llvm.add %[[VAL_9]], %[[VAL_10]] overflow : i64 -// CHECK: %[[VAL_12:.*]] = llvm.mul %[[VAL_11]], %[[VAL_6]] overflow : i64 -// CHECK: %[[VAL_13:.*]] = llvm.add %[[VAL_12]], %[[VAL_7]] overflow : i64 -// CHECK: %[[VAL_14:.*]] = llvm.mul %[[VAL_6]], %[[VAL_1]] overflow : i64 +// CHECK: %[[VAL_8:.*]] = llvm.sub %[[VAL_5]], %[[VAL_6]] : i64 +// CHECK: %[[VAL_9:.*]] = llvm.mul %[[VAL_8]], %[[VAL_4]] : i64 +// CHECK: %[[VAL_10:.*]] = llvm.sub %[[VAL_2]], %[[VAL_6]] : i64 +// CHECK: %[[VAL_11:.*]] = llvm.add %[[VAL_9]], %[[VAL_10]] : i64 +// CHECK: %[[VAL_12:.*]] = llvm.mul %[[VAL_11]], %[[VAL_6]] : i64 +// CHECK: %[[VAL_13:.*]] = llvm.add %[[VAL_12]], %[[VAL_7]] : i64 +// CHECK: %[[VAL_14:.*]] = llvm.mul %[[VAL_6]], %[[VAL_1]] : i64 // CHECK: %[[VAL_16:.*]] = llvm.getelementptr %[[VAL_0]][%[[VAL_13]]] : (!llvm.ptr, i64) -> !llvm.ptr, i32 // CHECK: } @@ -2155,27 +2155,27 @@ func.func @ext_array_coor6(%arg0: !fir.ref>, %idx1 : index // CHECK-SAME: %[[VAL_0:.*]]: !llvm.ptr, %[[VAL_1:.*]]: i64, %[[VAL_2:.*]]: i64, %[[VAL_3:.*]]: i64, %[[VAL_4:.*]]: i64, %[[VAL_5:.*]]: i64) { // CHECK: %[[VAL_6:.*]] = llvm.mlir.constant(1 : i64) : i64 // CHECK: %[[VAL_7:.*]] = llvm.mlir.constant(0 : i64) : i64 -// CHECK: %[[VAL_8:.*]] = llvm.sub %[[VAL_5]], %[[VAL_6]] overflow : i64 -// CHECK: %[[VAL_9:.*]] = llvm.mul %[[VAL_8]], %[[VAL_4]] overflow : i64 -// CHECK: %[[VAL_10:.*]] = llvm.sub %[[VAL_2]], %[[VAL_6]] overflow : i64 -// CHECK: %[[VAL_11:.*]] = llvm.add %[[VAL_9]], %[[VAL_10]] overflow : i64 -// CHECK: %[[VAL_12:.*]] = llvm.mul %[[VAL_11]], %[[VAL_6]] overflow : i64 -// CHECK: %[[VAL_13:.*]] = llvm.add %[[VAL_12]], %[[VAL_7]] overflow : i64 -// CHECK: %[[VAL_14:.*]] = llvm.mul %[[VAL_6]], %[[VAL_1]] overflow : i64 -// CHECK: %[[VAL_15:.*]] = llvm.sub %[[VAL_5]], %[[VAL_6]] overflow : i64 -// CHECK: %[[VAL_16:.*]] = llvm.mul %[[VAL_15]], %[[VAL_4]] overflow : i64 -// CHECK: %[[VAL_17:.*]] = llvm.sub %[[VAL_2]], %[[VAL_6]] overflow : i64 -// CHECK: %[[VAL_18:.*]] = llvm.add %[[VAL_16]], %[[VAL_17]] overflow : i64 -// CHECK: %[[VAL_19:.*]] = llvm.mul %[[VAL_18]], %[[VAL_14]] overflow : i64 -// CHECK: %[[VAL_20:.*]] = llvm.add %[[VAL_19]], %[[VAL_13]] overflow : i64 -// CHECK: %[[VAL_21:.*]] = llvm.mul %[[VAL_14]], %[[VAL_1]] overflow : i64 -// CHECK: %[[VAL_22:.*]] = llvm.sub %[[VAL_5]], %[[VAL_6]] overflow : i64 -// CHECK: %[[VAL_23:.*]] = llvm.mul %[[VAL_22]], %[[VAL_4]] overflow : i64 -// CHECK: %[[VAL_24:.*]] = llvm.sub %[[VAL_2]], %[[VAL_6]] overflow : i64 -// CHECK: %[[VAL_25:.*]] = llvm.add %[[VAL_23]], %[[VAL_24]] overflow : i64 -// CHECK: %[[VAL_26:.*]] = llvm.mul %[[VAL_25]], %[[VAL_21]] overflow : i64 -// CHECK: %[[VAL_27:.*]] = llvm.add %[[VAL_26]], %[[VAL_20]] overflow : i64 -// CHECK: %[[VAL_28:.*]] = llvm.mul %[[VAL_21]], %[[VAL_1]] overflow : i64 +// CHECK: %[[VAL_8:.*]] = llvm.sub %[[VAL_5]], %[[VAL_6]] : i64 +// CHECK: %[[VAL_9:.*]] = llvm.mul %[[VAL_8]], %[[VAL_4]] : i64 +// CHECK: %[[VAL_10:.*]] = llvm.sub %[[VAL_2]], %[[VAL_6]] : i64 +// CHECK: %[[VAL_11:.*]] = llvm.add %[[VAL_9]], %[[VAL_10]] : i64 +// CHECK: %[[VAL_12:.*]] = llvm.mul %[[VAL_11]], %[[VAL_6]] : i64 +// CHECK: %[[VAL_13:.*]] = llvm.add %[[VAL_12]], %[[VAL_7]] : i64 +// CHECK: %[[VAL_14:.*]] = llvm.mul %[[VAL_6]], %[[VAL_1]] : i64 +// CHECK: %[[VAL_15:.*]] = llvm.sub %[[VAL_5]], %[[VAL_6]] : i64 +// CHECK: %[[VAL_16:.*]] = llvm.mul %[[VAL_15]], %[[VAL_4]] : i64 +// CHECK: %[[VAL_17:.*]] = llvm.sub %[[VAL_2]], %[[VAL_6]] : i64 +// CHECK: %[[VAL_18:.*]] = llvm.add %[[VAL_16]], %[[VAL_17]] : i64 +// CHECK: %[[VAL_19:.*]] = llvm.mul %[[VAL_18]], %[[VAL_14]] : i64 +// CHECK: %[[VAL_20:.*]] = llvm.add %[[VAL_19]], %[[VAL_13]] : i64 +// CHECK: %[[VAL_21:.*]] = llvm.mul %[[VAL_14]], %[[VAL_1]] : i64 +// CHECK: %[[VAL_22:.*]] = llvm.sub %[[VAL_5]], %[[VAL_6]] : i64 +// CHECK: %[[VAL_23:.*]] = llvm.mul %[[VAL_22]], %[[VAL_4]] : i64 +// CHECK: %[[VAL_24:.*]] = llvm.sub %[[VAL_2]], %[[VAL_6]] : i64 +// CHECK: %[[VAL_25:.*]] = llvm.add %[[VAL_23]], %[[VAL_24]] : i64 +// CHECK: %[[VAL_26:.*]] = llvm.mul %[[VAL_25]], %[[VAL_21]] : i64 +// CHECK: %[[VAL_27:.*]] = llvm.add %[[VAL_26]], %[[VAL_20]] : i64 +// CHECK: %[[VAL_28:.*]] = llvm.mul %[[VAL_21]], %[[VAL_1]] : i64 // CHECK: %[[VAL_30:.*]] = llvm.getelementptr %[[VAL_0]][%[[VAL_27]]] : (!llvm.ptr, i64) -> !llvm.ptr, i32 // CHECK: llvm.return // CHECK: } @@ -2193,13 +2193,13 @@ func.func @ext_array_coor_dt_slice(%arg0: !fir.ref : i64 -// CHECK: %[[VAL_10:.*]] = llvm.mul %[[VAL_9]], %[[VAL_4]] overflow : i64 -// CHECK: %[[VAL_11:.*]] = llvm.sub %[[VAL_2]], %[[VAL_7]] overflow : i64 -// CHECK: %[[VAL_12:.*]] = llvm.add %[[VAL_10]], %[[VAL_11]] overflow : i64 -// CHECK: %[[VAL_13:.*]] = llvm.mul %[[VAL_12]], %[[VAL_7]] overflow : i64 -// CHECK: %[[VAL_14:.*]] = llvm.add %[[VAL_13]], %[[VAL_8]] overflow : i64 -// CHECK: %[[VAL_15:.*]] = llvm.mul %[[VAL_7]], %[[VAL_1]] overflow : i64 +// CHECK: %[[VAL_9:.*]] = llvm.sub %[[VAL_5]], %[[VAL_7]] : i64 +// CHECK: %[[VAL_10:.*]] = llvm.mul %[[VAL_9]], %[[VAL_4]] : i64 +// CHECK: %[[VAL_11:.*]] = llvm.sub %[[VAL_2]], %[[VAL_7]] : i64 +// CHECK: %[[VAL_12:.*]] = llvm.add %[[VAL_10]], %[[VAL_11]] : i64 +// CHECK: %[[VAL_13:.*]] = llvm.mul %[[VAL_12]], %[[VAL_7]] : i64 +// CHECK: %[[VAL_14:.*]] = llvm.add %[[VAL_13]], %[[VAL_8]] : i64 +// CHECK: %[[VAL_15:.*]] = llvm.mul %[[VAL_7]], %[[VAL_1]] : i64 // CHECK: %[[VAL_17:.*]] = llvm.getelementptr %[[VAL_0]][%[[VAL_14]], 0] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<"_QFtest_dt_sliceTt", (i32, i32)> // CHECK: llvm.return // CHECK: } @@ -2452,8 +2452,8 @@ func.func @coordinate_box_array_1d(%arg0: !fir.box>, %arg1: // Index of the 1st CFI_dim_t object (corresonds the the 1st dimension) // CHECK-NEXT: %[[DIM_1_MEM_STRIDE_ADDR:.*]] = llvm.getelementptr %[[BOX]][0, 7, 0, 2] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, array<1 x array<3 x i64>>)> // CHECK-NEXT: %[[DIM_1_MEM_STRIDE_VAL:.*]] = llvm.load %[[DIM_1_MEM_STRIDE_ADDR]] : !llvm.ptr -> i64 -// CHECK-NEXT: %[[BYTE_OFFSET:.*]] = llvm.mul %[[COORDINATE]], %[[DIM_1_MEM_STRIDE_VAL]] overflow : i64 -// CHECK-NEXT: %[[SUBOJECT_OFFSET:.*]] = llvm.add %[[BYTE_OFFSET]], %[[OFFSET_INIT]] overflow : i64 +// CHECK-NEXT: %[[BYTE_OFFSET:.*]] = llvm.mul %[[COORDINATE]], %[[DIM_1_MEM_STRIDE_VAL]] : i64 +// CHECK-NEXT: %[[SUBOJECT_OFFSET:.*]] = llvm.add %[[BYTE_OFFSET]], %[[OFFSET_INIT]] : i64 // CHECK-NEXT: %[[SUBOBJECT_ADDR:.*]] = llvm.getelementptr %[[ARRAY_OBJECT]][%[[SUBOJECT_OFFSET]]] : (!llvm.ptr, i64) -> !llvm.ptr, i8 // CHECK-NEXT: llvm.return @@ -2471,8 +2471,8 @@ func.func @coordinate_of_box_dynamic_array_1d(%arg0: !fir.box !llvm.ptr, !llvm.struct<(ptr, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, array<1 x array<3 x i64>>)> // CHECK-NEXT: %[[DIM_1_MEM_STRIDE_VAL:.*]] = llvm.load %[[DIM_1_MEM_STRIDE_ADDR]] : !llvm.ptr -> i64 -// CHECK-NEXT: %[[BYTE_OFFSET:.*]] = llvm.mul %[[COORDINATE]], %[[DIM_1_MEM_STRIDE_VAL]] overflow : i64 -// CHECK-NEXT: %[[SUBOJECT_OFFSET:.*]] = llvm.add %[[BYTE_OFFSET]], %[[OFFSET_INIT]] overflow : i64 +// CHECK-NEXT: %[[BYTE_OFFSET:.*]] = llvm.mul %[[COORDINATE]], %[[DIM_1_MEM_STRIDE_VAL]] : i64 +// CHECK-NEXT: %[[SUBOJECT_OFFSET:.*]] = llvm.add %[[BYTE_OFFSET]], %[[OFFSET_INIT]] : i64 // CHECK-NEXT: %[[SUBOBJECT_ADDR:.*]] = llvm.getelementptr %[[ARRAY_OBJECT]][%[[SUBOJECT_OFFSET]]] : (!llvm.ptr, i64) -> !llvm.ptr, i8 // CHECK-NEXT: llvm.return @@ -2492,13 +2492,13 @@ func.func @coordinate_box_array_2d(%arg0: !fir.box>, % // Index of the 1st CFI_dim_t object (corresonds the the 1st dimension) // CHECK-NEXT: %[[DIM_1_MEM_STRIDE_ADDR:.*]] = llvm.getelementptr %[[BOX]][0, 7, 0, 2] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, array<2 x array<3 x i64>>)> // CHECK-NEXT: %[[DIM_1_MEM_STRIDE_VAL:.*]] = llvm.load %[[DIM_1_MEM_STRIDE_ADDR]] : !llvm.ptr -> i64 -// CHECK-NEXT: %[[BYTE_OFFSET_1:.*]] = llvm.mul %[[COORDINATE_1]], %[[DIM_1_MEM_STRIDE_VAL]] overflow : i64 -// CHECK-NEXT: %[[SUBOBJECT_OFFSET_1:.*]] = llvm.add %[[BYTE_OFFSET]], %[[OFFSET_INIT]] overflow : i64 +// CHECK-NEXT: %[[BYTE_OFFSET_1:.*]] = llvm.mul %[[COORDINATE_1]], %[[DIM_1_MEM_STRIDE_VAL]] : i64 +// CHECK-NEXT: %[[SUBOBJECT_OFFSET_1:.*]] = llvm.add %[[BYTE_OFFSET]], %[[OFFSET_INIT]] : i64 // Index of the 1st CFI_dim_t object (corresonds the the 2nd dimension) // CHECK-NEXT: %[[DIM_2_MEM_STRIDE_ADDR:.*]] = llvm.getelementptr %[[BOX]][0, 7, 1, 2] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, i{{.*}}, array<2 x array<3 x i64>>)> // CHECK-NEXT: %[[DIM_2_MEM_STRIDE_VAL:.*]] = llvm.load %[[DIM_2_MEM_STRIDE_ADDR]] : !llvm.ptr -> i64 -// CHECK-NEXT: %[[BYTE_OFFSET_2:.*]] = llvm.mul %[[COORDINATE_2]], %[[DIM_2_MEM_STRIDE_VAL]] overflow : i64 -// CHECK-NEXT: %[[SUBOBJECT_OFFSET_2:.*]] = llvm.add %[[BYTE_OFFSET_2]], %[[SUBOBJECT_OFFSET_1]] overflow : i64 +// CHECK-NEXT: %[[BYTE_OFFSET_2:.*]] = llvm.mul %[[COORDINATE_2]], %[[DIM_2_MEM_STRIDE_VAL]] : i64 +// CHECK-NEXT: %[[SUBOBJECT_OFFSET_2:.*]] = llvm.add %[[BYTE_OFFSET_2]], %[[SUBOBJECT_OFFSET_1]] : i64 // CHECK-NEXT: %[[SUBOBJECT_ADDR:.*]] = llvm.getelementptr %[[ARRAY_OBJECT]][%[[SUBOBJECT_OFFSET_2]]] : (!llvm.ptr, i64) -> !llvm.ptr, i8 // CHECK-NEXT: llvm.return @@ -2520,8 +2520,8 @@ func.func @coordinate_box_derived_inside_array(%arg0: !fir.box !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>, ptr, array<1 x i64>)> // CHECK: %[[VAL_14:.*]] = llvm.load %[[VAL_13]] : !llvm.ptr -> i64 -// CHECK: %[[VAL_15:.*]] = llvm.mul %[[COORDINATE_1]], %[[VAL_14]] overflow : i64 -// CHECK: %[[OFFSET:.*]] = llvm.add %[[VAL_15]], %[[VAL_8]] overflow : i64 +// CHECK: %[[VAL_15:.*]] = llvm.mul %[[COORDINATE_1]], %[[VAL_14]] : i64 +// CHECK: %[[OFFSET:.*]] = llvm.add %[[VAL_15]], %[[VAL_8]] : i64 // CHECK: %[[DERIVED:.*]] = llvm.getelementptr %[[ARRAY]][%[[OFFSET]]] : (!llvm.ptr, i64) -> !llvm.ptr, i8 // CHECK: %[[VAL_20:.*]] = llvm.getelementptr %[[DERIVED]][0, 1] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<"derived_3", (f32, f32)> // CHECK: llvm.return diff --git a/flang/test/Fir/coordinateof.fir b/flang/test/Fir/coordinateof.fir index acb9fd57c0955..c605f43a15437 100644 --- a/flang/test/Fir/coordinateof.fir +++ b/flang/test/Fir/coordinateof.fir @@ -30,8 +30,8 @@ func.func @foo3(%box : !fir.box>, %i : i32) -> i32 { // CHECK: %[[boxptr:.*]] = load ptr, ptr %[[gep0]] // CHECK: %[[gep1:.*]] = getelementptr { ptr, i64, {{.*}} i32 7 // CHECK: %[[stride:.*]] = load i64, ptr %[[gep1]] - // CHECK: %[[dimoffset:.*]] = mul nsw i64 %[[cvt]], %[[stride]] - // CHECK: %[[offset:.*]] = add nsw i64 %[[dimoffset]], 0 + // CHECK: %[[dimoffset:.*]] = mul i64 %[[cvt]], %[[stride]] + // CHECK: %[[offset:.*]] = add i64 %[[dimoffset]], 0 // CHECK: %[[gep2:.*]] = getelementptr i8, ptr %[[boxptr]], i64 %[[offset]] %1 = fir.coordinate_of %box, %ii : (!fir.box>, index) -> !fir.ref // CHECK: load i32, ptr %[[gep2]] @@ -68,8 +68,8 @@ func.func @foo6(%box : !fir.box>>>, %i : i64 // CHECK: %[[addr:.*]] = load ptr, ptr %[[addr_gep]] // CHECK: %[[stride_gep:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr %[[box]], i32 0, i32 7, i32 0, i32 2 // CHECK: %[[stride:.*]] = load i64, ptr %[[stride_gep]] - // CHECK: %[[mul:.*]] = mul nsw i64 %{{.*}}, %[[stride]] - // CHECK: %[[offset:.*]] = add nsw i64 %[[mul]], 0 + // CHECK: %[[mul:.*]] = mul i64 %{{.*}}, %[[stride]] + // CHECK: %[[offset:.*]] = add i64 %[[mul]], 0 // CHECK: %[[gep:.*]] = getelementptr i8, ptr %[[addr]], i64 %[[offset]] %coor = fir.coordinate_of %box, %i : (!fir.box>>>, i64) -> !fir.ref> diff --git a/flang/test/Fir/tbaa.fir b/flang/test/Fir/tbaa.fir index 4474bbbe3dd74..bbc8a81bdf683 100644 --- a/flang/test/Fir/tbaa.fir +++ b/flang/test/Fir/tbaa.fir @@ -41,8 +41,8 @@ module { // CHECK: %[[VAL_8:.*]] = llvm.mlir.constant(0 : i64) : i64 // CHECK: %[[VAL_9:.*]] = llvm.getelementptr %[[VAL_0]][0, 7, 0, 2] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>, ptr, array<1 x i64>)> // CHECK: %[[VAL_10:.*]] = llvm.load %[[VAL_9]] {tbaa = [#[[$BOXT]]]} : !llvm.ptr -> i64 -// CHECK: %[[VAL_11:.*]] = llvm.mul %[[VAL_4]], %[[VAL_10]] overflow : i64 -// CHECK: %[[VAL_12:.*]] = llvm.add %[[VAL_11]], %[[VAL_8]] overflow : i64 +// CHECK: %[[VAL_11:.*]] = llvm.mul %[[VAL_4]], %[[VAL_10]] : i64 +// CHECK: %[[VAL_12:.*]] = llvm.add %[[VAL_11]], %[[VAL_8]] : i64 // CHECK: %[[VAL_14:.*]] = llvm.getelementptr %[[VAL_7]]{{\[}}%[[VAL_12]]] : (!llvm.ptr, i64) -> !llvm.ptr, i8 // CHECK: %[[VAL_16:.*]] = llvm.mlir.constant(0 : i64) : i64 // CHECK: %[[VAL_17:.*]] = llvm.mlir.constant(-1 : i32) : i32 @@ -330,12 +330,12 @@ func.func @tbaa(%arg0: !fir.box>) { // CHECK: %[[VAL_1:.*]] = llvm.mlir.constant(0 : i64) : i64 // CHECK: %[[VAL_2:.*]] = llvm.mlir.constant(1 : i64) : i64 // CHECK: %[[VAL_3:.*]] = llvm.mlir.constant(0 : i64) : i64 -// CHECK: %[[VAL_4:.*]] = llvm.sub %[[VAL_1]], %[[VAL_2]] overflow : i64 -// CHECK: %[[VAL_5:.*]] = llvm.mul %[[VAL_4]], %[[VAL_2]] overflow : i64 +// CHECK: %[[VAL_4:.*]] = llvm.sub %[[VAL_1]], %[[VAL_2]] : i64 +// CHECK: %[[VAL_5:.*]] = llvm.mul %[[VAL_4]], %[[VAL_2]] : i64 // CHECK: %[[VAL_6:.*]] = llvm.getelementptr %[[VAL_0]][0, 7, 0, 2] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> // CHECK: %[[VAL_7:.*]] = llvm.load %[[VAL_6]] {tbaa = [#[[$BOXT]]]} : !llvm.ptr -> i64 -// CHECK: %[[VAL_8:.*]] = llvm.mul %[[VAL_5]], %[[VAL_7]] overflow : i64 -// CHECK: %[[VAL_9:.*]] = llvm.add %[[VAL_8]], %[[VAL_3]] overflow : i64 +// CHECK: %[[VAL_8:.*]] = llvm.mul %[[VAL_5]], %[[VAL_7]] : i64 +// CHECK: %[[VAL_9:.*]] = llvm.add %[[VAL_8]], %[[VAL_3]] : i64 // CHECK: %[[VAL_10:.*]] = llvm.getelementptr %[[VAL_0]][0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> // CHECK: %[[VAL_11:.*]] = llvm.load %[[VAL_10]] {tbaa = [#[[$BOXT]]]} : !llvm.ptr -> !llvm.ptr // CHECK: %[[VAL_13:.*]] = llvm.getelementptr %[[VAL_11]]{{\[}}%[[VAL_9]]] : (!llvm.ptr, i64) -> !llvm.ptr, i8 diff --git a/flang/test/Lower/HLFIR/implicit-type-conversion-allocatable.f90 b/flang/test/Lower/HLFIR/implicit-type-conversion-allocatable.f90 index 361cd61adea2d..7083a825dfd3b 100644 --- a/flang/test/Lower/HLFIR/implicit-type-conversion-allocatable.f90 +++ b/flang/test/Lower/HLFIR/implicit-type-conversion-allocatable.f90 @@ -38,29 +38,3 @@ subroutine preserve_lbounds(x, y) ! CHECK: hlfir.destroy %[[VAL_8]] : !hlfir.expr ! CHECK: return ! CHECK: } - -! Test that RHS character length is preserved in a character KIND -! conversion before the assignment. -subroutine kind_and_length(a, b) - character(len=4,kind=4), allocatable :: a(:) - character(len=2,kind=1) :: b(:) - a = b -end subroutine -! CHECK-LABEL: func.func @_QPkind_and_length( -! CHECK: %[[VAL_2:.*]] = arith.constant 4 : index -! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0:[a-z0-9]*]] typeparams %[[VAL_2:[a-z0-9]*]] {{.*}}Ea -! CHECK: %[[VAL_4:.*]] = arith.constant 2 : index -! CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_1:[a-z0-9]*]] typeparams %[[VAL_4:[a-z0-9]*]] {{.*}}Eb -! CHECK: %[[VAL_6:.*]] = arith.constant 0 : index -! CHECK: %[[VAL_7:.*]]:3 = fir.box_dims %[[VAL_5]]#0, %[[VAL_6]] : (!fir.box>>, index) -> (index, index, index) -! CHECK: %[[VAL_8:.*]] = fir.shape %[[VAL_7]]#1 : (index) -> !fir.shape<1> -! CHECK: %[[VAL_9:.*]] = hlfir.elemental %[[VAL_8]] typeparams %[[VAL_4]] unordered : (!fir.shape<1>, index) -> !hlfir.expr> { -! CHECK: ^bb0(%[[VAL_10:.*]]: index): -! CHECK: %[[VAL_11:.*]] = hlfir.designate %[[VAL_5]]#0 (%[[VAL_10]]) typeparams %[[VAL_4]] : (!fir.box>>, index, index) -> !fir.ref> -! CHECK: %[[VAL_12:.*]] = fir.alloca !fir.char<4,?>(%[[VAL_4]] : index) -! CHECK: fir.char_convert %[[VAL_11]] for %[[VAL_4]] to %[[VAL_12]] : !fir.ref>, index, !fir.ref> -! CHECK: %[[VAL_13:.*]]:2 = hlfir.declare %[[VAL_12:[a-z0-9]*]] typeparams %[[VAL_4:[a-z0-9]*]] {uniq_name = ".temp.kindconvert"} : (!fir.ref>, index) -> (!fir.boxchar<4>, !fir.ref>) -! CHECK: hlfir.yield_element %[[VAL_13]]#0 : !fir.boxchar<4> -! CHECK: } -! CHECK: hlfir.assign %[[VAL_9]] to %[[VAL_3]]#0 realloc keep_lhs_len : !hlfir.expr>, !fir.ref>>>> - diff --git a/libc/src/__support/FPUtil/Hypot.h b/libc/src/__support/FPUtil/Hypot.h index fb04d8a7775df..8a6eb4b920acd 100644 --- a/libc/src/__support/FPUtil/Hypot.h +++ b/libc/src/__support/FPUtil/Hypot.h @@ -178,7 +178,7 @@ LIBC_INLINE T hypot(T x, T y) { // But before that, remember to store the losing bits to sticky. // The shift length is for a^2 and b^2, so it's double of the exponent // difference between a and b. - uint16_t shift_length = static_cast(2 * (a_exp - b_exp)); + uint16_t shift_length = 2 * (a_exp - b_exp); sticky_bits = ((b_mant_sq & ((DUIntType(1) << shift_length) - DUIntType(1))) != DUIntType(0)); diff --git a/libc/src/__support/FPUtil/dyadic_float.h b/libc/src/__support/FPUtil/dyadic_float.h index 5f0d8f49ccf64..b7920943804e6 100644 --- a/libc/src/__support/FPUtil/dyadic_float.h +++ b/libc/src/__support/FPUtil/dyadic_float.h @@ -94,7 +94,7 @@ template struct DyadicFloat { return 0.0; // Assume that it is normalized, and output is also normal. - constexpr uint32_t PRECISION = FloatProperties::MANTISSA_PRECISION; + constexpr size_t PRECISION = FloatProperties::MANTISSA_WIDTH + 1; using output_bits_t = typename FPBits::UIntType; int exp_hi = exponent + static_cast((Bits - 1) + @@ -110,12 +110,12 @@ template struct DyadicFloat { exp_hi = FloatProperties::EXPONENT_BIAS; } - int exp_lo = exp_hi - static_cast(PRECISION) - 1; + int exp_lo = exp_hi - PRECISION - 1; MantissaType m_hi(mantissa >> shift); T d_hi = FPBits::create_value(sign, exp_hi, - static_cast(m_hi) & + output_bits_t(m_hi) & FloatProperties::MANTISSA_MASK) .get_val(); diff --git a/libc/src/__support/UInt.h b/libc/src/__support/UInt.h index bdc713e6a8db2..f72b995f8788d 100644 --- a/libc/src/__support/UInt.h +++ b/libc/src/__support/UInt.h @@ -25,13 +25,12 @@ namespace LIBC_NAMESPACE::cpp { -// BigInt has the same semantics as the 'standard integer types' and can be -// safely 'bit_cast'ed to compatible types. template struct BigInt { + static_assert(Bits > 0 && Bits % 64 == 0, "Number of bits in BigInt should be a multiple of 64."); LIBC_INLINE_VAR static constexpr size_t WORDCOUNT = Bits / 64; - uint64_t val[WORDCOUNT]; + uint64_t val[WORDCOUNT]{}; LIBC_INLINE_VAR static constexpr uint64_t MASK32 = 0xFFFFFFFFu; @@ -159,7 +158,7 @@ template struct BigInt { LIBC_INLINE constexpr BigInt operator+(const BigInt &other) const { - BigInt result(0); + BigInt result; SumCarry s{0, 0}; for (size_t i = 0; i < WORDCOUNT; ++i) { s = add_with_carry(val[i], other.val[i], s.carry); @@ -172,7 +171,7 @@ template struct BigInt { // it will always use the constexpr version of add_with_carry. LIBC_INLINE constexpr BigInt operator+(BigInt &&other) const { - BigInt result(0); + BigInt result; SumCarry s{0, 0}; for (size_t i = 0; i < WORDCOUNT; ++i) { s = add_with_carry_const(val[i], other.val[i], s.carry); @@ -200,7 +199,7 @@ template struct BigInt { LIBC_INLINE BigInt operator-(const BigInt &other) const { - BigInt result(0); + BigInt result; DiffBorrow d{0, 0}; for (size_t i = 0; i < WORDCOUNT; ++i) { d = sub_with_borrow(val[i], other.val[i], d.borrow); @@ -211,7 +210,7 @@ template struct BigInt { LIBC_INLINE constexpr BigInt operator-(BigInt &&other) const { - BigInt result(0); + BigInt result; DiffBorrow d{0, 0}; for (size_t i = 0; i < WORDCOUNT; ++i) { d = sub_with_borrow_const(val[i], other.val[i], d.borrow); @@ -691,7 +690,7 @@ template struct BigInt { LIBC_INLINE constexpr BigInt operator&(const BigInt &other) const { - BigInt result(0); + BigInt result; for (size_t i = 0; i < WORDCOUNT; ++i) result.val[i] = val[i] & other.val[i]; return result; @@ -706,7 +705,7 @@ template struct BigInt { LIBC_INLINE constexpr BigInt operator|(const BigInt &other) const { - BigInt result(0); + BigInt result; for (size_t i = 0; i < WORDCOUNT; ++i) result.val[i] = val[i] | other.val[i]; return result; @@ -721,7 +720,7 @@ template struct BigInt { LIBC_INLINE constexpr BigInt operator^(const BigInt &other) const { - BigInt result(0); + BigInt result; for (size_t i = 0; i < WORDCOUNT; ++i) result.val[i] = val[i] ^ other.val[i]; return result; @@ -735,7 +734,7 @@ template struct BigInt { } LIBC_INLINE constexpr BigInt operator~() const { - BigInt result(0); + BigInt result; for (size_t i = 0; i < WORDCOUNT; ++i) result.val[i] = ~val[i]; return result; diff --git a/libc/src/__support/float_to_string.h b/libc/src/__support/float_to_string.h index 92dd2e8f7e903..34c0c0ceef286 100644 --- a/libc/src/__support/float_to_string.h +++ b/libc/src/__support/float_to_string.h @@ -653,7 +653,7 @@ FloatToString::get_positive_block(int block_index) { // shift_amount = -(c0 - exponent) = c_0 + 16 * ceil(exponent/16) - exponent - cpp::UInt val(0); + cpp::UInt val; #ifdef LIBC_COPT_FLOAT_TO_STR_USE_MEGA_LONG_DOUBLE_TABLE // ------------------------------ TABLE MODE ------------------------------- const int32_t SHIFT_CONST = TABLE_SHIFT_CONST; @@ -704,7 +704,7 @@ FloatToString::get_negative_block(int block_index) { if (exponent < 0) { const int32_t idx = -exponent / IDX_SIZE; - cpp::UInt val(0); + cpp::UInt val; #ifdef LIBC_COPT_FLOAT_TO_STR_USE_MEGA_LONG_DOUBLE_TABLE // ------------------------------ TABLE MODE ------------------------------- const int32_t SHIFT_CONST = TABLE_SHIFT_CONST; diff --git a/libc/src/math/generic/explogxf.h b/libc/src/math/generic/explogxf.h index 156c24c21e231..512785be2cb85 100644 --- a/libc/src/math/generic/explogxf.h +++ b/libc/src/math/generic/explogxf.h @@ -311,8 +311,7 @@ LIBC_INLINE static double log_eval(double x) { // p1 is the leading 7 bits of mx, i.e. // p1 * 2^(-7) <= m_x < (p1 + 1) * 2^(-7). - int p1 = static_cast(bs.get_mantissa() >> - (FPB::FloatProp::MANTISSA_WIDTH - 7)); + int p1 = (bs.get_mantissa() >> (FPB::FloatProp::MANTISSA_WIDTH - 7)); // Set bs to (1 + (mx - p1*2^(-7)) bs.bits &= FPB::FloatProp::MANTISSA_MASK >> 7; diff --git a/libc/src/math/generic/log.cpp b/libc/src/math/generic/log.cpp index dfa41ad64578d..46b64df689086 100644 --- a/libc/src/math/generic/log.cpp +++ b/libc/src/math/generic/log.cpp @@ -769,7 +769,7 @@ LLVM_LIBC_FUNCTION(double, log, (double x)) { // Range reduction for log(x_m): // For each x_m, we would like to find r such that: // -2^-8 <= r * x_m - 1 < 2^-7 - int shifted = static_cast(x_u >> 45); + int shifted = x_u >> 45; int index = shifted & 0x7F; double r = RD[index]; diff --git a/libc/src/math/generic/log10.cpp b/libc/src/math/generic/log10.cpp index 2a801c6e98429..38789acc441e5 100644 --- a/libc/src/math/generic/log10.cpp +++ b/libc/src/math/generic/log10.cpp @@ -770,7 +770,7 @@ LLVM_LIBC_FUNCTION(double, log10, (double x)) { // Range reduction for log10(x_m): // For each x_m, we would like to find r such that: // -2^-8 <= r * x_m - 1 < 2^-7 - int shifted = static_cast(x_u >> 45); + int shifted = x_u >> 45; int index = shifted & 0x7F; double r = RD[index]; diff --git a/libc/src/math/generic/log1p.cpp b/libc/src/math/generic/log1p.cpp index 02299e271770a..c6ee8d8f9bbfb 100644 --- a/libc/src/math/generic/log1p.cpp +++ b/libc/src/math/generic/log1p.cpp @@ -949,9 +949,8 @@ LLVM_LIBC_FUNCTION(double, log1p, (double x)) { x_u = xhi_bits.uintval(); // Range reduction: // Find k such that |x_hi - k * 2^-7| <= 2^-8. - int idx = static_cast( - ((x_u & MANTISSA_MASK) + (1ULL << (MANTISSA_WIDTH - 8))) >> - (MANTISSA_WIDTH - 7)); + int idx = ((x_u & MANTISSA_MASK) + (1ULL << (MANTISSA_WIDTH - 8))) >> + (MANTISSA_WIDTH - 7); int x_e = xhi_bits.get_exponent() + (idx >> 7); double e_x = static_cast(x_e); diff --git a/libc/src/math/generic/log1pf.cpp b/libc/src/math/generic/log1pf.cpp index 023387d8add00..5b4e7edcceb47 100644 --- a/libc/src/math/generic/log1pf.cpp +++ b/libc/src/math/generic/log1pf.cpp @@ -56,8 +56,8 @@ LIBC_INLINE float log(double x) { // Get the 8 highest bits, use 7 bits (excluding the implicit hidden bit) for // lookup tables. - int f_index = static_cast( - xbits.get_mantissa() >> 45); // fputil::MantissaWidth::VALUE - 7 + int f_index = + xbits.get_mantissa() >> 45; // fputil::MantissaWidth::VALUE - 7 // Set bits to 1.m xbits.set_unbiased_exponent(0x3FF); diff --git a/libc/src/math/generic/log2.cpp b/libc/src/math/generic/log2.cpp index 2ceddf87dfd56..d72b0931c14b8 100644 --- a/libc/src/math/generic/log2.cpp +++ b/libc/src/math/generic/log2.cpp @@ -890,7 +890,7 @@ LLVM_LIBC_FUNCTION(double, log2, (double x)) { // Range reduction for log2(x_m): // For each x_m, we would like to find r such that: // -2^-8 <= r * x_m - 1 < 2^-7 - int shifted = static_cast(x_u >> 45); + int shifted = x_u >> 45; int index = shifted & 0x7F; double r = RD[index]; diff --git a/libc/test/src/__support/CMakeLists.txt b/libc/test/src/__support/CMakeLists.txt index 187b70781bdad..740209bc83d75 100644 --- a/libc/test/src/__support/CMakeLists.txt +++ b/libc/test/src/__support/CMakeLists.txt @@ -89,11 +89,8 @@ add_libc_test( SRCS uint_test.cpp DEPENDS - libc.src.__support.CPP.bit - libc.src.__support.CPP.optional - libc.src.__support.CPP.type_traits - libc.src.__support.macros.properties.float libc.src.__support.uint + libc.src.__support.CPP.optional ) add_libc_test( diff --git a/libc/test/src/__support/uint_test.cpp b/libc/test/src/__support/uint_test.cpp index e4d24b08da867..971bac55bd9d3 100644 --- a/libc/test/src/__support/uint_test.cpp +++ b/libc/test/src/__support/uint_test.cpp @@ -6,73 +6,23 @@ // //===----------------------------------------------------------------------===// -#include "src/__support/CPP/bit.h" // bit_cast #include "src/__support/CPP/optional.h" -#include "src/__support/CPP/type_traits.h" // is_trivially_constructible #include "src/__support/UInt.h" -#include "src/__support/macros/properties/float.h" // LIBC_COMPILER_HAS_FLOAT128 #include "test/UnitTest/Test.h" -#include // HUGE_VALF, HUGE_VALF - -namespace LIBC_NAMESPACE { - -using LL_UInt64 = cpp::UInt<64>; -// We want to test cpp::UInt<128> explicitly. So, for +// We want to test LIBC_NAMESPACE::cpp::UInt<128> explicitly. So, for // convenience, we use a sugar which does not conflict with the UInt128 type // which can resolve to __uint128_t if the platform has it. -using LL_UInt128 = cpp::UInt<128>; -using LL_UInt192 = cpp::UInt<192>; -using LL_UInt256 = cpp::UInt<256>; -using LL_UInt320 = cpp::UInt<320>; -using LL_UInt512 = cpp::UInt<512>; -using LL_UInt1024 = cpp::UInt<1024>; - -using LL_Int128 = cpp::Int<128>; -using LL_Int192 = cpp::Int<192>; - -TEST(LlvmLibcUIntClassTest, BitCastToFromDouble) { - static_assert(cpp::is_trivially_constructible::value); - static_assert(cpp::is_trivially_copyable::value); - static_assert(sizeof(LL_UInt64) == sizeof(double)); - const double inf = HUGE_VAL; - const double max = DBL_MAX; - const double array[] = {0.0, 0.1, 1.0, max, inf}; - for (double value : array) { - LL_UInt64 back = cpp::bit_cast(value); - double forth = cpp::bit_cast(back); - EXPECT_TRUE(value == forth); - } -} +using LL_UInt128 = LIBC_NAMESPACE::cpp::UInt<128>; +using LL_UInt192 = LIBC_NAMESPACE::cpp::UInt<192>; +using LL_UInt256 = LIBC_NAMESPACE::cpp::UInt<256>; +using LL_UInt320 = LIBC_NAMESPACE::cpp::UInt<320>; +using LL_UInt512 = LIBC_NAMESPACE::cpp::UInt<512>; +using LL_UInt1024 = LIBC_NAMESPACE::cpp::UInt<1024>; -#ifdef __SIZEOF_INT128__ -TEST(LlvmLibcUIntClassTest, BitCastToFromNativeUint128) { - static_assert(cpp::is_trivially_constructible::value); - static_assert(cpp::is_trivially_copyable::value); - static_assert(sizeof(LL_UInt128) == sizeof(__uint128_t)); - const __uint128_t array[] = {0, 1, ~__uint128_t(0)}; - for (__uint128_t value : array) { - LL_UInt128 back = cpp::bit_cast(value); - __uint128_t forth = cpp::bit_cast<__uint128_t>(back); - EXPECT_TRUE(value == forth); - } -} -#endif - -#ifdef LIBC_COMPILER_HAS_FLOAT128 -TEST(LlvmLibcUIntClassTest, BitCastToFromNativeFloat128) { - static_assert(cpp::is_trivially_constructible::value); - static_assert(cpp::is_trivially_copyable::value); - static_assert(sizeof(LL_UInt128) == sizeof(float128)); - const float128 array[] = {0, 0.1, 1}; - for (float128 value : array) { - LL_UInt128 back = cpp::bit_cast(value); - float128 forth = cpp::bit_cast(back); - EXPECT_TRUE(value == forth); - } -} -#endif +using LL_Int128 = LIBC_NAMESPACE::cpp::Int<128>; +using LL_Int192 = LIBC_NAMESPACE::cpp::Int<192>; TEST(LlvmLibcUIntClassTest, BasicInit) { LL_UInt128 half_val(12345); @@ -684,5 +634,3 @@ TEST(LlvmLibcUIntClassTest, ConstructorFromUInt128Tests) { } #endif // __SIZEOF_INT128__ - -} // namespace LIBC_NAMESPACE diff --git a/libcxx/test/std/time/time.syn/formatter_tests.h b/libcxx/test/std/time/time.syn/formatter_tests.h index 1b343b5c8711b..42c176b3b47e5 100644 --- a/libcxx/test/std/time/time.syn/formatter_tests.h +++ b/libcxx/test/std/time/time.syn/formatter_tests.h @@ -8,8 +8,6 @@ #ifndef TEST_STD_TIME_TIME_SYN_FORMATTER_TESTS_H #define TEST_STD_TIME_TIME_SYN_FORMATTER_TESTS_H -#include "assert_macros.h" -#include "concat_macros.h" #include "make_string.h" #include "string_literal.h" #include "test_format_string.h" @@ -36,9 +34,11 @@ using format_context = std::format_context; template void check(std::basic_string_view expected, test_format_string fmt, Args&&... args) { std::basic_string out = std::format(fmt, std::forward(args)...); - TEST_REQUIRE(out == expected, - TEST_WRITE_CONCATENATED( - "\nFormat string ", fmt.get(), "\nExpected output ", expected, "\nActual output ", out, '\n')); + if constexpr (std::same_as) + if (out != expected) + std::cerr << "\nFormat string " << fmt.get() << "\nExpected output " << expected << "\nActual output " << out + << '\n'; + assert(out == expected); } template @@ -47,24 +47,38 @@ void check(const std::locale& loc, test_format_string fmt, Args&&... args) { std::basic_string out = std::format(loc, fmt, std::forward(args)...); - TEST_REQUIRE(out == expected, - TEST_WRITE_CONCATENATED( - "\nFormat string ", fmt.get(), "\nExpected output ", expected, "\nActual output ", out, '\n')); + if constexpr (std::same_as) + if (out != expected) + std::cerr << "\nFormat string " << fmt.get() << "\nExpected output " << expected << "\nActual output " << out + << '\n'; + assert(out == expected); } template void check_exception([[maybe_unused]] std::string_view what, [[maybe_unused]] std::basic_string_view fmt, [[maybe_unused]] const Args&... args) { - TEST_VALIDATE_EXCEPTION( - std::format_error, - [&]([[maybe_unused]] const std::format_error& e) { - TEST_LIBCPP_REQUIRE( - e.what() == what, - TEST_WRITE_CONCATENATED( - "\nFormat string ", fmt, "\nExpected exception ", what, "\nActual exception ", e.what(), '\n')); - }, - TEST_IGNORE_NODISCARD std::vformat(fmt, std::make_format_args>(args...))); +#ifndef TEST_HAS_NO_EXCEPTIONS + try { + TEST_IGNORE_NODISCARD std::vformat(fmt, std::make_format_args>(args...)); + if constexpr (std::same_as) + std::cerr << "\nFormat string " << fmt << "\nDidn't throw an exception.\n"; + assert(false); + } catch (const std::format_error& e) { +# if defined(_LIBCPP_VERSION) + if constexpr (std::same_as) + if (e.what() != what) + std::cerr << "\nFormat string " << fmt << "\nExpected exception " << what << "\nActual exception " + << e.what() << '\n'; + assert(e.what() == what); +# else + (void)what; + (void)e; +# endif + return; + } + assert(false); +#endif } template diff --git a/libcxx/test/support/concat_macros.h b/libcxx/test/support/concat_macros.h index bc1306fbdb533..8d80a8c9f69ca 100644 --- a/libcxx/test/support/concat_macros.h +++ b/libcxx/test/support/concat_macros.h @@ -16,154 +16,15 @@ #include "test_macros.h" #ifndef TEST_HAS_NO_LOCALIZATION -# include -# include # include #endif #if TEST_STD_VER > 17 # ifndef TEST_HAS_NO_LOCALIZATION - -[[nodiscard]] constexpr bool test_is_high_surrogate(char32_t value) { return value >= 0xd800 && value <= 0xdbff; } - -[[nodiscard]] constexpr bool test_is_low_surrogate(char32_t value) { return value >= 0xdc00 && value <= 0xdfff; } - -[[nodiscard]] constexpr bool test_is_surrogate(char32_t value) { return value >= 0xd800 && value <= 0xdfff; } - -[[nodiscard]] constexpr bool test_is_code_point(char32_t value) { return value <= 0x10ffff; } - -[[nodiscard]] constexpr bool test_is_scalar_value(char32_t value) { - return test_is_code_point(value) && !test_is_surrogate(value); -} - -inline constexpr char32_t test_replacement_character = U'\ufffd'; - -template -OutIt test_transcode() = delete; - -template - requires(std::output_iterator && std::same_as, char8_t>) -OutIt test_transcode(InIt first, InIt last, OutIt out_it) { - return std::copy(first, last, out_it); -} - -template - requires std::output_iterator -void test_encode(OutIt& out_it, char16_t value) { - if (value < 0x80) - *out_it++ = value; - else if (value < 0x800) { - *out_it++ = 0b11000000 | (value >> 6); - *out_it++ = 0b10000000 | (value & 0b00111111); - } else { - *out_it++ = 0b11100000 | (value >> 12); - *out_it++ = 0b10000000 | ((value) >> 6 & 0b00111111); - *out_it++ = 0b10000000 | (value & 0b00111111); - } -} - -template - requires std::output_iterator -void test_encode(OutIt& out_it, char32_t value) { - if ((value & 0xffff0000) == 0) - test_encode(out_it, static_cast(value)); - else { - *out_it++ = 0b11100000 | (value >> 18); - *out_it++ = 0b10000000 | ((value) >> 12 & 0b00111111); - *out_it++ = 0b10000000 | ((value) >> 6 & 0b00111111); - *out_it++ = 0b10000000 | (value & 0b00111111); - } -} - -template - requires(std::output_iterator && - (std::same_as, char16_t> -# ifndef TEST_HAS_NO_WIDE_CHARACTERS - || (std::same_as, wchar_t> && sizeof(wchar_t) == 2)) -# endif - ) -OutIt test_transcode(InIt first, InIt last, OutIt out_it) { - while (first != last) { - char32_t value = *first++; - - if (test_is_low_surrogate(value)) [[unlikely]] { - test_encode(out_it, static_cast(test_replacement_character)); - continue; - } - - if (!test_is_high_surrogate(value)) { - test_encode(out_it, static_cast(value)); - continue; - } - - if (first == last || !test_is_low_surrogate(static_cast(*first))) [[unlikely]] { - test_encode(out_it, static_cast(test_replacement_character)); - continue; - } - - value -= 0xd800; - value <<= 10; - value += static_cast(*first++) - 0xdc00; - value += 0x10000; - - if (test_is_code_point(value)) [[likely]] - test_encode(out_it, value); - else - test_encode(out_it, static_cast(test_replacement_character)); - } - - return out_it; -} - -template - requires(std::output_iterator && - (std::same_as, char32_t> || -# ifndef TEST_HAS_NO_WIDE_CHARACTERS - (std::same_as, wchar_t> && sizeof(wchar_t) == 4)) -# endif - ) -OutIt test_transcode(InIt first, InIt last, OutIt out_it) { - while (first != last) { - char32_t value = *first++; - if (test_is_code_point(value)) [[likely]] - test_encode(out_it, value); - else - test_encode(out_it, static_cast(test_replacement_character)); - } - return out_it; -} - -template -concept test_streamable = requires(std::stringstream& stream, T&& value) { stream << value; }; - -template -concept test_convertable_range = (!test_streamable && requires(R&& value) { - std::basic_string_view{std::begin(value), std::end(value)}; -}); - template -concept test_can_concat = test_streamable || test_convertable_range; - -template -std::ostream& test_concat(std::ostream& stream, T&& value) { - return stream << value; -} - -template -std::ostream& test_concat(std::ostream& stream, T&& value) { - auto b = std::begin(value); - auto e = std::end(value); - if (b != e) { - // When T is an array it's string-literal, remove the NUL terminator. - if constexpr (std::is_array_v>) { - --e; - } - test_transcode(b, e, std::ostream_iterator{stream}); - } - return stream; -} -# endif // TEST_HAS_NO_LOCALIZATION +concept test_char_streamable = requires(T&& value) { std::stringstream{} << std::forward(value); }; +# endif // If possible concatenates message for the assertion function, else returns a // default message. Not being able to stream is not considered an error. For @@ -176,12 +37,12 @@ std::ostream& test_concat(std::ostream& stream, T&& value) { template std::string test_concat_message([[maybe_unused]] Args&&... args) { # ifndef TEST_HAS_NO_LOCALIZATION - if constexpr ((test_can_concat && ...)) { + if constexpr ((test_char_streamable && ...)) { std::stringstream sstr; - ((test_concat(sstr, std::forward(args))), ...); + ((sstr << std::forward(args)), ...); return sstr.str(); } else -# endif // TEST_HAS_NO_LOCALIZATION +# endif return "Message discarded since it can't be streamed to std::cerr.\n"; } diff --git a/libcxx/utils/libcxx/test/features.py b/libcxx/utils/libcxx/test/features.py index 5e854917e6ef4..ccabb48833f10 100644 --- a/libcxx/utils/libcxx/test/features.py +++ b/libcxx/utils/libcxx/test/features.py @@ -198,18 +198,6 @@ def _getAndroidDeviceApi(cfg): """, ), ), - # Tests that require 64-bit architecture - Feature( - name="32-bit-pointer", - when=lambda cfg: sourceBuilds( - cfg, - """ - int main(int, char**) { - static_assert(sizeof(void *) == 4); - } - """, - ), - ), # TODO: Remove this feature once compiler-rt includes __atomic_is_lockfree() # on all supported platforms. Feature( @@ -579,4 +567,16 @@ def check_gdb(cfg): cfg.available_features, ), ), + # Tests that require 64-bit architecture + Feature( + name="32-bit-pointer", + when=lambda cfg: sourceBuilds( + cfg, + """ + int main(int, char**) { + static_assert(sizeof(void *) == 4); + } + """, + ), + ), ] diff --git a/lld/test/ELF/emulation-amdgpu.s b/lld/test/ELF/emulation-amdgpu.s index 2d9ae52548b41..707f0aeb909ef 100644 --- a/lld/test/ELF/emulation-amdgpu.s +++ b/lld/test/ELF/emulation-amdgpu.s @@ -1,16 +1,10 @@ # REQUIRES: amdgpu -# RUN: llvm-mc -filetype=obj -triple=amdgcn-amd-amdhsa --amdhsa-code-object-version=4 %s -o %t.o +# RUN: llvm-mc -filetype=obj -triple=amdgcn-amd-amdhsa %s -o %t.o # RUN: ld.lld %t.o -o %t -# RUN: llvm-readobj --file-headers %t | FileCheck --check-prefixes=CHECK,HSA4 %s +# RUN: llvm-readobj --file-headers %t | FileCheck %s # RUN: ld.lld -m elf64_amdgpu %t.o -o %t -# RUN: llvm-readobj --file-headers %t | FileCheck --check-prefixes=CHECK,HSA4 %s - -# RUN: llvm-mc -filetype=obj -triple=amdgcn-amd-amdhsa --amdhsa-code-object-version=5 %s -o %t.o -# RUN: ld.lld %t.o -o %t -# RUN: llvm-readobj --file-headers %t | FileCheck --check-prefixes=CHECK,HSA5 %s -# RUN: ld.lld -m elf64_amdgpu %t.o -o %t -# RUN: llvm-readobj --file-headers %t | FileCheck --check-prefixes=CHECK,HSA5 %s +# RUN: llvm-readobj --file-headers %t | FileCheck %s # CHECK: ElfHeader { # CHECK-NEXT: Ident { @@ -19,9 +13,8 @@ # CHECK-NEXT: DataEncoding: LittleEndian (0x1) # CHECK-NEXT: FileVersion: 1 # CHECK-NEXT: OS/ABI: AMDGPU_HSA (0x40) -# HSA4: ABIVersion: 2 -# HSA5: ABIVersion: 3 -# CHECK: Unused: (00 00 00 00 00 00 00) +# CHECK-NEXT: ABIVersion: 2 +# CHECK-NEXT: Unused: (00 00 00 00 00 00 00) # CHECK-NEXT: } # CHECK-NEXT: Type: Executable (0x2) # CHECK-NEXT: Machine: EM_AMDGPU (0xE0) diff --git a/lldb/include/lldb/Core/Address.h b/lldb/include/lldb/Core/Address.h index 725b5d9f91d3d..b19e694427546 100644 --- a/lldb/include/lldb/Core/Address.h +++ b/lldb/include/lldb/Core/Address.h @@ -14,8 +14,6 @@ #include "lldb/lldb-private-enumerations.h" #include "lldb/lldb-types.h" -#include "llvm/ADT/StringRef.h" - #include #include @@ -239,12 +237,6 @@ class Address { /// contains the address, otherwise dumping the range that contains the /// address. /// - /// \param[in] pattern - /// An optional regex pattern to match against the description. If - /// specified, parts of the description matching this pattern may be - /// highlighted or processed differently. If this parameter is an empty - /// string or not provided, no highlighting is applied. - /// /// \return /// Returns \b true if the address was able to be displayed. /// File and load addresses may be unresolved and it may not be @@ -254,8 +246,8 @@ class Address { /// \see Address::DumpStyle bool Dump(Stream *s, ExecutionContextScope *exe_scope, DumpStyle style, DumpStyle fallback_style = DumpStyleInvalid, - uint32_t addr_byte_size = UINT32_MAX, bool all_ranges = false, - llvm::StringRef pattern = "") const; + uint32_t addr_byte_size = UINT32_MAX, + bool all_ranges = false) const; AddressClass GetAddressClass() const; diff --git a/lldb/include/lldb/Core/Debugger.h b/lldb/include/lldb/Core/Debugger.h index c6d603ca5dcde..e4ee94809cf1a 100644 --- a/lldb/include/lldb/Core/Debugger.h +++ b/lldb/include/lldb/Core/Debugger.h @@ -321,10 +321,6 @@ class Debugger : public std::enable_shared_from_this, llvm::StringRef GetAutosuggestionAnsiSuffix() const; - llvm::StringRef GetRegexMatchAnsiPrefix() const; - - llvm::StringRef GetRegexMatchAnsiSuffix() const; - bool GetShowDontUsePoHint() const; bool GetUseSourceCache() const; diff --git a/lldb/include/lldb/Symbol/Symbol.h b/lldb/include/lldb/Symbol/Symbol.h index e6c0b495bcf28..44a2d560010fe 100644 --- a/lldb/include/lldb/Symbol/Symbol.h +++ b/lldb/include/lldb/Symbol/Symbol.h @@ -174,8 +174,8 @@ class Symbol : public SymbolContextScope { void SetFlags(uint32_t flags) { m_flags = flags; } - void GetDescription(Stream *s, lldb::DescriptionLevel level, Target *target, - llvm::StringRef pattern = "") const; + void GetDescription(Stream *s, lldb::DescriptionLevel level, + Target *target) const; bool IsSynthetic() const { return m_is_synthetic; } diff --git a/lldb/include/lldb/Symbol/SymbolContext.h b/lldb/include/lldb/Symbol/SymbolContext.h index 26f3bac09a962..b0f5ffead2a16 100644 --- a/lldb/include/lldb/Symbol/SymbolContext.h +++ b/lldb/include/lldb/Symbol/SymbolContext.h @@ -145,19 +145,13 @@ class SymbolContext { /// is dumped if this flag is \b true, otherwise the line info /// of the actual inlined function is dumped. /// - /// \param[in] pattern - /// An optional regex pattern to match against the stop context - /// description. If specified, parts of the description matching this - /// pattern may be highlighted or processed differently. If this parameter - /// is an empty string or not provided, no highlighting is applied. - /// /// \return /// \b true if some text was dumped, \b false otherwise. bool DumpStopContext(Stream *s, ExecutionContextScope *exe_scope, const Address &so_addr, bool show_fullpaths, bool show_module, bool show_inlined_frames, - bool show_function_arguments, bool show_function_name, - llvm::StringRef pattern = "") const; + bool show_function_arguments, + bool show_function_name) const; /// Get the address range contained within a symbol context. /// @@ -223,8 +217,8 @@ class SymbolContext { /// The symbol that was found, or \b nullptr if none was found. const Symbol *FindBestGlobalDataSymbol(ConstString name, Status &error); - void GetDescription(Stream *s, lldb::DescriptionLevel level, Target *target, - llvm::StringRef pattern = "") const; + void GetDescription(Stream *s, lldb::DescriptionLevel level, + Target *target) const; uint32_t GetResolvedMask() const; diff --git a/lldb/include/lldb/Utility/Stream.h b/lldb/include/lldb/Utility/Stream.h index 20c55ac4597ae..1a5fd343e4df0 100644 --- a/lldb/include/lldb/Utility/Stream.h +++ b/lldb/include/lldb/Utility/Stream.h @@ -231,40 +231,6 @@ class Stream { /// The string to be output to the stream. size_t PutCString(llvm::StringRef cstr); - /// Output a C string to the stream with color highlighting. - /// - /// Print a C string \a text to the stream, applying color highlighting to - /// the portions of the string that match the regex pattern \a pattern. The - /// pattern is matched as many times as possible throughout the string. If \a - /// pattern is nullptr, then no highlighting is applied. - /// - /// The highlighting is applied by enclosing the matching text in ANSI color - /// codes. The \a prefix parameter specifies the ANSI code to start the color - /// (the standard value is assumed to be 'ansi.fg.red', representing red - /// foreground), and the \a suffix parameter specifies the ANSI code to end - /// the color (the standard value is assumed to be 'ansi.normal', resetting to - /// default text style). These constants should be defined appropriately in - /// your environment. - /// - /// \param[in] text - /// The string to be output to the stream. - /// - /// \param[in] pattern - /// The regex pattern to match against the \a text string. Portions of \a - /// text matching this pattern will be colorized. If this parameter is - /// nullptr, highlighting is not performed. - /// \param[in] prefix - /// The ANSI color code to start colorization. This is - /// environment-dependent. - /// \param[in] suffix - /// The ANSI color code to end colorization. This is - /// environment-dependent. - - void PutCStringColorHighlighted(llvm::StringRef text, - llvm::StringRef pattern = "", - llvm::StringRef prefix = "", - llvm::StringRef suffix = ""); - /// Output and End of Line character to the stream. size_t EOL(); diff --git a/lldb/source/Commands/CommandObjectTarget.cpp b/lldb/source/Commands/CommandObjectTarget.cpp index 63232c221ad1d..58785cde3ec7c 100644 --- a/lldb/source/Commands/CommandObjectTarget.cpp +++ b/lldb/source/Commands/CommandObjectTarget.cpp @@ -8,7 +8,6 @@ #include "CommandObjectTarget.h" -#include "lldb/Core/Address.h" #include "lldb/Core/Debugger.h" #include "lldb/Core/IOHandler.h" #include "lldb/Core/Module.h" @@ -1533,7 +1532,7 @@ static void DumpOsoFilesTable(Stream &strm, static void DumpAddress(ExecutionContextScope *exe_scope, const Address &so_addr, bool verbose, bool all_ranges, - Stream &strm, llvm::StringRef pattern = "") { + Stream &strm) { strm.IndentMore(); strm.Indent(" Address: "); so_addr.Dump(&strm, exe_scope, Address::DumpStyleModuleWithFileAddress); @@ -1543,14 +1542,13 @@ static void DumpAddress(ExecutionContextScope *exe_scope, strm.Indent(" Summary: "); const uint32_t save_indent = strm.GetIndentLevel(); strm.SetIndentLevel(save_indent + 13); - so_addr.Dump(&strm, exe_scope, Address::DumpStyleResolvedDescription, - Address::DumpStyleInvalid, UINT32_MAX, false, pattern); + so_addr.Dump(&strm, exe_scope, Address::DumpStyleResolvedDescription); strm.SetIndentLevel(save_indent); // Print out detailed address information when verbose is enabled if (verbose) { strm.EOL(); so_addr.Dump(&strm, exe_scope, Address::DumpStyleDetailedSymbolContext, - Address::DumpStyleInvalid, UINT32_MAX, all_ranges, pattern); + Address::DumpStyleInvalid, UINT32_MAX, all_ranges); } strm.IndentLess(); } @@ -1595,7 +1593,6 @@ static uint32_t LookupSymbolInModule(CommandInterpreter &interpreter, return 0; SymbolContext sc; - const bool use_color = interpreter.GetDebugger().GetUseColor(); std::vector match_indexes; ConstString symbol_name(name); uint32_t num_matches = 0; @@ -1621,19 +1618,12 @@ static uint32_t LookupSymbolInModule(CommandInterpreter &interpreter, if (symbol->ValueIsAddress()) { DumpAddress( interpreter.GetExecutionContext().GetBestExecutionContextScope(), - symbol->GetAddressRef(), verbose, all_ranges, strm, - use_color && name_is_regex ? name : nullptr); + symbol->GetAddressRef(), verbose, all_ranges, strm); strm.EOL(); } else { strm.IndentMore(); strm.Indent(" Name: "); - llvm::StringRef ansi_prefix = - interpreter.GetDebugger().GetRegexMatchAnsiPrefix(); - llvm::StringRef ansi_suffix = - interpreter.GetDebugger().GetRegexMatchAnsiSuffix(); - strm.PutCStringColorHighlighted( - symbol->GetDisplayName().GetStringRef(), - use_color ? name : nullptr, ansi_prefix, ansi_suffix); + strm.PutCString(symbol->GetDisplayName().GetStringRef()); strm.EOL(); strm.Indent(" Value: "); strm.Printf("0x%16.16" PRIx64 "\n", symbol->GetRawValue()); diff --git a/lldb/source/Core/Address.cpp b/lldb/source/Core/Address.cpp index 19d34db44ea55..189d50fe962a6 100644 --- a/lldb/source/Core/Address.cpp +++ b/lldb/source/Core/Address.cpp @@ -7,7 +7,6 @@ //===----------------------------------------------------------------------===// #include "lldb/Core/Address.h" -#include "lldb/Core/Debugger.h" #include "lldb/Core/Declaration.h" #include "lldb/Core/DumpDataExtractor.h" #include "lldb/Core/Module.h" @@ -29,7 +28,6 @@ #include "lldb/Target/Process.h" #include "lldb/Target/SectionLoadList.h" #include "lldb/Target/Target.h" -#include "lldb/Utility/AnsiTerminal.h" #include "lldb/Utility/ConstString.h" #include "lldb/Utility/DataExtractor.h" #include "lldb/Utility/Endian.h" @@ -407,7 +405,7 @@ bool Address::GetDescription(Stream &s, Target &target, bool Address::Dump(Stream *s, ExecutionContextScope *exe_scope, DumpStyle style, DumpStyle fallback_style, uint32_t addr_size, - bool all_ranges, llvm::StringRef pattern) const { + bool all_ranges) const { // If the section was nullptr, only load address is going to work unless we // are trying to deref a pointer SectionSP section_sp(GetSection()); @@ -503,6 +501,7 @@ bool Address::Dump(Stream *s, ExecutionContextScope *exe_scope, DumpStyle style, pointer_size = target->GetArchitecture().GetAddressByteSize(); else if (module_sp) pointer_size = module_sp->GetArchitecture().GetAddressByteSize(); + bool showed_info = false; if (section_sp) { SectionType sect_type = section_sp->GetType(); @@ -516,16 +515,7 @@ bool Address::Dump(Stream *s, ExecutionContextScope *exe_scope, DumpStyle style, if (symbol) { const char *symbol_name = symbol->GetName().AsCString(); if (symbol_name) { - llvm::StringRef ansi_prefix; - llvm::StringRef ansi_suffix; - if (target) { - ansi_prefix = - target->GetDebugger().GetRegexMatchAnsiPrefix(); - ansi_suffix = - target->GetDebugger().GetRegexMatchAnsiSuffix(); - } - s->PutCStringColorHighlighted(symbol_name, pattern, - ansi_prefix, ansi_suffix); + s->PutCString(symbol_name); addr_t delta = file_Addr - symbol->GetAddressRef().GetFileAddress(); if (delta) @@ -653,7 +643,7 @@ bool Address::Dump(Stream *s, ExecutionContextScope *exe_scope, DumpStyle style, pointer_sc.symbol != nullptr) { s->PutCString(": "); pointer_sc.DumpStopContext(s, exe_scope, so_addr, true, false, - false, true, true, pattern); + false, true, true); } } } @@ -692,22 +682,19 @@ bool Address::Dump(Stream *s, ExecutionContextScope *exe_scope, DumpStyle style, // address. sc.DumpStopContext(s, exe_scope, *this, show_fullpaths, show_module, show_inlined_frames, - show_function_arguments, show_function_name, - pattern); + show_function_arguments, show_function_name); } else { // We found a symbol but it was in a different section so it // isn't the symbol we should be showing, just show the section // name + offset - Dump(s, exe_scope, DumpStyleSectionNameOffset, DumpStyleInvalid, - UINT32_MAX, false, pattern); + Dump(s, exe_scope, DumpStyleSectionNameOffset); } } } } } else { if (fallback_style != DumpStyleInvalid) - return Dump(s, exe_scope, fallback_style, DumpStyleInvalid, addr_size, - false, pattern); + return Dump(s, exe_scope, fallback_style, DumpStyleInvalid, addr_size); return false; } break; @@ -728,7 +715,7 @@ bool Address::Dump(Stream *s, ExecutionContextScope *exe_scope, DumpStyle style, sc.symbol->GetAddressRef().GetSection() != GetSection()) sc.symbol = nullptr; } - sc.GetDescription(s, eDescriptionLevelBrief, target, pattern); + sc.GetDescription(s, eDescriptionLevelBrief, target); if (sc.block) { bool can_create = true; @@ -776,8 +763,7 @@ bool Address::Dump(Stream *s, ExecutionContextScope *exe_scope, DumpStyle style, } } else { if (fallback_style != DumpStyleInvalid) - return Dump(s, exe_scope, fallback_style, DumpStyleInvalid, addr_size, - false, pattern); + return Dump(s, exe_scope, fallback_style, DumpStyleInvalid, addr_size); return false; } break; diff --git a/lldb/source/Core/CoreProperties.td b/lldb/source/Core/CoreProperties.td index 8d81967bdb50a..0e0f468d3ecd7 100644 --- a/lldb/source/Core/CoreProperties.td +++ b/lldb/source/Core/CoreProperties.td @@ -203,14 +203,6 @@ let Definition = "debugger" in { Global, DefaultStringValue<"${ansi.normal}">, Desc<"When displaying suggestion in a color-enabled terminal, use the ANSI terminal code specified in this format immediately after the suggestion.">; - def ShowRegexMatchAnsiPrefix: Property<"show-regex-match-ansi-prefix", "String">, - Global, - DefaultStringValue<"${ansi.fg.red}">, - Desc<"When displaying a regex match in a color-enabled terminal, use the ANSI terminal code specified in this format immediately before the match.">; - def ShowRegexMatchAnsiSuffix: Property<"show-regex-match-ansi-suffix", "String">, - Global, - DefaultStringValue<"${ansi.normal}">, - Desc<"When displaying a regex match in a color-enabled terminal, use the ANSI terminal code specified in this format immediately after the match.">; def ShowDontUsePoHint: Property<"show-dont-use-po-hint", "Boolean">, Global, DefaultTrue, diff --git a/lldb/source/Core/Debugger.cpp b/lldb/source/Core/Debugger.cpp index 97311b4716ac2..398265dca1cb7 100644 --- a/lldb/source/Core/Debugger.cpp +++ b/lldb/source/Core/Debugger.cpp @@ -453,18 +453,6 @@ llvm::StringRef Debugger::GetAutosuggestionAnsiSuffix() const { idx, g_debugger_properties[idx].default_cstr_value); } -llvm::StringRef Debugger::GetRegexMatchAnsiPrefix() const { - const uint32_t idx = ePropertyShowRegexMatchAnsiPrefix; - return GetPropertyAtIndexAs( - idx, g_debugger_properties[idx].default_cstr_value); -} - -llvm::StringRef Debugger::GetRegexMatchAnsiSuffix() const { - const uint32_t idx = ePropertyShowRegexMatchAnsiSuffix; - return GetPropertyAtIndexAs( - idx, g_debugger_properties[idx].default_cstr_value); -} - bool Debugger::GetShowDontUsePoHint() const { const uint32_t idx = ePropertyShowDontUsePoHint; return GetPropertyAtIndexAs( diff --git a/lldb/source/Symbol/Symbol.cpp b/lldb/source/Symbol/Symbol.cpp index fcc45f861c225..26b4c4d62ad9c 100644 --- a/lldb/source/Symbol/Symbol.cpp +++ b/lldb/source/Symbol/Symbol.cpp @@ -8,8 +8,6 @@ #include "lldb/Symbol/Symbol.h" -#include "lldb/Core/Address.h" -#include "lldb/Core/Debugger.h" #include "lldb/Core/Module.h" #include "lldb/Core/ModuleSpec.h" #include "lldb/Core/Section.h" @@ -227,7 +225,7 @@ bool Symbol::IsTrampoline() const { return m_type == eSymbolTypeTrampoline; } bool Symbol::IsIndirect() const { return m_type == eSymbolTypeResolver; } void Symbol::GetDescription(Stream *s, lldb::DescriptionLevel level, - Target *target, llvm::StringRef pattern) const { + Target *target) const { s->Printf("id = {0x%8.8x}", m_uid); if (m_addr_range.GetBaseAddress().GetSection()) { @@ -254,24 +252,11 @@ void Symbol::GetDescription(Stream *s, lldb::DescriptionLevel level, s->Printf(", value = 0x%16.16" PRIx64, m_addr_range.GetBaseAddress().GetOffset()); } - llvm::StringRef ansi_prefix; - llvm::StringRef ansi_suffix; - if (target) { - ansi_prefix = target->GetDebugger().GetRegexMatchAnsiPrefix(); - ansi_suffix = target->GetDebugger().GetRegexMatchAnsiSuffix(); - } - if (ConstString demangled = m_mangled.GetDemangledName()) { - s->PutCString(", name=\""); - s->PutCStringColorHighlighted(demangled.GetStringRef(), pattern, - ansi_prefix, ansi_suffix); - s->PutCString("\""); - } - if (ConstString mangled_name = m_mangled.GetMangledName()) { - s->PutCString(", mangled=\""); - s->PutCStringColorHighlighted(mangled_name.GetStringRef(), pattern, - ansi_prefix, ansi_suffix); - s->PutCString("\""); - } + ConstString demangled = GetMangled().GetDemangledName(); + if (demangled) + s->Printf(", name=\"%s\"", demangled.AsCString()); + if (m_mangled.GetMangledName()) + s->Printf(", mangled=\"%s\"", m_mangled.GetMangledName().AsCString()); } void Symbol::Dump(Stream *s, Target *target, uint32_t index, diff --git a/lldb/source/Symbol/SymbolContext.cpp b/lldb/source/Symbol/SymbolContext.cpp index 9fd40b5ca567f..63968ec2d1506 100644 --- a/lldb/source/Symbol/SymbolContext.cpp +++ b/lldb/source/Symbol/SymbolContext.cpp @@ -8,7 +8,6 @@ #include "lldb/Symbol/SymbolContext.h" -#include "lldb/Core/Address.h" #include "lldb/Core/Debugger.h" #include "lldb/Core/Module.h" #include "lldb/Core/ModuleSpec.h" @@ -72,8 +71,7 @@ bool SymbolContext::DumpStopContext(Stream *s, ExecutionContextScope *exe_scope, const Address &addr, bool show_fullpaths, bool show_module, bool show_inlined_frames, bool show_function_arguments, - bool show_function_name, - llvm::StringRef pattern) const { + bool show_function_name) const { bool dumped_something = false; if (show_module && module_sp) { if (show_fullpaths) @@ -83,6 +81,7 @@ bool SymbolContext::DumpStopContext(Stream *s, ExecutionContextScope *exe_scope, s->PutChar('`'); dumped_something = true; } + if (function != nullptr) { SymbolContext inline_parent_sc; Address inline_parent_addr; @@ -95,16 +94,8 @@ bool SymbolContext::DumpStopContext(Stream *s, ExecutionContextScope *exe_scope, name = function->GetNameNoArguments(); if (!name) name = function->GetName(); - if (name) { - llvm::StringRef ansi_prefix; - llvm::StringRef ansi_suffix; - if (target_sp) { - ansi_prefix = target_sp->GetDebugger().GetRegexMatchAnsiPrefix(); - ansi_suffix = target_sp->GetDebugger().GetRegexMatchAnsiSuffix(); - } - s->PutCStringColorHighlighted(name.GetStringRef(), pattern, ansi_prefix, - ansi_suffix); - } + if (name) + name.Dump(s); } if (addr.IsValid()) { @@ -172,14 +163,7 @@ bool SymbolContext::DumpStopContext(Stream *s, ExecutionContextScope *exe_scope, dumped_something = true; if (symbol->GetType() == eSymbolTypeTrampoline) s->PutCString("symbol stub for: "); - llvm::StringRef ansi_prefix; - llvm::StringRef ansi_suffix; - if (target_sp) { - ansi_prefix = target_sp->GetDebugger().GetRegexMatchAnsiPrefix(); - ansi_suffix = target_sp->GetDebugger().GetRegexMatchAnsiSuffix(); - } - s->PutCStringColorHighlighted(symbol->GetName().GetStringRef(), pattern, - ansi_prefix, ansi_suffix); + symbol->GetName().Dump(s); } if (addr.IsValid() && symbol->ValueIsAddress()) { @@ -202,8 +186,7 @@ bool SymbolContext::DumpStopContext(Stream *s, ExecutionContextScope *exe_scope, } void SymbolContext::GetDescription(Stream *s, lldb::DescriptionLevel level, - Target *target, - llvm::StringRef pattern) const { + Target *target) const { if (module_sp) { s->Indent(" Module: file = \""); module_sp->GetFileSpec().Dump(s->AsRawOstream()); @@ -263,7 +246,7 @@ void SymbolContext::GetDescription(Stream *s, lldb::DescriptionLevel level, if (symbol != nullptr) { s->Indent(" Symbol: "); - symbol->GetDescription(s, level, target, pattern); + symbol->GetDescription(s, level, target); s->EOL(); } diff --git a/lldb/source/Symbol/Variable.cpp b/lldb/source/Symbol/Variable.cpp index db740cb7cb6e4..85ceadd20c611 100644 --- a/lldb/source/Symbol/Variable.cpp +++ b/lldb/source/Symbol/Variable.cpp @@ -227,8 +227,7 @@ bool Variable::LocationIsValidForFrame(StackFrame *frame) { // contains the current address when converted to a load address return m_location_list.ContainsAddress( loclist_base_load_addr, - frame->GetFrameCodeAddressForSymbolication().GetLoadAddress( - target_sp.get())); + frame->GetFrameCodeAddress().GetLoadAddress(target_sp.get())); } } return false; diff --git a/lldb/source/Utility/Stream.cpp b/lldb/source/Utility/Stream.cpp index 62e061e9d09c0..af28a49a1f0c2 100644 --- a/lldb/source/Utility/Stream.cpp +++ b/lldb/source/Utility/Stream.cpp @@ -8,13 +8,11 @@ #include "lldb/Utility/Stream.h" -#include "lldb/Utility/AnsiTerminal.h" #include "lldb/Utility/Endian.h" #include "lldb/Utility/VASPrintf.h" #include "llvm/ADT/SmallString.h" #include "llvm/Support/Format.h" #include "llvm/Support/LEB128.h" -#include "llvm/Support/Regex.h" #include @@ -72,34 +70,6 @@ size_t Stream::PutCString(llvm::StringRef str) { return bytes_written; } -void Stream::PutCStringColorHighlighted(llvm::StringRef text, - llvm::StringRef pattern, - llvm::StringRef prefix, - llvm::StringRef suffix) { - // Only apply color formatting when a pattern is present and both prefix and - // suffix are specified. In the absence of these conditions, output the text - // without color formatting. - if (pattern.empty() || (prefix.empty() && suffix.empty())) { - PutCString(text); - return; - } - - llvm::Regex reg_pattern(pattern); - llvm::SmallVector matches; - llvm::StringRef remaining = text; - std::string format_str = lldb_private::ansi::FormatAnsiTerminalCodes( - prefix.str() + "%.*s" + suffix.str()); - while (reg_pattern.match(remaining, &matches)) { - llvm::StringRef match = matches[0]; - size_t match_start_pos = match.data() - remaining.data(); - PutCString(remaining.take_front(match_start_pos)); - Printf(format_str.c_str(), match.size(), match.data()); - remaining = remaining.drop_front(match_start_pos + match.size()); - } - if (remaining.size()) - PutCString(remaining); -} - // Print a double quoted NULL terminated C string to the stream using the // printf format in "format". void Stream::QuotedCString(const char *cstr, const char *format) { diff --git a/lldb/test/API/functionalities/location-list-lookup/Makefile b/lldb/test/API/functionalities/location-list-lookup/Makefile index 8e453681d7b39..78b0b11cb7484 100644 --- a/lldb/test/API/functionalities/location-list-lookup/Makefile +++ b/lldb/test/API/functionalities/location-list-lookup/Makefile @@ -1,3 +1,3 @@ -CXX_SOURCES := main.cpp +C_SOURCES := main.c CFLAGS_EXTRAS := -O1 include Makefile.rules diff --git a/lldb/test/API/functionalities/location-list-lookup/TestLocationListLookup.py b/lldb/test/API/functionalities/location-list-lookup/TestLocationListLookup.py index ccee3bfde3f5d..4793447c59413 100644 --- a/lldb/test/API/functionalities/location-list-lookup/TestLocationListLookup.py +++ b/lldb/test/API/functionalities/location-list-lookup/TestLocationListLookup.py @@ -7,8 +7,16 @@ class LocationListLookupTestCase(TestBase): - def launch(self) -> lldb.SBProcess: + def setUp(self): + # Call super's setUp(). + TestBase.setUp(self) + + @skipIf(oslist=["linux"], archs=["arm"]) + def test_loclist(self): + self.build() exe = self.getBuildArtifact("a.out") + + # Create a target by the debugger. target = self.dbg.CreateTarget(exe) self.assertTrue(target, VALID_TARGET) self.dbg.SetAsync(False) @@ -19,31 +27,12 @@ def launch(self) -> lldb.SBProcess: self.assertTrue(process.IsValid()) self.assertTrue(process.is_stopped) - return process - - def check_local_vars(self, process: lldb.SBProcess, check_expr: bool): - # Find `bar` on the stack, then - # make sure we can read out the local - # variables (with both `frame var` and `expr`) + # Find `main` on the stack, then + # find `argv` local variable, then + # check that we can read the c-string in argv[0] for f in process.GetSelectedThread().frames: - frame_name = f.GetDisplayFunctionName() - if frame_name is not None and frame_name.startswith("Foo::bar"): + if f.GetDisplayFunctionName() == "main": argv = f.GetValueForVariablePath("argv").GetChildAtIndex(0) strm = lldb.SBStream() argv.GetDescription(strm) self.assertNotEqual(strm.GetData().find("a.out"), -1) - - if check_expr: - process.GetSelectedThread().SetSelectedFrame(f.idx) - self.expect_expr("this", result_type="Foo *") - - @skipIf(oslist=["linux"], archs=["arm"]) - @skipIfDarwin - def test_loclist_frame_var(self): - self.build() - self.check_local_vars(self.launch(), check_expr=False) - - @skipUnlessDarwin - def test_loclist_expr(self): - self.build() - self.check_local_vars(self.launch(), check_expr=True) diff --git a/lldb/test/API/functionalities/location-list-lookup/main.c b/lldb/test/API/functionalities/location-list-lookup/main.c new file mode 100644 index 0000000000000..852772ee52ca2 --- /dev/null +++ b/lldb/test/API/functionalities/location-list-lookup/main.c @@ -0,0 +1,23 @@ +#include +#include + +// The goal with this test is: +// 1. Have main() followed by foo() +// 2. Have the no-return call to abort() in main be the last instruction +// 3. Have the next instruction be the start of foo() +// 4. The debug info for argv uses a location list. +// clang at -O1 on x86_64 or arm64 has debuginfo like +// DW_AT_location (0x00000049: +// [0x0000000100003f15, 0x0000000100003f25): DW_OP_reg4 RSI +// [0x0000000100003f25, 0x0000000100003f5b): DW_OP_reg15 R15) + +void foo(int); +int main(int argc, char **argv) { + char *file = argv[0]; + char f0 = file[0]; + printf("%c\n", f0); + foo(f0); + printf("%s %d\n", argv[0], argc); + abort(); /// argv is still be accessible here +} +void foo(int in) { printf("%d\n", in); } diff --git a/lldb/test/API/functionalities/location-list-lookup/main.cpp b/lldb/test/API/functionalities/location-list-lookup/main.cpp deleted file mode 100644 index 4ccdadbddbb55..0000000000000 --- a/lldb/test/API/functionalities/location-list-lookup/main.cpp +++ /dev/null @@ -1,23 +0,0 @@ -#include -#include - -void func(int in); - -struct Foo { - int x; - [[clang::noinline]] void bar(char **argv); -}; - -int main(int argc, char **argv) { - Foo f{.x = 5}; - std::printf("%p\n", &f.x); - f.bar(argv); - return f.x; -} - -void Foo::bar(char **argv) { - std::printf("%p %p\n", argv, this); - std::abort(); /// 'this' should be still accessible -} - -void func(int in) { printf("%d\n", in); } diff --git a/lldb/test/Shell/Commands/command-image-lookup-color.test b/lldb/test/Shell/Commands/command-image-lookup-color.test deleted file mode 100644 index 186526b7efaee..0000000000000 --- a/lldb/test/Shell/Commands/command-image-lookup-color.test +++ /dev/null @@ -1,60 +0,0 @@ -# RUN: %clang_host -g %S/Inputs/main.c -o %t - -# The file main.c is does not show up in search on Windows. -# UNSUPPORTED: system-windows - -# Until we figure out the correct CHECK lines. -# UNSUPPORTED: system-darwin - -# Checking simple regex search - -# RUN: %lldb %t -b -o 'settings set use-color true' -o 'image lookup -r -s ma' | FileCheck %s --check-prefix CHECK1 -# CHECK1: Name: {{.+}}31mma{{.+}}0min.c - -# Checking complex regex searches - -# RUN: %lldb %t -b -o 'settings set use-color true' -o 'image lookup -r -s main.c|foo' | FileCheck %s --check-prefix CHECK2 -# CHECK2: Name: {{.+}}31mmain.c{{.+}}0m - -# RUN: %lldb %t -b -o 'settings set use-color true' -o 'image lookup -r -s m[abc]' | FileCheck %s --check-prefix CHECK3 -# CHECK3: Name: {{.+}}31mma{{.+}}0min.c - -# Checking to ensure that no attempt is made to color anything when there are no matching symbols found - -# RUN: %lldb %t -o 'settings set use-color true' -o 'image lookup -r -s IMPPATTERN123456' | FileCheck %s --check-prefix CHECK4 -# CHECK4-NOT: {{[0-9]+}} symbols match the regular expression - -# Checking multiple matches on same symbol - -# RUN: %lldb %t -b -o 'settings set use-color true' -o 'image lookup -r -s (ma|n)' | FileCheck %s --check-prefix CHECK5 -# CHECK5: Name: {{.+}}31mma{{.+}}0mi{{.+}}31mn{{.+}}0m.c - -# Checking no colorization without regex search - -# RUN: %lldb %t -b -o 'settings set use-color true' -o 'image lookup -s main' | FileCheck %s --check-prefix CHECK6 -# CHECK6: Summary: {{.+}}`main at main.c: - -# Checking no colorization when use-color is false - -# RUN: %lldb %t -b -o 'settings set use-color false' -o 'image lookup -r -s ma' | FileCheck %s --check-prefix CHECK7 -# CHECK7: Name: main.c - -# Checking for custom colors - -# RUN: %lldb %t -b -o 'settings set use-color true' -o 'settings set show-regex-match-ansi-prefix ${ansi.fg.green}' -o 'image lookup -r -s ma' | FileCheck %s --check-prefix CHECK8 -# CHECK8: Name: {{.+}}32mma{{.+}}0min.c - -# Checking for functionality when there's prefix but no suffix - -# RUN: %lldb %t -b -o 'settings set use-color true' -o 'settings set show-regex-match-ansi-prefix ${ansi.fg.red}' -o 'settings set show-regex-match-ansi-suffix ""' -o 'image lookup -r -s ma' | FileCheck %s --check-prefix CHECK9 -# CHECK9: Name: {{.+}}31mmain.c - -# Checking for functionality when there's suffix but no prefix - -# RUN: %lldb %t -b -o 'settings set use-color true' -o 'settings set show-regex-match-ansi-prefix ""' -o 'settings set show-regex-match-ansi-suffix ${ansi.fg.red}' -o 'image lookup -r -s ma' | FileCheck %s --check-prefix CHECK10 -# CHECK10: Name: ma{{.+}}31min.c - -# Checking for no colorization when there's neither suffix nor prefix - -# RUN: %lldb %t -b -o 'settings set use-color true' -o 'settings set show-regex-match-ansi-prefix ""' -o 'settings set show-regex-match-ansi-suffix ""' -o 'image lookup -r -s ma' | FileCheck %s --check-prefix CHECK11 -# CHECK11: Name: main.c diff --git a/llvm/docs/MIRLangRef.rst b/llvm/docs/MIRLangRef.rst index 52ff24daa7fba..223701599ebb8 100644 --- a/llvm/docs/MIRLangRef.rst +++ b/llvm/docs/MIRLangRef.rst @@ -58,12 +58,12 @@ for the post register allocation pseudo instruction expansion pass, you can specify the machine copy propagation pass in the ``-stop-after`` option, as it runs just before the pass that we are trying to test: - ``llc -stop-after=machine-cp bug-trigger.ll -o test.mir`` + ``llc -stop-after=machine-cp bug-trigger.ll > test.mir`` If the same pass is run multiple times, a run index can be included after the name with a comma. - ``llc -stop-after=dead-mi-elimination,1 bug-trigger.ll -o test.mir`` + ``llc -stop-after=dead-mi-elimination,1 bug-trigger.ll > test.mir`` After generating the input MIR file, you'll have to add a run line that uses the ``-run-pass`` option to it. In order to test the post register allocation diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst index fcb671cf1dfa2..f58ae03a6efcf 100644 --- a/llvm/docs/ReleaseNotes.rst +++ b/llvm/docs/ReleaseNotes.rst @@ -92,8 +92,6 @@ Changes to Interprocedural Optimizations Changes to the AArch64 Backend ------------------------------ -* Added support for Cortex-A520, Cortex-A720 and Cortex-X4 CPUs. - Changes to the AMDGPU Backend ----------------------------- @@ -104,11 +102,11 @@ Changes to the AMDGPU Backend * Implemented :ref:`llvm.get.rounding ` +* Added support for Cortex-A520, Cortex-A720 and Cortex-X4 CPUs. + Changes to the ARM Backend -------------------------- -* Added support for Cortex-M52 CPUs. - Changes to the AVR Backend -------------------------- diff --git a/llvm/include/llvm/Analysis/ScalarEvolutionExpressions.h b/llvm/include/llvm/Analysis/ScalarEvolutionExpressions.h index fd884f2a2f55b..91848a91c17e6 100644 --- a/llvm/include/llvm/Analysis/ScalarEvolutionExpressions.h +++ b/llvm/include/llvm/Analysis/ScalarEvolutionExpressions.h @@ -16,6 +16,7 @@ #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/iterator_range.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/IR/Constants.h" #include "llvm/IR/ValueHandle.h" diff --git a/llvm/include/llvm/AsmParser/LLParser.h b/llvm/include/llvm/AsmParser/LLParser.h index a1f330acb1c23..34bf249cc6d1e 100644 --- a/llvm/include/llvm/AsmParser/LLParser.h +++ b/llvm/include/llvm/AsmParser/LLParser.h @@ -51,22 +51,15 @@ namespace llvm { /// or a symbolic (%var) reference. This is just a discriminated union. struct ValID { enum { - t_LocalID, // ID in UIntVal. - t_GlobalID, // ID in UIntVal. - t_LocalName, // Name in StrVal. - t_GlobalName, // Name in StrVal. - t_APSInt, // Value in APSIntVal. - t_APFloat, // Value in APFloatVal. - t_Null, // No value. - t_Undef, // No value. - t_Zero, // No value. - t_None, // No value. - t_Poison, // No value. - t_EmptyArray, // No value: [] - t_Constant, // Value in ConstantVal. - t_InlineAsm, // Value in FTy/StrVal/StrVal2/UIntVal. - t_ConstantStruct, // Value in ConstantStructElts. - t_PackedConstantStruct // Value in ConstantStructElts. + t_LocalID, t_GlobalID, // ID in UIntVal. + t_LocalName, t_GlobalName, // Name in StrVal. + t_APSInt, t_APFloat, // Value in APSIntVal/APFloatVal. + t_Null, t_Undef, t_Zero, t_None, t_Poison, // No value. + t_EmptyArray, // No value: [] + t_Constant, // Value in ConstantVal. + t_InlineAsm, // Value in FTy/StrVal/StrVal2/UIntVal. + t_ConstantStruct, // Value in ConstantStructElts. + t_PackedConstantStruct // Value in ConstantStructElts. } Kind = t_LocalID; LLLexer::LocTy Loc; diff --git a/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h b/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h index 3d36d06a7e9da..e0101a5ac1ca8 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h @@ -1194,7 +1194,7 @@ class MachineIRBuilder { const SrcOp &Op0, const SrcOp &Op1, std::optional Flags = std::nullopt); - /// Build and insert a \p Res = G_IS_FPCLASS \p Src, \p Mask + /// Build and insert a \p Res = G_IS_FPCLASS \p Pred, \p Src, \p Mask MachineInstrBuilder buildIsFPClass(const DstOp &Res, const SrcOp &Src, unsigned Mask) { return buildInstr(TargetOpcode::G_IS_FPCLASS, {Res}, diff --git a/llvm/include/llvm/IR/Dominators.h b/llvm/include/llvm/IR/Dominators.h index 8784a425d2841..b6b737f34b5e3 100644 --- a/llvm/include/llvm/IR/Dominators.h +++ b/llvm/include/llvm/IR/Dominators.h @@ -16,6 +16,7 @@ #include "llvm/ADT/APInt.h" #include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/DenseMap.h" #include "llvm/ADT/DenseMapInfo.h" #include "llvm/ADT/DepthFirstIterator.h" #include "llvm/ADT/Hashing.h" @@ -23,6 +24,7 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Twine.h" #include "llvm/ADT/ilist_iterator.h" +#include "llvm/ADT/iterator_range.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/CFG.h" #include "llvm/IR/PassManager.h" diff --git a/llvm/include/llvm/IR/GetElementPtrTypeIterator.h b/llvm/include/llvm/IR/GetElementPtrTypeIterator.h index f3272327c3f8b..8c6ede96c873b 100644 --- a/llvm/include/llvm/IR/GetElementPtrTypeIterator.h +++ b/llvm/include/llvm/IR/GetElementPtrTypeIterator.h @@ -20,6 +20,7 @@ #include "llvm/IR/Operator.h" #include "llvm/IR/User.h" #include "llvm/Support/Casting.h" +#include #include #include #include diff --git a/llvm/include/llvm/TargetParser/ARMTargetParser.def b/llvm/include/llvm/TargetParser/ARMTargetParser.def index 558b6f127de3f..546f6bef34a41 100644 --- a/llvm/include/llvm/TargetParser/ARMTargetParser.def +++ b/llvm/include/llvm/TargetParser/ARMTargetParser.def @@ -308,9 +308,6 @@ ARM_CPU_NAME("cortex-m55", ARMV8_1MMainline, FK_FP_ARMV8_FULLFP16_D16, false, ARM_CPU_NAME("cortex-m85", ARMV8_1MMainline, FK_FP_ARMV8_FULLFP16_D16, false, (ARM::AEK_DSP | ARM::AEK_SIMD | ARM::AEK_FP | ARM::AEK_FP16 | ARM::AEK_RAS | ARM::AEK_PACBTI)) -ARM_CPU_NAME("cortex-m52", ARMV8_1MMainline, FK_FP_ARMV8_FULLFP16_D16, false, - (ARM::AEK_DSP | ARM::AEK_SIMD | ARM::AEK_FP | ARM::AEK_FP16 | - ARM::AEK_RAS | ARM::AEK_PACBTI)) ARM_CPU_NAME("cortex-a32", ARMV8A, FK_CRYPTO_NEON_FP_ARMV8, false, ARM::AEK_CRC) ARM_CPU_NAME("cortex-a35", ARMV8A, FK_CRYPTO_NEON_FP_ARMV8, false, ARM::AEK_CRC) ARM_CPU_NAME("cortex-a53", ARMV8A, FK_CRYPTO_NEON_FP_ARMV8, false, ARM::AEK_CRC) diff --git a/llvm/lib/Analysis/DomConditionCache.cpp b/llvm/lib/Analysis/DomConditionCache.cpp index c7f4cab415888..351881fe9e1f9 100644 --- a/llvm/lib/Analysis/DomConditionCache.cpp +++ b/llvm/lib/Analysis/DomConditionCache.cpp @@ -36,7 +36,8 @@ static void findAffectedValues(Value *Cond, ICmpInst::Predicate Pred; Value *A; - if (match(Cond, m_ICmp(Pred, m_Value(A), m_Constant()))) { + Constant *C; + if (match(Cond, m_ICmp(Pred, m_Value(A), m_Constant(C)))) { AddAffected(A); if (ICmpInst::isEquality(Pred)) { diff --git a/llvm/lib/Analysis/IVDescriptors.cpp b/llvm/lib/Analysis/IVDescriptors.cpp index 1aa324c6b5f38..46629e381bc36 100644 --- a/llvm/lib/Analysis/IVDescriptors.cpp +++ b/llvm/lib/Analysis/IVDescriptors.cpp @@ -24,6 +24,8 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/KnownBits.h" +#include + using namespace llvm; using namespace llvm::PatternMatch; diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp index 5445746ab2a1b..dfd36862b05c1 100644 --- a/llvm/lib/Analysis/ValueTracking.cpp +++ b/llvm/lib/Analysis/ValueTracking.cpp @@ -616,7 +616,7 @@ static bool isKnownNonZeroFromAssume(const Value *V, const SimplifyQuery &Q) { static void computeKnownBitsFromCmp(const Value *V, CmpInst::Predicate Pred, Value *LHS, Value *RHS, KnownBits &Known, - const SimplifyQuery &Q) { + unsigned Depth, const SimplifyQuery &Q) { if (RHS->getType()->isPointerTy()) { // Handle comparison of pointer to null explicitly, as it will not be // covered by the m_APInt() logic below. @@ -720,13 +720,13 @@ void llvm::computeKnownBitsFromContext(const Value *V, KnownBits &Known, BasicBlockEdge Edge0(BI->getParent(), BI->getSuccessor(0)); if (Q.DT->dominates(Edge0, Q.CxtI->getParent())) computeKnownBitsFromCmp(V, Cmp->getPredicate(), Cmp->getOperand(0), - Cmp->getOperand(1), Known, Q); + Cmp->getOperand(1), Known, Depth, Q); BasicBlockEdge Edge1(BI->getParent(), BI->getSuccessor(1)); if (Q.DT->dominates(Edge1, Q.CxtI->getParent())) computeKnownBitsFromCmp(V, Cmp->getInversePredicate(), Cmp->getOperand(0), Cmp->getOperand(1), Known, - Q); + Depth, Q); } if (Known.hasConflict()) @@ -794,7 +794,7 @@ void llvm::computeKnownBitsFromContext(const Value *V, KnownBits &Known, continue; computeKnownBitsFromCmp(V, Cmp->getPredicate(), Cmp->getOperand(0), - Cmp->getOperand(1), Known, Q); + Cmp->getOperand(1), Known, Depth, Q); } // Conflicting assumption: Undefined behavior will occur on this execution @@ -1465,10 +1465,8 @@ static void computeKnownBitsFromOperator(const Operator *I, Q.IIQ.getMetadata(cast(I), LLVMContext::MD_range)) computeKnownBitsFromRangeMetadata(*MD, Known); if (const Value *RV = cast(I)->getReturnedArgOperand()) { - if (RV->getType() == I->getType()) { - computeKnownBits(RV, Known2, Depth + 1, Q); - Known = Known.unionWith(Known2); - } + computeKnownBits(RV, Known2, Depth + 1, Q); + Known = Known.unionWith(Known2); } if (const IntrinsicInst *II = dyn_cast(I)) { switch (II->getIntrinsicID()) { @@ -2714,7 +2712,7 @@ static bool isKnownNonZeroFromOperator(const Operator *I, if (const auto *RP = getArgumentAliasingToReturnedPointer(Call, true)) return isKnownNonZero(RP, Depth, Q); } else if (const Value *RV = cast(I)->getReturnedArgOperand()) { - if (RV->getType() == I->getType() && isKnownNonZero(RV, Depth, Q)) + if (isKnownNonZero(RV, Depth, Q)) return true; } diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 5be1892a44f6d..51ae8b703e50f 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -2881,12 +2881,6 @@ bool SelectionDAG::isSplatValue(SDValue V, const APInt &DemandedElts, } } - // Fallback - this is a splat if all demanded elts are the same constant. - if (computeKnownBits(V, DemandedElts, Depth).isConstant()) { - UndefElts = ~DemandedElts; - return true; - } - return false; } diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp index 95cdec722062e..bff64e3a15a24 100644 --- a/llvm/lib/IR/AsmWriter.cpp +++ b/llvm/lib/IR/AsmWriter.cpp @@ -3520,15 +3520,15 @@ static void printMetadataIdentifier(StringRef Name, if (Name.empty()) { Out << " "; } else { - unsigned char FirstC = static_cast(Name[0]); - if (isalpha(FirstC) || FirstC == '-' || FirstC == '$' || FirstC == '.' || - FirstC == '_') - Out << FirstC; + if (isalpha(static_cast(Name[0])) || Name[0] == '-' || + Name[0] == '$' || Name[0] == '.' || Name[0] == '_') + Out << Name[0]; else - Out << '\\' << hexdigit(FirstC >> 4) << hexdigit(FirstC & 0x0F); + Out << '\\' << hexdigit(Name[0] >> 4) << hexdigit(Name[0] & 0x0F); for (unsigned i = 1, e = Name.size(); i != e; ++i) { unsigned char C = Name[i]; - if (isalnum(C) || C == '-' || C == '$' || C == '.' || C == '_') + if (isalnum(static_cast(C)) || C == '-' || C == '$' || + C == '.' || C == '_') Out << C; else Out << '\\' << hexdigit(C >> 4) << hexdigit(C & 0x0F); diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp index 645691f441794..67ee7b7d97e9a 100644 --- a/llvm/lib/IR/AutoUpgrade.cpp +++ b/llvm/lib/IR/AutoUpgrade.cpp @@ -1293,8 +1293,7 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) { } auto *ST = dyn_cast(F->getReturnType()); - if (ST && (!ST->isLiteral() || ST->isPacked()) && - F->getIntrinsicID() != Intrinsic::not_intrinsic) { + if (ST && (!ST->isLiteral() || ST->isPacked())) { // Replace return type with literal non-packed struct. Only do this for // intrinsics declared to return a struct, not for intrinsics with // overloaded return type, in which case the exact struct type will be diff --git a/llvm/lib/IR/Type.cpp b/llvm/lib/IR/Type.cpp index a185ca3fb8dc1..3d2e203a20dac 100644 --- a/llvm/lib/IR/Type.cpp +++ b/llvm/lib/IR/Type.cpp @@ -141,9 +141,16 @@ bool Type::canLosslesslyBitCastTo(Type *Ty) const { Ty->getPrimitiveSizeInBits().getFixedValue() == 8192) return true; - // Conservatively assume we can't losslessly convert between pointers with - // different address spaces. - return false; + // At this point we have only various mismatches of the first class types + // remaining and ptr->ptr. Just select the lossless conversions. Everything + // else is not lossless. Conservatively assume we can't losslessly convert + // between pointers with different address spaces. + if (auto *PTy = dyn_cast(this)) { + if (auto *OtherPTy = dyn_cast(Ty)) + return PTy->getAddressSpace() == OtherPTy->getAddressSpace(); + return false; + } + return false; // Other types have no identity values } bool Type::isEmptyTy() const { diff --git a/llvm/lib/TableGen/JSONBackend.cpp b/llvm/lib/TableGen/JSONBackend.cpp index 2a3f522a9c0ef..a9dc30d7f59b6 100644 --- a/llvm/lib/TableGen/JSONBackend.cpp +++ b/llvm/lib/TableGen/JSONBackend.cpp @@ -11,6 +11,7 @@ // //===----------------------------------------------------------------------===// +#include "llvm/ADT/ArrayRef.h" #include "llvm/Support/Casting.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" diff --git a/llvm/lib/Target/AArch64/AArch64CollectLOH.cpp b/llvm/lib/Target/AArch64/AArch64CollectLOH.cpp index ff569e3dce2e9..811635faaf1bf 100644 --- a/llvm/lib/Target/AArch64/AArch64CollectLOH.cpp +++ b/llvm/lib/Target/AArch64/AArch64CollectLOH.cpp @@ -101,6 +101,7 @@ #include "AArch64InstrInfo.h" #include "AArch64MachineFunctionInfo.h" #include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFunctionPass.h" diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp index 757471d6a905e..3748f671f1237 100644 --- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp +++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp @@ -38,6 +38,7 @@ #include #include #include +#include #include using namespace llvm; diff --git a/llvm/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp b/llvm/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp index 7544786d9f6c8..32686b25f2118 100644 --- a/llvm/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp +++ b/llvm/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp @@ -46,6 +46,7 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/DebugCounter.h" #include "llvm/Support/raw_ostream.h" +#include #include #include diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp index 93b8295f4f3ef..50cbd3672fbd0 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -9532,9 +9532,9 @@ AArch64InstrInfo::probedStackAlloc(MachineBasicBlock::iterator MBBI, .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)) .setMIFlags(Flags); - // STR XZR, [SP] - BuildMI(*ExitMBB, ExitMBB->end(), DL, TII->get(AArch64::STRXui)) - .addReg(AArch64::XZR) + // LDR XZR, [SP] + BuildMI(*ExitMBB, ExitMBB->end(), DL, TII->get(AArch64::LDRXui)) + .addReg(AArch64::XZR, RegState::Define) .addReg(AArch64::SP) .addImm(0) .setMIFlags(Flags); diff --git a/llvm/lib/Target/AArch64/AArch64SLSHardening.cpp b/llvm/lib/Target/AArch64/AArch64SLSHardening.cpp index 3687492c3e3e4..a4165d0514dcc 100644 --- a/llvm/lib/Target/AArch64/AArch64SLSHardening.cpp +++ b/llvm/lib/Target/AArch64/AArch64SLSHardening.cpp @@ -14,6 +14,7 @@ #include "AArch64InstrInfo.h" #include "AArch64Subtarget.h" #include "Utils/AArch64BaseInfo.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/CodeGen/IndirectThunks.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFunction.h" diff --git a/llvm/lib/Target/AArch64/SMEABIPass.cpp b/llvm/lib/Target/AArch64/SMEABIPass.cpp index 3315171798d9f..c813d92ec85b7 100644 --- a/llvm/lib/Target/AArch64/SMEABIPass.cpp +++ b/llvm/lib/Target/AArch64/SMEABIPass.cpp @@ -15,6 +15,7 @@ #include "AArch64.h" #include "Utils/AArch64BaseInfo.h" #include "Utils/AArch64SMEAttributes.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" #include "llvm/IR/Constants.h" #include "llvm/IR/IRBuilder.h" diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h index 5ee888d9db001..48d030d64f821 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h @@ -9,6 +9,7 @@ #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUGLOBALISELUTILS_H #define LLVM_LIB_TARGET_AMDGPU_AMDGPUGLOBALISELUTILS_H +#include "llvm/ADT/ArrayRef.h" #include "llvm/CodeGen/Register.h" #include diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index a6d1da94b8907..fb8d685409e42 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -2334,8 +2334,9 @@ static SDValue combineBallotPattern(SDValue VCMP, bool &Negate) { // Note that ballot doesn't use SETEQ condition but its easy to support it // here for completeness, so in this case Negate is set true on return. auto VCMP_CC = cast(VCMP.getOperand(2))->get(); - if ((VCMP_CC == ISD::SETEQ || VCMP_CC == ISD::SETNE) && - isNullConstant(VCMP.getOperand(1))) { + auto *VCMP_CRHS = dyn_cast(VCMP.getOperand(1)); + if ((VCMP_CC == ISD::SETEQ || VCMP_CC == ISD::SETNE) && VCMP_CRHS && + VCMP_CRHS->isZero()) { auto Cond = VCMP.getOperand(0); if (ISD::isExtOpcode(Cond->getOpcode())) // Skip extension. @@ -2369,8 +2370,8 @@ void AMDGPUDAGToDAGISel::SelectBRCOND(SDNode *N) { Cond->getOperand(0)->getOpcode() == AMDGPUISD::SETCC) { SDValue VCMP = Cond->getOperand(0); auto CC = cast(Cond->getOperand(2))->get(); - if ((CC == ISD::SETEQ || CC == ISD::SETNE) && - isNullConstant(Cond->getOperand(1)) && + auto *CRHS = dyn_cast(Cond->getOperand(1)); + if ((CC == ISD::SETEQ || CC == ISD::SETNE) && CRHS && CRHS->isZero() && // TODO: make condition below an assert after fixing ballot bitwidth. VCMP.getValueType().getSizeInBits() == ST->getWavefrontSize()) { // %VCMP = i(WaveSize) AMDGPUISD::SETCC ... diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp index fd8f0bebd3bec..5ebf834377f2c 100644 --- a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp +++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp @@ -274,42 +274,32 @@ void GCNUpwardRPTracker::recede(const MachineInstr &MI) { if (MI.isDebugInstr()) return; - // Kill all defs. - GCNRegPressure DefPressure, ECDefPressure; - bool HasECDefs = false; - for (const MachineOperand &MO : MI.all_defs()) { - if (!MO.getReg().isVirtual()) - continue; - + auto DecrementDef = [this](const MachineOperand &MO) { Register Reg = MO.getReg(); - LaneBitmask DefMask = getDefRegMask(MO, *MRI); - - // Treat a def as fully live at the moment of definition: keep a record. - if (MO.isEarlyClobber()) { - ECDefPressure.inc(Reg, LaneBitmask::getNone(), DefMask, *MRI); - HasECDefs = true; - } else - DefPressure.inc(Reg, LaneBitmask::getNone(), DefMask, *MRI); - auto I = LiveRegs.find(Reg); if (I == LiveRegs.end()) - continue; + return; LaneBitmask &LiveMask = I->second; LaneBitmask PrevMask = LiveMask; - LiveMask &= ~DefMask; + LiveMask &= ~getDefRegMask(MO, *MRI); CurPressure.inc(Reg, PrevMask, LiveMask, *MRI); if (LiveMask.none()) LiveRegs.erase(I); - } + }; - // Update MaxPressure with defs pressure. - DefPressure += CurPressure; - if (HasECDefs) - DefPressure += ECDefPressure; - MaxPressure = max(DefPressure, MaxPressure); + // Decrement non-early-clobber defs. + SmallVector EarlyClobberDefs; + for (const MachineOperand &MO : MI.all_defs()) { + if (!MO.getReg().isVirtual()) + continue; + if (!MO.isEarlyClobber()) + DecrementDef(MO); + else + EarlyClobberDefs.push_back(&MO); + } - // Make uses alive. + // Increment uses. SmallVector RegUses; collectVirtualRegUses(RegUses, MI, LIS, *MRI); for (const RegisterMaskPair &U : RegUses) { @@ -319,9 +309,13 @@ void GCNUpwardRPTracker::recede(const MachineInstr &MI) { CurPressure.inc(U.RegUnit, PrevMask, LiveMask, *MRI); } - // Update MaxPressure with uses plus early-clobber defs pressure. - MaxPressure = HasECDefs ? max(CurPressure + ECDefPressure, MaxPressure) - : max(CurPressure, MaxPressure); + // Point of maximum pressure: non-early-clobber defs are decremented and uses + // are incremented. + MaxPressure = max(CurPressure, MaxPressure); + + // Now decrement early clobber defs. + for (const MachineOperand *MO : EarlyClobberDefs) + DecrementDef(*MO); assert(CurPressure == getRegPressure(*MRI, LiveRegs)); } diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h index 4100970fe1a96..e21bf10d795ba 100644 --- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h +++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h @@ -85,18 +85,6 @@ struct GCNRegPressure { return !(*this == O); } - GCNRegPressure &operator+=(const GCNRegPressure &RHS) { - for (unsigned I = 0; I < TOTAL_KINDS; ++I) - Value[I] += RHS.Value[I]; - return *this; - } - - GCNRegPressure &operator-=(const GCNRegPressure &RHS) { - for (unsigned I = 0; I < TOTAL_KINDS; ++I) - Value[I] -= RHS.Value[I]; - return *this; - } - void dump() const; private: @@ -117,20 +105,6 @@ inline GCNRegPressure max(const GCNRegPressure &P1, const GCNRegPressure &P2) { return Res; } -inline GCNRegPressure operator+(const GCNRegPressure &P1, - const GCNRegPressure &P2) { - GCNRegPressure Sum = P1; - Sum += P2; - return Sum; -} - -inline GCNRegPressure operator-(const GCNRegPressure &P1, - const GCNRegPressure &P2) { - GCNRegPressure Diff = P1; - Diff -= P2; - return Diff; -} - class GCNRPTracker { public: using LiveRegSet = DenseMap; diff --git a/llvm/lib/Target/ARM/ARM.td b/llvm/lib/Target/ARM/ARM.td index 3df428b9ce967..bf64ecf649644 100644 --- a/llvm/lib/Target/ARM/ARM.td +++ b/llvm/lib/Target/ARM/ARM.td @@ -1519,17 +1519,6 @@ def : ProcessorModel<"cortex-m85", CortexM85Model, [ARMv81mMainline, FeatureUseMISched, HasMVEFloatOps]>; -def : ProcessorModel<"cortex-m52", CortexM55Model, [ARMv81mMainline, - FeatureDSP, - FeatureFPARMv8_D16, - FeatureHasNoBranchPredictor, - FeaturePACBTI, - FeatureUseMISched, - FeaturePrefLoopAlign32, - FeatureHasSlowFPVMLx, - FeatureMVEVectorCostFactor1, - HasMVEFloatOps]>; - def : ProcNoItin<"cortex-a32", [ARMv8a, FeatureHWDivThumb, FeatureHWDivARM, diff --git a/llvm/lib/Target/ARM/ARMSubtarget.cpp b/llvm/lib/Target/ARM/ARMSubtarget.cpp index 922fa93226f29..1505e92140508 100644 --- a/llvm/lib/Target/ARM/ARMSubtarget.cpp +++ b/llvm/lib/Target/ARM/ARMSubtarget.cpp @@ -298,7 +298,6 @@ void ARMSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) { case CortexM3: case CortexM7: case CortexR52: - case CortexM52: case CortexX1: case CortexX1C: break; diff --git a/llvm/lib/Target/ARM/ARMSubtarget.h b/llvm/lib/Target/ARM/ARMSubtarget.h index aa8bea103dd6a..eb5bf25f2a382 100644 --- a/llvm/lib/Target/ARM/ARMSubtarget.h +++ b/llvm/lib/Target/ARM/ARMSubtarget.h @@ -71,7 +71,6 @@ class ARMSubtarget : public ARMGenSubtargetInfo { CortexA9, CortexM3, CortexM7, - CortexM52, CortexR4, CortexR4F, CortexR5, diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp index c4427948d3b89..31a814900ca50 100644 --- a/llvm/lib/Target/ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp +++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp @@ -19,6 +19,7 @@ #include "llvm/MC/MCWinCOFFObjectWriter.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" +#include using namespace llvm; diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.h b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.h index 42d91f559f51a..b83931eb88ac2 100644 --- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.h +++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.h @@ -17,6 +17,7 @@ #include "MCTargetDesc/HexagonMCInstrInfo.h" #include "MCTargetDesc/HexagonMCTargetDesc.h" #include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/Support/SMLoc.h" #include #include diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp index 61ec63ff6c1ca..e9e29c2dc8929 100644 --- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp @@ -269,8 +269,6 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM, setOperationAction({ISD::FADD, ISD::FSUB}, VT, Legal); setOperationAction({ISD::FMUL, ISD::FDIV}, VT, Legal); setOperationAction(ISD::FMA, VT, Legal); - setOperationAction(ISD::FSQRT, VT, Legal); - setOperationAction(ISD::FNEG, VT, Legal); setCondCodeAction({ISD::SETGE, ISD::SETGT, ISD::SETOGE, ISD::SETOGT, ISD::SETUGE, ISD::SETUGT}, VT, Expand); @@ -311,8 +309,6 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM, setOperationAction({ISD::FADD, ISD::FSUB}, VT, Legal); setOperationAction({ISD::FMUL, ISD::FDIV}, VT, Legal); setOperationAction(ISD::FMA, VT, Legal); - setOperationAction(ISD::FSQRT, VT, Legal); - setOperationAction(ISD::FNEG, VT, Legal); setCondCodeAction({ISD::SETGE, ISD::SETGT, ISD::SETOGE, ISD::SETOGT, ISD::SETUGE, ISD::SETUGT}, VT, Expand); diff --git a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td index 8559baa0e525f..a9bf65c6840d5 100644 --- a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td +++ b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td @@ -1092,13 +1092,6 @@ multiclass PatXr { (!cast(Inst#"_D") LASX256:$xj)>; } -multiclass PatXrF { - def : Pat<(v8f32 (OpNode (v8f32 LASX256:$xj))), - (!cast(Inst#"_S") LASX256:$xj)>; - def : Pat<(v4f64 (OpNode (v4f64 LASX256:$xj))), - (!cast(Inst#"_D") LASX256:$xj)>; -} - multiclass PatXrXr { def : Pat<(OpNode (v32i8 LASX256:$xj), (v32i8 LASX256:$xk)), (!cast(Inst#"_B") LASX256:$xj, LASX256:$xk)>; @@ -1455,21 +1448,6 @@ def : Pat<(fma v8f32:$xj, v8f32:$xk, v8f32:$xa), def : Pat<(fma v4f64:$xj, v4f64:$xk, v4f64:$xa), (XVFMADD_D v4f64:$xj, v4f64:$xk, v4f64:$xa)>; -// XVFSQRT_{S/D} -defm : PatXrF; - -// XVRECIP_{S/D} -def : Pat<(fdiv vsplatf32_fpimm_eq_1, v8f32:$xj), - (XVFRECIP_S v8f32:$xj)>; -def : Pat<(fdiv vsplatf64_fpimm_eq_1, v4f64:$xj), - (XVFRECIP_D v4f64:$xj)>; - -// XVFRSQRT_{S/D} -def : Pat<(fdiv vsplatf32_fpimm_eq_1, (fsqrt v8f32:$xj)), - (XVFRSQRT_S v8f32:$xj)>; -def : Pat<(fdiv vsplatf64_fpimm_eq_1, (fsqrt v4f64:$xj)), - (XVFRSQRT_D v4f64:$xj)>; - // XVSEQ[I]_{B/H/W/D} defm : PatCCXrSimm5; defm : PatCCXrXr; @@ -1605,10 +1583,6 @@ foreach vt = [v32i8, v16i16, v8i32, v4i64, v8f32, v4f64] in def : Pat<(vt (vselect LASX256:$xa, LASX256:$xk, LASX256:$xj)), (XVBITSEL_V LASX256:$xj, LASX256:$xk, LASX256:$xa)>; -// fneg -def : Pat<(fneg (v8f32 LASX256:$xj)), (XVBITREVI_W LASX256:$xj, 31)>; -def : Pat<(fneg (v4f64 LASX256:$xj)), (XVBITREVI_D LASX256:$xj, 63)>; - } // Predicates = [HasExtLASX] /// Intrinsic pattern diff --git a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td index 5947f241bb597..ff21c6681271e 100644 --- a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td +++ b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td @@ -95,29 +95,6 @@ def vsplati64_imm_eq_63 : PatFrags<(ops), [(build_vector), Imm.getBitWidth() == EltTy.getSizeInBits() && Imm == 63; }]>; -def vsplatf32_fpimm_eq_1 - : PatFrags<(ops), [(bitconvert (v4i32 (build_vector))), - (bitconvert (v8i32 (build_vector)))], [{ - APInt Imm; - EVT EltTy = N->getValueType(0).getVectorElementType(); - N = N->getOperand(0).getNode(); - - return selectVSplat(N, Imm, EltTy.getSizeInBits()) && - Imm.getBitWidth() == EltTy.getSizeInBits() && - Imm == APFloat(+1.0f).bitcastToAPInt(); -}]>; -def vsplatf64_fpimm_eq_1 - : PatFrags<(ops), [(bitconvert (v2i64 (build_vector))), - (bitconvert (v4i64 (build_vector)))], [{ - APInt Imm; - EVT EltTy = N->getValueType(0).getVectorElementType(); - N = N->getOperand(0).getNode(); - - return selectVSplat(N, Imm, EltTy.getSizeInBits()) && - Imm.getBitWidth() == EltTy.getSizeInBits() && - Imm == APFloat(+1.0).bitcastToAPInt(); -}]>; - def vsplati8imm7 : PatFrag<(ops node:$reg), (and node:$reg, vsplati8_imm_eq_7)>; def vsplati16imm15 : PatFrag<(ops node:$reg), @@ -1196,13 +1173,6 @@ multiclass PatVr { (!cast(Inst#"_D") LSX128:$vj)>; } -multiclass PatVrF { - def : Pat<(v4f32 (OpNode (v4f32 LSX128:$vj))), - (!cast(Inst#"_S") LSX128:$vj)>; - def : Pat<(v2f64 (OpNode (v2f64 LSX128:$vj))), - (!cast(Inst#"_D") LSX128:$vj)>; -} - multiclass PatVrVr { def : Pat<(OpNode (v16i8 LSX128:$vj), (v16i8 LSX128:$vk)), (!cast(Inst#"_B") LSX128:$vj, LSX128:$vk)>; @@ -1555,21 +1525,6 @@ def : Pat<(fma v4f32:$vj, v4f32:$vk, v4f32:$va), def : Pat<(fma v2f64:$vj, v2f64:$vk, v2f64:$va), (VFMADD_D v2f64:$vj, v2f64:$vk, v2f64:$va)>; -// VFSQRT_{S/D} -defm : PatVrF; - -// VFRECIP_{S/D} -def : Pat<(fdiv vsplatf32_fpimm_eq_1, v4f32:$vj), - (VFRECIP_S v4f32:$vj)>; -def : Pat<(fdiv vsplatf64_fpimm_eq_1, v2f64:$vj), - (VFRECIP_D v2f64:$vj)>; - -// VFRSQRT_{S/D} -def : Pat<(fdiv vsplatf32_fpimm_eq_1, (fsqrt v4f32:$vj)), - (VFRSQRT_S v4f32:$vj)>; -def : Pat<(fdiv vsplatf64_fpimm_eq_1, (fsqrt v2f64:$vj)), - (VFRSQRT_D v2f64:$vj)>; - // VSEQ[I]_{B/H/W/D} defm : PatCCVrSimm5; defm : PatCCVrVr; @@ -1712,10 +1667,6 @@ foreach vt = [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64] in def : Pat<(vt (vselect LSX128:$va, LSX128:$vk, LSX128:$vj)), (VBITSEL_V LSX128:$vj, LSX128:$vk, LSX128:$va)>; -// fneg -def : Pat<(fneg (v4f32 LSX128:$vj)), (VBITREVI_W LSX128:$vj, 31)>; -def : Pat<(fneg (v2f64 LSX128:$vj)), (VBITREVI_D LSX128:$vj, 63)>; - } // Predicates = [HasExtLSX] /// Intrinsic pattern diff --git a/llvm/lib/Target/Mips/Mips16FrameLowering.cpp b/llvm/lib/Target/Mips/Mips16FrameLowering.cpp index 10c953bb344a8..4f4e3f3f2ed74 100644 --- a/llvm/lib/Target/Mips/Mips16FrameLowering.cpp +++ b/llvm/lib/Target/Mips/Mips16FrameLowering.cpp @@ -30,6 +30,7 @@ #include "llvm/MC/MachineLocation.h" #include "llvm/Support/MathExtras.h" #include "llvm/CodeGen/TargetFrameLowering.h" +#include #include #include diff --git a/llvm/lib/Target/Mips/MipsISelLowering.h b/llvm/lib/Target/Mips/MipsISelLowering.h index c17e51f446513..81d37ff2f065f 100644 --- a/llvm/lib/Target/Mips/MipsISelLowering.h +++ b/llvm/lib/Target/Mips/MipsISelLowering.h @@ -31,6 +31,7 @@ #include "llvm/IR/Type.h" #include "llvm/Target/TargetMachine.h" #include +#include #include #include #include diff --git a/llvm/lib/Target/Mips/MipsMCInstLower.cpp b/llvm/lib/Target/Mips/MipsMCInstLower.cpp index b0642f3d1ff28..7b58cb90ab87a 100644 --- a/llvm/lib/Target/Mips/MipsMCInstLower.cpp +++ b/llvm/lib/Target/Mips/MipsMCInstLower.cpp @@ -21,6 +21,7 @@ #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCInst.h" #include "llvm/Support/ErrorHandling.h" +#include using namespace llvm; diff --git a/llvm/lib/Target/Mips/MipsSEISelLowering.cpp b/llvm/lib/Target/Mips/MipsSEISelLowering.cpp index 5c34067c88889..b84f304373f66 100644 --- a/llvm/lib/Target/Mips/MipsSEISelLowering.cpp +++ b/llvm/lib/Target/Mips/MipsSEISelLowering.cpp @@ -15,6 +15,7 @@ #include "MipsRegisterInfo.h" #include "MipsSubtarget.h" #include "llvm/ADT/APInt.h" +#include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" #include "llvm/CodeGen/CallingConvLower.h" diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index 13665985f52eb..58ff193f24371 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -561,7 +561,7 @@ multiclass F2_Support_Half { [(set Int16Regs:$dst, (OpNode (bf16 Int16Regs:$a)))]>, Requires<[hasSM<80>, hasPTX<70>]>; def bf16x2 : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a), - !strconcat(OpcStr, ".bf16x2 \t$dst, $a;"), + !strconcat(OpcStr, ".v2bf16 \t$dst, $a;"), [(set Int32Regs:$dst, (OpNode (v2bf16 Int32Regs:$a)))]>, Requires<[hasSM<80>, hasPTX<70>]>; def f16_ftz : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a), diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFStreamer.h b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFStreamer.h index 10204b184a49f..7d786ac13bb90 100644 --- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFStreamer.h +++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFStreamer.h @@ -13,6 +13,7 @@ #ifndef LLVM_LIB_TARGET_PPC_MCELFSTREAMER_PPCELFSTREAMER_H #define LLVM_LIB_TARGET_PPC_MCELFSTREAMER_PPCELFSTREAMER_H +#include "llvm/ADT/SmallVector.h" #include "llvm/MC/MCELFStreamer.h" #include diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp index 4716e37b34432..a5dc0b45b13cc 100644 --- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp +++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp @@ -12,6 +12,7 @@ #include "PPCMCAsmInfo.h" #include "llvm/TargetParser/Triple.h" +#include using namespace llvm; diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.cpp index 80c37f82bf298..284e52c298a2d 100644 --- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.cpp +++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.cpp @@ -12,6 +12,7 @@ #include "PPCPredicates.h" #include "llvm/Support/ErrorHandling.h" +#include using namespace llvm; PPC::Predicate PPC::InvertPredicate(PPC::Predicate Opcode) { diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp index 062b53e24a0d7..aa385d7c3b202 100644 --- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp @@ -18,6 +18,7 @@ #include "llvm/IR/ProfDataUtils.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/KnownBits.h" #include "llvm/Transforms/InstCombine/InstCombiner.h" #include "llvm/Transforms/Utils/Local.h" #include diff --git a/llvm/lib/Target/RISCV/RISCVMergeBaseOffset.cpp b/llvm/lib/Target/RISCV/RISCVMergeBaseOffset.cpp index ae46d5554d350..097e12a2cd59f 100644 --- a/llvm/lib/Target/RISCV/RISCVMergeBaseOffset.cpp +++ b/llvm/lib/Target/RISCV/RISCVMergeBaseOffset.cpp @@ -19,6 +19,7 @@ #include "llvm/Support/Debug.h" #include "llvm/Target/TargetOptions.h" #include +#include using namespace llvm; #define DEBUG_TYPE "riscv-merge-base-offset" diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp index 3abdb6003659f..84239db1971c2 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp @@ -213,8 +213,13 @@ RISCVTargetMachine::getSubtargetImpl(const Function &F) const { llvm::bit_floor((RVVBitsMax < 64 || RVVBitsMax > 65536) ? 0 : RVVBitsMax); SmallString<512> Key; - raw_svector_ostream(Key) << "RVVMin" << RVVBitsMin << "RVVMax" << RVVBitsMax - << CPU << TuneCPU << FS; + Key += "RVVMin"; + Key += std::to_string(RVVBitsMin); + Key += "RVVMax"; + Key += std::to_string(RVVBitsMax); + Key += CPU; + Key += TuneCPU; + Key += FS; auto &I = SubtargetMap[Key]; if (!I) { // This needs to be done before we create a new subtarget since any diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp index f79787d4baa4d..873994c2e333b 100644 --- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp +++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp @@ -872,21 +872,13 @@ bool SystemZTargetLowering::hasInlineStackProbe(const MachineFunction &MF) const TargetLowering::AtomicExpansionKind SystemZTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const { - // Don't expand subword operations as they require special treatment. - if (RMW->getType()->isIntegerTy(8) || RMW->getType()->isIntegerTy(16)) - return AtomicExpansionKind::None; - - // Don't expand if there is a target instruction available. - if (Subtarget.hasInterlockedAccess1() && - (RMW->getType()->isIntegerTy(32) || RMW->getType()->isIntegerTy(64)) && - (RMW->getOperation() == AtomicRMWInst::BinOp::Add || - RMW->getOperation() == AtomicRMWInst::BinOp::Sub || - RMW->getOperation() == AtomicRMWInst::BinOp::And || - RMW->getOperation() == AtomicRMWInst::BinOp::Or || - RMW->getOperation() == AtomicRMWInst::BinOp::Xor)) - return AtomicExpansionKind::None; - - return AtomicExpansionKind::CmpXChg; + // TODO: expand them all here instead of in backend. + return (RMW->isFloatingPointOperation() || + RMW->getOperation() == AtomicRMWInst::UIncWrap || + RMW->getOperation() == AtomicRMWInst::UDecWrap || + RMW->getType()->isIntegerTy(128)) + ? AtomicExpansionKind::CmpXChg + : AtomicExpansionKind::None; } bool SystemZTargetLowering::isLegalICmpImmediate(int64_t Imm) const { @@ -4358,31 +4350,6 @@ SDValue SystemZTargetLowering::lowerATOMIC_STORE(SDValue Op, return Chain; } -// Prepare for a Compare And Swap for a subword operation. This needs to be -// done in memory with 4 bytes at natural alignment. -static void getCSAddressAndShifts(SDValue Addr, SelectionDAG &DAG, SDLoc DL, - SDValue &AlignedAddr, SDValue &BitShift, - SDValue &NegBitShift) { - EVT PtrVT = Addr.getValueType(); - EVT WideVT = MVT::i32; - - // Get the address of the containing word. - AlignedAddr = DAG.getNode(ISD::AND, DL, PtrVT, Addr, - DAG.getConstant(-4, DL, PtrVT)); - - // Get the number of bits that the word must be rotated left in order - // to bring the field to the top bits of a GR32. - BitShift = DAG.getNode(ISD::SHL, DL, PtrVT, Addr, - DAG.getConstant(3, DL, PtrVT)); - BitShift = DAG.getNode(ISD::TRUNCATE, DL, WideVT, BitShift); - - // Get the complementing shift amount, for rotating a field in the top - // bits back to its proper position. - NegBitShift = DAG.getNode(ISD::SUB, DL, WideVT, - DAG.getConstant(0, DL, WideVT), BitShift); - -} - // Op is an 8-, 16-bit or 32-bit ATOMIC_LOAD_* operation. Lower the first // two into the fullword ATOMIC_LOADW_* operation given by Opcode. SDValue SystemZTargetLowering::lowerATOMIC_LOAD_OP(SDValue Op, @@ -4390,7 +4357,7 @@ SDValue SystemZTargetLowering::lowerATOMIC_LOAD_OP(SDValue Op, unsigned Opcode) const { auto *Node = cast(Op.getNode()); - // 32-bit operations need no special handling. + // 32-bit operations need no code outside the main loop. EVT NarrowVT = Node->getMemoryVT(); EVT WideVT = MVT::i32; if (NarrowVT == WideVT) @@ -4402,6 +4369,7 @@ SDValue SystemZTargetLowering::lowerATOMIC_LOAD_OP(SDValue Op, SDValue Src2 = Node->getVal(); MachineMemOperand *MMO = Node->getMemOperand(); SDLoc DL(Node); + EVT PtrVT = Addr.getValueType(); // Convert atomic subtracts of constants into additions. if (Opcode == SystemZISD::ATOMIC_LOADW_SUB) @@ -4410,8 +4378,20 @@ SDValue SystemZTargetLowering::lowerATOMIC_LOAD_OP(SDValue Op, Src2 = DAG.getConstant(-Const->getSExtValue(), DL, Src2.getValueType()); } - SDValue AlignedAddr, BitShift, NegBitShift; - getCSAddressAndShifts(Addr, DAG, DL, AlignedAddr, BitShift, NegBitShift); + // Get the address of the containing word. + SDValue AlignedAddr = DAG.getNode(ISD::AND, DL, PtrVT, Addr, + DAG.getConstant(-4, DL, PtrVT)); + + // Get the number of bits that the word must be rotated left in order + // to bring the field to the top bits of a GR32. + SDValue BitShift = DAG.getNode(ISD::SHL, DL, PtrVT, Addr, + DAG.getConstant(3, DL, PtrVT)); + BitShift = DAG.getNode(ISD::TRUNCATE, DL, WideVT, BitShift); + + // Get the complementing shift amount, for rotating a field in the top + // bits back to its proper position. + SDValue NegBitShift = DAG.getNode(ISD::SUB, DL, WideVT, + DAG.getConstant(0, DL, WideVT), BitShift); // Extend the source operand to 32 bits and prepare it for the inner loop. // ATOMIC_SWAPW uses RISBG to rotate the field left, but all other @@ -4443,24 +4423,38 @@ SDValue SystemZTargetLowering::lowerATOMIC_LOAD_OP(SDValue Op, return DAG.getMergeValues(RetOps, DL); } -// Op is an ATOMIC_LOAD_SUB operation. Lower 8- and 16-bit operations into -// ATOMIC_LOADW_SUBs and convert 32- and 64-bit operations into additions. +// Op is an ATOMIC_LOAD_SUB operation. Lower 8- and 16-bit operations +// into ATOMIC_LOADW_SUBs and decide whether to convert 32- and 64-bit +// operations into additions. SDValue SystemZTargetLowering::lowerATOMIC_LOAD_SUB(SDValue Op, SelectionDAG &DAG) const { auto *Node = cast(Op.getNode()); EVT MemVT = Node->getMemoryVT(); if (MemVT == MVT::i32 || MemVT == MVT::i64) { - // A full-width operation: negate and use LAA(G). + // A full-width operation. assert(Op.getValueType() == MemVT && "Mismatched VTs"); - assert(Subtarget.hasInterlockedAccess1() && - "Should have been expanded by AtomicExpand pass."); SDValue Src2 = Node->getVal(); + SDValue NegSrc2; SDLoc DL(Src2); - SDValue NegSrc2 = - DAG.getNode(ISD::SUB, DL, MemVT, DAG.getConstant(0, DL, MemVT), Src2); - return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, MemVT, - Node->getChain(), Node->getBasePtr(), NegSrc2, - Node->getMemOperand()); + + if (auto *Op2 = dyn_cast(Src2)) { + // Use an addition if the operand is constant and either LAA(G) is + // available or the negative value is in the range of A(G)FHI. + int64_t Value = (-Op2->getAPIntValue()).getSExtValue(); + if (isInt<32>(Value) || Subtarget.hasInterlockedAccess1()) + NegSrc2 = DAG.getConstant(Value, DL, MemVT); + } else if (Subtarget.hasInterlockedAccess1()) + // Use LAA(G) if available. + NegSrc2 = DAG.getNode(ISD::SUB, DL, MemVT, DAG.getConstant(0, DL, MemVT), + Src2); + + if (NegSrc2.getNode()) + return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, MemVT, + Node->getChain(), Node->getBasePtr(), NegSrc2, + Node->getMemOperand()); + + // Use the node as-is. + return Op; } return lowerATOMIC_LOAD_OP(Op, DAG, SystemZISD::ATOMIC_LOADW_SUB); @@ -4498,9 +4492,22 @@ SDValue SystemZTargetLowering::lowerATOMIC_CMP_SWAP(SDValue Op, // Convert 8-bit and 16-bit compare and swap to a loop, implemented // via a fullword ATOMIC_CMP_SWAPW operation. int64_t BitSize = NarrowVT.getSizeInBits(); + EVT PtrVT = Addr.getValueType(); + + // Get the address of the containing word. + SDValue AlignedAddr = DAG.getNode(ISD::AND, DL, PtrVT, Addr, + DAG.getConstant(-4, DL, PtrVT)); - SDValue AlignedAddr, BitShift, NegBitShift; - getCSAddressAndShifts(Addr, DAG, DL, AlignedAddr, BitShift, NegBitShift); + // Get the number of bits that the word must be rotated left in order + // to bring the field to the top bits of a GR32. + SDValue BitShift = DAG.getNode(ISD::SHL, DL, PtrVT, Addr, + DAG.getConstant(3, DL, PtrVT)); + BitShift = DAG.getNode(ISD::TRUNCATE, DL, WideVT, BitShift); + + // Get the complementing shift amount, for rotating a field in the top + // bits back to its proper position. + SDValue NegBitShift = DAG.getNode(ISD::SUB, DL, WideVT, + DAG.getConstant(0, DL, WideVT), BitShift); // Construct the ATOMIC_CMP_SWAPW node. SDVTList VTList = DAG.getVTList(WideVT, MVT::i32, MVT::Other); @@ -7944,17 +7951,20 @@ MachineBasicBlock *SystemZTargetLowering::emitCondStore(MachineInstr &MI, return JoinMBB; } -// Implement EmitInstrWithCustomInserter for subword pseudo ATOMIC_LOADW_* or -// ATOMIC_SWAPW instruction MI. BinOpcode is the instruction that performs -// the binary operation elided by "*", or 0 for ATOMIC_SWAPW. Invert says -// whether the field should be inverted after performing BinOpcode (e.g. for -// NAND). +// Implement EmitInstrWithCustomInserter for pseudo ATOMIC_LOAD{,W}_* +// or ATOMIC_SWAP{,W} instruction MI. BinOpcode is the instruction that +// performs the binary operation elided by "*", or 0 for ATOMIC_SWAP{,W}. +// BitSize is the width of the field in bits, or 0 if this is a partword +// ATOMIC_LOADW_* or ATOMIC_SWAPW instruction, in which case the bitsize +// is one of the operands. Invert says whether the field should be +// inverted after performing BinOpcode (e.g. for NAND). MachineBasicBlock *SystemZTargetLowering::emitAtomicLoadBinary( MachineInstr &MI, MachineBasicBlock *MBB, unsigned BinOpcode, - bool Invert) const { + unsigned BitSize, bool Invert) const { MachineFunction &MF = *MBB->getParent(); const SystemZInstrInfo *TII = Subtarget.getInstrInfo(); MachineRegisterInfo &MRI = MF.getRegInfo(); + bool IsSubWord = (BitSize < 32); // Extract the operands. Base can be a register or a frame index. // Src2 can be a register or immediate. @@ -7962,22 +7972,31 @@ MachineBasicBlock *SystemZTargetLowering::emitAtomicLoadBinary( MachineOperand Base = earlyUseOperand(MI.getOperand(1)); int64_t Disp = MI.getOperand(2).getImm(); MachineOperand Src2 = earlyUseOperand(MI.getOperand(3)); - Register BitShift = MI.getOperand(4).getReg(); - Register NegBitShift = MI.getOperand(5).getReg(); - unsigned BitSize = MI.getOperand(6).getImm(); + Register BitShift = IsSubWord ? MI.getOperand(4).getReg() : Register(); + Register NegBitShift = IsSubWord ? MI.getOperand(5).getReg() : Register(); DebugLoc DL = MI.getDebugLoc(); + if (IsSubWord) + BitSize = MI.getOperand(6).getImm(); + + // Subword operations use 32-bit registers. + const TargetRegisterClass *RC = (BitSize <= 32 ? + &SystemZ::GR32BitRegClass : + &SystemZ::GR64BitRegClass); + unsigned LOpcode = BitSize <= 32 ? SystemZ::L : SystemZ::LG; + unsigned CSOpcode = BitSize <= 32 ? SystemZ::CS : SystemZ::CSG; // Get the right opcodes for the displacement. - unsigned LOpcode = TII->getOpcodeForOffset(SystemZ::L, Disp); - unsigned CSOpcode = TII->getOpcodeForOffset(SystemZ::CS, Disp); + LOpcode = TII->getOpcodeForOffset(LOpcode, Disp); + CSOpcode = TII->getOpcodeForOffset(CSOpcode, Disp); assert(LOpcode && CSOpcode && "Displacement out of range"); // Create virtual registers for temporary results. - Register OrigVal = MRI.createVirtualRegister(&SystemZ::GR32BitRegClass); - Register OldVal = MRI.createVirtualRegister(&SystemZ::GR32BitRegClass); - Register NewVal = MRI.createVirtualRegister(&SystemZ::GR32BitRegClass); - Register RotatedOldVal = MRI.createVirtualRegister(&SystemZ::GR32BitRegClass); - Register RotatedNewVal = MRI.createVirtualRegister(&SystemZ::GR32BitRegClass); + Register OrigVal = MRI.createVirtualRegister(RC); + Register OldVal = MRI.createVirtualRegister(RC); + Register NewVal = (BinOpcode || IsSubWord ? + MRI.createVirtualRegister(RC) : Src2.getReg()); + Register RotatedOldVal = (IsSubWord ? MRI.createVirtualRegister(RC) : OldVal); + Register RotatedNewVal = (IsSubWord ? MRI.createVirtualRegister(RC) : NewVal); // Insert a basic block for the main loop. MachineBasicBlock *StartMBB = MBB; @@ -8004,28 +8023,39 @@ MachineBasicBlock *SystemZTargetLowering::emitAtomicLoadBinary( BuildMI(MBB, DL, TII->get(SystemZ::PHI), OldVal) .addReg(OrigVal).addMBB(StartMBB) .addReg(Dest).addMBB(LoopMBB); - BuildMI(MBB, DL, TII->get(SystemZ::RLL), RotatedOldVal) - .addReg(OldVal).addReg(BitShift).addImm(0); + if (IsSubWord) + BuildMI(MBB, DL, TII->get(SystemZ::RLL), RotatedOldVal) + .addReg(OldVal).addReg(BitShift).addImm(0); if (Invert) { // Perform the operation normally and then invert every bit of the field. - Register Tmp = MRI.createVirtualRegister(&SystemZ::GR32BitRegClass); + Register Tmp = MRI.createVirtualRegister(RC); BuildMI(MBB, DL, TII->get(BinOpcode), Tmp).addReg(RotatedOldVal).add(Src2); - // XILF with the upper BitSize bits set. - BuildMI(MBB, DL, TII->get(SystemZ::XILF), RotatedNewVal) - .addReg(Tmp).addImm(-1U << (32 - BitSize)); + if (BitSize <= 32) + // XILF with the upper BitSize bits set. + BuildMI(MBB, DL, TII->get(SystemZ::XILF), RotatedNewVal) + .addReg(Tmp).addImm(-1U << (32 - BitSize)); + else { + // Use LCGR and add -1 to the result, which is more compact than + // an XILF, XILH pair. + Register Tmp2 = MRI.createVirtualRegister(RC); + BuildMI(MBB, DL, TII->get(SystemZ::LCGR), Tmp2).addReg(Tmp); + BuildMI(MBB, DL, TII->get(SystemZ::AGHI), RotatedNewVal) + .addReg(Tmp2).addImm(-1); + } } else if (BinOpcode) // A simply binary operation. BuildMI(MBB, DL, TII->get(BinOpcode), RotatedNewVal) .addReg(RotatedOldVal) .add(Src2); - else + else if (IsSubWord) // Use RISBG to rotate Src2 into position and use it to replace the // field in RotatedOldVal. BuildMI(MBB, DL, TII->get(SystemZ::RISBG32), RotatedNewVal) .addReg(RotatedOldVal).addReg(Src2.getReg()) .addImm(32).addImm(31 + BitSize).addImm(32 - BitSize); - BuildMI(MBB, DL, TII->get(SystemZ::RLL), NewVal) - .addReg(RotatedNewVal).addReg(NegBitShift).addImm(0); + if (IsSubWord) + BuildMI(MBB, DL, TII->get(SystemZ::RLL), NewVal) + .addReg(RotatedNewVal).addReg(NegBitShift).addImm(0); BuildMI(MBB, DL, TII->get(CSOpcode), Dest) .addReg(OldVal) .addReg(NewVal) @@ -8040,40 +8070,50 @@ MachineBasicBlock *SystemZTargetLowering::emitAtomicLoadBinary( return DoneMBB; } -// Implement EmitInstrWithCustomInserter for subword pseudo -// ATOMIC_LOADW_{,U}{MIN,MAX} instruction MI. CompareOpcode is the +// Implement EmitInstrWithCustomInserter for pseudo +// ATOMIC_LOAD{,W}_{,U}{MIN,MAX} instruction MI. CompareOpcode is the // instruction that should be used to compare the current field with the // minimum or maximum value. KeepOldMask is the BRC condition-code mask -// for when the current field should be kept. +// for when the current field should be kept. BitSize is the width of +// the field in bits, or 0 if this is a partword ATOMIC_LOADW_* instruction. MachineBasicBlock *SystemZTargetLowering::emitAtomicLoadMinMax( MachineInstr &MI, MachineBasicBlock *MBB, unsigned CompareOpcode, - unsigned KeepOldMask) const { + unsigned KeepOldMask, unsigned BitSize) const { MachineFunction &MF = *MBB->getParent(); const SystemZInstrInfo *TII = Subtarget.getInstrInfo(); MachineRegisterInfo &MRI = MF.getRegInfo(); + bool IsSubWord = (BitSize < 32); // Extract the operands. Base can be a register or a frame index. Register Dest = MI.getOperand(0).getReg(); MachineOperand Base = earlyUseOperand(MI.getOperand(1)); int64_t Disp = MI.getOperand(2).getImm(); Register Src2 = MI.getOperand(3).getReg(); - Register BitShift = MI.getOperand(4).getReg(); - Register NegBitShift = MI.getOperand(5).getReg(); - unsigned BitSize = MI.getOperand(6).getImm(); + Register BitShift = (IsSubWord ? MI.getOperand(4).getReg() : Register()); + Register NegBitShift = (IsSubWord ? MI.getOperand(5).getReg() : Register()); DebugLoc DL = MI.getDebugLoc(); + if (IsSubWord) + BitSize = MI.getOperand(6).getImm(); + + // Subword operations use 32-bit registers. + const TargetRegisterClass *RC = (BitSize <= 32 ? + &SystemZ::GR32BitRegClass : + &SystemZ::GR64BitRegClass); + unsigned LOpcode = BitSize <= 32 ? SystemZ::L : SystemZ::LG; + unsigned CSOpcode = BitSize <= 32 ? SystemZ::CS : SystemZ::CSG; // Get the right opcodes for the displacement. - unsigned LOpcode = TII->getOpcodeForOffset(SystemZ::L, Disp); - unsigned CSOpcode = TII->getOpcodeForOffset(SystemZ::CS, Disp); + LOpcode = TII->getOpcodeForOffset(LOpcode, Disp); + CSOpcode = TII->getOpcodeForOffset(CSOpcode, Disp); assert(LOpcode && CSOpcode && "Displacement out of range"); // Create virtual registers for temporary results. - Register OrigVal = MRI.createVirtualRegister(&SystemZ::GR32BitRegClass); - Register OldVal = MRI.createVirtualRegister(&SystemZ::GR32BitRegClass); - Register NewVal = MRI.createVirtualRegister(&SystemZ::GR32BitRegClass); - Register RotatedOldVal = MRI.createVirtualRegister(&SystemZ::GR32BitRegClass); - Register RotatedAltVal = MRI.createVirtualRegister(&SystemZ::GR32BitRegClass); - Register RotatedNewVal = MRI.createVirtualRegister(&SystemZ::GR32BitRegClass); + Register OrigVal = MRI.createVirtualRegister(RC); + Register OldVal = MRI.createVirtualRegister(RC); + Register NewVal = MRI.createVirtualRegister(RC); + Register RotatedOldVal = (IsSubWord ? MRI.createVirtualRegister(RC) : OldVal); + Register RotatedAltVal = (IsSubWord ? MRI.createVirtualRegister(RC) : Src2); + Register RotatedNewVal = (IsSubWord ? MRI.createVirtualRegister(RC) : NewVal); // Insert 3 basic blocks for the loop. MachineBasicBlock *StartMBB = MBB; @@ -8099,8 +8139,9 @@ MachineBasicBlock *SystemZTargetLowering::emitAtomicLoadMinMax( BuildMI(MBB, DL, TII->get(SystemZ::PHI), OldVal) .addReg(OrigVal).addMBB(StartMBB) .addReg(Dest).addMBB(UpdateMBB); - BuildMI(MBB, DL, TII->get(SystemZ::RLL), RotatedOldVal) - .addReg(OldVal).addReg(BitShift).addImm(0); + if (IsSubWord) + BuildMI(MBB, DL, TII->get(SystemZ::RLL), RotatedOldVal) + .addReg(OldVal).addReg(BitShift).addImm(0); BuildMI(MBB, DL, TII->get(CompareOpcode)) .addReg(RotatedOldVal).addReg(Src2); BuildMI(MBB, DL, TII->get(SystemZ::BRC)) @@ -8112,9 +8153,10 @@ MachineBasicBlock *SystemZTargetLowering::emitAtomicLoadMinMax( // %RotatedAltVal = RISBG %RotatedOldVal, %Src2, 32, 31 + BitSize, 0 // # fall through to UpdateMBB MBB = UseAltMBB; - BuildMI(MBB, DL, TII->get(SystemZ::RISBG32), RotatedAltVal) - .addReg(RotatedOldVal).addReg(Src2) - .addImm(32).addImm(31 + BitSize).addImm(0); + if (IsSubWord) + BuildMI(MBB, DL, TII->get(SystemZ::RISBG32), RotatedAltVal) + .addReg(RotatedOldVal).addReg(Src2) + .addImm(32).addImm(31 + BitSize).addImm(0); MBB->addSuccessor(UpdateMBB); // UpdateMBB: @@ -8128,8 +8170,9 @@ MachineBasicBlock *SystemZTargetLowering::emitAtomicLoadMinMax( BuildMI(MBB, DL, TII->get(SystemZ::PHI), RotatedNewVal) .addReg(RotatedOldVal).addMBB(LoopMBB) .addReg(RotatedAltVal).addMBB(UseAltMBB); - BuildMI(MBB, DL, TII->get(SystemZ::RLL), NewVal) - .addReg(RotatedNewVal).addReg(NegBitShift).addImm(0); + if (IsSubWord) + BuildMI(MBB, DL, TII->get(SystemZ::RLL), NewVal) + .addReg(RotatedNewVal).addReg(NegBitShift).addImm(0); BuildMI(MBB, DL, TII->get(CSOpcode), Dest) .addReg(OldVal) .addReg(NewVal) @@ -8144,7 +8187,7 @@ MachineBasicBlock *SystemZTargetLowering::emitAtomicLoadMinMax( return DoneMBB; } -// Implement EmitInstrWithCustomInserter for subword pseudo ATOMIC_CMP_SWAPW +// Implement EmitInstrWithCustomInserter for pseudo ATOMIC_CMP_SWAPW // instruction MI. MachineBasicBlock * SystemZTargetLowering::emitAtomicCmpSwapW(MachineInstr &MI, @@ -8961,44 +9004,171 @@ MachineBasicBlock *SystemZTargetLowering::EmitInstrWithCustomInserter( return emitExt128(MI, MBB, true); case SystemZ::ATOMIC_SWAPW: - return emitAtomicLoadBinary(MI, MBB, 0); + return emitAtomicLoadBinary(MI, MBB, 0, 0); + case SystemZ::ATOMIC_SWAP_32: + return emitAtomicLoadBinary(MI, MBB, 0, 32); + case SystemZ::ATOMIC_SWAP_64: + return emitAtomicLoadBinary(MI, MBB, 0, 64); case SystemZ::ATOMIC_LOADW_AR: - return emitAtomicLoadBinary(MI, MBB, SystemZ::AR); + return emitAtomicLoadBinary(MI, MBB, SystemZ::AR, 0); case SystemZ::ATOMIC_LOADW_AFI: - return emitAtomicLoadBinary(MI, MBB, SystemZ::AFI); + return emitAtomicLoadBinary(MI, MBB, SystemZ::AFI, 0); + case SystemZ::ATOMIC_LOAD_AR: + return emitAtomicLoadBinary(MI, MBB, SystemZ::AR, 32); + case SystemZ::ATOMIC_LOAD_AHI: + return emitAtomicLoadBinary(MI, MBB, SystemZ::AHI, 32); + case SystemZ::ATOMIC_LOAD_AFI: + return emitAtomicLoadBinary(MI, MBB, SystemZ::AFI, 32); + case SystemZ::ATOMIC_LOAD_AGR: + return emitAtomicLoadBinary(MI, MBB, SystemZ::AGR, 64); + case SystemZ::ATOMIC_LOAD_AGHI: + return emitAtomicLoadBinary(MI, MBB, SystemZ::AGHI, 64); + case SystemZ::ATOMIC_LOAD_AGFI: + return emitAtomicLoadBinary(MI, MBB, SystemZ::AGFI, 64); case SystemZ::ATOMIC_LOADW_SR: - return emitAtomicLoadBinary(MI, MBB, SystemZ::SR); + return emitAtomicLoadBinary(MI, MBB, SystemZ::SR, 0); + case SystemZ::ATOMIC_LOAD_SR: + return emitAtomicLoadBinary(MI, MBB, SystemZ::SR, 32); + case SystemZ::ATOMIC_LOAD_SGR: + return emitAtomicLoadBinary(MI, MBB, SystemZ::SGR, 64); case SystemZ::ATOMIC_LOADW_NR: - return emitAtomicLoadBinary(MI, MBB, SystemZ::NR); + return emitAtomicLoadBinary(MI, MBB, SystemZ::NR, 0); case SystemZ::ATOMIC_LOADW_NILH: - return emitAtomicLoadBinary(MI, MBB, SystemZ::NILH); + return emitAtomicLoadBinary(MI, MBB, SystemZ::NILH, 0); + case SystemZ::ATOMIC_LOAD_NR: + return emitAtomicLoadBinary(MI, MBB, SystemZ::NR, 32); + case SystemZ::ATOMIC_LOAD_NILL: + return emitAtomicLoadBinary(MI, MBB, SystemZ::NILL, 32); + case SystemZ::ATOMIC_LOAD_NILH: + return emitAtomicLoadBinary(MI, MBB, SystemZ::NILH, 32); + case SystemZ::ATOMIC_LOAD_NILF: + return emitAtomicLoadBinary(MI, MBB, SystemZ::NILF, 32); + case SystemZ::ATOMIC_LOAD_NGR: + return emitAtomicLoadBinary(MI, MBB, SystemZ::NGR, 64); + case SystemZ::ATOMIC_LOAD_NILL64: + return emitAtomicLoadBinary(MI, MBB, SystemZ::NILL64, 64); + case SystemZ::ATOMIC_LOAD_NILH64: + return emitAtomicLoadBinary(MI, MBB, SystemZ::NILH64, 64); + case SystemZ::ATOMIC_LOAD_NIHL64: + return emitAtomicLoadBinary(MI, MBB, SystemZ::NIHL64, 64); + case SystemZ::ATOMIC_LOAD_NIHH64: + return emitAtomicLoadBinary(MI, MBB, SystemZ::NIHH64, 64); + case SystemZ::ATOMIC_LOAD_NILF64: + return emitAtomicLoadBinary(MI, MBB, SystemZ::NILF64, 64); + case SystemZ::ATOMIC_LOAD_NIHF64: + return emitAtomicLoadBinary(MI, MBB, SystemZ::NIHF64, 64); case SystemZ::ATOMIC_LOADW_OR: - return emitAtomicLoadBinary(MI, MBB, SystemZ::OR); + return emitAtomicLoadBinary(MI, MBB, SystemZ::OR, 0); case SystemZ::ATOMIC_LOADW_OILH: - return emitAtomicLoadBinary(MI, MBB, SystemZ::OILH); + return emitAtomicLoadBinary(MI, MBB, SystemZ::OILH, 0); + case SystemZ::ATOMIC_LOAD_OR: + return emitAtomicLoadBinary(MI, MBB, SystemZ::OR, 32); + case SystemZ::ATOMIC_LOAD_OILL: + return emitAtomicLoadBinary(MI, MBB, SystemZ::OILL, 32); + case SystemZ::ATOMIC_LOAD_OILH: + return emitAtomicLoadBinary(MI, MBB, SystemZ::OILH, 32); + case SystemZ::ATOMIC_LOAD_OILF: + return emitAtomicLoadBinary(MI, MBB, SystemZ::OILF, 32); + case SystemZ::ATOMIC_LOAD_OGR: + return emitAtomicLoadBinary(MI, MBB, SystemZ::OGR, 64); + case SystemZ::ATOMIC_LOAD_OILL64: + return emitAtomicLoadBinary(MI, MBB, SystemZ::OILL64, 64); + case SystemZ::ATOMIC_LOAD_OILH64: + return emitAtomicLoadBinary(MI, MBB, SystemZ::OILH64, 64); + case SystemZ::ATOMIC_LOAD_OIHL64: + return emitAtomicLoadBinary(MI, MBB, SystemZ::OIHL64, 64); + case SystemZ::ATOMIC_LOAD_OIHH64: + return emitAtomicLoadBinary(MI, MBB, SystemZ::OIHH64, 64); + case SystemZ::ATOMIC_LOAD_OILF64: + return emitAtomicLoadBinary(MI, MBB, SystemZ::OILF64, 64); + case SystemZ::ATOMIC_LOAD_OIHF64: + return emitAtomicLoadBinary(MI, MBB, SystemZ::OIHF64, 64); case SystemZ::ATOMIC_LOADW_XR: - return emitAtomicLoadBinary(MI, MBB, SystemZ::XR); + return emitAtomicLoadBinary(MI, MBB, SystemZ::XR, 0); case SystemZ::ATOMIC_LOADW_XILF: - return emitAtomicLoadBinary(MI, MBB, SystemZ::XILF); + return emitAtomicLoadBinary(MI, MBB, SystemZ::XILF, 0); + case SystemZ::ATOMIC_LOAD_XR: + return emitAtomicLoadBinary(MI, MBB, SystemZ::XR, 32); + case SystemZ::ATOMIC_LOAD_XILF: + return emitAtomicLoadBinary(MI, MBB, SystemZ::XILF, 32); + case SystemZ::ATOMIC_LOAD_XGR: + return emitAtomicLoadBinary(MI, MBB, SystemZ::XGR, 64); + case SystemZ::ATOMIC_LOAD_XILF64: + return emitAtomicLoadBinary(MI, MBB, SystemZ::XILF64, 64); + case SystemZ::ATOMIC_LOAD_XIHF64: + return emitAtomicLoadBinary(MI, MBB, SystemZ::XIHF64, 64); case SystemZ::ATOMIC_LOADW_NRi: - return emitAtomicLoadBinary(MI, MBB, SystemZ::NR, true); + return emitAtomicLoadBinary(MI, MBB, SystemZ::NR, 0, true); case SystemZ::ATOMIC_LOADW_NILHi: - return emitAtomicLoadBinary(MI, MBB, SystemZ::NILH, true); + return emitAtomicLoadBinary(MI, MBB, SystemZ::NILH, 0, true); + case SystemZ::ATOMIC_LOAD_NRi: + return emitAtomicLoadBinary(MI, MBB, SystemZ::NR, 32, true); + case SystemZ::ATOMIC_LOAD_NILLi: + return emitAtomicLoadBinary(MI, MBB, SystemZ::NILL, 32, true); + case SystemZ::ATOMIC_LOAD_NILHi: + return emitAtomicLoadBinary(MI, MBB, SystemZ::NILH, 32, true); + case SystemZ::ATOMIC_LOAD_NILFi: + return emitAtomicLoadBinary(MI, MBB, SystemZ::NILF, 32, true); + case SystemZ::ATOMIC_LOAD_NGRi: + return emitAtomicLoadBinary(MI, MBB, SystemZ::NGR, 64, true); + case SystemZ::ATOMIC_LOAD_NILL64i: + return emitAtomicLoadBinary(MI, MBB, SystemZ::NILL64, 64, true); + case SystemZ::ATOMIC_LOAD_NILH64i: + return emitAtomicLoadBinary(MI, MBB, SystemZ::NILH64, 64, true); + case SystemZ::ATOMIC_LOAD_NIHL64i: + return emitAtomicLoadBinary(MI, MBB, SystemZ::NIHL64, 64, true); + case SystemZ::ATOMIC_LOAD_NIHH64i: + return emitAtomicLoadBinary(MI, MBB, SystemZ::NIHH64, 64, true); + case SystemZ::ATOMIC_LOAD_NILF64i: + return emitAtomicLoadBinary(MI, MBB, SystemZ::NILF64, 64, true); + case SystemZ::ATOMIC_LOAD_NIHF64i: + return emitAtomicLoadBinary(MI, MBB, SystemZ::NIHF64, 64, true); case SystemZ::ATOMIC_LOADW_MIN: - return emitAtomicLoadMinMax(MI, MBB, SystemZ::CR, SystemZ::CCMASK_CMP_LE); + return emitAtomicLoadMinMax(MI, MBB, SystemZ::CR, + SystemZ::CCMASK_CMP_LE, 0); + case SystemZ::ATOMIC_LOAD_MIN_32: + return emitAtomicLoadMinMax(MI, MBB, SystemZ::CR, + SystemZ::CCMASK_CMP_LE, 32); + case SystemZ::ATOMIC_LOAD_MIN_64: + return emitAtomicLoadMinMax(MI, MBB, SystemZ::CGR, + SystemZ::CCMASK_CMP_LE, 64); + case SystemZ::ATOMIC_LOADW_MAX: - return emitAtomicLoadMinMax(MI, MBB, SystemZ::CR, SystemZ::CCMASK_CMP_GE); + return emitAtomicLoadMinMax(MI, MBB, SystemZ::CR, + SystemZ::CCMASK_CMP_GE, 0); + case SystemZ::ATOMIC_LOAD_MAX_32: + return emitAtomicLoadMinMax(MI, MBB, SystemZ::CR, + SystemZ::CCMASK_CMP_GE, 32); + case SystemZ::ATOMIC_LOAD_MAX_64: + return emitAtomicLoadMinMax(MI, MBB, SystemZ::CGR, + SystemZ::CCMASK_CMP_GE, 64); + case SystemZ::ATOMIC_LOADW_UMIN: - return emitAtomicLoadMinMax(MI, MBB, SystemZ::CLR, SystemZ::CCMASK_CMP_LE); + return emitAtomicLoadMinMax(MI, MBB, SystemZ::CLR, + SystemZ::CCMASK_CMP_LE, 0); + case SystemZ::ATOMIC_LOAD_UMIN_32: + return emitAtomicLoadMinMax(MI, MBB, SystemZ::CLR, + SystemZ::CCMASK_CMP_LE, 32); + case SystemZ::ATOMIC_LOAD_UMIN_64: + return emitAtomicLoadMinMax(MI, MBB, SystemZ::CLGR, + SystemZ::CCMASK_CMP_LE, 64); + case SystemZ::ATOMIC_LOADW_UMAX: - return emitAtomicLoadMinMax(MI, MBB, SystemZ::CLR, SystemZ::CCMASK_CMP_GE); + return emitAtomicLoadMinMax(MI, MBB, SystemZ::CLR, + SystemZ::CCMASK_CMP_GE, 0); + case SystemZ::ATOMIC_LOAD_UMAX_32: + return emitAtomicLoadMinMax(MI, MBB, SystemZ::CLR, + SystemZ::CCMASK_CMP_GE, 32); + case SystemZ::ATOMIC_LOAD_UMAX_64: + return emitAtomicLoadMinMax(MI, MBB, SystemZ::CLGR, + SystemZ::CCMASK_CMP_GE, 64); case SystemZ::ATOMIC_CMP_SWAPW: return emitAtomicCmpSwapW(MI, MBB); diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.h b/llvm/lib/Target/SystemZ/SystemZISelLowering.h index 1e2887cff8164..40fe433f816fa 100644 --- a/llvm/lib/Target/SystemZ/SystemZISelLowering.h +++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.h @@ -748,12 +748,13 @@ class SystemZTargetLowering : public TargetLowering { bool ClearEven) const; MachineBasicBlock *emitAtomicLoadBinary(MachineInstr &MI, MachineBasicBlock *BB, - unsigned BinOpcode, + unsigned BinOpcode, unsigned BitSize, bool Invert = false) const; MachineBasicBlock *emitAtomicLoadMinMax(MachineInstr &MI, MachineBasicBlock *MBB, unsigned CompareOpcode, - unsigned KeepOldMask) const; + unsigned KeepOldMask, + unsigned BitSize) const; MachineBasicBlock *emitAtomicCmpSwapW(MachineInstr &MI, MachineBasicBlock *BB) const; MachineBasicBlock *emitMemMemWrapper(MachineInstr &MI, MachineBasicBlock *BB, diff --git a/llvm/lib/Target/SystemZ/SystemZInstrFormats.td b/llvm/lib/Target/SystemZ/SystemZInstrFormats.td index 2e5ff4a1df673..a25719f80ad07 100644 --- a/llvm/lib/Target/SystemZ/SystemZInstrFormats.td +++ b/llvm/lib/Target/SystemZ/SystemZInstrFormats.td @@ -5327,6 +5327,30 @@ multiclass CondStores + : Pseudo<(outs cls:$dst), (ins bdaddr20only:$ptr, operand:$src2), + [(set cls:$dst, (operator bdaddr20only:$ptr, pat))]> { + let Defs = [CC]; + let Has20BitOffset = 1; + let mayLoad = 1; + let mayStore = 1; + let usesCustomInserter = 1; + let hasNoSchedulingInfo = 1; +} + +// Specializations of AtomicLoadWBinary. +class AtomicLoadBinaryReg32 + : AtomicLoadBinary; +class AtomicLoadBinaryImm32 + : AtomicLoadBinary; +class AtomicLoadBinaryReg64 + : AtomicLoadBinary; +class AtomicLoadBinaryImm64 + : AtomicLoadBinary; + // OPERATOR is ATOMIC_SWAPW or an ATOMIC_LOADW_* operation. PAT and OPERAND // describe the second (non-memory) operand. class AtomicLoadWBinary; +def ATOMIC_SWAP_32 : AtomicLoadBinaryReg32; +def ATOMIC_SWAP_64 : AtomicLoadBinaryReg64; def ATOMIC_LOADW_AR : AtomicLoadWBinaryReg; def ATOMIC_LOADW_AFI : AtomicLoadWBinaryImm; +let Predicates = [FeatureNoInterlockedAccess1] in { + def ATOMIC_LOAD_AR : AtomicLoadBinaryReg32; + def ATOMIC_LOAD_AHI : AtomicLoadBinaryImm32; + def ATOMIC_LOAD_AFI : AtomicLoadBinaryImm32; + def ATOMIC_LOAD_AGR : AtomicLoadBinaryReg64; + def ATOMIC_LOAD_AGHI : AtomicLoadBinaryImm64; + def ATOMIC_LOAD_AGFI : AtomicLoadBinaryImm64; +} def ATOMIC_LOADW_SR : AtomicLoadWBinaryReg; +def ATOMIC_LOAD_SR : AtomicLoadBinaryReg32; +def ATOMIC_LOAD_SGR : AtomicLoadBinaryReg64; def ATOMIC_LOADW_NR : AtomicLoadWBinaryReg; def ATOMIC_LOADW_NILH : AtomicLoadWBinaryImm; +let Predicates = [FeatureNoInterlockedAccess1] in { + def ATOMIC_LOAD_NR : AtomicLoadBinaryReg32; + def ATOMIC_LOAD_NILL : AtomicLoadBinaryImm32; + def ATOMIC_LOAD_NILH : AtomicLoadBinaryImm32; + def ATOMIC_LOAD_NILF : AtomicLoadBinaryImm32; + def ATOMIC_LOAD_NGR : AtomicLoadBinaryReg64; + def ATOMIC_LOAD_NILL64 : AtomicLoadBinaryImm64; + def ATOMIC_LOAD_NILH64 : AtomicLoadBinaryImm64; + def ATOMIC_LOAD_NIHL64 : AtomicLoadBinaryImm64; + def ATOMIC_LOAD_NIHH64 : AtomicLoadBinaryImm64; + def ATOMIC_LOAD_NILF64 : AtomicLoadBinaryImm64; + def ATOMIC_LOAD_NIHF64 : AtomicLoadBinaryImm64; +} def ATOMIC_LOADW_OR : AtomicLoadWBinaryReg; def ATOMIC_LOADW_OILH : AtomicLoadWBinaryImm; +let Predicates = [FeatureNoInterlockedAccess1] in { + def ATOMIC_LOAD_OR : AtomicLoadBinaryReg32; + def ATOMIC_LOAD_OILL : AtomicLoadBinaryImm32; + def ATOMIC_LOAD_OILH : AtomicLoadBinaryImm32; + def ATOMIC_LOAD_OILF : AtomicLoadBinaryImm32; + def ATOMIC_LOAD_OGR : AtomicLoadBinaryReg64; + def ATOMIC_LOAD_OILL64 : AtomicLoadBinaryImm64; + def ATOMIC_LOAD_OILH64 : AtomicLoadBinaryImm64; + def ATOMIC_LOAD_OIHL64 : AtomicLoadBinaryImm64; + def ATOMIC_LOAD_OIHH64 : AtomicLoadBinaryImm64; + def ATOMIC_LOAD_OILF64 : AtomicLoadBinaryImm64; + def ATOMIC_LOAD_OIHF64 : AtomicLoadBinaryImm64; +} def ATOMIC_LOADW_XR : AtomicLoadWBinaryReg; def ATOMIC_LOADW_XILF : AtomicLoadWBinaryImm; +let Predicates = [FeatureNoInterlockedAccess1] in { + def ATOMIC_LOAD_XR : AtomicLoadBinaryReg32; + def ATOMIC_LOAD_XILF : AtomicLoadBinaryImm32; + def ATOMIC_LOAD_XGR : AtomicLoadBinaryReg64; + def ATOMIC_LOAD_XILF64 : AtomicLoadBinaryImm64; + def ATOMIC_LOAD_XIHF64 : AtomicLoadBinaryImm64; +} def ATOMIC_LOADW_NRi : AtomicLoadWBinaryReg; def ATOMIC_LOADW_NILHi : AtomicLoadWBinaryImm; +def ATOMIC_LOAD_NRi : AtomicLoadBinaryReg32; +def ATOMIC_LOAD_NILLi : AtomicLoadBinaryImm32; +def ATOMIC_LOAD_NILHi : AtomicLoadBinaryImm32; +def ATOMIC_LOAD_NILFi : AtomicLoadBinaryImm32; +def ATOMIC_LOAD_NGRi : AtomicLoadBinaryReg64; +def ATOMIC_LOAD_NILL64i : AtomicLoadBinaryImm64; +def ATOMIC_LOAD_NILH64i : AtomicLoadBinaryImm64; +def ATOMIC_LOAD_NIHL64i : AtomicLoadBinaryImm64; +def ATOMIC_LOAD_NIHH64i : AtomicLoadBinaryImm64; +def ATOMIC_LOAD_NILF64i : AtomicLoadBinaryImm64; +def ATOMIC_LOAD_NIHF64i : AtomicLoadBinaryImm64; def ATOMIC_LOADW_MIN : AtomicLoadWBinaryReg; +def ATOMIC_LOAD_MIN_32 : AtomicLoadBinaryReg32; +def ATOMIC_LOAD_MIN_64 : AtomicLoadBinaryReg64; + def ATOMIC_LOADW_MAX : AtomicLoadWBinaryReg; +def ATOMIC_LOAD_MAX_32 : AtomicLoadBinaryReg32; +def ATOMIC_LOAD_MAX_64 : AtomicLoadBinaryReg64; + def ATOMIC_LOADW_UMIN : AtomicLoadWBinaryReg; +def ATOMIC_LOAD_UMIN_32 : AtomicLoadBinaryReg32; +def ATOMIC_LOAD_UMIN_64 : AtomicLoadBinaryReg64; + def ATOMIC_LOADW_UMAX : AtomicLoadWBinaryReg; +def ATOMIC_LOAD_UMAX_32 : AtomicLoadBinaryReg32; +def ATOMIC_LOAD_UMAX_64 : AtomicLoadBinaryReg64; def ATOMIC_CMP_SWAPW : Pseudo<(outs GR32:$dst), (ins bdaddr20only:$addr, GR32:$cmp, GR32:$swap, diff --git a/llvm/lib/Target/X86/MCA/X86CustomBehaviour.h b/llvm/lib/Target/X86/MCA/X86CustomBehaviour.h index 4a83ba848dd88..3cb5edeee5b5e 100644 --- a/llvm/lib/Target/X86/MCA/X86CustomBehaviour.h +++ b/llvm/lib/Target/X86/MCA/X86CustomBehaviour.h @@ -17,6 +17,7 @@ #ifndef LLVM_LIB_TARGET_X86_MCA_X86CUSTOMBEHAVIOUR_H #define LLVM_LIB_TARGET_X86_MCA_X86CUSTOMBEHAVIOUR_H +#include "llvm/ADT/SmallVector.h" #include "llvm/MCA/CustomBehaviour.h" #include "llvm/TargetParser/TargetParser.h" diff --git a/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp b/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp index b13bf361ab79b..d2e3c708f59d0 100644 --- a/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp +++ b/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp @@ -24,6 +24,7 @@ #include "X86InstrBuilder.h" #include "X86InstrInfo.h" #include "X86Subtarget.h" +#include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/PostOrderIterator.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/ScopeExit.h" diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index d69976342fcbd..4fca5afb46dd2 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -8727,7 +8727,6 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { // constants. Insertion into a zero vector is handled as a special-case // somewhere below here. if (NumConstants == NumElems - 1 && NumNonZero != 1 && - FrozenUndefMask.isZero() && (isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT) || isOperationLegalOrCustom(ISD::VECTOR_SHUFFLE, VT))) { // Create an all-constant vector. The variable element in the old @@ -48707,7 +48706,7 @@ static SDValue canonicalizeBitSelect(SDNode *N, SelectionDAG &DAG, if (useVPTERNLOG(Subtarget, VT)) { // Emit a VPTERNLOG node directly - 0xCA is the imm code for A?B:C. // VPTERNLOG is only available as vXi32/64-bit types. - MVT OpSVT = EltSizeInBits <= 32 ? MVT::i32 : MVT::i64; + MVT OpSVT = EltSizeInBits == 32 ? MVT::i32 : MVT::i64; MVT OpVT = MVT::getVectorVT(OpSVT, VT.getSizeInBits() / OpSVT.getSizeInBits()); SDValue A = DAG.getBitcast(OpVT, N0.getOperand(1)); @@ -49886,8 +49885,8 @@ static SDValue combineLoad(SDNode *N, SelectionDAG &DAG, User->getValueSizeInBits(0).getFixedValue() > RegVT.getFixedSizeInBits()) { if (User->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD && - cast(User)->getBasePtr() == Ptr && - cast(User)->getMemoryVT().getSizeInBits() == + cast(User)->getBasePtr() == Ptr && + cast(User)->getMemoryVT().getSizeInBits() == MemVT.getSizeInBits()) { SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, SDLoc(N), RegVT.getSizeInBits()); @@ -49915,7 +49914,7 @@ static SDValue combineLoad(SDNode *N, SelectionDAG &DAG, if (ISD::isNormalLoad(User)) { // See if we are loading a constant that matches in the lower // bits of a longer constant (but from a different constant pool ptr). - SDValue UserPtr = cast(User)->getBasePtr(); + SDValue UserPtr = cast(User)->getBasePtr(); const Constant *LdC = getTargetConstantFromBasePtr(Ptr); const Constant *UserC = getTargetConstantFromBasePtr(UserPtr); if (LdC && UserC && UserPtr != Ptr && @@ -54482,7 +54481,6 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT, SDValue Op0 = Ops[0]; bool IsSplat = llvm::all_equal(Ops); unsigned NumOps = Ops.size(); - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); // Repeated subvectors. if (IsSplat && @@ -54491,6 +54489,25 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT, if (Op0.getOpcode() == X86ISD::VBROADCAST) return DAG.getNode(Op0.getOpcode(), DL, VT, Op0.getOperand(0)); + // If this simple subvector or scalar/subvector broadcast_load is inserted + // into both halves, use a larger broadcast_load. Update other uses to use + // an extracted subvector. + if (ISD::isNormalLoad(Op0.getNode()) || + Op0.getOpcode() == X86ISD::VBROADCAST_LOAD || + Op0.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) { + auto *Mem = cast(Op0); + unsigned Opc = Op0.getOpcode() == X86ISD::VBROADCAST_LOAD + ? X86ISD::VBROADCAST_LOAD + : X86ISD::SUBV_BROADCAST_LOAD; + if (SDValue BcastLd = + getBROADCAST_LOAD(Opc, DL, VT, Mem->getMemoryVT(), Mem, 0, DAG)) { + SDValue BcastSrc = + extractSubVector(BcastLd, 0, DAG, DL, Op0.getValueSizeInBits()); + DAG.ReplaceAllUsesOfValueWith(Op0, BcastSrc); + return BcastLd; + } + } + // concat_vectors(movddup(x),movddup(x)) -> broadcast(x) if (Op0.getOpcode() == X86ISD::MOVDDUP && VT == MVT::v4f64 && (Subtarget.hasAVX2() || @@ -54939,7 +54956,7 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT, if (SelVT.getVectorElementType() == MVT::i1) { SelVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumOps * SelVT.getVectorNumElements()); - if (TLI.isTypeLegal(SelVT)) + if (DAG.getTargetLoweringInfo().isTypeLegal(SelVT)) return DAG.getNode(Op0.getOpcode(), DL, VT, ConcatSubOperand(SelVT.getSimpleVT(), Ops, 0), ConcatSubOperand(VT, Ops, 1), @@ -54953,7 +54970,7 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT, IsConcatFree(VT, Ops, 1) && IsConcatFree(VT, Ops, 2)) { EVT SelVT = Ops[0].getOperand(0).getValueType(); SelVT = SelVT.getDoubleNumVectorElementsVT(*DAG.getContext()); - if (TLI.isTypeLegal(SelVT)) + if (DAG.getTargetLoweringInfo().isTypeLegal(SelVT)) return DAG.getNode(Op0.getOpcode(), DL, VT, ConcatSubOperand(SelVT.getSimpleVT(), Ops, 0), ConcatSubOperand(VT, Ops, 1), @@ -54977,28 +54994,6 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT, } } - // If this simple subvector or scalar/subvector broadcast_load is inserted - // into both halves, use a larger broadcast_load. Update other uses to use - // an extracted subvector. - if (IsSplat && - (VT.is256BitVector() || (VT.is512BitVector() && Subtarget.hasAVX512()))) { - if (ISD::isNormalLoad(Op0.getNode()) || - Op0.getOpcode() == X86ISD::VBROADCAST_LOAD || - Op0.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) { - auto *Mem = cast(Op0); - unsigned Opc = Op0.getOpcode() == X86ISD::VBROADCAST_LOAD - ? X86ISD::VBROADCAST_LOAD - : X86ISD::SUBV_BROADCAST_LOAD; - if (SDValue BcastLd = - getBROADCAST_LOAD(Opc, DL, VT, Mem->getMemoryVT(), Mem, 0, DAG)) { - SDValue BcastSrc = - extractSubVector(BcastLd, 0, DAG, DL, Op0.getValueSizeInBits()); - DAG.ReplaceAllUsesOfValueWith(Op0, BcastSrc); - return BcastLd; - } - } - } - // Attempt to fold target constant loads. if (all_of(Ops, [](SDValue Op) { return getTargetConstantFromNode(Op); })) { SmallVector EltBits; diff --git a/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp b/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp index 6301285fe9545..2a896314875d2 100644 --- a/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp +++ b/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp @@ -26,6 +26,7 @@ #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/ScopeExit.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" diff --git a/llvm/lib/TargetParser/Host.cpp b/llvm/lib/TargetParser/Host.cpp index 40f4ba0be5503..ae47929fce6ac 100644 --- a/llvm/lib/TargetParser/Host.cpp +++ b/llvm/lib/TargetParser/Host.cpp @@ -202,7 +202,6 @@ StringRef sys::detail::getHostCPUNameForARM(StringRef ProcCpuinfoContent) { .Case("0xc20", "cortex-m0") .Case("0xc23", "cortex-m3") .Case("0xc24", "cortex-m4") - .Case("0xd24", "cortex-m52") .Case("0xd22", "cortex-m55") .Case("0xd02", "cortex-a34") .Case("0xd04", "cortex-a35") diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp index 255ce6973a16f..a991f0906052a 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -514,8 +514,6 @@ static Instruction *foldCttzCtlz(IntrinsicInst &II, InstCombinerImpl &IC) { return IC.replaceInstUsesWith(II, ConstantInt::getNullValue(II.getType())); } - Constant *C; - if (IsTZ) { // cttz(-x) -> cttz(x) if (match(Op0, m_Neg(m_Value(X)))) @@ -551,38 +549,6 @@ static Instruction *foldCttzCtlz(IntrinsicInst &II, InstCombinerImpl &IC) { if (match(Op0, m_Intrinsic(m_Value(X)))) return IC.replaceOperand(II, 0, X); - - // cttz(shl(%const, %val), 1) --> add(cttz(%const, 1), %val) - if (match(Op0, m_Shl(m_ImmConstant(C), m_Value(X))) && - match(Op1, m_One())) { - Value *ConstCttz = - IC.Builder.CreateBinaryIntrinsic(Intrinsic::cttz, C, Op1); - return BinaryOperator::CreateAdd(ConstCttz, X); - } - - // cttz(lshr exact (%const, %val), 1) --> sub(cttz(%const, 1), %val) - if (match(Op0, m_Exact(m_LShr(m_ImmConstant(C), m_Value(X)))) && - match(Op1, m_One())) { - Value *ConstCttz = - IC.Builder.CreateBinaryIntrinsic(Intrinsic::cttz, C, Op1); - return BinaryOperator::CreateSub(ConstCttz, X); - } - } else { - // ctlz(lshr(%const, %val), 1) --> add(ctlz(%const, 1), %val) - if (match(Op0, m_LShr(m_ImmConstant(C), m_Value(X))) && - match(Op1, m_One())) { - Value *ConstCtlz = - IC.Builder.CreateBinaryIntrinsic(Intrinsic::ctlz, C, Op1); - return BinaryOperator::CreateAdd(ConstCtlz, X); - } - - // ctlz(shl nuw (%const, %val), 1) --> sub(ctlz(%const, 1), %val) - if (match(Op0, m_NUWShl(m_ImmConstant(C), m_Value(X))) && - match(Op1, m_One())) { - Value *ConstCtlz = - IC.Builder.CreateBinaryIntrinsic(Intrinsic::ctlz, C, Op1); - return BinaryOperator::CreateSub(ConstCtlz, X); - } } KnownBits Known = IC.computeKnownBits(Op0, 0, &II); diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp index e42e011bd4361..4295c7692e6d5 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp @@ -812,8 +812,8 @@ Instruction *InstCombinerImpl::foldGEPICmp(GEPOperator *GEPLHS, Value *RHS, // Only lower this if the icmp is the only user of the GEP or if we expect // the result to fold to a constant! if ((GEPsInBounds || CmpInst::isEquality(Cond)) && - (GEPLHS->hasAllConstantIndices() || GEPLHS->hasOneUse()) && - (GEPRHS->hasAllConstantIndices() || GEPRHS->hasOneUse())) { + (isa(GEPLHS) || GEPLHS->hasOneUse()) && + (isa(GEPRHS) || GEPRHS->hasOneUse())) { // ((gep Ptr, OFFSET1) cmp (gep Ptr, OFFSET2) ---> (OFFSET1 cmp OFFSET2) Value *L = EmitGEPOffset(GEPLHS); Value *R = EmitGEPOffset(GEPRHS); @@ -7034,7 +7034,7 @@ Instruction *InstCombinerImpl::visitICmpInst(ICmpInst &I) { return Res; { - Value *X, *Y; + Value *X, *Y, *Z; // Transform (X & ~Y) == 0 --> (X & Y) != 0 // and (X & ~Y) != 0 --> (X & Y) == 0 // if A is a power of 2. @@ -7044,18 +7044,32 @@ Instruction *InstCombinerImpl::visitICmpInst(ICmpInst &I) { return new ICmpInst(I.getInversePredicate(), Builder.CreateAnd(X, Y), Op1); - // Op0 pred Op1 -> ~Op1 pred ~Op0, if this allows us to drop an instruction. - if (Op0->getType()->isIntOrIntVectorTy()) { - bool ConsumesOp0, ConsumesOp1; - if (isFreeToInvert(Op0, Op0->hasOneUse(), ConsumesOp0) && - isFreeToInvert(Op1, Op1->hasOneUse(), ConsumesOp1) && - (ConsumesOp0 || ConsumesOp1)) { - Value *InvOp0 = getFreelyInverted(Op0, Op0->hasOneUse(), &Builder); - Value *InvOp1 = getFreelyInverted(Op1, Op1->hasOneUse(), &Builder); - assert(InvOp0 && InvOp1 && - "Mismatch between isFreeToInvert and getFreelyInverted"); - return new ICmpInst(I.getSwappedPredicate(), InvOp0, InvOp1); - } + // Transform (~X ^ Y) s< ~Z --> (X ^ Y) s> Z, + // (~X ^ Y) s> ~Z --> (X ^ Y) s< Z, + // (~X ^ Y) s<= ~Z --> (X ^ Y) s>= Z, + // (~X ^ Y) s>= ~Z --> (X ^ Y) s<= Z, + // (~X ^ Y) u< ~Z --> (X ^ Y) u< Z, + // (~X ^ Y) u> ~Z --> (X ^ Y) u< Z, + // (~X ^ Y) u<= ~Z --> (X ^ Y) u>= Z, + // (~X ^ Y) u>= ~Z --> (X ^ Y) u<= Z, + // (~X ^ Y) == ~Z --> (X ^ Y) == Z, + // and (~X ^ Y) != ~Z --> (X ^ Y) != Z, + if (match(&I, m_c_ICmp(Pred, m_c_Xor(m_Not(m_Value(X)), m_Value(Y)), + m_Not(m_Value(Z)))) && + (I.getOperand(0)->hasOneUse() || I.getOperand(1)->hasOneUse())) + return new ICmpInst(I.getSwappedPredicate(Pred), Builder.CreateXor(X, Y), + Z); + + // ~X < ~Y --> Y < X + // ~X < C --> X > ~C + if (match(Op0, m_Not(m_Value(X)))) { + if (match(Op1, m_Not(m_Value(Y)))) + return new ICmpInst(I.getPredicate(), Y, X); + + const APInt *C; + if (match(Op1, m_APInt(C))) + return new ICmpInst(I.getSwappedPredicate(), X, + ConstantInt::get(Op1->getType(), ~(*C))); } Instruction *AddI = nullptr; diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp index f14541a1a037e..446aa497026d3 100644 --- a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp +++ b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp @@ -755,7 +755,7 @@ static unsigned getFullUnrollBoostingFactor(const EstimatedUnrollCost &Cost, static std::optional shouldPragmaUnroll(Loop *L, const PragmaInfo &PInfo, const unsigned TripMultiple, const unsigned TripCount, - unsigned MaxTripCount, const UnrollCostEstimator UCE, + const UnrollCostEstimator UCE, const TargetTransformInfo::UnrollingPreferences &UP) { // Using unroll pragma @@ -776,10 +776,6 @@ shouldPragmaUnroll(Loop *L, const PragmaInfo &PInfo, if (PInfo.PragmaFullUnroll && TripCount != 0) return TripCount; - if (PInfo.PragmaEnableUnroll && !TripCount && MaxTripCount && - MaxTripCount <= UnrollMaxUpperBound) - return MaxTripCount; - // if didn't return until here, should continue to other priorties return std::nullopt; } @@ -906,7 +902,7 @@ bool llvm::computeUnrollCount( // 1st priority is unroll count set by "unroll-count" option. // 2nd priority is unroll count set by pragma. if (auto UnrollFactor = shouldPragmaUnroll(L, PInfo, TripMultiple, TripCount, - MaxTripCount, UCE, UP)) { + UCE, UP)) { UP.Count = *UnrollFactor; if (UserUnrollCount || (PragmaCount > 0)) { diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index d84c2717077f2..c07e9f043b7b8 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -6888,30 +6888,6 @@ void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) { ParamsOk = false; break; } - case VFParamKind::OMP_Linear: { - Value *ScalarParam = CI->getArgOperand(Param.ParamPos); - // Find the stride for the scalar parameter in this loop and see if - // it matches the stride for the variant. - // TODO: do we need to figure out the cost of an extract to get the - // first lane? Or do we hope that it will be folded away? - ScalarEvolution *SE = PSE.getSE(); - const auto *SAR = - dyn_cast(SE->getSCEV(ScalarParam)); - - if (!SAR || SAR->getLoop() != TheLoop) { - ParamsOk = false; - break; - } - - const SCEVConstant *Step = - dyn_cast(SAR->getStepRecurrence(*SE)); - - if (!Step || - Step->getAPInt().getSExtValue() != Param.LinearStepOrPos) - ParamsOk = false; - - break; - } case VFParamKind::GlobalPredicate: UsesMask = true; break; diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 3d17c202f7a7c..1891b211a3566 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -340,14 +340,8 @@ Value *VPInstruction::generateInstruction(VPTransformState &State, auto *Phi = State.get(getOperand(0), 0); // The loop step is equal to the vectorization factor (num of SIMD // elements) times the unroll factor (num of SIMD instructions). - Value *Step; - { - BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); - IRBuilder<> PHBuilder(VectorPH->getTerminator()); - // Step is loop-invariant, calls to vscale will be placed in the - // preheader. - Step = createStepForVF(PHBuilder, Phi->getType(), State.VF, State.UF); - } + Value *Step = + createStepForVF(Builder, Phi->getType(), State.VF, State.UF); return Builder.CreateAdd(Phi, Step, Name, hasNoUnsignedWrap(), hasNoSignedWrap()); } @@ -808,7 +802,6 @@ void VPWidenCastRecipe::print(raw_ostream &O, const Twine &Indent, O << Indent << "WIDEN-CAST "; printAsOperand(O, SlotTracker); O << " = " << Instruction::getOpcodeName(Opcode) << " "; - printFlags(O); printOperands(O, SlotTracker); O << " to " << *getResultType(); } diff --git a/llvm/test/Analysis/ValueTracking/knownbits-and-or-xor-lowbit.ll b/llvm/test/Analysis/ValueTracking/knownbits-and-or-xor-lowbit.ll index 4ca7ed9eda7bb..f44948e6a089f 100644 --- a/llvm/test/Analysis/ValueTracking/knownbits-and-or-xor-lowbit.ll +++ b/llvm/test/Analysis/ValueTracking/knownbits-and-or-xor-lowbit.ll @@ -147,9 +147,10 @@ define i1 @sub_YX_and_bit0_is_zero_fail(i8 %x, i8 %C) nounwind { define <2 x i1> @sub_YX_xor_bit0_is_one_fail(<2 x i8> %x, <2 x i8> %C) nounwind { ; CHECK-LABEL: @sub_YX_xor_bit0_is_one_fail( -; CHECK-NEXT: [[TMP1:%.*]] = sub <2 x i8> [[X:%.*]], [[C:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = xor <2 x i8> [[TMP1]], [[X]] -; CHECK-NEXT: [[R:%.*]] = icmp eq <2 x i8> [[TMP2]], +; CHECK-NEXT: [[TMP1:%.*]] = xor <2 x i8> [[X:%.*]], +; CHECK-NEXT: [[Y:%.*]] = add <2 x i8> [[TMP1]], [[C:%.*]] +; CHECK-NEXT: [[W:%.*]] = xor <2 x i8> [[Y]], [[X]] +; CHECK-NEXT: [[R:%.*]] = icmp eq <2 x i8> [[W]], ; CHECK-NEXT: ret <2 x i1> [[R]] ; %C1 = sub <2 x i8> %C, diff --git a/llvm/test/Assembler/named-metadata.ll b/llvm/test/Assembler/named-metadata.ll index 32ebf2bfd26f3..9fa37a7989d4f 100644 --- a/llvm/test/Assembler/named-metadata.ll +++ b/llvm/test/Assembler/named-metadata.ll @@ -23,7 +23,3 @@ ; when emitting it, followed by xfoo. !\xfoo = !{!0, !1, !2} ; CHECK: !\5Cxfoo = !{!0, !1, !2} - -; Make sure we handle escapes with the high bit set correctly. -!\FFfoo = !{!0, !1, !2} -; CHECK: !\FFfoo = !{!0, !1, !2} diff --git a/llvm/test/Assembler/struct-ret-without-upgrade.ll b/llvm/test/Assembler/struct-ret-without-upgrade.ll index 14f931c23abb6..992b2f9f767fa 100644 --- a/llvm/test/Assembler/struct-ret-without-upgrade.ll +++ b/llvm/test/Assembler/struct-ret-without-upgrade.ll @@ -15,15 +15,4 @@ define %ty @test(%ty %arg) { ret %ty %copy } -define %ty @test_not_real_intrinsic() { -; CHECK-LABEL: @test_not_real_intrinsic( -; CHECK-NEXT: [[RET:%.*]] = call [[TY:%.*]] @llvm.dummy() -; CHECK-NEXT: ret [[TY]] [[RET]] -; - %ret = call %ty @llvm.dummy() - ret %ty %ret -} - -declare %ty @llvm.dummy() - declare %ty @llvm.ssa.copy.s_tys(%ty) diff --git a/llvm/test/CodeGen/AArch64/stack-probing-64k.ll b/llvm/test/CodeGen/AArch64/stack-probing-64k.ll index 945c271d37500..2f15e317a7f58 100644 --- a/llvm/test/CodeGen/AArch64/stack-probing-64k.ll +++ b/llvm/test/CodeGen/AArch64/stack-probing-64k.ll @@ -313,7 +313,7 @@ define void @static_16_align_131072(ptr %out) #0 { ; CHECK-NEXT: b .LBB9_1 ; CHECK-NEXT: .LBB9_3: // %entry ; CHECK-NEXT: mov sp, x9 -; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: ldr xzr, [sp] ; CHECK-NEXT: mov x8, sp ; CHECK-NEXT: str x8, [x0] ; CHECK-NEXT: mov sp, x29 diff --git a/llvm/test/CodeGen/AArch64/stack-probing-dynamic.ll b/llvm/test/CodeGen/AArch64/stack-probing-dynamic.ll index d247ed1b59977..a3b8df487ed48 100644 --- a/llvm/test/CodeGen/AArch64/stack-probing-dynamic.ll +++ b/llvm/test/CodeGen/AArch64/stack-probing-dynamic.ll @@ -28,7 +28,7 @@ define void @dynamic(i64 %size, ptr %out) #0 { ; CHECK-NEXT: b .LBB0_1 ; CHECK-NEXT: .LBB0_3: ; CHECK-NEXT: mov sp, x8 -; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: ldr xzr, [sp] ; CHECK-NEXT: str x8, [x1] ; CHECK-NEXT: mov sp, x29 ; CHECK-NEXT: .cfi_def_cfa wsp, 16 @@ -72,7 +72,7 @@ define void @dynamic_fixed(i64 %size, ptr %out1, ptr %out2) #0 { ; CHECK-NEXT: b .LBB1_1 ; CHECK-NEXT: .LBB1_3: ; CHECK-NEXT: mov sp, x8 -; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: ldr xzr, [sp] ; CHECK-NEXT: str x8, [x2] ; CHECK-NEXT: mov sp, x29 ; CHECK-NEXT: .cfi_def_cfa wsp, 16 @@ -122,7 +122,7 @@ define void @dynamic_align_64(i64 %size, ptr %out) #0 { ; CHECK-NEXT: b .LBB2_1 ; CHECK-NEXT: .LBB2_3: ; CHECK-NEXT: mov sp, x8 -; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: ldr xzr, [sp] ; CHECK-NEXT: str x8, [x1] ; CHECK-NEXT: mov sp, x29 ; CHECK-NEXT: .cfi_def_cfa wsp, 32 @@ -167,7 +167,7 @@ define void @dynamic_align_8192(i64 %size, ptr %out) #0 { ; CHECK-NEXT: mov sp, x9 ; CHECK-NEXT: add x9, x0, #15 ; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: ldr xzr, [sp] ; CHECK-NEXT: and x9, x9, #0xfffffffffffffff0 ; CHECK-NEXT: mov x19, sp ; CHECK-NEXT: sub x8, x8, x9 @@ -181,7 +181,7 @@ define void @dynamic_align_8192(i64 %size, ptr %out) #0 { ; CHECK-NEXT: b .LBB3_4 ; CHECK-NEXT: .LBB3_6: ; CHECK-NEXT: mov sp, x8 -; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: ldr xzr, [sp] ; CHECK-NEXT: str x8, [x1] ; CHECK-NEXT: mov sp, x29 ; CHECK-NEXT: .cfi_def_cfa wsp, 32 @@ -221,7 +221,7 @@ define void @dynamic_64k_guard(i64 %size, ptr %out) #0 "stack-probe-size"="65536 ; CHECK-NEXT: b .LBB4_1 ; CHECK-NEXT: .LBB4_3: ; CHECK-NEXT: mov sp, x8 -; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: ldr xzr, [sp] ; CHECK-NEXT: str x8, [x1] ; CHECK-NEXT: mov sp, x29 ; CHECK-NEXT: .cfi_def_cfa wsp, 16 @@ -265,7 +265,7 @@ define void @no_reserved_call_frame(i64 %n) #0 { ; CHECK-NEXT: b .LBB5_1 ; CHECK-NEXT: .LBB5_3: // %entry ; CHECK-NEXT: mov sp, x0 -; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: ldr xzr, [sp] ; CHECK-NEXT: sub sp, sp, #1104 ; CHECK-NEXT: str xzr, [sp] ; CHECK-NEXT: bl callee_stack_args @@ -344,7 +344,7 @@ define void @dynamic_sve(i64 %size, ptr %out) #0 "target-features"="+sve" { ; CHECK-NEXT: b .LBB7_1 ; CHECK-NEXT: .LBB7_3: ; CHECK-NEXT: mov sp, x8 -; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: ldr xzr, [sp] ; CHECK-NEXT: str x8, [x1] ; CHECK-NEXT: mov sp, x29 ; CHECK-NEXT: .cfi_def_cfa wsp, 32 diff --git a/llvm/test/CodeGen/AArch64/stack-probing-sve.ll b/llvm/test/CodeGen/AArch64/stack-probing-sve.ll index 4dad104e66f20..03a9220ebfddc 100644 --- a/llvm/test/CodeGen/AArch64/stack-probing-sve.ll +++ b/llvm/test/CodeGen/AArch64/stack-probing-sve.ll @@ -115,7 +115,7 @@ define void @sve_17_vector(ptr %out) #0 { ; CHECK-NEXT: b .LBB3_1 ; CHECK-NEXT: .LBB3_3: // %entry ; CHECK-NEXT: mov sp, x9 -; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: ldr xzr, [sp] ; CHECK-NEXT: .cfi_def_cfa_register wsp ; CHECK-NEXT: addvl sp, sp, #17 ; CHECK-NEXT: .cfi_def_cfa wsp, 16 @@ -351,7 +351,7 @@ define void @sve_16v_1p_csr( %a) #0 { ; CHECK-NEXT: b .LBB9_1 ; CHECK-NEXT: .LBB9_3: // %entry ; CHECK-NEXT: mov sp, x9 -; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: ldr xzr, [sp] ; CHECK-NEXT: .cfi_def_cfa_register wsp ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill @@ -467,7 +467,7 @@ define void @sve_1_vector_4096_arr(ptr %out) #0 { ; CHECK-NEXT: b .LBB11_1 ; CHECK-NEXT: .LBB11_3: // %entry ; CHECK-NEXT: mov sp, x9 -; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: ldr xzr, [sp] ; CHECK-NEXT: .cfi_def_cfa_register wsp ; CHECK-NEXT: addvl sp, sp, #31 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0f, 0x8f, 0x00, 0x11, 0x90, 0xe0, 0x00, 0x22, 0x11, 0x88, 0x02, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 12304 + 264 * VG @@ -516,7 +516,7 @@ define void @sve_1_vector_16_arr_align_8192(ptr %out) #0 { ; CHECK-NEXT: b .LBB12_1 ; CHECK-NEXT: .LBB12_3: // %entry ; CHECK-NEXT: mov sp, x9 -; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: ldr xzr, [sp] ; CHECK-NEXT: mov sp, x29 ; CHECK-NEXT: .cfi_def_cfa wsp, 16 ; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload @@ -616,7 +616,7 @@ define void @sve_1028_64k_guard(ptr %out) #0 "stack-probe-size"="65536" { ; CHECK-NEXT: b .LBB14_1 ; CHECK-NEXT: .LBB14_3: // %entry ; CHECK-NEXT: mov sp, x9 -; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: ldr xzr, [sp] ; CHECK-NEXT: .cfi_def_cfa_register wsp ; CHECK-NEXT: addvl sp, sp, #31 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x90, 0x0e, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1808 * VG diff --git a/llvm/test/CodeGen/AArch64/stack-probing.ll b/llvm/test/CodeGen/AArch64/stack-probing.ll index 5c5d9321a56e5..df5408de5bab0 100644 --- a/llvm/test/CodeGen/AArch64/stack-probing.ll +++ b/llvm/test/CodeGen/AArch64/stack-probing.ll @@ -400,7 +400,7 @@ define void @static_16_align_8192(ptr %out) #0 { ; CHECK-NEXT: b .LBB13_1 ; CHECK-NEXT: .LBB13_3: // %entry ; CHECK-NEXT: mov sp, x9 -; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: ldr xzr, [sp] ; CHECK-NEXT: mov x8, sp ; CHECK-NEXT: str x8, [x0] ; CHECK-NEXT: mov sp, x29 diff --git a/llvm/test/CodeGen/AMDGPU/codegen-internal-only-func.ll b/llvm/test/CodeGen/AMDGPU/codegen-internal-only-func.ll index 680fae1869600..4760ddb65bf2d 100644 --- a/llvm/test/CodeGen/AMDGPU/codegen-internal-only-func.ll +++ b/llvm/test/CodeGen/AMDGPU/codegen-internal-only-func.ll @@ -1,7 +1,6 @@ ; REQUIRES: asserts -; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck -check-prefixes=OPT,COV4 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=OPT %s ; RUN: not llc --crash -O0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=null %s -; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck -check-prefixes=OPT,COV5 %s ; AMDGPUAttributor deletes the function "by accident" so it's never ; codegened with optimizations. @@ -15,12 +14,9 @@ ; OPT-NEXT: amdhsa.target: amdgcn-amd-amdhsa--gfx900 ; OPT-NEXT: amdhsa.version: ; OPT-NEXT: - 1 -; COV4: - 1 -; COV5: - 2 -; OPT: ... +; OPT-NEXT: - 1 +; OPT-NEXT: ... define internal i32 @func() { ret i32 0 } -!llvm.module.flags = !{!0} -!0 = !{i32 1, !"amdgpu_code_object_version", i32 CODE_OBJECT_VERSION} diff --git a/llvm/test/CodeGen/AMDGPU/elf-header-osabi.ll b/llvm/test/CodeGen/AMDGPU/elf-header-osabi.ll index f8fc3e1e76480..e5de7859dab6f 100644 --- a/llvm/test/CodeGen/AMDGPU/elf-header-osabi.ll +++ b/llvm/test/CodeGen/AMDGPU/elf-header-osabi.ll @@ -1,12 +1,9 @@ ; RUN: llc -filetype=obj -mtriple=amdgcn -mcpu=gfx801 < %s | llvm-readobj --file-headers - | FileCheck --check-prefixes=NONE %s ; RUN: llc -filetype=obj -mtriple=amdgcn-amd- -mcpu=gfx801 < %s | llvm-readobj --file-headers - | FileCheck --check-prefixes=NONE %s ; RUN: llc -filetype=obj -mtriple=amdgcn-amd-unknown -mcpu=gfx801 < %s | llvm-readobj --file-headers - | FileCheck --check-prefixes=NONE %s -; RUN: llc -filetype=obj -mtriple=amdgcn--amdhsa -mcpu=gfx801 --amdhsa-code-object-version=4 < %s | llvm-readobj --file-headers - | FileCheck --check-prefixes=HSA,HSA4 %s -; RUN: llc -filetype=obj -mtriple=amdgcn-amd-amdhsa -mcpu=gfx801 --amdhsa-code-object-version=4 < %s | llvm-readobj --file-headers - | FileCheck --check-prefixes=HSA,HSA4 %s -; RUN: llc -filetype=obj -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx801 --amdhsa-code-object-version=4 < %s | llvm-readobj --file-headers - | FileCheck --check-prefixes=HSA,HSA4 %s -; RUN: llc -filetype=obj -mtriple=amdgcn--amdhsa -mcpu=gfx801 --amdhsa-code-object-version=5 < %s | llvm-readobj --file-headers - | FileCheck --check-prefixes=HSA,HSA5 %s -; RUN: llc -filetype=obj -mtriple=amdgcn-amd-amdhsa -mcpu=gfx801 --amdhsa-code-object-version=5 < %s | llvm-readobj --file-headers - | FileCheck --check-prefixes=HSA,HSA5 %s -; RUN: llc -filetype=obj -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx801 --amdhsa-code-object-version=5 < %s | llvm-readobj --file-headers - | FileCheck --check-prefixes=HSA,HSA5 %s +; RUN: llc -filetype=obj -mtriple=amdgcn--amdhsa -mcpu=gfx801 < %s | llvm-readobj --file-headers - | FileCheck --check-prefixes=HSA %s +; RUN: llc -filetype=obj -mtriple=amdgcn-amd-amdhsa -mcpu=gfx801 < %s | llvm-readobj --file-headers - | FileCheck --check-prefixes=HSA %s +; RUN: llc -filetype=obj -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx801 < %s | llvm-readobj --file-headers - | FileCheck --check-prefixes=HSA %s ; RUN: llc -filetype=obj -mtriple=amdgcn--amdpal -mcpu=gfx801 < %s | llvm-readobj --file-headers - | FileCheck --check-prefixes=PAL %s ; RUN: llc -filetype=obj -mtriple=amdgcn-amd-amdpal -mcpu=gfx801 < %s | llvm-readobj --file-headers - | FileCheck --check-prefixes=PAL %s ; RUN: llc -filetype=obj -mtriple=amdgcn-unknown-amdpal -mcpu=gfx801 < %s | llvm-readobj --file-headers - | FileCheck --check-prefixes=PAL %s @@ -16,8 +13,7 @@ ; NONE: OS/ABI: SystemV (0x0) ; HSA: OS/ABI: AMDGPU_HSA (0x40) -; HSA4: ABIVersion: 2 -; HSA5: ABIVersion: 3 +; HSA: ABIVersion: 2 ; PAL: OS/ABI: AMDGPU_PAL (0x41) ; PAL: ABIVersion: 0 ; MESA3D: OS/ABI: AMDGPU_MESA3D (0x42) diff --git a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll index 40939ee69f67e..61017e809c863 100644 --- a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll +++ b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll @@ -17,9 +17,6 @@ ; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdpal -mcpu=gfx1030 < %s | FileCheck -check-prefixes=GFX10,GFX10-SDAG %s ; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdpal -mcpu=gfx1030 < %s | FileCheck -check-prefixes=GFX10,GFX10-GISEL %s -; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG %s -; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL %s - ; Test for integer mad formation for patterns used in clpeak define i32 @clpeak_imad_pat_i32(i32 %x, i32 %y) { @@ -146,36 +143,6 @@ define i32 @clpeak_imad_pat_i32(i32 %x, i32 %y) { ; GFX10-GISEL-NEXT: v_add_nc_u32_e32 v0, 1, v0 ; GFX10-GISEL-NEXT: v_mul_lo_u32 v0, v1, v0 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-SDAG-LABEL: clpeak_imad_pat_i32: -; GFX11-SDAG: ; %bb.0: ; %entry -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v0, 1, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_mul_lo_u32 v5, v0, v1 -; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v0, v5, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_mul_lo_u32 v2, v0, v1 -; GFX11-SDAG-NEXT: v_mad_u64_u32 v[3:4], null, v2, v5, v[2:3] -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_mad_u64_u32 v[0:1], null, v3, v2, v[3:4] -; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-GISEL-LABEL: clpeak_imad_pat_i32: -; GFX11-GISEL: ; %bb.0: ; %entry -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, 1, v0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_mul_lo_u32 v2, v0, v1 -; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, v2, v0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_mul_lo_u32 v0, v0, v1 -; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v1, 1, v2 -; GFX11-GISEL-NEXT: v_mul_lo_u32 v1, v0, v1 -; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, 1, v0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_mul_lo_u32 v0, v1, v0 -; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] entry: %y18 = add i32 %x, 1 %add = mul i32 %y18, %y @@ -304,35 +271,6 @@ define signext i16 @clpeak_imad_pat_i16(i16 signext %x, i16 signext %y) { ; GFX10-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 ; GFX10-GISEL-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-SDAG-LABEL: clpeak_imad_pat_i16: -; GFX11-SDAG: ; %bb.0: ; %entry -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_mad_u16 v0, v1, v0, v1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_mad_u16 v1, v0, v1, v0 -; GFX11-SDAG-NEXT: v_mad_u16 v0, v1, v0, v1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_mad_u16 v0, v0, v1, v0 -; GFX11-SDAG-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-GISEL-LABEL: clpeak_imad_pat_i16: -; GFX11-GISEL: ; %bb.0: ; %entry -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_add_nc_u16 v0, v0, 1 -; GFX11-GISEL-NEXT: v_add_nc_u16 v2, v1, 1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX11-GISEL-NEXT: v_mul_lo_u16 v1, v2, v0 -; GFX11-GISEL-NEXT: v_add_nc_u16 v0, v0, 1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX11-GISEL-NEXT: v_add_nc_u16 v1, v1, 1 -; GFX11-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] entry: %conv33 = add i16 %x, 1 %add = mul i16 %conv33, %y @@ -524,38 +462,6 @@ define <2 x i16> @clpeak_imad_pat_v2i16(<2 x i16> %x, <2 x i16> %y) { ; GFX10-GISEL-NEXT: v_pk_mul_lo_u16 v0, v0, v1 ; GFX10-GISEL-NEXT: v_pk_mul_lo_u16 v0, v0, v2 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-SDAG-LABEL: clpeak_imad_pat_v2i16: -; GFX11-SDAG: ; %bb.0: ; %entry -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_pk_sub_u16 v0, v0, -1 op_sel_hi:[1,0] -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v2, v0, v1 -; GFX11-SDAG-NEXT: v_pk_add_u16 v0, v2, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v1 -; GFX11-SDAG-NEXT: v_pk_sub_u16 v1, v2, -1 op_sel_hi:[1,0] -; GFX11-SDAG-NEXT: v_pk_sub_u16 v2, v0, -1 op_sel_hi:[1,0] -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v1 -; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v2 -; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-GISEL-LABEL: clpeak_imad_pat_v2i16: -; GFX11-GISEL: ; %bb.0: ; %entry -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0] -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v2, v0, v1 -; GFX11-GISEL-NEXT: v_pk_add_u16 v0, v2, v0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v0, v0, v1 -; GFX11-GISEL-NEXT: v_pk_add_u16 v1, v2, 1 op_sel_hi:[1,0] -; GFX11-GISEL-NEXT: v_pk_add_u16 v2, v0, 1 op_sel_hi:[1,0] -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v0, v0, v1 -; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v0, v0, v2 -; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] entry: %y18 = add <2 x i16> %x, %add = mul <2 x i16> %y18, %y @@ -813,60 +719,6 @@ define <3 x i16> @clpeak_imad_pat_v3i16(<3 x i16> %x, <3 x i16> %y) { ; GFX10-GISEL-NEXT: v_pk_mul_lo_u16 v0, v0, v4 ; GFX10-GISEL-NEXT: v_pk_mul_lo_u16 v1, v1, v5 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-SDAG-LABEL: clpeak_imad_pat_v3i16: -; GFX11-SDAG: ; %bb.0: ; %entry -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_pk_sub_u16 v1, v1, -1 -; GFX11-SDAG-NEXT: v_pk_sub_u16 v0, v0, -1 op_sel_hi:[1,0] -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v4, v1, v3 -; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v5, v0, v2 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_pk_add_u16 v1, v4, v1 -; GFX11-SDAG-NEXT: v_pk_add_u16 v0, v5, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v1, v1, v3 -; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v2 -; GFX11-SDAG-NEXT: v_pk_sub_u16 v2, v5, -1 op_sel_hi:[1,0] -; GFX11-SDAG-NEXT: v_pk_sub_u16 v3, v4, -1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-SDAG-NEXT: v_pk_sub_u16 v4, v1, -1 -; GFX11-SDAG-NEXT: v_pk_sub_u16 v5, v0, -1 op_sel_hi:[1,0] -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v2 -; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v1, v1, v3 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v5 -; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v1, v1, v4 -; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-GISEL-LABEL: clpeak_imad_pat_v3i16: -; GFX11-GISEL: ; %bb.0: ; %entry -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0] -; GFX11-GISEL-NEXT: v_pk_add_u16 v1, v1, 1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v4, v0, v2 -; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v5, v1, v3 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_pk_add_u16 v0, v4, v0 -; GFX11-GISEL-NEXT: v_pk_add_u16 v1, v5, v1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v0, v0, v2 -; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v1, v1, v3 -; GFX11-GISEL-NEXT: v_pk_add_u16 v2, v4, 1 op_sel_hi:[1,0] -; GFX11-GISEL-NEXT: v_pk_add_u16 v3, v5, 1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-GISEL-NEXT: v_pk_add_u16 v4, v0, 1 op_sel_hi:[1,0] -; GFX11-GISEL-NEXT: v_pk_add_u16 v5, v1, 1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v0, v0, v2 -; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v1, v1, v3 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v0, v0, v4 -; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v1, v1, v5 -; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] entry: %y48 = add <3 x i16> %x, %add = mul <3 x i16> %y48, %y @@ -1210,60 +1062,6 @@ define <4 x i16> @clpeak_imad_pat_v4i16(<4 x i16> %x, <4 x i16> %y) { ; GFX10-GISEL-NEXT: v_pk_mul_lo_u16 v0, v0, v4 ; GFX10-GISEL-NEXT: v_pk_mul_lo_u16 v1, v1, v5 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-SDAG-LABEL: clpeak_imad_pat_v4i16: -; GFX11-SDAG: ; %bb.0: ; %entry -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_pk_sub_u16 v1, v1, -1 op_sel_hi:[1,0] -; GFX11-SDAG-NEXT: v_pk_sub_u16 v0, v0, -1 op_sel_hi:[1,0] -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v4, v1, v3 -; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v5, v0, v2 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_pk_add_u16 v1, v4, v1 -; GFX11-SDAG-NEXT: v_pk_add_u16 v0, v5, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v1, v1, v3 -; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v2 -; GFX11-SDAG-NEXT: v_pk_sub_u16 v2, v5, -1 op_sel_hi:[1,0] -; GFX11-SDAG-NEXT: v_pk_sub_u16 v3, v4, -1 op_sel_hi:[1,0] -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-SDAG-NEXT: v_pk_sub_u16 v4, v1, -1 op_sel_hi:[1,0] -; GFX11-SDAG-NEXT: v_pk_sub_u16 v5, v0, -1 op_sel_hi:[1,0] -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v2 -; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v1, v1, v3 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v5 -; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v1, v1, v4 -; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-GISEL-LABEL: clpeak_imad_pat_v4i16: -; GFX11-GISEL: ; %bb.0: ; %entry -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0] -; GFX11-GISEL-NEXT: v_pk_add_u16 v1, v1, 1 op_sel_hi:[1,0] -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v4, v0, v2 -; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v5, v1, v3 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_pk_add_u16 v0, v4, v0 -; GFX11-GISEL-NEXT: v_pk_add_u16 v1, v5, v1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v0, v0, v2 -; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v1, v1, v3 -; GFX11-GISEL-NEXT: v_pk_add_u16 v2, v4, 1 op_sel_hi:[1,0] -; GFX11-GISEL-NEXT: v_pk_add_u16 v3, v5, 1 op_sel_hi:[1,0] -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-GISEL-NEXT: v_pk_add_u16 v4, v0, 1 op_sel_hi:[1,0] -; GFX11-GISEL-NEXT: v_pk_add_u16 v5, v1, 1 op_sel_hi:[1,0] -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v0, v0, v2 -; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v1, v1, v3 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v0, v0, v4 -; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v1, v1, v5 -; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] entry: %y18 = add <4 x i16> %x, %add = mul <4 x i16> %y18, %y @@ -1386,35 +1184,6 @@ define zeroext i16 @clpeak_umad_pat_i16(i16 zeroext %x, i16 zeroext %y) { ; GFX10-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 ; GFX10-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-SDAG-LABEL: clpeak_umad_pat_i16: -; GFX11-SDAG: ; %bb.0: ; %entry -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_mad_u16 v0, v1, v0, v1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_mad_u16 v1, v0, v1, v0 -; GFX11-SDAG-NEXT: v_mad_u16 v0, v1, v0, v1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_mad_u16 v0, v0, v1, v0 -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-GISEL-LABEL: clpeak_umad_pat_i16: -; GFX11-GISEL: ; %bb.0: ; %entry -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_add_nc_u16 v0, v0, 1 -; GFX11-GISEL-NEXT: v_add_nc_u16 v2, v1, 1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX11-GISEL-NEXT: v_mul_lo_u16 v1, v2, v0 -; GFX11-GISEL-NEXT: v_add_nc_u16 v0, v0, 1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX11-GISEL-NEXT: v_add_nc_u16 v1, v1, 1 -; GFX11-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] entry: %conv33 = add i16 %x, 1 %add = mul i16 %conv33, %y @@ -1606,38 +1375,6 @@ define <2 x i16> @clpeak_umad_pat_v2i16(<2 x i16> %x, <2 x i16> %y) { ; GFX10-GISEL-NEXT: v_pk_mul_lo_u16 v0, v0, v1 ; GFX10-GISEL-NEXT: v_pk_mul_lo_u16 v0, v0, v2 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-SDAG-LABEL: clpeak_umad_pat_v2i16: -; GFX11-SDAG: ; %bb.0: ; %entry -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_pk_sub_u16 v0, v0, -1 op_sel_hi:[1,0] -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v2, v0, v1 -; GFX11-SDAG-NEXT: v_pk_add_u16 v0, v2, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v1 -; GFX11-SDAG-NEXT: v_pk_sub_u16 v1, v2, -1 op_sel_hi:[1,0] -; GFX11-SDAG-NEXT: v_pk_sub_u16 v2, v0, -1 op_sel_hi:[1,0] -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v1 -; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v2 -; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-GISEL-LABEL: clpeak_umad_pat_v2i16: -; GFX11-GISEL: ; %bb.0: ; %entry -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0] -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v2, v0, v1 -; GFX11-GISEL-NEXT: v_pk_add_u16 v0, v2, v0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v0, v0, v1 -; GFX11-GISEL-NEXT: v_pk_add_u16 v1, v2, 1 op_sel_hi:[1,0] -; GFX11-GISEL-NEXT: v_pk_add_u16 v2, v0, 1 op_sel_hi:[1,0] -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v0, v0, v1 -; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v0, v0, v2 -; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] entry: %y18 = add <2 x i16> %x, %add = mul <2 x i16> %y18, %y @@ -1895,60 +1632,6 @@ define <3 x i16> @clpeak_umad_pat_v3i16(<3 x i16> %x, <3 x i16> %y) { ; GFX10-GISEL-NEXT: v_pk_mul_lo_u16 v0, v0, v4 ; GFX10-GISEL-NEXT: v_pk_mul_lo_u16 v1, v1, v5 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-SDAG-LABEL: clpeak_umad_pat_v3i16: -; GFX11-SDAG: ; %bb.0: ; %entry -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_pk_sub_u16 v1, v1, -1 -; GFX11-SDAG-NEXT: v_pk_sub_u16 v0, v0, -1 op_sel_hi:[1,0] -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v4, v1, v3 -; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v5, v0, v2 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_pk_add_u16 v1, v4, v1 -; GFX11-SDAG-NEXT: v_pk_add_u16 v0, v5, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v1, v1, v3 -; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v2 -; GFX11-SDAG-NEXT: v_pk_sub_u16 v2, v5, -1 op_sel_hi:[1,0] -; GFX11-SDAG-NEXT: v_pk_sub_u16 v3, v4, -1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-SDAG-NEXT: v_pk_sub_u16 v4, v1, -1 -; GFX11-SDAG-NEXT: v_pk_sub_u16 v5, v0, -1 op_sel_hi:[1,0] -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v2 -; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v1, v1, v3 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v5 -; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v1, v1, v4 -; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-GISEL-LABEL: clpeak_umad_pat_v3i16: -; GFX11-GISEL: ; %bb.0: ; %entry -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0] -; GFX11-GISEL-NEXT: v_pk_add_u16 v1, v1, 1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v4, v0, v2 -; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v5, v1, v3 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_pk_add_u16 v0, v4, v0 -; GFX11-GISEL-NEXT: v_pk_add_u16 v1, v5, v1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v0, v0, v2 -; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v1, v1, v3 -; GFX11-GISEL-NEXT: v_pk_add_u16 v2, v4, 1 op_sel_hi:[1,0] -; GFX11-GISEL-NEXT: v_pk_add_u16 v3, v5, 1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-GISEL-NEXT: v_pk_add_u16 v4, v0, 1 op_sel_hi:[1,0] -; GFX11-GISEL-NEXT: v_pk_add_u16 v5, v1, 1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v0, v0, v2 -; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v1, v1, v3 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v0, v0, v4 -; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v1, v1, v5 -; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] entry: %y48 = add <3 x i16> %x, %add = mul <3 x i16> %y48, %y @@ -2292,60 +1975,6 @@ define <4 x i16> @clpeak_umad_pat_v4i16(<4 x i16> %x, <4 x i16> %y) { ; GFX10-GISEL-NEXT: v_pk_mul_lo_u16 v0, v0, v4 ; GFX10-GISEL-NEXT: v_pk_mul_lo_u16 v1, v1, v5 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-SDAG-LABEL: clpeak_umad_pat_v4i16: -; GFX11-SDAG: ; %bb.0: ; %entry -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_pk_sub_u16 v1, v1, -1 op_sel_hi:[1,0] -; GFX11-SDAG-NEXT: v_pk_sub_u16 v0, v0, -1 op_sel_hi:[1,0] -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v4, v1, v3 -; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v5, v0, v2 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_pk_add_u16 v1, v4, v1 -; GFX11-SDAG-NEXT: v_pk_add_u16 v0, v5, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v1, v1, v3 -; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v2 -; GFX11-SDAG-NEXT: v_pk_sub_u16 v2, v5, -1 op_sel_hi:[1,0] -; GFX11-SDAG-NEXT: v_pk_sub_u16 v3, v4, -1 op_sel_hi:[1,0] -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-SDAG-NEXT: v_pk_sub_u16 v4, v1, -1 op_sel_hi:[1,0] -; GFX11-SDAG-NEXT: v_pk_sub_u16 v5, v0, -1 op_sel_hi:[1,0] -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v2 -; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v1, v1, v3 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v5 -; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v1, v1, v4 -; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-GISEL-LABEL: clpeak_umad_pat_v4i16: -; GFX11-GISEL: ; %bb.0: ; %entry -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0] -; GFX11-GISEL-NEXT: v_pk_add_u16 v1, v1, 1 op_sel_hi:[1,0] -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v4, v0, v2 -; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v5, v1, v3 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_pk_add_u16 v0, v4, v0 -; GFX11-GISEL-NEXT: v_pk_add_u16 v1, v5, v1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v0, v0, v2 -; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v1, v1, v3 -; GFX11-GISEL-NEXT: v_pk_add_u16 v2, v4, 1 op_sel_hi:[1,0] -; GFX11-GISEL-NEXT: v_pk_add_u16 v3, v5, 1 op_sel_hi:[1,0] -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-GISEL-NEXT: v_pk_add_u16 v4, v0, 1 op_sel_hi:[1,0] -; GFX11-GISEL-NEXT: v_pk_add_u16 v5, v1, 1 op_sel_hi:[1,0] -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v0, v0, v2 -; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v1, v1, v3 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v0, v0, v4 -; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v1, v1, v5 -; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] entry: %y18 = add <4 x i16> %x, %add = mul <4 x i16> %y18, %y @@ -2557,54 +2186,6 @@ define <2 x i32> @clpeak_imad_pat_v2i32(<2 x i32> %x, <2 x i32> %y) { ; GFX10-GISEL-NEXT: v_mul_lo_u32 v0, v2, v0 ; GFX10-GISEL-NEXT: v_mul_lo_u32 v1, v3, v1 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-SDAG-LABEL: clpeak_imad_pat_v2i32: -; GFX11-SDAG: ; %bb.0: ; %entry -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v0, 1, v0 -; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v1, 1, v1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_mul_lo_u32 v6, v0, v2 -; GFX11-SDAG-NEXT: v_mul_lo_u32 v7, v1, v3 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v0, v6, v0 -; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v1, v7, v1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_mul_lo_u32 v2, v0, v2 -; GFX11-SDAG-NEXT: v_mul_lo_u32 v3, v1, v3 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_mad_u64_u32 v[4:5], null, v2, v6, v[2:3] -; GFX11-SDAG-NEXT: v_mad_u64_u32 v[5:6], null, v3, v7, v[3:4] -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_mad_u64_u32 v[0:1], null, v4, v2, v[4:5] -; GFX11-SDAG-NEXT: v_mad_u64_u32 v[1:2], null, v5, v3, v[5:6] -; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-GISEL-LABEL: clpeak_imad_pat_v2i32: -; GFX11-GISEL: ; %bb.0: ; %entry -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, 1, v0 -; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v1, 1, v1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_mul_lo_u32 v4, v0, v2 -; GFX11-GISEL-NEXT: v_mul_lo_u32 v5, v1, v3 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, v4, v0 -; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v1, v5, v1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_mul_lo_u32 v0, v0, v2 -; GFX11-GISEL-NEXT: v_mul_lo_u32 v1, v1, v3 -; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v2, 1, v4 -; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v3, 1, v5 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_mul_lo_u32 v2, v0, v2 -; GFX11-GISEL-NEXT: v_mul_lo_u32 v3, v1, v3 -; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, 1, v0 -; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v1, 1, v1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_mul_lo_u32 v0, v2, v0 -; GFX11-GISEL-NEXT: v_mul_lo_u32 v1, v3, v1 -; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] entry: %y18 = add <2 x i32> %x, %add = mul <2 x i32> %y18, %y @@ -2890,74 +2471,6 @@ define <3 x i32> @clpeak_imad_pat_v3i32(<3 x i32> %x, <3 x i32> %y) { ; GFX10-GISEL-NEXT: v_mul_lo_u32 v1, v4, v1 ; GFX10-GISEL-NEXT: v_mul_lo_u32 v2, v5, v2 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-SDAG-LABEL: clpeak_imad_pat_v3i32: -; GFX11-SDAG: ; %bb.0: ; %entry -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v0, 1, v0 -; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v1, 1, v1 -; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v2, 1, v2 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-SDAG-NEXT: v_mul_lo_u32 v7, v0, v3 -; GFX11-SDAG-NEXT: v_mul_lo_u32 v8, v1, v4 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-SDAG-NEXT: v_mul_lo_u32 v9, v2, v5 -; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v0, v7, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v1, v8, v1 -; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v6, v9, v2 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-SDAG-NEXT: v_mul_lo_u32 v2, v0, v3 -; GFX11-SDAG-NEXT: v_mul_lo_u32 v3, v1, v4 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_mul_lo_u32 v4, v6, v5 -; GFX11-SDAG-NEXT: v_mad_u64_u32 v[5:6], null, v2, v7, v[2:3] -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_mad_u64_u32 v[6:7], null, v3, v8, v[3:4] -; GFX11-SDAG-NEXT: v_mad_u64_u32 v[7:8], null, v4, v9, v[4:5] -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_mad_u64_u32 v[0:1], null, v5, v2, v[5:6] -; GFX11-SDAG-NEXT: v_mad_u64_u32 v[1:2], null, v6, v3, v[6:7] -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-SDAG-NEXT: v_mad_u64_u32 v[2:3], null, v7, v4, v[7:8] -; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-GISEL-LABEL: clpeak_imad_pat_v3i32: -; GFX11-GISEL: ; %bb.0: ; %entry -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, 1, v0 -; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v1, 1, v1 -; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v2, 1, v2 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-GISEL-NEXT: v_mul_lo_u32 v6, v0, v3 -; GFX11-GISEL-NEXT: v_mul_lo_u32 v7, v1, v4 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-GISEL-NEXT: v_mul_lo_u32 v8, v2, v5 -; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, v6, v0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v1, v7, v1 -; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v2, v8, v2 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-GISEL-NEXT: v_mul_lo_u32 v0, v0, v3 -; GFX11-GISEL-NEXT: v_mul_lo_u32 v1, v1, v4 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX11-GISEL-NEXT: v_mul_lo_u32 v2, v2, v5 -; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v3, 1, v6 -; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v4, 1, v7 -; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v5, 1, v8 -; GFX11-GISEL-NEXT: v_mul_lo_u32 v3, v0, v3 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-GISEL-NEXT: v_mul_lo_u32 v4, v1, v4 -; GFX11-GISEL-NEXT: v_mul_lo_u32 v5, v2, v5 -; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, 1, v0 -; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v1, 1, v1 -; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v2, 1, v2 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-GISEL-NEXT: v_mul_lo_u32 v0, v3, v0 -; GFX11-GISEL-NEXT: v_mul_lo_u32 v1, v4, v1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-GISEL-NEXT: v_mul_lo_u32 v2, v5, v2 -; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] entry: %y48 = add <3 x i32> %x, %add = mul <3 x i32> %y48, %y @@ -3281,90 +2794,6 @@ define <4 x i32> @clpeak_imad_pat_v4i32(<4 x i32> %x, <4 x i32> %y) { ; GFX10-GISEL-NEXT: v_mul_lo_u32 v2, v5, v2 ; GFX10-GISEL-NEXT: v_mul_lo_u32 v3, v6, v3 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-SDAG-LABEL: clpeak_imad_pat_v4i32: -; GFX11-SDAG: ; %bb.0: ; %entry -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v0, 1, v0 -; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v1, 1, v1 -; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v2, 1, v2 -; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v3, 1, v3 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-SDAG-NEXT: v_mul_lo_u32 v8, v0, v4 -; GFX11-SDAG-NEXT: v_mul_lo_u32 v9, v1, v5 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-SDAG-NEXT: v_mul_lo_u32 v10, v2, v6 -; GFX11-SDAG-NEXT: v_mul_lo_u32 v11, v3, v7 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v0, v8, v0 -; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v1, v9, v1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v12, v10, v2 -; GFX11-SDAG-NEXT: v_mul_lo_u32 v2, v0, v4 -; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v0, v11, v3 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-SDAG-NEXT: v_mul_lo_u32 v3, v1, v5 -; GFX11-SDAG-NEXT: v_mul_lo_u32 v4, v12, v6 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-SDAG-NEXT: v_mul_lo_u32 v5, v0, v7 -; GFX11-SDAG-NEXT: v_mad_u64_u32 v[6:7], null, v2, v8, v[2:3] -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-SDAG-NEXT: v_mad_u64_u32 v[7:8], null, v3, v9, v[3:4] -; GFX11-SDAG-NEXT: v_mad_u64_u32 v[8:9], null, v4, v10, v[4:5] -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-SDAG-NEXT: v_mad_u64_u32 v[9:10], null, v5, v11, v[5:6] -; GFX11-SDAG-NEXT: v_mad_u64_u32 v[0:1], null, v6, v2, v[6:7] -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-SDAG-NEXT: v_mad_u64_u32 v[1:2], null, v7, v3, v[7:8] -; GFX11-SDAG-NEXT: v_mad_u64_u32 v[2:3], null, v8, v4, v[8:9] -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-SDAG-NEXT: v_mad_u64_u32 v[3:4], null, v9, v5, v[9:10] -; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-GISEL-LABEL: clpeak_imad_pat_v4i32: -; GFX11-GISEL: ; %bb.0: ; %entry -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, 1, v0 -; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v1, 1, v1 -; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v2, 1, v2 -; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v3, 1, v3 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-GISEL-NEXT: v_mul_lo_u32 v8, v0, v4 -; GFX11-GISEL-NEXT: v_mul_lo_u32 v9, v1, v5 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-GISEL-NEXT: v_mul_lo_u32 v10, v2, v6 -; GFX11-GISEL-NEXT: v_mul_lo_u32 v11, v3, v7 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, v8, v0 -; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v1, v9, v1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v2, v10, v2 -; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v3, v11, v3 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-GISEL-NEXT: v_mul_lo_u32 v0, v0, v4 -; GFX11-GISEL-NEXT: v_mul_lo_u32 v1, v1, v5 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-GISEL-NEXT: v_mul_lo_u32 v2, v2, v6 -; GFX11-GISEL-NEXT: v_mul_lo_u32 v3, v3, v7 -; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v4, 1, v8 -; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v5, 1, v9 -; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v6, 1, v10 -; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v7, 1, v11 -; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v8, 1, v0 -; GFX11-GISEL-NEXT: v_mul_lo_u32 v0, v0, v4 -; GFX11-GISEL-NEXT: v_mul_lo_u32 v4, v1, v5 -; GFX11-GISEL-NEXT: v_mul_lo_u32 v5, v2, v6 -; GFX11-GISEL-NEXT: v_mul_lo_u32 v6, v3, v7 -; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v1, 1, v1 -; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v2, 1, v2 -; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v3, 1, v3 -; GFX11-GISEL-NEXT: v_mul_lo_u32 v0, v0, v8 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-GISEL-NEXT: v_mul_lo_u32 v1, v4, v1 -; GFX11-GISEL-NEXT: v_mul_lo_u32 v2, v5, v2 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-GISEL-NEXT: v_mul_lo_u32 v3, v6, v3 -; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] entry: %y18 = add <4 x i32> %x, %add = mul <4 x i32> %y18, %y @@ -3521,40 +2950,6 @@ define i32 @clpeak_imad_pat_i24(i32 %x, i32 %y) { ; GFX10-GISEL-NEXT: v_add_nc_u32_e32 v0, 1, v0 ; GFX10-GISEL-NEXT: v_mul_lo_u32 v0, v1, v0 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-SDAG-LABEL: clpeak_imad_pat_i24: -; GFX11-SDAG: ; %bb.0: ; %entry -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_bfe_i32 v0, v0, 0, 24 -; GFX11-SDAG-NEXT: v_bfe_i32 v1, v1, 0, 24 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v0, 1, v0 -; GFX11-SDAG-NEXT: v_mul_lo_u32 v5, v1, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v0, v5, v0 -; GFX11-SDAG-NEXT: v_mul_lo_u32 v2, v0, v1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_mad_u64_u32 v[3:4], null, v2, v5, v[2:3] -; GFX11-SDAG-NEXT: v_mad_u64_u32 v[0:1], null, v3, v2, v[3:4] -; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-GISEL-LABEL: clpeak_imad_pat_i24: -; GFX11-GISEL: ; %bb.0: ; %entry -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_bfe_i32 v0, v0, 0, 24 -; GFX11-GISEL-NEXT: v_bfe_i32 v1, v1, 0, 24 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, 1, v0 -; GFX11-GISEL-NEXT: v_mul_lo_u32 v2, v1, v0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, v2, v0 -; GFX11-GISEL-NEXT: v_mul_lo_u32 v0, v0, v1 -; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v1, 1, v2 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_mul_lo_u32 v1, v0, v1 -; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, 1, v0 -; GFX11-GISEL-NEXT: v_mul_lo_u32 v0, v1, v0 -; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] entry: %shl = shl i32 %x, 8 %shr = ashr exact i32 %shl, 8 @@ -3715,40 +3110,6 @@ define i32 @clpeak_imad_pat_u24(i32 %x, i32 %y) { ; GFX10-GISEL-NEXT: v_add_nc_u32_e32 v0, 1, v0 ; GFX10-GISEL-NEXT: v_mul_lo_u32 v0, v1, v0 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-SDAG-LABEL: clpeak_imad_pat_u24: -; GFX11-SDAG: ; %bb.0: ; %entry -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0xffffff, v0 -; GFX11-SDAG-NEXT: v_and_b32_e32 v1, 0xffffff, v1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v0, 1, v0 -; GFX11-SDAG-NEXT: v_mul_lo_u32 v5, v1, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v0, v5, v0 -; GFX11-SDAG-NEXT: v_mul_lo_u32 v2, v0, v1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_mad_u64_u32 v[3:4], null, v2, v5, v[2:3] -; GFX11-SDAG-NEXT: v_mad_u64_u32 v[0:1], null, v3, v2, v[3:4] -; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-GISEL-LABEL: clpeak_imad_pat_u24: -; GFX11-GISEL: ; %bb.0: ; %entry -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0xffffff, v0 -; GFX11-GISEL-NEXT: v_and_b32_e32 v1, 0xffffff, v1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, 1, v0 -; GFX11-GISEL-NEXT: v_mul_lo_u32 v2, v1, v0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, v2, v0 -; GFX11-GISEL-NEXT: v_mul_lo_u32 v0, v0, v1 -; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v1, 1, v2 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_mul_lo_u32 v1, v0, v1 -; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, 1, v0 -; GFX11-GISEL-NEXT: v_mul_lo_u32 v0, v1, v0 -; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] entry: %shl = and i32 %x, 16777215 %shl1 = and i32 %y, 16777215 @@ -3879,35 +3240,6 @@ define signext i8 @clpeak_imad_pat_i8(i8 signext %x, i8 signext %y) { ; GFX10-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 ; GFX10-GISEL-NEXT: v_bfe_i32 v0, v0, 0, 8 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-SDAG-LABEL: clpeak_imad_pat_i8: -; GFX11-SDAG: ; %bb.0: ; %entry -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_mad_u16 v0, v1, v0, v1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_mad_u16 v1, v0, v1, v0 -; GFX11-SDAG-NEXT: v_mad_u16 v0, v1, v0, v1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_mad_u16 v0, v0, v1, v0 -; GFX11-SDAG-NEXT: v_bfe_i32 v0, v0, 0, 8 -; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-GISEL-LABEL: clpeak_imad_pat_i8: -; GFX11-GISEL: ; %bb.0: ; %entry -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_add_nc_u16 v0, v0, 1 -; GFX11-GISEL-NEXT: v_add_nc_u16 v2, v1, 1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX11-GISEL-NEXT: v_mul_lo_u16 v1, v2, v0 -; GFX11-GISEL-NEXT: v_add_nc_u16 v0, v0, 1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX11-GISEL-NEXT: v_add_nc_u16 v1, v1, 1 -; GFX11-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_bfe_i32 v0, v0, 0, 8 -; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] entry: %conv33 = add i8 %x, 1 %add = mul i8 %conv33, %y @@ -4115,60 +3447,6 @@ define <2 x i8> @clpeak_imad_pat_v2i8(<2 x i8> %x, <2 x i8> %y) { ; GFX10-GISEL-NEXT: v_mul_lo_u16 v0, v0, v4 ; GFX10-GISEL-NEXT: v_mul_lo_u16 v1, v1, v5 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-SDAG-LABEL: clpeak_imad_pat_v2i8: -; GFX11-SDAG: ; %bb.0: ; %entry -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_add_nc_u16 v1, v1, 1 -; GFX11-SDAG-NEXT: v_add_nc_u16 v0, v0, 1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_mad_u16 v4, v1, v3, v1 -; GFX11-SDAG-NEXT: v_mad_u16 v5, v0, v2, v0 -; GFX11-SDAG-NEXT: v_mul_lo_u16 v1, v1, v3 -; GFX11-SDAG-NEXT: v_mul_lo_u16 v0, v0, v2 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-SDAG-NEXT: v_mul_lo_u16 v3, v4, v3 -; GFX11-SDAG-NEXT: v_mul_lo_u16 v2, v5, v2 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_mad_u16 v1, v3, v1, v3 -; GFX11-SDAG-NEXT: v_mad_u16 v0, v2, v0, v2 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_mad_u16 v1, v1, v3, v1 -; GFX11-SDAG-NEXT: v_mad_u16 v0, v0, v2, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_lshlrev_b16 v2, 8, v1 -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX11-SDAG-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-GISEL-LABEL: clpeak_imad_pat_v2i8: -; GFX11-GISEL: ; %bb.0: ; %entry -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_add_nc_u16 v0, v0, 1 -; GFX11-GISEL-NEXT: v_add_nc_u16 v1, v1, 1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_mul_lo_u16 v4, v0, v2 -; GFX11-GISEL-NEXT: v_mul_lo_u16 v5, v1, v3 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_add_nc_u16 v0, v4, v0 -; GFX11-GISEL-NEXT: v_add_nc_u16 v1, v5, v1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_mul_lo_u16 v0, v0, v2 -; GFX11-GISEL-NEXT: v_mul_lo_u16 v1, v1, v3 -; GFX11-GISEL-NEXT: v_add_nc_u16 v2, v4, 1 -; GFX11-GISEL-NEXT: v_add_nc_u16 v3, v5, 1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-GISEL-NEXT: v_add_nc_u16 v4, v0, 1 -; GFX11-GISEL-NEXT: v_add_nc_u16 v5, v1, 1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-GISEL-NEXT: v_mul_lo_u16 v0, v0, v2 -; GFX11-GISEL-NEXT: v_mul_lo_u16 v1, v1, v3 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_mul_lo_u16 v0, v0, v4 -; GFX11-GISEL-NEXT: v_mul_lo_u16 v1, v1, v5 -; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] entry: %y18 = add <2 x i8> %x, %add = mul <2 x i8> %y18, %y @@ -4527,75 +3805,6 @@ define i64 @clpeak_imad_pat_i64(i64 %x, i64 %y) { ; GFX10-GISEL-NEXT: v_mul_lo_u32 v3, v3, v4 ; GFX10-GISEL-NEXT: v_add3_u32 v1, v1, v2, v3 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-SDAG-LABEL: clpeak_imad_pat_i64: -; GFX11-SDAG: ; %bb.0: ; %entry -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_add_co_u32 v4, vcc_lo, v0, 1 -; GFX11-SDAG-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-SDAG-NEXT: v_mul_lo_u32 v7, v4, v3 -; GFX11-SDAG-NEXT: v_mad_u64_u32 v[0:1], null, v4, v2, 0 -; GFX11-SDAG-NEXT: v_mul_lo_u32 v6, v5, v2 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-SDAG-NEXT: v_add3_u32 v1, v1, v7, v6 -; GFX11-SDAG-NEXT: v_add_co_u32 v6, vcc_lo, v0, v4 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v1, v5, vcc_lo -; GFX11-SDAG-NEXT: v_mul_lo_u32 v7, v6, v3 -; GFX11-SDAG-NEXT: v_mad_u64_u32 v[3:4], null, v6, v2, 0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_mul_lo_u32 v2, v5, v2 -; GFX11-SDAG-NEXT: v_mul_lo_u32 v1, v3, v1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_add3_u32 v4, v4, v7, v2 -; GFX11-SDAG-NEXT: v_mul_lo_u32 v2, v4, v0 -; GFX11-SDAG-NEXT: v_mad_u64_u32 v[5:6], null, v3, v0, v[3:4] -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_add3_u32 v6, v2, v6, v1 -; GFX11-SDAG-NEXT: v_mul_lo_u32 v2, v5, v4 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_mul_lo_u32 v4, v6, v3 -; GFX11-SDAG-NEXT: v_mad_u64_u32 v[0:1], null, v5, v3, v[5:6] -; GFX11-SDAG-NEXT: v_add3_u32 v1, v4, v1, v2 -; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-GISEL-LABEL: clpeak_imad_pat_i64: -; GFX11-GISEL: ; %bb.0: ; %entry -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_add_co_u32 v4, vcc_lo, v0, 1 -; GFX11-GISEL-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v4, v2, 0 -; GFX11-GISEL-NEXT: v_mul_lo_u32 v6, v4, v3 -; GFX11-GISEL-NEXT: v_mul_lo_u32 v7, v5, v2 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-GISEL-NEXT: v_add3_u32 v1, v1, v6, v7 -; GFX11-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, v4 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, v1, v5, vcc_lo -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[4:5], null, v6, v2, 0 -; GFX11-GISEL-NEXT: v_mul_lo_u32 v3, v6, v3 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_mul_lo_u32 v2, v7, v2 -; GFX11-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, 1 -; GFX11-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX11-GISEL-NEXT: v_mul_lo_u32 v1, v4, v1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-GISEL-NEXT: v_add3_u32 v5, v5, v3, v2 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[2:3], null, v4, v0, 0 -; GFX11-GISEL-NEXT: v_add_co_u32 v4, vcc_lo, v4, 1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_mul_lo_u32 v0, v5, v0 -; GFX11-GISEL-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v5, vcc_lo -; GFX11-GISEL-NEXT: v_add3_u32 v3, v3, v1, v0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v2, v4, 0 -; GFX11-GISEL-NEXT: v_mul_lo_u32 v2, v2, v5 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_mul_lo_u32 v3, v3, v4 -; GFX11-GISEL-NEXT: v_add3_u32 v1, v1, v2, v3 -; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] entry: %y18 = add i64 %x, 1 %add = mul i64 %y18, %y @@ -5249,120 +4458,6 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) { ; GFX10-GISEL-NEXT: v_add3_u32 v1, v1, v4, v7 ; GFX10-GISEL-NEXT: v_add3_u32 v3, v3, v6, v5 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-SDAG-LABEL: clpeak_imad_pat_v2i64: -; GFX11-SDAG: ; %bb.0: ; %entry -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_add_co_u32 v8, vcc_lo, v0, 1 -; GFX11-SDAG-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, 0, v1, vcc_lo -; GFX11-SDAG-NEXT: v_add_co_u32 v10, vcc_lo, v2, 1 -; GFX11-SDAG-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, 0, v3, vcc_lo -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-SDAG-NEXT: v_mul_lo_u32 v12, v9, v4 -; GFX11-SDAG-NEXT: v_mul_lo_u32 v13, v8, v5 -; GFX11-SDAG-NEXT: v_mad_u64_u32 v[0:1], null, v8, v4, 0 -; GFX11-SDAG-NEXT: v_mul_lo_u32 v14, v11, v6 -; GFX11-SDAG-NEXT: v_mul_lo_u32 v15, v10, v7 -; GFX11-SDAG-NEXT: v_mad_u64_u32 v[2:3], null, v10, v6, 0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_add3_u32 v1, v1, v13, v12 -; GFX11-SDAG-NEXT: v_add3_u32 v12, v3, v15, v14 -; GFX11-SDAG-NEXT: v_add_co_u32 v3, vcc_lo, v0, v8 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-SDAG-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, v1, v9, vcc_lo -; GFX11-SDAG-NEXT: v_add_co_u32 v9, vcc_lo, v2, v10 -; GFX11-SDAG-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, v12, v11, vcc_lo -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-SDAG-NEXT: v_mul_lo_u32 v11, v8, v4 -; GFX11-SDAG-NEXT: v_mul_lo_u32 v13, v3, v5 -; GFX11-SDAG-NEXT: v_mul_lo_u32 v14, v9, v7 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_mul_lo_u32 v10, v10, v6 -; GFX11-SDAG-NEXT: v_mad_u64_u32 v[7:8], null, v9, v6, 0 -; GFX11-SDAG-NEXT: v_mad_u64_u32 v[5:6], null, v3, v4, 0 -; GFX11-SDAG-NEXT: v_add3_u32 v8, v8, v14, v10 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_add3_u32 v6, v6, v13, v11 -; GFX11-SDAG-NEXT: v_mul_lo_u32 v11, v8, v2 -; GFX11-SDAG-NEXT: v_mad_u64_u32 v[9:10], null, v7, v2, v[7:8] -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX11-SDAG-NEXT: v_mul_lo_u32 v13, v6, v0 -; GFX11-SDAG-NEXT: v_mad_u64_u32 v[3:4], null, v5, v0, v[5:6] -; GFX11-SDAG-NEXT: v_mul_lo_u32 v0, v5, v1 -; GFX11-SDAG-NEXT: v_mul_lo_u32 v1, v7, v12 -; GFX11-SDAG-NEXT: v_mul_lo_u32 v6, v3, v6 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-SDAG-NEXT: v_add3_u32 v4, v13, v4, v0 -; GFX11-SDAG-NEXT: v_add3_u32 v10, v11, v10, v1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-SDAG-NEXT: v_mul_lo_u32 v11, v4, v5 -; GFX11-SDAG-NEXT: v_mad_u64_u32 v[0:1], null, v3, v5, v[3:4] -; GFX11-SDAG-NEXT: v_mul_lo_u32 v4, v9, v8 -; GFX11-SDAG-NEXT: v_mul_lo_u32 v5, v10, v7 -; GFX11-SDAG-NEXT: v_mad_u64_u32 v[2:3], null, v9, v7, v[9:10] -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_add3_u32 v1, v11, v1, v6 -; GFX11-SDAG-NEXT: v_add3_u32 v3, v5, v3, v4 -; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-GISEL-LABEL: clpeak_imad_pat_v2i64: -; GFX11-GISEL: ; %bb.0: ; %entry -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_add_co_u32 v8, vcc_lo, v0, 1 -; GFX11-GISEL-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, 0, v1, vcc_lo -; GFX11-GISEL-NEXT: v_add_co_u32 v10, vcc_lo, v2, 1 -; GFX11-GISEL-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, 0, v3, vcc_lo -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v8, v4, 0 -; GFX11-GISEL-NEXT: v_mul_lo_u32 v12, v8, v5 -; GFX11-GISEL-NEXT: v_mul_lo_u32 v13, v9, v4 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[2:3], null, v10, v6, 0 -; GFX11-GISEL-NEXT: v_mul_lo_u32 v14, v10, v7 -; GFX11-GISEL-NEXT: v_mul_lo_u32 v15, v11, v6 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_add3_u32 v1, v1, v12, v13 -; GFX11-GISEL-NEXT: v_add3_u32 v12, v3, v14, v15 -; GFX11-GISEL-NEXT: v_add_co_u32 v3, vcc_lo, v0, v8 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-GISEL-NEXT: v_add_co_ci_u32_e32 v13, vcc_lo, v1, v9, vcc_lo -; GFX11-GISEL-NEXT: v_add_co_u32 v10, vcc_lo, v2, v10 -; GFX11-GISEL-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, v12, v11, vcc_lo -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[8:9], null, v3, v4, 0 -; GFX11-GISEL-NEXT: v_mul_lo_u32 v5, v3, v5 -; GFX11-GISEL-NEXT: v_mul_lo_u32 v13, v13, v4 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[3:4], null, v10, v6, 0 -; GFX11-GISEL-NEXT: v_mul_lo_u32 v7, v10, v7 -; GFX11-GISEL-NEXT: v_mul_lo_u32 v6, v11, v6 -; GFX11-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, 1 -; GFX11-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX11-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, 1 -; GFX11-GISEL-NEXT: v_add3_u32 v9, v9, v5, v13 -; GFX11-GISEL-NEXT: v_add3_u32 v10, v4, v7, v6 -; GFX11-GISEL-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, 0, v12, vcc_lo -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[4:5], null, v8, v0, 0 -; GFX11-GISEL-NEXT: v_mul_lo_u32 v1, v8, v1 -; GFX11-GISEL-NEXT: v_mul_lo_u32 v0, v9, v0 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[6:7], null, v3, v2, 0 -; GFX11-GISEL-NEXT: v_mul_lo_u32 v11, v3, v11 -; GFX11-GISEL-NEXT: v_mul_lo_u32 v2, v10, v2 -; GFX11-GISEL-NEXT: v_add_co_u32 v8, vcc_lo, v8, 1 -; GFX11-GISEL-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, 0, v9, vcc_lo -; GFX11-GISEL-NEXT: v_add_co_u32 v12, vcc_lo, v3, 1 -; GFX11-GISEL-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, 0, v10, vcc_lo -; GFX11-GISEL-NEXT: v_add3_u32 v3, v5, v1, v0 -; GFX11-GISEL-NEXT: v_add3_u32 v5, v7, v11, v2 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[0:1], null, v4, v8, 0 -; GFX11-GISEL-NEXT: v_mul_lo_u32 v4, v4, v9 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-GISEL-NEXT: v_mul_lo_u32 v7, v3, v8 -; GFX11-GISEL-NEXT: v_mad_u64_u32 v[2:3], null, v6, v12, 0 -; GFX11-GISEL-NEXT: v_mul_lo_u32 v6, v6, v10 -; GFX11-GISEL-NEXT: v_mul_lo_u32 v5, v5, v12 -; GFX11-GISEL-NEXT: v_add3_u32 v1, v1, v4, v7 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_add3_u32 v3, v3, v6, v5 -; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] entry: %y18 = add <2 x i64> %x, %add = mul <2 x i64> %y18, %y @@ -5557,26 +4652,6 @@ define i32 @v_multi_use_mul_chain_add_other_use_all(i32 %arg, i32 %arg1, i32 %ar ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v5, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: v_multi_use_mul_chain_add_other_use_all: -; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_nc_u32_e32 v0, 1, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mul_lo_u32 v2, v0, v1 -; GFX11-NEXT: v_add_nc_u32_e32 v0, v2, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mul_lo_u32 v1, v0, v1 -; GFX11-NEXT: v_add_nc_u32_e32 v0, 1, v2 -; GFX11-NEXT: v_mul_lo_u32 v5, v1, v0 -; GFX11-NEXT: global_store_b32 v[3:4], v2, off dlc -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b32 v[3:4], v1, off dlc -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b32 v[3:4], v5, off dlc -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_add_nc_u32_e32 v0, v5, v0 -; GFX11-NEXT: s_setpc_b64 s[30:31] bb: %i = add i32 %arg, 1 %i3 = mul i32 %i, %arg1 @@ -5753,24 +4828,6 @@ define i32 @v_multi_use_mul_chain_add_other_use_some(i32 %arg, i32 %arg1, i32 %a ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v5, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: v_multi_use_mul_chain_add_other_use_some: -; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_nc_u32_e32 v0, 1, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mul_lo_u32 v2, v0, v1 -; GFX11-NEXT: v_add_nc_u32_e32 v0, v2, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mul_lo_u32 v0, v0, v1 -; GFX11-NEXT: v_add_nc_u32_e32 v1, 1, v2 -; GFX11-NEXT: v_mul_lo_u32 v5, v0, v1 -; GFX11-NEXT: global_store_b32 v[3:4], v2, off dlc -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b32 v[3:4], v5, off dlc -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_add_nc_u32_e32 v0, v5, v1 -; GFX11-NEXT: s_setpc_b64 s[30:31] bb: %i = add i32 %arg, 1 %i3 = mul i32 %i, %arg1 @@ -5988,58 +5045,6 @@ define i32 @clpeak_imad_pat_i32_x2(i32 %x, i32 %y) { ; GFX10-GISEL-NEXT: v_add_nc_u32_e32 v0, 1, v0 ; GFX10-GISEL-NEXT: v_mul_lo_u32 v0, v1, v0 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-SDAG-LABEL: clpeak_imad_pat_i32_x2: -; GFX11-SDAG: ; %bb.0: ; %entry -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v0, 1, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_mul_lo_u32 v2, v0, v1 -; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v0, v2, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_mul_lo_u32 v0, v0, v1 -; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v1, 1, v2 -; GFX11-SDAG-NEXT: v_mul_lo_u32 v2, v0, v1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v1, v2, v1 -; GFX11-SDAG-NEXT: v_mul_lo_u32 v0, v1, v0 -; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v1, 1, v2 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_mul_lo_u32 v5, v0, v1 -; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v1, v5, v1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_mul_lo_u32 v2, v1, v0 -; GFX11-SDAG-NEXT: v_mad_u64_u32 v[3:4], null, v2, v5, v[2:3] -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_mad_u64_u32 v[0:1], null, v3, v2, v[3:4] -; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-GISEL-LABEL: clpeak_imad_pat_i32_x2: -; GFX11-GISEL: ; %bb.0: ; %entry -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, 1, v0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_mul_lo_u32 v2, v0, v1 -; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, v2, v0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_mul_lo_u32 v0, v0, v1 -; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v1, 1, v2 -; GFX11-GISEL-NEXT: v_mul_lo_u32 v2, v0, v1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v1, v2, v1 -; GFX11-GISEL-NEXT: v_mul_lo_u32 v0, v1, v0 -; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v1, 1, v2 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_mul_lo_u32 v2, v0, v1 -; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v1, v2, v1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_mul_lo_u32 v0, v1, v0 -; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v1, 1, v2 -; GFX11-GISEL-NEXT: v_mul_lo_u32 v1, v0, v1 -; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, 1, v0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_mul_lo_u32 v0, v1, v0 -; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] entry: %y38 = add i32 %x, 1 %add = mul i32 %y38, %y @@ -6419,98 +5424,6 @@ define <2 x i32> @clpeak_imad_pat_v2i32_x2(<2 x i32> %x, <2 x i32> %y) { ; GFX10-GISEL-NEXT: v_mul_lo_u32 v0, v2, v0 ; GFX10-GISEL-NEXT: v_mul_lo_u32 v1, v3, v1 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-SDAG-LABEL: clpeak_imad_pat_v2i32_x2: -; GFX11-SDAG: ; %bb.0: ; %entry -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v0, 1, v0 -; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v1, 1, v1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_mul_lo_u32 v4, v0, v2 -; GFX11-SDAG-NEXT: v_mul_lo_u32 v5, v1, v3 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v0, v4, v0 -; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v1, v5, v1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_mul_lo_u32 v0, v0, v2 -; GFX11-SDAG-NEXT: v_mul_lo_u32 v1, v1, v3 -; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v2, 1, v4 -; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v3, 1, v5 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_mul_lo_u32 v4, v0, v2 -; GFX11-SDAG-NEXT: v_mul_lo_u32 v5, v1, v3 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v2, v4, v2 -; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v3, v5, v3 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_mul_lo_u32 v0, v2, v0 -; GFX11-SDAG-NEXT: v_mul_lo_u32 v1, v3, v1 -; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v2, 1, v4 -; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v3, 1, v5 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_mul_lo_u32 v6, v0, v2 -; GFX11-SDAG-NEXT: v_mul_lo_u32 v7, v1, v3 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v2, v6, v2 -; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v3, v7, v3 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_mul_lo_u32 v2, v2, v0 -; GFX11-SDAG-NEXT: v_mul_lo_u32 v3, v3, v1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_mad_u64_u32 v[4:5], null, v2, v6, v[2:3] -; GFX11-SDAG-NEXT: v_mad_u64_u32 v[5:6], null, v3, v7, v[3:4] -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_mad_u64_u32 v[0:1], null, v4, v2, v[4:5] -; GFX11-SDAG-NEXT: v_mad_u64_u32 v[1:2], null, v5, v3, v[5:6] -; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-GISEL-LABEL: clpeak_imad_pat_v2i32_x2: -; GFX11-GISEL: ; %bb.0: ; %entry -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, 1, v0 -; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v1, 1, v1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_mul_lo_u32 v4, v0, v2 -; GFX11-GISEL-NEXT: v_mul_lo_u32 v5, v1, v3 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, v4, v0 -; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v1, v5, v1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_mul_lo_u32 v0, v0, v2 -; GFX11-GISEL-NEXT: v_mul_lo_u32 v1, v1, v3 -; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v2, 1, v4 -; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v3, 1, v5 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_mul_lo_u32 v4, v0, v2 -; GFX11-GISEL-NEXT: v_mul_lo_u32 v5, v1, v3 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v2, v4, v2 -; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v3, v5, v3 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_mul_lo_u32 v0, v2, v0 -; GFX11-GISEL-NEXT: v_mul_lo_u32 v1, v3, v1 -; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v2, 1, v4 -; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v3, 1, v5 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_mul_lo_u32 v4, v0, v2 -; GFX11-GISEL-NEXT: v_mul_lo_u32 v5, v1, v3 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v2, v4, v2 -; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v3, v5, v3 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_mul_lo_u32 v0, v2, v0 -; GFX11-GISEL-NEXT: v_mul_lo_u32 v1, v3, v1 -; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v2, 1, v4 -; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v3, 1, v5 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_mul_lo_u32 v2, v0, v2 -; GFX11-GISEL-NEXT: v_mul_lo_u32 v3, v1, v3 -; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, 1, v0 -; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v1, 1, v1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_mul_lo_u32 v0, v2, v0 -; GFX11-GISEL-NEXT: v_mul_lo_u32 v1, v3, v1 -; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] entry: %y38 = add <2 x i32> %x, %add = mul <2 x i32> %y38, %y @@ -6715,51 +5628,6 @@ define signext i16 @clpeak_imad_pat_i16_x2(i16 signext %x, i16 signext %y) { ; GFX10-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 ; GFX10-GISEL-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-SDAG-LABEL: clpeak_imad_pat_i16_x2: -; GFX11-SDAG: ; %bb.0: ; %entry -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_mad_u16 v0, v1, v0, v1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_mad_u16 v1, v0, v1, v0 -; GFX11-SDAG-NEXT: v_mad_u16 v0, v1, v0, v1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_mad_u16 v1, v0, v1, v0 -; GFX11-SDAG-NEXT: v_mad_u16 v0, v1, v0, v1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_mad_u16 v1, v0, v1, v0 -; GFX11-SDAG-NEXT: v_mad_u16 v0, v1, v0, v1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_mad_u16 v0, v0, v1, v0 -; GFX11-SDAG-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-GISEL-LABEL: clpeak_imad_pat_i16_x2: -; GFX11-GISEL: ; %bb.0: ; %entry -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_add_nc_u16 v0, v0, 1 -; GFX11-GISEL-NEXT: v_add_nc_u16 v2, v1, 1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX11-GISEL-NEXT: v_mul_lo_u16 v1, v2, v0 -; GFX11-GISEL-NEXT: v_add_nc_u16 v0, v0, 1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX11-GISEL-NEXT: v_add_nc_u16 v1, v1, 1 -; GFX11-GISEL-NEXT: v_mul_lo_u16 v1, v1, v0 -; GFX11-GISEL-NEXT: v_add_nc_u16 v0, v0, 1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX11-GISEL-NEXT: v_add_nc_u16 v1, v1, 1 -; GFX11-GISEL-NEXT: v_mul_lo_u16 v1, v1, v0 -; GFX11-GISEL-NEXT: v_add_nc_u16 v0, v0, 1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX11-GISEL-NEXT: v_add_nc_u16 v1, v1, 1 -; GFX11-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] entry: %conv69 = add i16 %x, 1 %add = mul i16 %conv69, %y @@ -6958,51 +5826,6 @@ define zeroext i16 @clpeak_umad_pat_i16_x2(i16 zeroext %x, i16 zeroext %y) { ; GFX10-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 ; GFX10-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-SDAG-LABEL: clpeak_umad_pat_i16_x2: -; GFX11-SDAG: ; %bb.0: ; %entry -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_mad_u16 v0, v1, v0, v1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_mad_u16 v1, v0, v1, v0 -; GFX11-SDAG-NEXT: v_mad_u16 v0, v1, v0, v1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_mad_u16 v1, v0, v1, v0 -; GFX11-SDAG-NEXT: v_mad_u16 v0, v1, v0, v1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_mad_u16 v1, v0, v1, v0 -; GFX11-SDAG-NEXT: v_mad_u16 v0, v1, v0, v1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_mad_u16 v0, v0, v1, v0 -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-GISEL-LABEL: clpeak_umad_pat_i16_x2: -; GFX11-GISEL: ; %bb.0: ; %entry -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_add_nc_u16 v0, v0, 1 -; GFX11-GISEL-NEXT: v_add_nc_u16 v2, v1, 1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX11-GISEL-NEXT: v_mul_lo_u16 v1, v2, v0 -; GFX11-GISEL-NEXT: v_add_nc_u16 v0, v0, 1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX11-GISEL-NEXT: v_add_nc_u16 v1, v1, 1 -; GFX11-GISEL-NEXT: v_mul_lo_u16 v1, v1, v0 -; GFX11-GISEL-NEXT: v_add_nc_u16 v0, v0, 1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX11-GISEL-NEXT: v_add_nc_u16 v1, v1, 1 -; GFX11-GISEL-NEXT: v_mul_lo_u16 v1, v1, v0 -; GFX11-GISEL-NEXT: v_add_nc_u16 v0, v0, 1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX11-GISEL-NEXT: v_add_nc_u16 v1, v1, 1 -; GFX11-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] entry: %conv69 = add i16 %x, 1 %add = mul i16 %conv69, %y @@ -7326,60 +6149,6 @@ define <2 x i16> @clpeak_imad_pat_v2i16_x2(<2 x i16> %x, <2 x i16> %y) { ; GFX10-GISEL-NEXT: v_pk_mul_lo_u16 v0, v0, v1 ; GFX10-GISEL-NEXT: v_pk_mul_lo_u16 v0, v0, v2 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-SDAG-LABEL: clpeak_imad_pat_v2i16_x2: -; GFX11-SDAG: ; %bb.0: ; %entry -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_pk_sub_u16 v0, v0, -1 op_sel_hi:[1,0] -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v2, v0, v1 -; GFX11-SDAG-NEXT: v_pk_add_u16 v0, v2, v0 -; GFX11-SDAG-NEXT: v_pk_sub_u16 v2, v2, -1 op_sel_hi:[1,0] -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v1 -; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v1, v0, v2 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_pk_add_u16 v2, v1, v2 -; GFX11-SDAG-NEXT: v_pk_sub_u16 v1, v1, -1 op_sel_hi:[1,0] -; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v0, v2, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v2, v0, v1 -; GFX11-SDAG-NEXT: v_pk_add_u16 v1, v2, v1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v0, v1, v0 -; GFX11-SDAG-NEXT: v_pk_sub_u16 v1, v2, -1 op_sel_hi:[1,0] -; GFX11-SDAG-NEXT: v_pk_sub_u16 v2, v0, -1 op_sel_hi:[1,0] -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v1 -; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v2 -; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-GISEL-LABEL: clpeak_imad_pat_v2i16_x2: -; GFX11-GISEL: ; %bb.0: ; %entry -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0] -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v2, v0, v1 -; GFX11-GISEL-NEXT: v_pk_add_u16 v0, v2, v0 -; GFX11-GISEL-NEXT: v_pk_add_u16 v2, v2, 1 op_sel_hi:[1,0] -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v0, v0, v1 -; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v1, v0, v2 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_pk_add_u16 v2, v1, v2 -; GFX11-GISEL-NEXT: v_pk_add_u16 v1, v1, 1 op_sel_hi:[1,0] -; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v0, v2, v0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v2, v0, v1 -; GFX11-GISEL-NEXT: v_pk_add_u16 v1, v2, v1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v0, v1, v0 -; GFX11-GISEL-NEXT: v_pk_add_u16 v1, v2, 1 op_sel_hi:[1,0] -; GFX11-GISEL-NEXT: v_pk_add_u16 v2, v0, 1 op_sel_hi:[1,0] -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v0, v0, v1 -; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v0, v0, v2 -; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] entry: %y38 = add <2 x i16> %x, %add = mul <2 x i16> %y38, %y @@ -7703,60 +6472,6 @@ define <2 x i16> @clpeak_umad_pat_v2i16_x2(<2 x i16> %x, <2 x i16> %y) { ; GFX10-GISEL-NEXT: v_pk_mul_lo_u16 v0, v0, v1 ; GFX10-GISEL-NEXT: v_pk_mul_lo_u16 v0, v0, v2 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-SDAG-LABEL: clpeak_umad_pat_v2i16_x2: -; GFX11-SDAG: ; %bb.0: ; %entry -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_pk_sub_u16 v0, v0, -1 op_sel_hi:[1,0] -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v2, v0, v1 -; GFX11-SDAG-NEXT: v_pk_add_u16 v0, v2, v0 -; GFX11-SDAG-NEXT: v_pk_sub_u16 v2, v2, -1 op_sel_hi:[1,0] -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v1 -; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v1, v0, v2 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_pk_add_u16 v2, v1, v2 -; GFX11-SDAG-NEXT: v_pk_sub_u16 v1, v1, -1 op_sel_hi:[1,0] -; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v0, v2, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v2, v0, v1 -; GFX11-SDAG-NEXT: v_pk_add_u16 v1, v2, v1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v0, v1, v0 -; GFX11-SDAG-NEXT: v_pk_sub_u16 v1, v2, -1 op_sel_hi:[1,0] -; GFX11-SDAG-NEXT: v_pk_sub_u16 v2, v0, -1 op_sel_hi:[1,0] -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v1 -; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v2 -; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-GISEL-LABEL: clpeak_umad_pat_v2i16_x2: -; GFX11-GISEL: ; %bb.0: ; %entry -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0] -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v2, v0, v1 -; GFX11-GISEL-NEXT: v_pk_add_u16 v0, v2, v0 -; GFX11-GISEL-NEXT: v_pk_add_u16 v2, v2, 1 op_sel_hi:[1,0] -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v0, v0, v1 -; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v1, v0, v2 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_pk_add_u16 v2, v1, v2 -; GFX11-GISEL-NEXT: v_pk_add_u16 v1, v1, 1 op_sel_hi:[1,0] -; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v0, v2, v0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v2, v0, v1 -; GFX11-GISEL-NEXT: v_pk_add_u16 v1, v2, v1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v0, v1, v0 -; GFX11-GISEL-NEXT: v_pk_add_u16 v1, v2, 1 op_sel_hi:[1,0] -; GFX11-GISEL-NEXT: v_pk_add_u16 v2, v0, 1 op_sel_hi:[1,0] -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v0, v0, v1 -; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v0, v0, v2 -; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] entry: %y38 = add <2 x i16> %x, %add = mul <2 x i16> %y38, %y @@ -7810,15 +6525,6 @@ define <2 x i32> @multi_use_mul_mad_i32_var(i32 %x, i32 %y, i32 %z0, i32 %z1) { ; GFX10-NEXT: v_add_nc_u32_e32 v0, v1, v2 ; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: multi_use_mul_mad_i32_var: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mul_lo_u32 v1, v0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_add_nc_u32_e32 v0, v1, v2 -; GFX11-NEXT: v_add_nc_u32_e32 v1, v1, v3 -; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %mul = mul i32 %x, %y %add0 = add i32 %mul, %z0 @@ -7900,27 +6606,6 @@ define <2 x i16> @multi_use_mul_mad_i16_var(i16 %x, i16 %y, i16 %z0, i16 %z1) { ; GFX10-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX10-GISEL-NEXT: v_lshl_or_b32 v0, v0, 16, v1 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-SDAG-LABEL: multi_use_mul_mad_i16_var: -; GFX11-SDAG: ; %bb.0: ; %entry -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_mad_u16 v2, v0, v1, v2 -; GFX11-SDAG-NEXT: v_mad_u16 v0, v0, v1, v3 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_perm_b32 v0, v0, v2, 0x5040100 -; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-GISEL-LABEL: multi_use_mul_mad_i16_var: -; GFX11-GISEL: ; %bb.0: ; %entry -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_mul_lo_u16 v0, v0, v1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_add_nc_u16 v1, v0, v2 -; GFX11-GISEL-NEXT: v_add_nc_u16 v0, v0, v3 -; GFX11-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_lshl_or_b32 v0, v0, 16, v1 -; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] entry: %mul = mul i16 %x, %y %add0 = add i16 %mul, %z0 @@ -7968,16 +6653,6 @@ define i32 @other_use_mul_mad_i32_var(i32 %x, i32 %y, i32 %z, ptr addrspace(3) % ; GFX10-NEXT: ds_write_b32 v3, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: other_use_mul_mad_i32_var: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mul_lo_u32 v1, v0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_add_nc_u32_e32 v0, v1, v2 -; GFX11-NEXT: ds_store_b32 v3, v1 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %mul = mul i32 %x, %y %add0 = add i32 %mul, %z @@ -8046,25 +6721,6 @@ define i16 @other_use_mul_mad_i16_var(i16 %x, i16 %y, i16 %z, ptr addrspace(3) % ; GFX10-GISEL-NEXT: ds_write_b16 v3, v1 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-SDAG-LABEL: other_use_mul_mad_i16_var: -; GFX11-SDAG: ; %bb.0: ; %entry -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_mul_lo_u16 v4, v0, v1 -; GFX11-SDAG-NEXT: v_mad_u16 v0, v0, v1, v2 -; GFX11-SDAG-NEXT: ds_store_b16 v3, v4 -; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-GISEL-LABEL: other_use_mul_mad_i16_var: -; GFX11-GISEL: ; %bb.0: ; %entry -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_mul_lo_u16 v1, v0, v1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_add_nc_u16 v0, v1, v2 -; GFX11-GISEL-NEXT: ds_store_b16 v3, v1 -; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] entry: %mul = mul i16 %x, %y %add0 = add i16 %mul, %z @@ -8160,15 +6816,6 @@ define <4 x i16> @multi_use_mul_mad_v2i16_var(<2 x i16> %x, <2 x i16> %y, <2 x i ; GFX10-NEXT: v_pk_add_u16 v0, v1, v2 ; GFX10-NEXT: v_pk_add_u16 v1, v1, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: multi_use_mul_mad_v2i16_var: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_pk_mul_lo_u16 v1, v0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_add_u16 v0, v1, v2 -; GFX11-NEXT: v_pk_add_u16 v1, v1, v3 -; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %mul = mul <2 x i16> %x, %y %add0 = add <2 x i16> %mul, %z0 @@ -8274,16 +6921,6 @@ define <2 x i16> @other_use_mul_mad_v2i16_var(<2 x i16> %x, <2 x i16> %y, <2 x i ; GFX10-NEXT: ds_write_b32 v3, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: other_use_mul_mad_v2i16_var: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_pk_mul_lo_u16 v1, v0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_add_u16 v0, v1, v2 -; GFX11-NEXT: ds_store_b32 v3, v1 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %mul = mul <2 x i16> %x, %y %add0 = add <2 x i16> %mul, %z diff --git a/llvm/test/CodeGen/AMDGPU/regpressure_printer.mir b/llvm/test/CodeGen/AMDGPU/regpressure_printer.mir index f0c5ba489ef3d..83e85ccf7f8f1 100644 --- a/llvm/test/CodeGen/AMDGPU/regpressure_printer.mir +++ b/llvm/test/CodeGen/AMDGPU/regpressure_printer.mir @@ -47,46 +47,87 @@ body: | name: live_through_test tracksRegLiveness: true body: | - ; RP-LABEL: name: live_through_test - ; RP: bb.0: - ; RP-NEXT: Live-in: - ; RP-NEXT: SGPR VGPR - ; RP-NEXT: 0 0 - ; RP-NEXT: 4 0 %0:sgpr_128 = IMPLICIT_DEF - ; RP-NEXT: 3 0 - ; RP-NEXT: Live-out: %0:00000000000000F3 - ; RP-NEXT: Live-thr: - ; RP-NEXT: 0 0 - ; RP-NEXT: bb.1: - ; RP-NEXT: Live-in: %0:00000000000000F3 - ; RP-NEXT: SGPR VGPR - ; RP-NEXT: 3 0 - ; RP-NEXT: 3 0 S_NOP 0, implicit %0.sub0:sgpr_128 - ; RP-NEXT: 2 0 - ; RP-NEXT: 3 0 %0.sub0:sgpr_128 = IMPLICIT_DEF - ; RP-NEXT: 3 0 - ; RP-NEXT: 4 0 %0.sub1:sgpr_128 = IMPLICIT_DEF - ; RP-NEXT: 3 0 - ; RP-NEXT: 3 0 S_NOP 0, implicit %0.sub2:sgpr_128 - ; RP-NEXT: 2 0 - ; RP-NEXT: 3 0 %0.sub2:sgpr_128 = IMPLICIT_DEF - ; RP-NEXT: 3 0 - ; RP-NEXT: 3 0 S_NOP 0, implicit %0.sub2:sgpr_128 - ; RP-NEXT: 2 0 - ; RP-NEXT: 2 0 S_NOP 0, implicit %0.sub3:sgpr_128 - ; RP-NEXT: 2 0 - ; RP-NEXT: Live-out: %0:00000000000000C3 - ; RP-NEXT: Live-thr: %0:00000000000000C0 - ; RP-NEXT: 1 0 - ; RP-NEXT: bb.2: - ; RP-NEXT: Live-in: %0:00000000000000C3 - ; RP-NEXT: SGPR VGPR - ; RP-NEXT: 2 0 - ; RP-NEXT: 2 0 S_NOP 0, implicit %0.sub3:sgpr_128, implicit %0.sub0:sgpr_128 - ; RP-NEXT: 0 0 - ; RP-NEXT: Live-out: - ; RP-NEXT: Live-thr: - ; RP-NEXT: 0 0 + ; RPU-LABEL: name: live_through_test + ; RPU: bb.0: + ; RPU-NEXT: Live-in: + ; RPU-NEXT: SGPR VGPR + ; RPU-NEXT: 0 0 + ; RPU-NEXT: 3 0 %0:sgpr_128 = IMPLICIT_DEF + ; RPU-NEXT: 3 0 + ; RPU-NEXT: Live-out: %0:00000000000000F3 + ; RPU-NEXT: Live-thr: + ; RPU-NEXT: 0 0 + ; RPU-NEXT: bb.1: + ; RPU-NEXT: Live-in: %0:00000000000000F3 + ; RPU-NEXT: SGPR VGPR + ; RPU-NEXT: 3 0 + ; RPU-NEXT: 3 0 S_NOP 0, implicit %0.sub0:sgpr_128 + ; RPU-NEXT: 2 0 + ; RPU-NEXT: 3 0 %0.sub0:sgpr_128 = IMPLICIT_DEF + ; RPU-NEXT: 3 0 + ; RPU-NEXT: 3 0 %0.sub1:sgpr_128 = IMPLICIT_DEF + ; RPU-NEXT: 3 0 + ; RPU-NEXT: 3 0 S_NOP 0, implicit %0.sub2:sgpr_128 + ; RPU-NEXT: 2 0 + ; RPU-NEXT: 3 0 %0.sub2:sgpr_128 = IMPLICIT_DEF + ; RPU-NEXT: 3 0 + ; RPU-NEXT: 3 0 S_NOP 0, implicit %0.sub2:sgpr_128 + ; RPU-NEXT: 2 0 + ; RPU-NEXT: 2 0 S_NOP 0, implicit %0.sub3:sgpr_128 + ; RPU-NEXT: 2 0 + ; RPU-NEXT: Live-out: %0:00000000000000C3 + ; RPU-NEXT: Live-thr: %0:00000000000000C0 + ; RPU-NEXT: 1 0 + ; RPU-NEXT: bb.2: + ; RPU-NEXT: Live-in: %0:00000000000000C3 + ; RPU-NEXT: SGPR VGPR + ; RPU-NEXT: 2 0 + ; RPU-NEXT: 2 0 S_NOP 0, implicit %0.sub3:sgpr_128, implicit %0.sub0:sgpr_128 + ; RPU-NEXT: 0 0 + ; RPU-NEXT: Live-out: + ; RPU-NEXT: Live-thr: + ; RPU-NEXT: 0 0 + ; + ; RPD-LABEL: name: live_through_test + ; RPD: bb.0: + ; RPD-NEXT: Live-in: + ; RPD-NEXT: SGPR VGPR + ; RPD-NEXT: 0 0 + ; RPD-NEXT: 4 0 %0:sgpr_128 = IMPLICIT_DEF + ; RPD-NEXT: 3 0 + ; RPD-NEXT: Live-out: %0:00000000000000F3 + ; RPD-NEXT: Live-thr: + ; RPD-NEXT: 0 0 + ; RPD-NEXT: bb.1: + ; RPD-NEXT: Live-in: %0:00000000000000F3 + ; RPD-NEXT: SGPR VGPR + ; RPD-NEXT: 3 0 + ; RPD-NEXT: 3 0 S_NOP 0, implicit %0.sub0:sgpr_128 + ; RPD-NEXT: 2 0 + ; RPD-NEXT: 3 0 %0.sub0:sgpr_128 = IMPLICIT_DEF + ; RPD-NEXT: 3 0 + ; RPD-NEXT: 4 0 %0.sub1:sgpr_128 = IMPLICIT_DEF + ; RPD-NEXT: 3 0 + ; RPD-NEXT: 3 0 S_NOP 0, implicit %0.sub2:sgpr_128 + ; RPD-NEXT: 2 0 + ; RPD-NEXT: 3 0 %0.sub2:sgpr_128 = IMPLICIT_DEF + ; RPD-NEXT: 3 0 + ; RPD-NEXT: 3 0 S_NOP 0, implicit %0.sub2:sgpr_128 + ; RPD-NEXT: 2 0 + ; RPD-NEXT: 2 0 S_NOP 0, implicit %0.sub3:sgpr_128 + ; RPD-NEXT: 2 0 + ; RPD-NEXT: Live-out: %0:00000000000000C3 + ; RPD-NEXT: Live-thr: %0:00000000000000C0 + ; RPD-NEXT: 1 0 + ; RPD-NEXT: bb.2: + ; RPD-NEXT: Live-in: %0:00000000000000C3 + ; RPD-NEXT: SGPR VGPR + ; RPD-NEXT: 2 0 + ; RPD-NEXT: 2 0 S_NOP 0, implicit %0.sub3:sgpr_128, implicit %0.sub0:sgpr_128 + ; RPD-NEXT: 0 0 + ; RPD-NEXT: Live-out: + ; RPD-NEXT: Live-thr: + ; RPD-NEXT: 0 0 bb.0: %0:sgpr_128 = IMPLICIT_DEF bb.1: @@ -182,7 +223,7 @@ body: | ; RPU-NEXT: 0 7 ; RPU-NEXT: 0 7 %7:vgpr_32 = GLOBAL_LOAD_DWORD %5:vreg_64, 0, 0, implicit $exec ; RPU-NEXT: 0 6 - ; RPU-NEXT: 0 8 %8:vreg_64 = IMPLICIT_DEF + ; RPU-NEXT: 0 7 %8:vreg_64 = IMPLICIT_DEF ; RPU-NEXT: 0 7 ; RPU-NEXT: 0 9 %9:vreg_64 = IMPLICIT_DEF ; RPU-NEXT: 0 9 @@ -221,7 +262,7 @@ body: | ; RPU-NEXT: 0 12 ; RPU-NEXT: 0 12 dead %21:vgpr_32 = GLOBAL_LOAD_DWORD %14:vreg_64, 0, 0, implicit $exec ; RPU-NEXT: 0 10 - ; RPU-NEXT: 0 11 dead %22:vgpr_32 = GLOBAL_LOAD_DWORD %15:vreg_64, 0, 0, implicit $exec + ; RPU-NEXT: 0 10 dead %22:vgpr_32 = GLOBAL_LOAD_DWORD %15:vreg_64, 0, 0, implicit $exec ; RPU-NEXT: 0 10 ; RPU-NEXT: 0 10 %23:vreg_64 = V_LSHLREV_B64_e64 2, %8:vreg_64, implicit $exec ; RPU-NEXT: 0 9 @@ -509,7 +550,7 @@ body: | ; RPU-NEXT: 0 0 ; RPU-NEXT: 0 0 $sgpr0 = S_BUFFER_LOAD_DWORD_IMM $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0 ; RPU-NEXT: 0 0 - ; RPU-NEXT: 0 1 undef %0.sub5:vreg_512 = V_MOV_B32_e32 5, implicit $exec + ; RPU-NEXT: 0 0 undef %0.sub5:vreg_512 = V_MOV_B32_e32 5, implicit $exec ; RPU-NEXT: 0 0 ; RPU-NEXT: 0 0 S_CMP_GT_U32 $sgpr0, 15, implicit-def $scc ; RPU-NEXT: 0 0 @@ -528,7 +569,7 @@ body: | ; RPU-NEXT: 0 1 ; RPU-NEXT: 0 1 $m0 = S_MOV_B32 killed $sgpr0 ; RPU-NEXT: 0 1 - ; RPU-NEXT: 0 16 %0:vreg_512 = V_INDIRECT_REG_WRITE_MOVREL_B32_V16 %0:vreg_512(tied-def 0), 42, 3, implicit $m0, implicit $exec + ; RPU-NEXT: 0 1 %0:vreg_512 = V_INDIRECT_REG_WRITE_MOVREL_B32_V16 %0:vreg_512(tied-def 0), 42, 3, implicit $m0, implicit $exec ; RPU-NEXT: 0 1 ; RPU-NEXT: Live-out: %0:0000000000000C00 ; RPU-NEXT: Live-thr: @@ -668,19 +709,33 @@ tracksRegLiveness: true body: | bb.0: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 - ; RP-LABEL: name: test_partially_used_early_clobber_def - ; RP: Live-in: - ; RP-NEXT: SGPR VGPR - ; RP-NEXT: 0 0 - ; RP-NEXT: 4 0 %0:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 - ; RP-NEXT: 4 0 - ; RP-NEXT: 8 0 early-clobber %1:sgpr_128 = COPY %0:sgpr_128 - ; RP-NEXT: 1 0 - ; RP-NEXT: 1 0 S_NOP 0, implicit %1.sub1:sgpr_128 - ; RP-NEXT: 0 0 - ; RP-NEXT: Live-out: - ; RP-NEXT: Live-thr: - ; RP-NEXT: 0 0 + ; RPU-LABEL: name: test_partially_used_early_clobber_def + ; RPU: Live-in: + ; RPU-NEXT: SGPR VGPR + ; RPU-NEXT: 0 0 + ; RPU-NEXT: 4 0 %0:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + ; RPU-NEXT: 4 0 + ; RPU-NEXT: 5 0 early-clobber %1:sgpr_128 = COPY %0:sgpr_128 + ; RPU-NEXT: 1 0 + ; RPU-NEXT: 1 0 S_NOP 0, implicit %1.sub1:sgpr_128 + ; RPU-NEXT: 0 0 + ; RPU-NEXT: Live-out: + ; RPU-NEXT: Live-thr: + ; RPU-NEXT: 0 0 + ; + ; RPD-LABEL: name: test_partially_used_early_clobber_def + ; RPD: Live-in: + ; RPD-NEXT: SGPR VGPR + ; RPD-NEXT: 0 0 + ; RPD-NEXT: 4 0 %0:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + ; RPD-NEXT: 4 0 + ; RPD-NEXT: 8 0 early-clobber %1:sgpr_128 = COPY %0:sgpr_128 + ; RPD-NEXT: 1 0 + ; RPD-NEXT: 1 0 S_NOP 0, implicit %1.sub1:sgpr_128 + ; RPD-NEXT: 0 0 + ; RPD-NEXT: Live-out: + ; RPD-NEXT: Live-thr: + ; RPD-NEXT: 0 0 %0:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 early-clobber %1:sgpr_128 = COPY %0:sgpr_128 S_NOP 0, implicit %1.sub1 @@ -697,7 +752,7 @@ body: | ; RPU-NEXT: 0 0 ; RPU-NEXT: 4 0 %0:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 ; RPU-NEXT: 4 0 - ; RPU-NEXT: 16 0 %1:sgpr_128 = COPY %0:sgpr_128, implicit-def %2:sgpr_128, implicit-def early-clobber %3:sgpr_128, implicit-def dead early-clobber %4:sgpr_128 + ; RPU-NEXT: 7 0 %1:sgpr_128 = COPY %0:sgpr_128, implicit-def %2:sgpr_128, implicit-def early-clobber %3:sgpr_128, implicit-def dead early-clobber %4:sgpr_128 ; RPU-NEXT: 6 0 ; RPU-NEXT: 6 0 S_NOP 0, implicit %1.sub1:sgpr_128, implicit %2.sub0_sub1:sgpr_128, implicit %3.sub0_sub1_sub2:sgpr_128 ; RPU-NEXT: 0 0 diff --git a/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-all-any.ll b/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-all-any.ll index b78f6412cd677..81346996431fc 100644 --- a/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-all-any.ll +++ b/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-all-any.ll @@ -1,21 +1,14 @@ -; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM4 %s -; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM5 %s -; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=4 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF4 %s -; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=5 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF5 %s - -; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=4 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF4 %s -; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=5 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF5 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck --check-prefixes=ASM %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --filetype=obj < %s | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF %s ; ASM: .amdgcn_target "amdgcn-amd-amdhsa--gfx900" ; ASM: amdhsa.target: amdgcn-amd-amdhsa--gfx900 ; ASM: amdhsa.version: ; ASM: - 1 -; ASM4: - 1 -; ASM5: - 2 +; ASM: - 1 ; ELF: OS/ABI: AMDGPU_HSA (0x40) -; ELF4: ABIVersion: 2 -; ELF5: ABIVersion: 3 +; ELF: ABIVersion: 2 ; ELF: Flags [ (0x12C) ; ELF-NEXT: EF_AMDGPU_FEATURE_XNACK_ANY_V4 (0x100) ; ELF-NEXT: EF_AMDGPU_MACH_AMDGCN_GFX900 (0x2C) @@ -37,4 +30,4 @@ entry: } !llvm.module.flags = !{!0} -!0 = !{i32 1, !"amdgpu_code_object_version", i32 CODE_OBJECT_VERSION} +!0 = !{i32 1, !"amdgpu_code_object_version", i32 400} diff --git a/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-all-not-supported.ll b/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-all-not-supported.ll index a3c75e0997533..de518bb99370a 100644 --- a/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-all-not-supported.ll +++ b/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-all-not-supported.ll @@ -1,21 +1,14 @@ -; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 | FileCheck --check-prefixes=ASM,ASM4 %s -; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 | FileCheck --check-prefixes=ASM,ASM5 %s -; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 --amdhsa-code-object-version=4 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF4 %s -; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 --amdhsa-code-object-version=5 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF5 %s - -; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 --amdhsa-code-object-version=4 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF4 %s -; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 --amdhsa-code-object-version=5 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF5 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck --check-prefixes=ASM %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 --filetype=obj < %s | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF %s ; ASM: .amdgcn_target "amdgcn-amd-amdhsa--gfx700" ; ASM: amdhsa.target: amdgcn-amd-amdhsa--gfx700 ; ASM: amdhsa.version: ; ASM: - 1 -; ASM4: - 1 -; ASM5: - 2 +; ASM: - 1 ; ELF: OS/ABI: AMDGPU_HSA (0x40) -; ELF4: ABIVersion: 2 -; ELF5: ABIVersion: 3 +; ELF: ABIVersion: 2 ; ELF: Flags [ (0x22) ; ELF-NEXT: EF_AMDGPU_MACH_AMDGCN_GFX700 (0x22) ; ELF-NEXT: ] @@ -36,4 +29,4 @@ entry: } !llvm.module.flags = !{!0} -!0 = !{i32 1, !"amdgpu_code_object_version", i32 CODE_OBJECT_VERSION} +!0 = !{i32 1, !"amdgpu_code_object_version", i32 400} diff --git a/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-all-off.ll b/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-all-off.ll index d4d8af8c1a99b..b43d5941e8ccd 100644 --- a/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-all-off.ll +++ b/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-all-off.ll @@ -1,21 +1,14 @@ -; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM4 %s -; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM5 %s -; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=4 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF4 %s -; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=5 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF5 %s - -; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=4 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF4 %s -; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=5 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF5 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck --check-prefixes=ASM %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --filetype=obj < %s | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF %s ; ASM: .amdgcn_target "amdgcn-amd-amdhsa--gfx900:xnack-" ; ASM: amdhsa.target: 'amdgcn-amd-amdhsa--gfx900:xnack-' ; ASM: amdhsa.version: ; ASM: - 1 -; ASM4: - 1 -; ASM5: - 2 +; ASM: - 1 ; ELF: OS/ABI: AMDGPU_HSA (0x40) -; ELF4: ABIVersion: 2 -; ELF5: ABIVersion: 3 +; ELF: ABIVersion: 2 ; ELF: Flags [ (0x22C) ; ELF-NEXT: EF_AMDGPU_FEATURE_XNACK_OFF_V4 (0x200) ; ELF-NEXT: EF_AMDGPU_MACH_AMDGCN_GFX900 (0x2C) @@ -38,4 +31,4 @@ entry: attributes #0 = { "target-features"="-xnack" } !llvm.module.flags = !{!0} -!0 = !{i32 1, !"amdgpu_code_object_version", i32 CODE_OBJECT_VERSION} +!0 = !{i32 1, !"amdgpu_code_object_version", i32 400} diff --git a/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-all-on.ll b/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-all-on.ll index 9ca8b055a2797..fc2bb4f4a2809 100644 --- a/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-all-on.ll +++ b/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-all-on.ll @@ -1,21 +1,14 @@ -; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM4 %s -; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM5 %s -; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=4 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF4 %s -; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=5 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF5 %s - -; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=4 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF4 %s -; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=5 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF5 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck --check-prefixes=ASM %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --filetype=obj < %s | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF %s ; ASM: .amdgcn_target "amdgcn-amd-amdhsa--gfx900:xnack+" ; ASM: amdhsa.target: 'amdgcn-amd-amdhsa--gfx900:xnack+' ; ASM: amdhsa.version: ; ASM: - 1 -; ASM4: - 1 -; ASM5: - 2 +; ASM: - 1 ; ELF: OS/ABI: AMDGPU_HSA (0x40) -; ELF4: ABIVersion: 2 -; ELF5: ABIVersion: 3 +; ELF: ABIVersion: 2 ; ELF: Flags [ (0x32C) ; ELF-NEXT: EF_AMDGPU_FEATURE_XNACK_ON_V4 (0x300) ; ELF-NEXT: EF_AMDGPU_MACH_AMDGCN_GFX900 (0x2C) @@ -39,4 +32,4 @@ entry: attributes #0 = { "target-features"="+xnack" } !llvm.module.flags = !{!0} -!0 = !{i32 1, !"amdgpu_code_object_version", i32 CODE_OBJECT_VERSION} +!0 = !{i32 1, !"amdgpu_code_object_version", i32 400} diff --git a/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-any-off-1.ll b/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-any-off-1.ll index fd3f5878469e6..27c58e400e132 100644 --- a/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-any-off-1.ll +++ b/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-any-off-1.ll @@ -1,21 +1,14 @@ -; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM4 %s -; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM5 %s -; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=4 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF4 %s -; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=5 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF5 %s - -; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=4 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF4 %s -; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=5 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF5 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck --check-prefixes=ASM %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --filetype=obj < %s | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF %s ; ASM: .amdgcn_target "amdgcn-amd-amdhsa--gfx900:xnack-" ; ASM: amdhsa.target: 'amdgcn-amd-amdhsa--gfx900:xnack-' ; ASM: amdhsa.version: ; ASM: - 1 -; ASM4: - 1 -; ASM5: - 2 +; ASM: - 1 ; ELF: OS/ABI: AMDGPU_HSA (0x40) -; ELF4: ABIVersion: 2 -; ELF5: ABIVersion: 3 +; ELF: ABIVersion: 2 ; ELF: Flags [ (0x22C) ; ELF-NEXT: EF_AMDGPU_FEATURE_XNACK_OFF_V4 (0x200) ; ELF-NEXT: EF_AMDGPU_MACH_AMDGCN_GFX900 (0x2C) @@ -39,4 +32,4 @@ entry: attributes #0 = { "target-features"="-xnack" } !llvm.module.flags = !{!0} -!0 = !{i32 1, !"amdgpu_code_object_version", i32 CODE_OBJECT_VERSION} +!0 = !{i32 1, !"amdgpu_code_object_version", i32 400} diff --git a/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-any-off-2.ll b/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-any-off-2.ll index 34673dd5b8919..a97f8b2d9cd64 100644 --- a/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-any-off-2.ll +++ b/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-any-off-2.ll @@ -1,21 +1,14 @@ -; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM4 %s -; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM5 %s -; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=4 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF4 %s -; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=5 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF5 %s - -; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=4 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF4 %s -; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=5 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF5 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck --check-prefixes=ASM %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --filetype=obj < %s | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF %s ; ASM: .amdgcn_target "amdgcn-amd-amdhsa--gfx900:xnack-" ; ASM: amdhsa.target: 'amdgcn-amd-amdhsa--gfx900:xnack-' ; ASM: amdhsa.version: ; ASM: - 1 -; ASM4: - 1 -; ASM5: - 2 +; ASM: - 1 ; ELF: OS/ABI: AMDGPU_HSA (0x40) -; ELF4: ABIVersion: 2 -; ELF5: ABIVersion: 3 +; ELF: ABIVersion: 2 ; ELF: Flags [ (0x22C) ; ELF-NEXT: EF_AMDGPU_FEATURE_XNACK_OFF_V4 (0x200) ; ELF-NEXT: EF_AMDGPU_MACH_AMDGCN_GFX900 (0x2C) @@ -39,4 +32,4 @@ entry: attributes #0 = { "target-features"="-xnack" } !llvm.module.flags = !{!0} -!0 = !{i32 1, !"amdgpu_code_object_version", i32 CODE_OBJECT_VERSION} +!0 = !{i32 1, !"amdgpu_code_object_version", i32 400} diff --git a/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-any-on-1.ll b/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-any-on-1.ll index c283ece7e8bdf..9ef7d7acd3c14 100644 --- a/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-any-on-1.ll +++ b/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-any-on-1.ll @@ -1,21 +1,14 @@ -; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM4 %s -; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM5 %s -; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=4 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF4 %s -; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=5 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF5 %s - -; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=4 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF4 %s -; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=5 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF5 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck --check-prefixes=ASM %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --filetype=obj < %s | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF %s ; ASM: .amdgcn_target "amdgcn-amd-amdhsa--gfx900:xnack+" ; ASM: amdhsa.target: 'amdgcn-amd-amdhsa--gfx900:xnack+' ; ASM: amdhsa.version: ; ASM: - 1 -; ASM4: - 1 -; ASM5: - 2 +; ASM: - 1 ; ELF: OS/ABI: AMDGPU_HSA (0x40) -; ELF4: ABIVersion: 2 -; ELF5: ABIVersion: 3 +; ELF: ABIVersion: 2 ; ELF: Flags [ (0x32C) ; ELF-NEXT: EF_AMDGPU_FEATURE_XNACK_ON_V4 (0x300) ; ELF-NEXT: EF_AMDGPU_MACH_AMDGCN_GFX900 (0x2C) @@ -39,4 +32,4 @@ entry: attributes #0 = { "target-features"="+xnack" } !llvm.module.flags = !{!0} -!0 = !{i32 1, !"amdgpu_code_object_version", i32 CODE_OBJECT_VERSION} +!0 = !{i32 1, !"amdgpu_code_object_version", i32 400} diff --git a/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-any-on-2.ll b/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-any-on-2.ll index 869254cae5258..5d32346745f84 100644 --- a/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-any-on-2.ll +++ b/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-any-on-2.ll @@ -1,21 +1,14 @@ -; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM4 %s -; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM5 %s -; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=4 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF4 %s -; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=5 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF5 %s - -; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=4 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF4 %s -; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=5 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF5 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck --check-prefixes=ASM %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --filetype=obj < %s | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF %s ; ASM: .amdgcn_target "amdgcn-amd-amdhsa--gfx900:xnack+" ; ASM: amdhsa.target: 'amdgcn-amd-amdhsa--gfx900:xnack+' ; ASM: amdhsa.version: ; ASM: - 1 -; ASM4: - 1 -; ASM5: - 2 +; ASM: - 1 ; ELF: OS/ABI: AMDGPU_HSA (0x40) -; ELF4: ABIVersion: 2 -; ELF5: ABIVersion: 3 +; ELF: ABIVersion: 2 ; ELF: Flags [ (0x32C) ; ELF-NEXT: EF_AMDGPU_FEATURE_XNACK_ON_V4 (0x300) ; ELF-NEXT: EF_AMDGPU_MACH_AMDGCN_GFX900 (0x2C) @@ -39,4 +32,4 @@ entry: attributes #0 = { "target-features"="+xnack" } !llvm.module.flags = !{!0} -!0 = !{i32 1, !"amdgpu_code_object_version", i32 CODE_OBJECT_VERSION} +!0 = !{i32 1, !"amdgpu_code_object_version", i32 400} diff --git a/llvm/test/CodeGen/AMDGPU/tid-one-func-xnack-any.ll b/llvm/test/CodeGen/AMDGPU/tid-one-func-xnack-any.ll index b1bcb34c8aee4..d6778ec1f1fab 100644 --- a/llvm/test/CodeGen/AMDGPU/tid-one-func-xnack-any.ll +++ b/llvm/test/CodeGen/AMDGPU/tid-one-func-xnack-any.ll @@ -1,21 +1,14 @@ -; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM4 %s -; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM5 %s -; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=4 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF4 %s -; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=5 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF5 %s - -; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=4 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF4 %s -; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=5 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF5 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck --check-prefixes=ASM %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --filetype=obj < %s | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF %s ; ASM: .amdgcn_target "amdgcn-amd-amdhsa--gfx900" ; ASM: amdhsa.target: amdgcn-amd-amdhsa--gfx900 ; ASM: amdhsa.version: ; ASM: - 1 -; ASM4: - 1 -; ASM5: - 2 +; ASM: - 1 ; ELF: OS/ABI: AMDGPU_HSA (0x40) -; ELF4: ABIVersion: 2 -; ELF5: ABIVersion: 3 +; ELF: ABIVersion: 2 ; ELF: Flags [ (0x12C) ; ELF-NEXT: EF_AMDGPU_FEATURE_XNACK_ANY_V4 (0x100) ; ELF-NEXT: EF_AMDGPU_MACH_AMDGCN_GFX900 (0x2C) @@ -27,4 +20,4 @@ entry: } !llvm.module.flags = !{!0} -!0 = !{i32 1, !"amdgpu_code_object_version", i32 CODE_OBJECT_VERSION} +!0 = !{i32 1, !"amdgpu_code_object_version", i32 400} diff --git a/llvm/test/CodeGen/AMDGPU/tid-one-func-xnack-not-supported.ll b/llvm/test/CodeGen/AMDGPU/tid-one-func-xnack-not-supported.ll index cc04eb0e661d6..efa09a61e6305 100644 --- a/llvm/test/CodeGen/AMDGPU/tid-one-func-xnack-not-supported.ll +++ b/llvm/test/CodeGen/AMDGPU/tid-one-func-xnack-not-supported.ll @@ -1,21 +1,14 @@ -; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 | FileCheck --check-prefixes=ASM,ASM4 %s -; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 | FileCheck --check-prefixes=ASM,ASM5 %s -; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 --amdhsa-code-object-version=4 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF4 %s -; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 --amdhsa-code-object-version=5 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF5 %s - -; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 --amdhsa-code-object-version=4 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF4 %s -; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 --amdhsa-code-object-version=5 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF5 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck --check-prefixes=ASM %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 --filetype=obj < %s | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF %s ; ASM: .amdgcn_target "amdgcn-amd-amdhsa--gfx700" ; ASM: amdhsa.target: amdgcn-amd-amdhsa--gfx700 ; ASM: amdhsa.version: ; ASM: - 1 -; ASM4: - 1 -; ASM5: - 2 +; ASM: - 1 ; ELF: OS/ABI: AMDGPU_HSA (0x40) -; ELF4: ABIVersion: 2 -; ELF5: ABIVersion: 3 +; ELF: ABIVersion: 2 ; ELF: Flags [ (0x22) ; ELF-NEXT: EF_AMDGPU_MACH_AMDGCN_GFX700 (0x22) ; ELF-NEXT: ] @@ -26,4 +19,4 @@ entry: } !llvm.module.flags = !{!0} -!0 = !{i32 1, !"amdgpu_code_object_version", i32 CODE_OBJECT_VERSION} +!0 = !{i32 1, !"amdgpu_code_object_version", i32 400} diff --git a/llvm/test/CodeGen/AMDGPU/tid-one-func-xnack-off.ll b/llvm/test/CodeGen/AMDGPU/tid-one-func-xnack-off.ll index e84778b526f11..630453656570c 100644 --- a/llvm/test/CodeGen/AMDGPU/tid-one-func-xnack-off.ll +++ b/llvm/test/CodeGen/AMDGPU/tid-one-func-xnack-off.ll @@ -1,21 +1,14 @@ -; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM4 %s -; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM5 %s -; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=4 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF4 %s -; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=5 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF5 %s - -; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=4 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF4 %s -; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=5 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF5 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck --check-prefixes=ASM %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --filetype=obj < %s | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF %s ; ASM: .amdgcn_target "amdgcn-amd-amdhsa--gfx900:xnack-" ; ASM: amdhsa.target: 'amdgcn-amd-amdhsa--gfx900:xnack-' ; ASM: amdhsa.version: ; ASM: - 1 -; ASM4: - 1 -; ASM5: - 2 +; ASM: - 1 ; ELF: OS/ABI: AMDGPU_HSA (0x40) -; ELF4: ABIVersion: 2 -; ELF5: ABIVersion: 3 +; ELF: ABIVersion: 2 ; ELF: Flags [ (0x22C) ; ELF-NEXT: EF_AMDGPU_FEATURE_XNACK_OFF_V4 (0x200) ; ELF-NEXT: EF_AMDGPU_MACH_AMDGCN_GFX900 (0x2C) @@ -29,4 +22,4 @@ entry: attributes #0 = { "target-features"="-xnack" } !llvm.module.flags = !{!0} -!0 = !{i32 1, !"amdgpu_code_object_version", i32 CODE_OBJECT_VERSION} +!0 = !{i32 1, !"amdgpu_code_object_version", i32 400} diff --git a/llvm/test/CodeGen/AMDGPU/tid-one-func-xnack-on.ll b/llvm/test/CodeGen/AMDGPU/tid-one-func-xnack-on.ll index a1ab6ed5f082a..f99ceccec96a9 100644 --- a/llvm/test/CodeGen/AMDGPU/tid-one-func-xnack-on.ll +++ b/llvm/test/CodeGen/AMDGPU/tid-one-func-xnack-on.ll @@ -1,21 +1,14 @@ -; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM4 %s -; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck --check-prefixes=ASM,ASM5 %s -; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=4 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF4 %s -; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=5 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF5 %s - -; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=4 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF4 %s -; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=5 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF5 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck --check-prefixes=ASM %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --filetype=obj < %s | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF %s ; ASM: .amdgcn_target "amdgcn-amd-amdhsa--gfx900:xnack+" ; ASM: amdhsa.target: 'amdgcn-amd-amdhsa--gfx900:xnack+' ; ASM: amdhsa.version: ; ASM: - 1 -; ASM4: - 1 -; ASM5: - 2 +; ASM: - 1 ; ELF: OS/ABI: AMDGPU_HSA (0x40) -; ELF4: ABIVersion: 2 -; ELF5: ABIVersion: 3 +; ELF: ABIVersion: 2 ; ELF: Flags [ (0x32C) ; ELF-NEXT: EF_AMDGPU_FEATURE_XNACK_ON_V4 (0x300) ; ELF-NEXT: EF_AMDGPU_MACH_AMDGCN_GFX900 (0x2C) @@ -29,4 +22,4 @@ entry: attributes #0 = { "target-features"="+xnack" } !llvm.module.flags = !{!0} -!0 = !{i32 1, !"amdgpu_code_object_version", i32 CODE_OBJECT_VERSION} +!0 = !{i32 1, !"amdgpu_code_object_version", i32 400} diff --git a/llvm/test/CodeGen/ARM/vector-store.ll b/llvm/test/CodeGen/ARM/vector-store.ll index 9c8ea7a2c440c..a8a1031637afc 100644 --- a/llvm/test/CodeGen/ARM/vector-store.ll +++ b/llvm/test/CodeGen/ARM/vector-store.ll @@ -403,14 +403,17 @@ define void @v3i8store(ptr %p) { ; CHECK-LABEL: v3i8store: ; CHECK: @ %bb.0: ; CHECK-NEXT: sub sp, #4 -; CHECK-NEXT: movs r1, #0 -; CHECK-NEXT: mov r2, sp -; CHECK-NEXT: str r1, [sp] -; CHECK-NEXT: vld1.32 {d16[0]}, [r2:32] -; CHECK-NEXT: strb r1, [r0, #2] +; CHECK-NEXT: vmov.i32 d16, #0xff +; CHECK-NEXT: mov r1, sp +; CHECK-NEXT: vmov.i32 d17, #0x0 +; CHECK-NEXT: movs r2, #0 +; CHECK-NEXT: vand d16, d17, d16 +; CHECK-NEXT: vst1.32 {d16[0]}, [r1:32] +; CHECK-NEXT: vld1.32 {d16[0]}, [r1:32] ; CHECK-NEXT: vmovl.u16 q8, d16 -; CHECK-NEXT: vmov.32 r2, d16[0] -; CHECK-NEXT: strh r2, [r0] +; CHECK-NEXT: strb r2, [r0, #2] +; CHECK-NEXT: vmov.32 r1, d16[0] +; CHECK-NEXT: strh r1, [r0] ; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: bx lr store <3 x i8> zeroinitializer, ptr %p, align 4 diff --git a/llvm/test/CodeGen/LoongArch/lasx/fsqrt.ll b/llvm/test/CodeGen/LoongArch/lasx/fsqrt.ll deleted file mode 100644 index c4a881bdeae9f..0000000000000 --- a/llvm/test/CodeGen/LoongArch/lasx/fsqrt.ll +++ /dev/null @@ -1,65 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s - -;; fsqrt -define void @sqrt_v8f32(ptr %res, ptr %a0) nounwind { -; CHECK-LABEL: sqrt_v8f32: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: xvld $xr0, $a1, 0 -; CHECK-NEXT: xvfsqrt.s $xr0, $xr0 -; CHECK-NEXT: xvst $xr0, $a0, 0 -; CHECK-NEXT: ret -entry: - %v0 = load <8 x float>, ptr %a0, align 16 - %sqrt = call <8 x float> @llvm.sqrt.v8f32 (<8 x float> %v0) - store <8 x float> %sqrt, ptr %res, align 16 - ret void -} - -define void @sqrt_v4f64(ptr %res, ptr %a0) nounwind { -; CHECK-LABEL: sqrt_v4f64: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: xvld $xr0, $a1, 0 -; CHECK-NEXT: xvfsqrt.d $xr0, $xr0 -; CHECK-NEXT: xvst $xr0, $a0, 0 -; CHECK-NEXT: ret -entry: - %v0 = load <4 x double>, ptr %a0, align 16 - %sqrt = call <4 x double> @llvm.sqrt.v4f64 (<4 x double> %v0) - store <4 x double> %sqrt, ptr %res, align 16 - ret void -} - -;; 1.0 / (fsqrt vec) -define void @one_div_sqrt_v8f32(ptr %res, ptr %a0) nounwind { -; CHECK-LABEL: one_div_sqrt_v8f32: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: xvld $xr0, $a1, 0 -; CHECK-NEXT: xvfrsqrt.s $xr0, $xr0 -; CHECK-NEXT: xvst $xr0, $a0, 0 -; CHECK-NEXT: ret -entry: - %v0 = load <8 x float>, ptr %a0, align 16 - %sqrt = call <8 x float> @llvm.sqrt.v8f32 (<8 x float> %v0) - %div = fdiv <8 x float> , %sqrt - store <8 x float> %div, ptr %res, align 16 - ret void -} - -define void @one_div_sqrt_v4f64(ptr %res, ptr %a0) nounwind { -; CHECK-LABEL: one_div_sqrt_v4f64: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: xvld $xr0, $a1, 0 -; CHECK-NEXT: xvfrsqrt.d $xr0, $xr0 -; CHECK-NEXT: xvst $xr0, $a0, 0 -; CHECK-NEXT: ret -entry: - %v0 = load <4 x double>, ptr %a0, align 16 - %sqrt = call <4 x double> @llvm.sqrt.v4f64 (<4 x double> %v0) - %div = fdiv <4 x double> , %sqrt - store <4 x double> %div, ptr %res, align 16 - ret void -} - -declare <8 x float> @llvm.sqrt.v8f32(<8 x float>) -declare <4 x double> @llvm.sqrt.v4f64(<4 x double>) diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fdiv.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fdiv.ll index 6004565b0b784..284121a79a492 100644 --- a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fdiv.ll +++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fdiv.ll @@ -32,32 +32,3 @@ entry: store <4 x double> %v2, ptr %res ret void } - -;; 1.0 / vec -define void @one_fdiv_v8f32(ptr %res, ptr %a0) nounwind { -; CHECK-LABEL: one_fdiv_v8f32: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: xvld $xr0, $a1, 0 -; CHECK-NEXT: xvfrecip.s $xr0, $xr0 -; CHECK-NEXT: xvst $xr0, $a0, 0 -; CHECK-NEXT: ret -entry: - %v0 = load <8 x float>, ptr %a0 - %div = fdiv <8 x float> , %v0 - store <8 x float> %div, ptr %res - ret void -} - -define void @one_fdiv_v4f64(ptr %res, ptr %a0) nounwind { -; CHECK-LABEL: one_fdiv_v4f64: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: xvld $xr0, $a1, 0 -; CHECK-NEXT: xvfrecip.d $xr0, $xr0 -; CHECK-NEXT: xvst $xr0, $a0, 0 -; CHECK-NEXT: ret -entry: - %v0 = load <4 x double>, ptr %a0 - %div = fdiv <4 x double> , %v0 - store <4 x double> %div, ptr %res - ret void -} diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fneg.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fneg.ll deleted file mode 100644 index 5eb468fc55a0e..0000000000000 --- a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fneg.ll +++ /dev/null @@ -1,29 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s - -define void @fneg_v8f32(ptr %res, ptr %a0) nounwind { -; CHECK-LABEL: fneg_v8f32: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: xvld $xr0, $a1, 0 -; CHECK-NEXT: xvbitrevi.w $xr0, $xr0, 31 -; CHECK-NEXT: xvst $xr0, $a0, 0 -; CHECK-NEXT: ret -entry: - %v0 = load <8 x float>, ptr %a0 - %v1 = fneg <8 x float> %v0 - store <8 x float> %v1, ptr %res - ret void -} -define void @fneg_v4f64(ptr %res, ptr %a0) nounwind { -; CHECK-LABEL: fneg_v4f64: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: xvld $xr0, $a1, 0 -; CHECK-NEXT: xvbitrevi.d $xr0, $xr0, 63 -; CHECK-NEXT: xvst $xr0, $a0, 0 -; CHECK-NEXT: ret -entry: - %v0 = load <4 x double>, ptr %a0 - %v1 = fneg <4 x double> %v0 - store <4 x double> %v1, ptr %res - ret void -} diff --git a/llvm/test/CodeGen/LoongArch/lsx/fsqrt.ll b/llvm/test/CodeGen/LoongArch/lsx/fsqrt.ll deleted file mode 100644 index a57bc1ca0e948..0000000000000 --- a/llvm/test/CodeGen/LoongArch/lsx/fsqrt.ll +++ /dev/null @@ -1,65 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s - -;; fsqrt -define void @sqrt_v4f32(ptr %res, ptr %a0) nounwind { -; CHECK-LABEL: sqrt_v4f32: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vld $vr0, $a1, 0 -; CHECK-NEXT: vfsqrt.s $vr0, $vr0 -; CHECK-NEXT: vst $vr0, $a0, 0 -; CHECK-NEXT: ret -entry: - %v0 = load <4 x float>, ptr %a0, align 16 - %sqrt = call <4 x float> @llvm.sqrt.v4f32 (<4 x float> %v0) - store <4 x float> %sqrt, ptr %res, align 16 - ret void -} - -define void @sqrt_v2f64(ptr %res, ptr %a0) nounwind { -; CHECK-LABEL: sqrt_v2f64: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vld $vr0, $a1, 0 -; CHECK-NEXT: vfsqrt.d $vr0, $vr0 -; CHECK-NEXT: vst $vr0, $a0, 0 -; CHECK-NEXT: ret -entry: - %v0 = load <2 x double>, ptr %a0, align 16 - %sqrt = call <2 x double> @llvm.sqrt.v2f64 (<2 x double> %v0) - store <2 x double> %sqrt, ptr %res, align 16 - ret void -} - -;; 1.0 / (fsqrt vec) -define void @one_div_sqrt_v4f32(ptr %res, ptr %a0) nounwind { -; CHECK-LABEL: one_div_sqrt_v4f32: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vld $vr0, $a1, 0 -; CHECK-NEXT: vfrsqrt.s $vr0, $vr0 -; CHECK-NEXT: vst $vr0, $a0, 0 -; CHECK-NEXT: ret -entry: - %v0 = load <4 x float>, ptr %a0, align 16 - %sqrt = call <4 x float> @llvm.sqrt.v4f32 (<4 x float> %v0) - %div = fdiv <4 x float> , %sqrt - store <4 x float> %div, ptr %res, align 16 - ret void -} - -define void @one_div_sqrt_v2f64(ptr %res, ptr %a0) nounwind { -; CHECK-LABEL: one_div_sqrt_v2f64: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vld $vr0, $a1, 0 -; CHECK-NEXT: vfrsqrt.d $vr0, $vr0 -; CHECK-NEXT: vst $vr0, $a0, 0 -; CHECK-NEXT: ret -entry: - %v0 = load <2 x double>, ptr %a0, align 16 - %sqrt = call <2 x double> @llvm.sqrt.v2f64 (<2 x double> %v0) - %div = fdiv <2 x double> , %sqrt - store <2 x double> %div, ptr %res, align 16 - ret void -} - -declare <4 x float> @llvm.sqrt.v4f32(<4 x float>) -declare <2 x double> @llvm.sqrt.v2f64(<2 x double>) diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/fdiv.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/fdiv.ll index 5f1ee9e4d212e..eb7c8bd9616ec 100644 --- a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/fdiv.ll +++ b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/fdiv.ll @@ -32,32 +32,3 @@ entry: store <2 x double> %v2, ptr %res ret void } - -;; 1.0 / vec -define void @one_fdiv_v4f32(ptr %res, ptr %a0) nounwind { -; CHECK-LABEL: one_fdiv_v4f32: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vld $vr0, $a1, 0 -; CHECK-NEXT: vfrecip.s $vr0, $vr0 -; CHECK-NEXT: vst $vr0, $a0, 0 -; CHECK-NEXT: ret -entry: - %v0 = load <4 x float>, ptr %a0 - %div = fdiv <4 x float> , %v0 - store <4 x float> %div, ptr %res - ret void -} - -define void @one_fdiv_v2f64(ptr %res, ptr %a0) nounwind { -; CHECK-LABEL: one_fdiv_v2f64: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vld $vr0, $a1, 0 -; CHECK-NEXT: vfrecip.d $vr0, $vr0 -; CHECK-NEXT: vst $vr0, $a0, 0 -; CHECK-NEXT: ret -entry: - %v0 = load <2 x double>, ptr %a0 - %div = fdiv <2 x double> , %v0 - store <2 x double> %div, ptr %res - ret void -} diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/fneg.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/fneg.ll deleted file mode 100644 index 795c1ac8b3684..0000000000000 --- a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/fneg.ll +++ /dev/null @@ -1,29 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s - -define void @fneg_v4f32(ptr %res, ptr %a0) nounwind { -; CHECK-LABEL: fneg_v4f32: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vld $vr0, $a1, 0 -; CHECK-NEXT: vbitrevi.w $vr0, $vr0, 31 -; CHECK-NEXT: vst $vr0, $a0, 0 -; CHECK-NEXT: ret -entry: - %v0 = load <4 x float>, ptr %a0 - %v1 = fneg <4 x float> %v0 - store <4 x float> %v1, ptr %res - ret void -} -define void @fneg_v2f64(ptr %res, ptr %a0) nounwind { -; CHECK-LABEL: fneg_v2f64: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vld $vr0, $a1, 0 -; CHECK-NEXT: vbitrevi.d $vr0, $vr0, 63 -; CHECK-NEXT: vst $vr0, $a0, 0 -; CHECK-NEXT: ret -entry: - %v0 = load <2 x double>, ptr %a0 - %v1 = fneg <2 x double> %v0 - store <2 x double> %v1, ptr %res - ret void -} diff --git a/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll b/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll index c08f16c2644de..7fba1aadcd986 100644 --- a/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll +++ b/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll @@ -392,16 +392,6 @@ define <2 x bfloat> @test_fabs(<2 x bfloat> %a) #0 { ret <2 x bfloat> %r } -; CHECK-LABEL: test_fabs_add( -; CHECK: abs.bf16x2 -; CHECK: ret; -define <2 x bfloat> @test_fabs_add(<2 x bfloat> %a, <2 x bfloat> %b) #0 { - %s = fadd <2 x bfloat> %a, %a - %r = call <2 x bfloat> @llvm.fabs.f16(<2 x bfloat> %s) - %d = fadd <2 x bfloat> %r, %b - ret <2 x bfloat> %d -} - ; CHECK-LABEL: test_minnum( ; CHECK-DAG: ld.param.b32 [[AF0:%r[0-9]+]], [test_minnum_param_0]; diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll index 769bfe8cd5ba9..a56a81f5f793b 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll @@ -244,14 +244,14 @@ define <8 x i64> @vrgather_shuffle_vx_v8i64(<8 x i64> %x) { ; RV32-NEXT: addi a0, a0, %lo(.LCPI13_0) ; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, mu ; RV32-NEXT: vle16.v v16, (a0) -; RV32-NEXT: vmv.v.i v20, 5 +; RV32-NEXT: vrgatherei16.vv v12, v8, v16 ; RV32-NEXT: lui a0, %hi(.LCPI13_1) ; RV32-NEXT: addi a0, a0, %lo(.LCPI13_1) -; RV32-NEXT: vle16.v v17, (a0) -; RV32-NEXT: li a0, 115 +; RV32-NEXT: vle16.v v8, (a0) +; RV32-NEXT: li a0, 140 ; RV32-NEXT: vmv.s.x v0, a0 -; RV32-NEXT: vrgatherei16.vv v12, v20, v16 -; RV32-NEXT: vrgatherei16.vv v12, v8, v17, v0.t +; RV32-NEXT: vmv.v.i v16, 5 +; RV32-NEXT: vrgatherei16.vv v12, v16, v8, v0.t ; RV32-NEXT: vmv.v.v v8, v12 ; RV32-NEXT: ret ; diff --git a/llvm/test/CodeGen/SystemZ/atomicrmw-add-04.ll b/llvm/test/CodeGen/SystemZ/atomicrmw-add-04.ll index 4af3793191b67..50d3eec15dbe8 100644 --- a/llvm/test/CodeGen/SystemZ/atomicrmw-add-04.ll +++ b/llvm/test/CodeGen/SystemZ/atomicrmw-add-04.ll @@ -16,12 +16,13 @@ define i64 @f1(i64 %dummy, ptr %src, i64 %b) { ret i64 %res } -; Check addition of 1. +; Check addition of 1, which can use AGHI. define i64 @f2(i64 %dummy, ptr %src) { ; CHECK-LABEL: f2: ; CHECK: lg %r2, 0(%r3) ; CHECK: [[LABEL:\.[^:]*]]: -; CHECK: la %r0, 1(%r2) +; CHECK: lgr %r0, %r2 +; CHECK: aghi %r0, 1 ; CHECK: csg %r2, %r0, 0(%r3) ; CHECK: jl [[LABEL]] ; CHECK: br %r14 @@ -29,64 +30,82 @@ define i64 @f2(i64 %dummy, ptr %src) { ret i64 %res } -; Check use of LAY. +; Check the high end of the AGHI range. define i64 @f3(i64 %dummy, ptr %src) { ; CHECK-LABEL: f3: -; CHECK: lay %r0, 32767(%r2) +; CHECK: aghi %r0, 32767 ; CHECK: br %r14 %res = atomicrmw add ptr %src, i64 32767 seq_cst ret i64 %res } -; Check the high end of the AGFI range. +; Check the next value up, which must use AGFI. define i64 @f4(i64 %dummy, ptr %src) { ; CHECK-LABEL: f4: -; CHECK: agfi %r0, 2147483647 +; CHECK: agfi %r0, 32768 ; CHECK: br %r14 - %res = atomicrmw add ptr %src, i64 2147483647 seq_cst + %res = atomicrmw add ptr %src, i64 32768 seq_cst ret i64 %res } -; Check the next value up, which uses an ALGFI. +; Check the high end of the AGFI range. define i64 @f5(i64 %dummy, ptr %src) { ; CHECK-LABEL: f5: -; CHECK: algfi %r0, 2147483648 +; CHECK: agfi %r0, 2147483647 ; CHECK: br %r14 - %res = atomicrmw add ptr %src, i64 2147483648 seq_cst + %res = atomicrmw add ptr %src, i64 2147483647 seq_cst ret i64 %res } -; Check addition of -1, which can use LAY. +; Check the next value up, which must use a register addition. define i64 @f6(i64 %dummy, ptr %src) { ; CHECK-LABEL: f6: -; CHECK: lay %r0, -1(%r2) +; CHECK: agr ; CHECK: br %r14 - %res = atomicrmw add ptr %src, i64 -1 seq_cst + %res = atomicrmw add ptr %src, i64 2147483648 seq_cst ret i64 %res } -; LAY still OK. +; Check addition of -1, which can use AGHI. define i64 @f7(i64 %dummy, ptr %src) { ; CHECK-LABEL: f7: -; CHECK: lay %r0, -32769(%r2) +; CHECK: aghi %r0, -1 ; CHECK: br %r14 - %res = atomicrmw add ptr %src, i64 -32769 seq_cst + %res = atomicrmw add ptr %src, i64 -1 seq_cst ret i64 %res } -; Check the low end of the AGFI range. +; Check the low end of the AGHI range. define i64 @f8(i64 %dummy, ptr %src) { ; CHECK-LABEL: f8: -; CHECK: agfi %r0, -2147483648 +; CHECK: aghi %r0, -32768 ; CHECK: br %r14 - %res = atomicrmw add ptr %src, i64 -2147483648 seq_cst + %res = atomicrmw add ptr %src, i64 -32768 seq_cst ret i64 %res } -; Check the next value down, which uses an SLGFI. +; Check the next value down, which must use AGFI instead. define i64 @f9(i64 %dummy, ptr %src) { ; CHECK-LABEL: f9: -; CHECK: slgfi %r0, 2147483649 +; CHECK: agfi %r0, -32769 +; CHECK: br %r14 + %res = atomicrmw add ptr %src, i64 -32769 seq_cst + ret i64 %res +} + +; Check the low end of the AGFI range. +define i64 @f10(i64 %dummy, ptr %src) { +; CHECK-LABEL: f10: +; CHECK: agfi %r0, -2147483648 +; CHECK: br %r14 + %res = atomicrmw add ptr %src, i64 -2147483648 seq_cst + ret i64 %res +} + +; Check the next value down, which must use a register addition. +define i64 @f11(i64 %dummy, ptr %src) { +; CHECK-LABEL: f11: +; CHECK: agr ; CHECK: br %r14 %res = atomicrmw add ptr %src, i64 -2147483649 seq_cst ret i64 %res diff --git a/llvm/test/CodeGen/SystemZ/atomicrmw-and-03.ll b/llvm/test/CodeGen/SystemZ/atomicrmw-and-03.ll index 96c82e6b1eaec..03ed2404dcc17 100644 --- a/llvm/test/CodeGen/SystemZ/atomicrmw-and-03.ll +++ b/llvm/test/CodeGen/SystemZ/atomicrmw-and-03.ll @@ -33,7 +33,7 @@ define i32 @f2(i32 %dummy, ptr %src) { ; Check ANDs of the low end of the NILH range. define i32 @f3(i32 %dummy, ptr %src) { ; CHECK-LABEL: f3: -; CHECK: llhr %r0, %r2 +; CHECK: nilh %r0, 0 ; CHECK: br %r14 %res = atomicrmw and ptr %src, i32 65535 seq_cst ret i32 %res diff --git a/llvm/test/CodeGen/SystemZ/atomicrmw-and-04.ll b/llvm/test/CodeGen/SystemZ/atomicrmw-and-04.ll index 9647548c842bf..00b6fd93ad5bb 100644 --- a/llvm/test/CodeGen/SystemZ/atomicrmw-and-04.ll +++ b/llvm/test/CodeGen/SystemZ/atomicrmw-and-04.ll @@ -16,10 +16,11 @@ define i64 @f1(i64 %dummy, ptr %src, i64 %b) { ret i64 %res } -; Check ANDs of 1, which are done using a register. +; Check ANDs of 1, which are done using a register. (We could use RISBG +; instead, but that isn't implemented yet.) define i64 @f2(i64 %dummy, ptr %src) { ; CHECK-LABEL: f2: -; CHECK: risbg +; CHECK: ngr ; CHECK: br %r14 %res = atomicrmw and ptr %src, i64 1 seq_cst ret i64 %res @@ -55,7 +56,7 @@ define i64 @f4(i64 %dummy, ptr %src) { ; Check the next value up, which must use a register. define i64 @f5(i64 %dummy, ptr %src) { ; CHECK-LABEL: f5: -; CHECK: risbg +; CHECK: ngr ; CHECK: br %r14 %res = atomicrmw and ptr %src, i64 12884901888 seq_cst ret i64 %res @@ -73,7 +74,7 @@ define i64 @f6(i64 %dummy, ptr %src) { ; Check the next value up, which must use a register. define i64 @f7(i64 %dummy, ptr %src) { ; CHECK-LABEL: f7: -; CHECK: risbg +; CHECK: ngr ; CHECK: br %r14 %res = atomicrmw and ptr %src, i64 281474976710656 seq_cst ret i64 %res diff --git a/llvm/test/CodeGen/SystemZ/atomicrmw-minmax-03.ll b/llvm/test/CodeGen/SystemZ/atomicrmw-minmax-03.ll index d107e5d1dc2e5..d633c2d74e3a6 100644 --- a/llvm/test/CodeGen/SystemZ/atomicrmw-minmax-03.ll +++ b/llvm/test/CodeGen/SystemZ/atomicrmw-minmax-03.ll @@ -1,31 +1,21 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 ; Test 32-bit atomic minimum and maximum. Here we match the z10 versions, ; which can't use LOCR. ; ; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 | FileCheck %s -; Todo: If-converter no longer producing CondReturns (with AtomicExpand pass). - ; Check signed minimum. define i32 @f1(i32 %dummy, ptr %src, i32 %b) { ; CHECK-LABEL: f1: -; CHECK: # %bb.0: -; CHECK-NEXT: l %r2, 0(%r3) -; CHECK-NEXT: j .LBB0_2 -; CHECK-NEXT: .LBB0_1: # %atomicrmw.start -; CHECK-NEXT: # in Loop: Header=BB0_2 Depth=1 -; CHECK-NEXT: cs %r2, %r0, 0(%r3) -; CHECK-NEXT: je .LBB0_4 -; CHECK-NEXT: .LBB0_2: # %atomicrmw.start -; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: lr %r0, %r2 -; CHECK-NEXT: crjle %r2, %r4, .LBB0_1 -; CHECK-NEXT: # %bb.3: # %atomicrmw.start -; CHECK-NEXT: # in Loop: Header=BB0_2 Depth=1 -; CHECK-NEXT: lr %r0, %r4 -; CHECK-NEXT: j .LBB0_1 -; CHECK-NEXT: .LBB0_4: # %atomicrmw.end -; CHECK-NEXT: br %r14 +; CHECK: l %r2, 0(%r3) +; CHECK: j [[LOOP:\.[^:]*]] +; CHECK: [[BB1:\.[^:]*]]: +; CHECK: cs %r2, [[NEW:%r[0-9]+]], 0(%r3) +; CHECK: ber %r14 +; CHECK: [[LOOP]]: +; CHECK: lr [[NEW]], %r2 +; CHECK: crjle %r2, %r4, [[KEEP:\..*]] +; CHECK: lr [[NEW]], %r4 +; CHECK: j [[BB1]] %res = atomicrmw min ptr %src, i32 %b seq_cst ret i32 %res } @@ -33,23 +23,16 @@ define i32 @f1(i32 %dummy, ptr %src, i32 %b) { ; Check signed maximum. define i32 @f2(i32 %dummy, ptr %src, i32 %b) { ; CHECK-LABEL: f2: -; CHECK: # %bb.0: -; CHECK-NEXT: l %r2, 0(%r3) -; CHECK-NEXT: j .LBB1_2 -; CHECK-NEXT: .LBB1_1: # %atomicrmw.start -; CHECK-NEXT: # in Loop: Header=BB1_2 Depth=1 -; CHECK-NEXT: cs %r2, %r0, 0(%r3) -; CHECK-NEXT: je .LBB1_4 -; CHECK-NEXT: .LBB1_2: # %atomicrmw.start -; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: lr %r0, %r2 -; CHECK-NEXT: crjh %r2, %r4, .LBB1_1 -; CHECK-NEXT: # %bb.3: # %atomicrmw.start -; CHECK-NEXT: # in Loop: Header=BB1_2 Depth=1 -; CHECK-NEXT: lr %r0, %r4 -; CHECK-NEXT: j .LBB1_1 -; CHECK-NEXT: .LBB1_4: # %atomicrmw.end -; CHECK-NEXT: br %r14 +; CHECK: l %r2, 0(%r3) +; CHECK: j [[LOOP:\.[^:]*]] +; CHECK: [[BB1:\.[^:]*]]: +; CHECK: cs %r2, [[NEW:%r[0-9]+]], 0(%r3) +; CHECK: ber %r14 +; CHECK: [[LOOP]]: +; CHECK: lr [[NEW]], %r2 +; CHECK: crjhe %r2, %r4, [[KEEP:\..*]] +; CHECK: lr [[NEW]], %r4 +; CHECK: j [[BB1]] %res = atomicrmw max ptr %src, i32 %b seq_cst ret i32 %res } @@ -57,23 +40,16 @@ define i32 @f2(i32 %dummy, ptr %src, i32 %b) { ; Check unsigned minimum. define i32 @f3(i32 %dummy, ptr %src, i32 %b) { ; CHECK-LABEL: f3: -; CHECK: # %bb.0: -; CHECK-NEXT: l %r2, 0(%r3) -; CHECK-NEXT: j .LBB2_2 -; CHECK-NEXT: .LBB2_1: # %atomicrmw.start -; CHECK-NEXT: # in Loop: Header=BB2_2 Depth=1 -; CHECK-NEXT: cs %r2, %r0, 0(%r3) -; CHECK-NEXT: je .LBB2_4 -; CHECK-NEXT: .LBB2_2: # %atomicrmw.start -; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: lr %r0, %r2 -; CHECK-NEXT: clrjle %r2, %r4, .LBB2_1 -; CHECK-NEXT: # %bb.3: # %atomicrmw.start -; CHECK-NEXT: # in Loop: Header=BB2_2 Depth=1 -; CHECK-NEXT: lr %r0, %r4 -; CHECK-NEXT: j .LBB2_1 -; CHECK-NEXT: .LBB2_4: # %atomicrmw.end -; CHECK-NEXT: br %r14 +; CHECK: l %r2, 0(%r3) +; CHECK: j [[LOOP:\.[^:]*]] +; CHECK: [[BB1:\.[^:]*]]: +; CHECK: cs %r2, [[NEW:%r[0-9]+]], 0(%r3) +; CHECK: ber %r14 +; CHECK: [[LOOP]]: +; CHECK: lr [[NEW]], %r2 +; CHECK: clrjle %r2, %r4, [[KEEP:\..*]] +; CHECK: lr [[NEW]], %r4 +; CHECK: j [[BB1]] %res = atomicrmw umin ptr %src, i32 %b seq_cst ret i32 %res } @@ -81,23 +57,16 @@ define i32 @f3(i32 %dummy, ptr %src, i32 %b) { ; Check unsigned maximum. define i32 @f4(i32 %dummy, ptr %src, i32 %b) { ; CHECK-LABEL: f4: -; CHECK: # %bb.0: -; CHECK-NEXT: l %r2, 0(%r3) -; CHECK-NEXT: j .LBB3_2 -; CHECK-NEXT: .LBB3_1: # %atomicrmw.start -; CHECK-NEXT: # in Loop: Header=BB3_2 Depth=1 -; CHECK-NEXT: cs %r2, %r0, 0(%r3) -; CHECK-NEXT: je .LBB3_4 -; CHECK-NEXT: .LBB3_2: # %atomicrmw.start -; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: lr %r0, %r2 -; CHECK-NEXT: clrjh %r2, %r4, .LBB3_1 -; CHECK-NEXT: # %bb.3: # %atomicrmw.start -; CHECK-NEXT: # in Loop: Header=BB3_2 Depth=1 -; CHECK-NEXT: lr %r0, %r4 -; CHECK-NEXT: j .LBB3_1 -; CHECK-NEXT: .LBB3_4: # %atomicrmw.end -; CHECK-NEXT: br %r14 +; CHECK: l %r2, 0(%r3) +; CHECK: j [[LOOP:\.[^:]*]] +; CHECK: [[BB1:\.[^:]*]]: +; CHECK: cs %r2, [[NEW:%r[0-9]+]], 0(%r3) +; CHECK: ber %r14 +; CHECK: [[LOOP]]: +; CHECK: lr [[NEW]], %r2 +; CHECK: clrjhe %r2, %r4, [[KEEP:\..*]] +; CHECK: lr [[NEW]], %r4 +; CHECK: j [[BB1]] %res = atomicrmw umax ptr %src, i32 %b seq_cst ret i32 %res } @@ -105,23 +74,9 @@ define i32 @f4(i32 %dummy, ptr %src, i32 %b) { ; Check the high end of the aligned CS range. define i32 @f5(i32 %dummy, ptr %src, i32 %b) { ; CHECK-LABEL: f5: -; CHECK: # %bb.0: -; CHECK-NEXT: l %r2, 4092(%r3) -; CHECK-NEXT: j .LBB4_2 -; CHECK-NEXT: .LBB4_1: # %atomicrmw.start -; CHECK-NEXT: # in Loop: Header=BB4_2 Depth=1 -; CHECK-NEXT: cs %r2, %r0, 4092(%r3) -; CHECK-NEXT: je .LBB4_4 -; CHECK-NEXT: .LBB4_2: # %atomicrmw.start -; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: lr %r0, %r2 -; CHECK-NEXT: crjle %r2, %r4, .LBB4_1 -; CHECK-NEXT: # %bb.3: # %atomicrmw.start -; CHECK-NEXT: # in Loop: Header=BB4_2 Depth=1 -; CHECK-NEXT: lr %r0, %r4 -; CHECK-NEXT: j .LBB4_1 -; CHECK-NEXT: .LBB4_4: # %atomicrmw.end -; CHECK-NEXT: br %r14 +; CHECK: l %r2, 4092(%r3) +; CHECK: cs %r2, {{%r[0-9]+}}, 4092(%r3) +; CHECK: ber %r14 %ptr = getelementptr i32, ptr %src, i64 1023 %res = atomicrmw min ptr %ptr, i32 %b seq_cst ret i32 %res @@ -130,23 +85,9 @@ define i32 @f5(i32 %dummy, ptr %src, i32 %b) { ; Check the next word up, which requires CSY. define i32 @f6(i32 %dummy, ptr %src, i32 %b) { ; CHECK-LABEL: f6: -; CHECK: # %bb.0: -; CHECK-NEXT: ly %r2, 4096(%r3) -; CHECK-NEXT: j .LBB5_2 -; CHECK-NEXT: .LBB5_1: # %atomicrmw.start -; CHECK-NEXT: # in Loop: Header=BB5_2 Depth=1 -; CHECK-NEXT: csy %r2, %r0, 4096(%r3) -; CHECK-NEXT: je .LBB5_4 -; CHECK-NEXT: .LBB5_2: # %atomicrmw.start -; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: lr %r0, %r2 -; CHECK-NEXT: crjle %r2, %r4, .LBB5_1 -; CHECK-NEXT: # %bb.3: # %atomicrmw.start -; CHECK-NEXT: # in Loop: Header=BB5_2 Depth=1 -; CHECK-NEXT: lr %r0, %r4 -; CHECK-NEXT: j .LBB5_1 -; CHECK-NEXT: .LBB5_4: # %atomicrmw.end -; CHECK-NEXT: br %r14 +; CHECK: ly %r2, 4096(%r3) +; CHECK: csy %r2, {{%r[0-9]+}}, 4096(%r3) +; CHECK: ber %r14 %ptr = getelementptr i32, ptr %src, i64 1024 %res = atomicrmw min ptr %ptr, i32 %b seq_cst ret i32 %res @@ -155,23 +96,9 @@ define i32 @f6(i32 %dummy, ptr %src, i32 %b) { ; Check the high end of the aligned CSY range. define i32 @f7(i32 %dummy, ptr %src, i32 %b) { ; CHECK-LABEL: f7: -; CHECK: # %bb.0: -; CHECK-NEXT: ly %r2, 524284(%r3) -; CHECK-NEXT: j .LBB6_2 -; CHECK-NEXT: .LBB6_1: # %atomicrmw.start -; CHECK-NEXT: # in Loop: Header=BB6_2 Depth=1 -; CHECK-NEXT: csy %r2, %r0, 524284(%r3) -; CHECK-NEXT: je .LBB6_4 -; CHECK-NEXT: .LBB6_2: # %atomicrmw.start -; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: lr %r0, %r2 -; CHECK-NEXT: crjle %r2, %r4, .LBB6_1 -; CHECK-NEXT: # %bb.3: # %atomicrmw.start -; CHECK-NEXT: # in Loop: Header=BB6_2 Depth=1 -; CHECK-NEXT: lr %r0, %r4 -; CHECK-NEXT: j .LBB6_1 -; CHECK-NEXT: .LBB6_4: # %atomicrmw.end -; CHECK-NEXT: br %r14 +; CHECK: ly %r2, 524284(%r3) +; CHECK: csy %r2, {{%r[0-9]+}}, 524284(%r3) +; CHECK: ber %r14 %ptr = getelementptr i32, ptr %src, i64 131071 %res = atomicrmw min ptr %ptr, i32 %b seq_cst ret i32 %res @@ -180,24 +107,10 @@ define i32 @f7(i32 %dummy, ptr %src, i32 %b) { ; Check the next word up, which needs separate address logic. define i32 @f8(i32 %dummy, ptr %src, i32 %b) { ; CHECK-LABEL: f8: -; CHECK: # %bb.0: -; CHECK-NEXT: agfi %r3, 524288 -; CHECK-NEXT: l %r2, 0(%r3) -; CHECK-NEXT: j .LBB7_2 -; CHECK-NEXT: .LBB7_1: # %atomicrmw.start -; CHECK-NEXT: # in Loop: Header=BB7_2 Depth=1 -; CHECK-NEXT: cs %r2, %r0, 0(%r3) -; CHECK-NEXT: je .LBB7_4 -; CHECK-NEXT: .LBB7_2: # %atomicrmw.start -; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: lr %r0, %r2 -; CHECK-NEXT: crjle %r2, %r4, .LBB7_1 -; CHECK-NEXT: # %bb.3: # %atomicrmw.start -; CHECK-NEXT: # in Loop: Header=BB7_2 Depth=1 -; CHECK-NEXT: lr %r0, %r4 -; CHECK-NEXT: j .LBB7_1 -; CHECK-NEXT: .LBB7_4: # %atomicrmw.end -; CHECK-NEXT: br %r14 +; CHECK: agfi %r3, 524288 +; CHECK: l %r2, 0(%r3) +; CHECK: cs %r2, {{%r[0-9]+}}, 0(%r3) +; CHECK: ber %r14 %ptr = getelementptr i32, ptr %src, i64 131072 %res = atomicrmw min ptr %ptr, i32 %b seq_cst ret i32 %res @@ -206,23 +119,9 @@ define i32 @f8(i32 %dummy, ptr %src, i32 %b) { ; Check the high end of the negative aligned CSY range. define i32 @f9(i32 %dummy, ptr %src, i32 %b) { ; CHECK-LABEL: f9: -; CHECK: # %bb.0: -; CHECK-NEXT: ly %r2, -4(%r3) -; CHECK-NEXT: j .LBB8_2 -; CHECK-NEXT: .LBB8_1: # %atomicrmw.start -; CHECK-NEXT: # in Loop: Header=BB8_2 Depth=1 -; CHECK-NEXT: csy %r2, %r0, -4(%r3) -; CHECK-NEXT: je .LBB8_4 -; CHECK-NEXT: .LBB8_2: # %atomicrmw.start -; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: lr %r0, %r2 -; CHECK-NEXT: crjle %r2, %r4, .LBB8_1 -; CHECK-NEXT: # %bb.3: # %atomicrmw.start -; CHECK-NEXT: # in Loop: Header=BB8_2 Depth=1 -; CHECK-NEXT: lr %r0, %r4 -; CHECK-NEXT: j .LBB8_1 -; CHECK-NEXT: .LBB8_4: # %atomicrmw.end -; CHECK-NEXT: br %r14 +; CHECK: ly %r2, -4(%r3) +; CHECK: csy %r2, {{%r[0-9]+}}, -4(%r3) +; CHECK: ber %r14 %ptr = getelementptr i32, ptr %src, i64 -1 %res = atomicrmw min ptr %ptr, i32 %b seq_cst ret i32 %res @@ -231,23 +130,9 @@ define i32 @f9(i32 %dummy, ptr %src, i32 %b) { ; Check the low end of the CSY range. define i32 @f10(i32 %dummy, ptr %src, i32 %b) { ; CHECK-LABEL: f10: -; CHECK: # %bb.0: -; CHECK-NEXT: ly %r2, -524288(%r3) -; CHECK-NEXT: j .LBB9_2 -; CHECK-NEXT: .LBB9_1: # %atomicrmw.start -; CHECK-NEXT: # in Loop: Header=BB9_2 Depth=1 -; CHECK-NEXT: csy %r2, %r0, -524288(%r3) -; CHECK-NEXT: je .LBB9_4 -; CHECK-NEXT: .LBB9_2: # %atomicrmw.start -; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: lr %r0, %r2 -; CHECK-NEXT: crjle %r2, %r4, .LBB9_1 -; CHECK-NEXT: # %bb.3: # %atomicrmw.start -; CHECK-NEXT: # in Loop: Header=BB9_2 Depth=1 -; CHECK-NEXT: lr %r0, %r4 -; CHECK-NEXT: j .LBB9_1 -; CHECK-NEXT: .LBB9_4: # %atomicrmw.end -; CHECK-NEXT: br %r14 +; CHECK: ly %r2, -524288(%r3) +; CHECK: csy %r2, {{%r[0-9]+}}, -524288(%r3) +; CHECK: ber %r14 %ptr = getelementptr i32, ptr %src, i64 -131072 %res = atomicrmw min ptr %ptr, i32 %b seq_cst ret i32 %res @@ -256,24 +141,10 @@ define i32 @f10(i32 %dummy, ptr %src, i32 %b) { ; Check the next word down, which needs separate address logic. define i32 @f11(i32 %dummy, ptr %src, i32 %b) { ; CHECK-LABEL: f11: -; CHECK: # %bb.0: -; CHECK-NEXT: agfi %r3, -524292 -; CHECK-NEXT: l %r2, 0(%r3) -; CHECK-NEXT: j .LBB10_2 -; CHECK-NEXT: .LBB10_1: # %atomicrmw.start -; CHECK-NEXT: # in Loop: Header=BB10_2 Depth=1 -; CHECK-NEXT: cs %r2, %r0, 0(%r3) -; CHECK-NEXT: je .LBB10_4 -; CHECK-NEXT: .LBB10_2: # %atomicrmw.start -; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: lr %r0, %r2 -; CHECK-NEXT: crjle %r2, %r4, .LBB10_1 -; CHECK-NEXT: # %bb.3: # %atomicrmw.start -; CHECK-NEXT: # in Loop: Header=BB10_2 Depth=1 -; CHECK-NEXT: lr %r0, %r4 -; CHECK-NEXT: j .LBB10_1 -; CHECK-NEXT: .LBB10_4: # %atomicrmw.end -; CHECK-NEXT: br %r14 +; CHECK: agfi %r3, -524292 +; CHECK: l %r2, 0(%r3) +; CHECK: cs %r2, {{%r[0-9]+}}, 0(%r3) +; CHECK: ber %r14 %ptr = getelementptr i32, ptr %src, i64 -131073 %res = atomicrmw min ptr %ptr, i32 %b seq_cst ret i32 %res @@ -282,24 +153,10 @@ define i32 @f11(i32 %dummy, ptr %src, i32 %b) { ; Check that indexed addresses are not allowed. define i32 @f12(i32 %dummy, i64 %base, i64 %index, i32 %b) { ; CHECK-LABEL: f12: -; CHECK: # %bb.0: -; CHECK-NEXT: l %r2, 0(%r4,%r3) -; CHECK-NEXT: agr %r3, %r4 -; CHECK-NEXT: j .LBB11_2 -; CHECK-NEXT: .LBB11_1: # %atomicrmw.start -; CHECK-NEXT: # in Loop: Header=BB11_2 Depth=1 -; CHECK-NEXT: cs %r2, %r0, 0(%r3) -; CHECK-NEXT: je .LBB11_4 -; CHECK-NEXT: .LBB11_2: # %atomicrmw.start -; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: lr %r0, %r2 -; CHECK-NEXT: crjle %r2, %r5, .LBB11_1 -; CHECK-NEXT: # %bb.3: # %atomicrmw.start -; CHECK-NEXT: # in Loop: Header=BB11_2 Depth=1 -; CHECK-NEXT: lr %r0, %r5 -; CHECK-NEXT: j .LBB11_1 -; CHECK-NEXT: .LBB11_4: # %atomicrmw.end -; CHECK-NEXT: br %r14 +; CHECK: agr %r3, %r4 +; CHECK: l %r2, 0(%r3) +; CHECK: cs %r2, {{%r[0-9]+}}, 0(%r3) +; CHECK: ber %r14 %add = add i64 %base, %index %ptr = inttoptr i64 %add to ptr %res = atomicrmw min ptr %ptr, i32 %b seq_cst @@ -309,23 +166,16 @@ define i32 @f12(i32 %dummy, i64 %base, i64 %index, i32 %b) { ; Check that constants are handled. define i32 @f13(i32 %dummy, ptr %ptr) { ; CHECK-LABEL: f13: -; CHECK: # %bb.0: -; CHECK-NEXT: l %r2, 0(%r3) -; CHECK-NEXT: j .LBB12_2 -; CHECK-NEXT: .LBB12_1: # %atomicrmw.start -; CHECK-NEXT: # in Loop: Header=BB12_2 Depth=1 -; CHECK-NEXT: cs %r2, %r0, 0(%r3) -; CHECK-NEXT: je .LBB12_4 -; CHECK-NEXT: .LBB12_2: # %atomicrmw.start -; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: lr %r0, %r2 -; CHECK-NEXT: cijl %r2, 43, .LBB12_1 -; CHECK-NEXT: # %bb.3: # %atomicrmw.start -; CHECK-NEXT: # in Loop: Header=BB12_2 Depth=1 -; CHECK-NEXT: lhi %r0, 42 -; CHECK-NEXT: j .LBB12_1 -; CHECK-NEXT: .LBB12_4: # %atomicrmw.end -; CHECK-NEXT: br %r14 +; CHECK: lhi [[LIMIT:%r[0-9]+]], 42 +; CHECK: j [[LOOP:\.[^:]*]] +; CHECK: [[BB1:\.[^:]*]]: +; CHECK: cs %r2, [[NEW:%r[0-9]+]], 0(%r3) +; CHECK: ber %r14 +; CHECK: [[LOOP]]: +; CHECK: lr [[NEW]], %r2 +; CHECK: crjle %r2, [[LIMIT]], [[KEEP:\..*]] +; CHECK: lhi [[NEW]], 42 +; CHECK: j [[BB1]] %res = atomicrmw min ptr %ptr, i32 42 seq_cst ret i32 %res } diff --git a/llvm/test/CodeGen/SystemZ/atomicrmw-minmax-04.ll b/llvm/test/CodeGen/SystemZ/atomicrmw-minmax-04.ll index 9352118a32f8a..64e76e0a90eaf 100644 --- a/llvm/test/CodeGen/SystemZ/atomicrmw-minmax-04.ll +++ b/llvm/test/CodeGen/SystemZ/atomicrmw-minmax-04.ll @@ -1,31 +1,21 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 ; Test 64-bit atomic minimum and maximum. Here we match the z10 versions, ; which can't use LOCGR. ; ; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 | FileCheck %s -; Todo: If-converter no longer producing CondReturns (with AtomicExpand pass). - ; Check signed minimum. define i64 @f1(i64 %dummy, ptr %src, i64 %b) { ; CHECK-LABEL: f1: -; CHECK: # %bb.0: -; CHECK-NEXT: lg %r2, 0(%r3) -; CHECK-NEXT: j .LBB0_2 -; CHECK-NEXT: .LBB0_1: # %atomicrmw.start -; CHECK-NEXT: # in Loop: Header=BB0_2 Depth=1 -; CHECK-NEXT: csg %r2, %r0, 0(%r3) -; CHECK-NEXT: je .LBB0_4 -; CHECK-NEXT: .LBB0_2: # %atomicrmw.start -; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: lgr %r0, %r2 -; CHECK-NEXT: cgrjle %r2, %r4, .LBB0_1 -; CHECK-NEXT: # %bb.3: # %atomicrmw.start -; CHECK-NEXT: # in Loop: Header=BB0_2 Depth=1 -; CHECK-NEXT: lgr %r0, %r4 -; CHECK-NEXT: j .LBB0_1 -; CHECK-NEXT: .LBB0_4: # %atomicrmw.end -; CHECK-NEXT: br %r14 +; CHECK: lg %r2, 0(%r3) +; CHECK: j [[LOOP:\.[^:]*]] +; CHECK: [[BB1:\.[^:]*]]: +; CHECK: csg %r2, [[NEW:%r[0-9]+]], 0(%r3) +; CHECK: ber %r14 +; CHECK: [[LOOP:\.[^:]*]]: +; CHECK: lgr [[NEW:%r[0-9]+]], %r2 +; CHECK: cgrjle %r2, %r4, [[KEEP:\..*]] +; CHECK: lgr [[NEW]], %r4 +; CHECK: j [[BB1]] %res = atomicrmw min ptr %src, i64 %b seq_cst ret i64 %res } @@ -33,23 +23,16 @@ define i64 @f1(i64 %dummy, ptr %src, i64 %b) { ; Check signed maximum. define i64 @f2(i64 %dummy, ptr %src, i64 %b) { ; CHECK-LABEL: f2: -; CHECK: # %bb.0: -; CHECK-NEXT: lg %r2, 0(%r3) -; CHECK-NEXT: j .LBB1_2 -; CHECK-NEXT: .LBB1_1: # %atomicrmw.start -; CHECK-NEXT: # in Loop: Header=BB1_2 Depth=1 -; CHECK-NEXT: csg %r2, %r0, 0(%r3) -; CHECK-NEXT: je .LBB1_4 -; CHECK-NEXT: .LBB1_2: # %atomicrmw.start -; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: lgr %r0, %r2 -; CHECK-NEXT: cgrjh %r2, %r4, .LBB1_1 -; CHECK-NEXT: # %bb.3: # %atomicrmw.start -; CHECK-NEXT: # in Loop: Header=BB1_2 Depth=1 -; CHECK-NEXT: lgr %r0, %r4 -; CHECK-NEXT: j .LBB1_1 -; CHECK-NEXT: .LBB1_4: # %atomicrmw.end -; CHECK-NEXT: br %r14 +; CHECK: lg %r2, 0(%r3) +; CHECK: j [[LOOP:\.[^:]*]] +; CHECK: [[BB1:\.[^:]*]]: +; CHECK: csg %r2, [[NEW:%r[0-9]+]], 0(%r3) +; CHECK: ber %r14 +; CHECK: [[LOOP:\.[^:]*]]: +; CHECK: lgr [[NEW:%r[0-9]+]], %r2 +; CHECK: cgrjhe %r2, %r4, [[KEEP:\..*]] +; CHECK: lgr [[NEW]], %r4 +; CHECK: j [[BB1]] %res = atomicrmw max ptr %src, i64 %b seq_cst ret i64 %res } @@ -57,23 +40,16 @@ define i64 @f2(i64 %dummy, ptr %src, i64 %b) { ; Check unsigned minimum. define i64 @f3(i64 %dummy, ptr %src, i64 %b) { ; CHECK-LABEL: f3: -; CHECK: # %bb.0: -; CHECK-NEXT: lg %r2, 0(%r3) -; CHECK-NEXT: j .LBB2_2 -; CHECK-NEXT: .LBB2_1: # %atomicrmw.start -; CHECK-NEXT: # in Loop: Header=BB2_2 Depth=1 -; CHECK-NEXT: csg %r2, %r0, 0(%r3) -; CHECK-NEXT: je .LBB2_4 -; CHECK-NEXT: .LBB2_2: # %atomicrmw.start -; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: lgr %r0, %r2 -; CHECK-NEXT: clgrjle %r2, %r4, .LBB2_1 -; CHECK-NEXT: # %bb.3: # %atomicrmw.start -; CHECK-NEXT: # in Loop: Header=BB2_2 Depth=1 -; CHECK-NEXT: lgr %r0, %r4 -; CHECK-NEXT: j .LBB2_1 -; CHECK-NEXT: .LBB2_4: # %atomicrmw.end -; CHECK-NEXT: br %r14 +; CHECK: lg %r2, 0(%r3) +; CHECK: j [[LOOP:\.[^:]*]] +; CHECK: [[BB1:\.[^:]*]]: +; CHECK: csg %r2, [[NEW:%r[0-9]+]], 0(%r3) +; CHECK: ber %r14 +; CHECK: [[LOOP:\.[^:]*]]: +; CHECK: lgr [[NEW:%r[0-9]+]], %r2 +; CHECK: clgrjle %r2, %r4, [[KEEP:\..*]] +; CHECK: lgr [[NEW]], %r4 +; CHECK: j [[BB1]] %res = atomicrmw umin ptr %src, i64 %b seq_cst ret i64 %res } @@ -81,23 +57,16 @@ define i64 @f3(i64 %dummy, ptr %src, i64 %b) { ; Check unsigned maximum. define i64 @f4(i64 %dummy, ptr %src, i64 %b) { ; CHECK-LABEL: f4: -; CHECK: # %bb.0: -; CHECK-NEXT: lg %r2, 0(%r3) -; CHECK-NEXT: j .LBB3_2 -; CHECK-NEXT: .LBB3_1: # %atomicrmw.start -; CHECK-NEXT: # in Loop: Header=BB3_2 Depth=1 -; CHECK-NEXT: csg %r2, %r0, 0(%r3) -; CHECK-NEXT: je .LBB3_4 -; CHECK-NEXT: .LBB3_2: # %atomicrmw.start -; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: lgr %r0, %r2 -; CHECK-NEXT: clgrjh %r2, %r4, .LBB3_1 -; CHECK-NEXT: # %bb.3: # %atomicrmw.start -; CHECK-NEXT: # in Loop: Header=BB3_2 Depth=1 -; CHECK-NEXT: lgr %r0, %r4 -; CHECK-NEXT: j .LBB3_1 -; CHECK-NEXT: .LBB3_4: # %atomicrmw.end -; CHECK-NEXT: br %r14 +; CHECK: lg %r2, 0(%r3) +; CHECK: j [[LOOP:\.[^:]*]] +; CHECK: [[BB1:\.[^:]*]]: +; CHECK: csg %r2, [[NEW:%r[0-9]+]], 0(%r3) +; CHECK: ber %r14 +; CHECK: [[LOOP:\.[^:]*]]: +; CHECK: lgr [[NEW:%r[0-9]+]], %r2 +; CHECK: clgrjhe %r2, %r4, [[KEEP:\..*]] +; CHECK: lgr [[NEW]], %r4 +; CHECK: j [[BB1]] %res = atomicrmw umax ptr %src, i64 %b seq_cst ret i64 %res } @@ -105,23 +74,9 @@ define i64 @f4(i64 %dummy, ptr %src, i64 %b) { ; Check the high end of the aligned CSG range. define i64 @f5(i64 %dummy, ptr %src, i64 %b) { ; CHECK-LABEL: f5: -; CHECK: # %bb.0: -; CHECK-NEXT: lg %r2, 524280(%r3) -; CHECK-NEXT: j .LBB4_2 -; CHECK-NEXT: .LBB4_1: # %atomicrmw.start -; CHECK-NEXT: # in Loop: Header=BB4_2 Depth=1 -; CHECK-NEXT: csg %r2, %r0, 524280(%r3) -; CHECK-NEXT: je .LBB4_4 -; CHECK-NEXT: .LBB4_2: # %atomicrmw.start -; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: lgr %r0, %r2 -; CHECK-NEXT: cgrjle %r2, %r4, .LBB4_1 -; CHECK-NEXT: # %bb.3: # %atomicrmw.start -; CHECK-NEXT: # in Loop: Header=BB4_2 Depth=1 -; CHECK-NEXT: lgr %r0, %r4 -; CHECK-NEXT: j .LBB4_1 -; CHECK-NEXT: .LBB4_4: # %atomicrmw.end -; CHECK-NEXT: br %r14 +; CHECK: lg %r2, 524280(%r3) +; CHECK: csg %r2, {{%r[0-9]+}}, 524280(%r3) +; CHECK: ber %r14 %ptr = getelementptr i64, ptr %src, i64 65535 %res = atomicrmw min ptr %ptr, i64 %b seq_cst ret i64 %res @@ -130,24 +85,10 @@ define i64 @f5(i64 %dummy, ptr %src, i64 %b) { ; Check the next doubleword up, which requires separate address logic. define i64 @f6(i64 %dummy, ptr %src, i64 %b) { ; CHECK-LABEL: f6: -; CHECK: # %bb.0: -; CHECK-NEXT: agfi %r3, 524288 -; CHECK-NEXT: lg %r2, 0(%r3) -; CHECK-NEXT: j .LBB5_2 -; CHECK-NEXT: .LBB5_1: # %atomicrmw.start -; CHECK-NEXT: # in Loop: Header=BB5_2 Depth=1 -; CHECK-NEXT: csg %r2, %r0, 0(%r3) -; CHECK-NEXT: je .LBB5_4 -; CHECK-NEXT: .LBB5_2: # %atomicrmw.start -; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: lgr %r0, %r2 -; CHECK-NEXT: cgrjle %r2, %r4, .LBB5_1 -; CHECK-NEXT: # %bb.3: # %atomicrmw.start -; CHECK-NEXT: # in Loop: Header=BB5_2 Depth=1 -; CHECK-NEXT: lgr %r0, %r4 -; CHECK-NEXT: j .LBB5_1 -; CHECK-NEXT: .LBB5_4: # %atomicrmw.end -; CHECK-NEXT: br %r14 +; CHECK: agfi %r3, 524288 +; CHECK: lg %r2, 0(%r3) +; CHECK: csg %r2, {{%r[0-9]+}}, 0(%r3) +; CHECK: ber %r14 %ptr = getelementptr i64, ptr %src, i64 65536 %res = atomicrmw min ptr %ptr, i64 %b seq_cst ret i64 %res @@ -156,23 +97,9 @@ define i64 @f6(i64 %dummy, ptr %src, i64 %b) { ; Check the low end of the CSG range. define i64 @f7(i64 %dummy, ptr %src, i64 %b) { ; CHECK-LABEL: f7: -; CHECK: # %bb.0: -; CHECK-NEXT: lg %r2, -524288(%r3) -; CHECK-NEXT: j .LBB6_2 -; CHECK-NEXT: .LBB6_1: # %atomicrmw.start -; CHECK-NEXT: # in Loop: Header=BB6_2 Depth=1 -; CHECK-NEXT: csg %r2, %r0, -524288(%r3) -; CHECK-NEXT: je .LBB6_4 -; CHECK-NEXT: .LBB6_2: # %atomicrmw.start -; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: lgr %r0, %r2 -; CHECK-NEXT: cgrjle %r2, %r4, .LBB6_1 -; CHECK-NEXT: # %bb.3: # %atomicrmw.start -; CHECK-NEXT: # in Loop: Header=BB6_2 Depth=1 -; CHECK-NEXT: lgr %r0, %r4 -; CHECK-NEXT: j .LBB6_1 -; CHECK-NEXT: .LBB6_4: # %atomicrmw.end -; CHECK-NEXT: br %r14 +; CHECK: lg %r2, -524288(%r3) +; CHECK: csg %r2, {{%r[0-9]+}}, -524288(%r3) +; CHECK: ber %r14 %ptr = getelementptr i64, ptr %src, i64 -65536 %res = atomicrmw min ptr %ptr, i64 %b seq_cst ret i64 %res @@ -181,24 +108,10 @@ define i64 @f7(i64 %dummy, ptr %src, i64 %b) { ; Check the next doubleword down, which requires separate address logic. define i64 @f8(i64 %dummy, ptr %src, i64 %b) { ; CHECK-LABEL: f8: -; CHECK: # %bb.0: -; CHECK-NEXT: agfi %r3, -524296 -; CHECK-NEXT: lg %r2, 0(%r3) -; CHECK-NEXT: j .LBB7_2 -; CHECK-NEXT: .LBB7_1: # %atomicrmw.start -; CHECK-NEXT: # in Loop: Header=BB7_2 Depth=1 -; CHECK-NEXT: csg %r2, %r0, 0(%r3) -; CHECK-NEXT: je .LBB7_4 -; CHECK-NEXT: .LBB7_2: # %atomicrmw.start -; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: lgr %r0, %r2 -; CHECK-NEXT: cgrjle %r2, %r4, .LBB7_1 -; CHECK-NEXT: # %bb.3: # %atomicrmw.start -; CHECK-NEXT: # in Loop: Header=BB7_2 Depth=1 -; CHECK-NEXT: lgr %r0, %r4 -; CHECK-NEXT: j .LBB7_1 -; CHECK-NEXT: .LBB7_4: # %atomicrmw.end -; CHECK-NEXT: br %r14 +; CHECK: agfi %r3, -524296 +; CHECK: lg %r2, 0(%r3) +; CHECK: csg %r2, {{%r[0-9]+}}, 0(%r3) +; CHECK: ber %r14 %ptr = getelementptr i64, ptr %src, i64 -65537 %res = atomicrmw min ptr %ptr, i64 %b seq_cst ret i64 %res @@ -207,24 +120,10 @@ define i64 @f8(i64 %dummy, ptr %src, i64 %b) { ; Check that indexed addresses are not allowed. define i64 @f9(i64 %dummy, i64 %base, i64 %index, i64 %b) { ; CHECK-LABEL: f9: -; CHECK: # %bb.0: -; CHECK-NEXT: lg %r2, 0(%r4,%r3) -; CHECK-NEXT: agr %r3, %r4 -; CHECK-NEXT: j .LBB8_2 -; CHECK-NEXT: .LBB8_1: # %atomicrmw.start -; CHECK-NEXT: # in Loop: Header=BB8_2 Depth=1 -; CHECK-NEXT: csg %r2, %r0, 0(%r3) -; CHECK-NEXT: je .LBB8_4 -; CHECK-NEXT: .LBB8_2: # %atomicrmw.start -; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: lgr %r0, %r2 -; CHECK-NEXT: cgrjle %r2, %r5, .LBB8_1 -; CHECK-NEXT: # %bb.3: # %atomicrmw.start -; CHECK-NEXT: # in Loop: Header=BB8_2 Depth=1 -; CHECK-NEXT: lgr %r0, %r5 -; CHECK-NEXT: j .LBB8_1 -; CHECK-NEXT: .LBB8_4: # %atomicrmw.end -; CHECK-NEXT: br %r14 +; CHECK: agr %r3, %r4 +; CHECK: lg %r2, 0(%r3) +; CHECK: csg %r2, {{%r[0-9]+}}, 0(%r3) +; CHECK: ber %r14 %add = add i64 %base, %index %ptr = inttoptr i64 %add to ptr %res = atomicrmw min ptr %ptr, i64 %b seq_cst @@ -234,23 +133,17 @@ define i64 @f9(i64 %dummy, i64 %base, i64 %index, i64 %b) { ; Check that constants are handled. define i64 @f10(i64 %dummy, ptr %ptr) { ; CHECK-LABEL: f10: -; CHECK: # %bb.0: -; CHECK-NEXT: lg %r2, 0(%r3) -; CHECK-NEXT: j .LBB9_2 -; CHECK-NEXT: .LBB9_1: # %atomicrmw.start -; CHECK-NEXT: # in Loop: Header=BB9_2 Depth=1 -; CHECK-NEXT: csg %r2, %r0, 0(%r3) -; CHECK-NEXT: je .LBB9_4 -; CHECK-NEXT: .LBB9_2: # %atomicrmw.start -; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: lgr %r0, %r2 -; CHECK-NEXT: cgijl %r2, 43, .LBB9_1 -; CHECK-NEXT: # %bb.3: # %atomicrmw.start -; CHECK-NEXT: # in Loop: Header=BB9_2 Depth=1 -; CHECK-NEXT: lghi %r0, 42 -; CHECK-NEXT: j .LBB9_1 -; CHECK-NEXT: .LBB9_4: # %atomicrmw.end -; CHECK-NEXT: br %r14 +; CHECK-DAG: lghi [[LIMIT:%r[0-9]+]], 42 +; CHECK-DAG: lg %r2, 0(%r3) +; CHECK: j [[LOOP:\.[^:]*]] +; CHECK: [[BB1:\.[^:]*]]: +; CHECK: csg %r2, [[NEW:%r[0-9]+]], 0(%r3) +; CHECK: ber %r14 +; CHECK: [[LOOP:\.[^:]*]]: +; CHECK: lgr [[NEW:%r[0-9]+]], %r2 +; CHECK: cgrjle %r2, [[LIMIT]], [[KEEP:\..*]] +; CHECK: lghi [[NEW]], 42 +; CHECK: j [[BB1]] %res = atomicrmw min ptr %ptr, i64 42 seq_cst ret i64 %res } diff --git a/llvm/test/CodeGen/SystemZ/atomicrmw-nand-03.ll b/llvm/test/CodeGen/SystemZ/atomicrmw-nand-03.ll index 8e9870f13013f..323eafb3e5b3a 100644 --- a/llvm/test/CodeGen/SystemZ/atomicrmw-nand-03.ll +++ b/llvm/test/CodeGen/SystemZ/atomicrmw-nand-03.ll @@ -17,14 +17,14 @@ define i32 @f1(i32 %dummy, ptr %src, i32 %b) { ret i32 %res } -; Check NANDs with different constant operands. +; Check NANDs of 1. define i32 @f2(i32 %dummy, ptr %src) { ; CHECK-LABEL: f2: ; CHECK: l %r2, 0(%r3) ; CHECK: [[LABEL:\.[^ ]*]]: ; CHECK: lr %r0, %r2 +; CHECK: nilf %r0, 1 ; CHECK: xilf %r0, 4294967295 -; CHECK: oilf %r0, 4294967294 ; CHECK: cs %r2, %r0, 0(%r3) ; CHECK: jl [[LABEL]] ; CHECK: br %r14 @@ -32,55 +32,61 @@ define i32 @f2(i32 %dummy, ptr %src) { ret i32 %res } +; Check NANDs of the low end of the NILH range. define i32 @f3(i32 %dummy, ptr %src) { ; CHECK-LABEL: f3: +; CHECK: nilh %r0, 0 ; CHECK: xilf %r0, 4294967295 -; CHECK: oilh %r0, 65535 ; CHECK: br %r14 %res = atomicrmw nand ptr %src, i32 65535 seq_cst ret i32 %res } +; Check the next value up, which must use NILF. define i32 @f4(i32 %dummy, ptr %src) { ; CHECK-LABEL: f4: +; CHECK: nilf %r0, 65536 ; CHECK: xilf %r0, 4294967295 -; CHECK: oilf %r0, 4294901759 ; CHECK: br %r14 %res = atomicrmw nand ptr %src, i32 65536 seq_cst ret i32 %res } +; Check the largest useful NILL value. define i32 @f5(i32 %dummy, ptr %src) { ; CHECK-LABEL: f5: +; CHECK: nill %r0, 65534 ; CHECK: xilf %r0, 4294967295 -; CHECK: oill %r0, 1 ; CHECK: br %r14 %res = atomicrmw nand ptr %src, i32 -2 seq_cst ret i32 %res } +; Check the low end of the NILL range. define i32 @f6(i32 %dummy, ptr %src) { ; CHECK-LABEL: f6: +; CHECK: nill %r0, 0 ; CHECK: xilf %r0, 4294967295 -; CHECK: oill %r0, 65535 ; CHECK: br %r14 %res = atomicrmw nand ptr %src, i32 -65536 seq_cst ret i32 %res } +; Check the largest useful NILH value, which is one less than the above. define i32 @f7(i32 %dummy, ptr %src) { ; CHECK-LABEL: f7: +; CHECK: nilh %r0, 65534 ; CHECK: xilf %r0, 4294967295 -; CHECK: oilh %r0, 1 ; CHECK: br %r14 %res = atomicrmw nand ptr %src, i32 -65537 seq_cst ret i32 %res } +; Check the highest useful NILF value, which is one less than the above. define i32 @f8(i32 %dummy, ptr %src) { ; CHECK-LABEL: f8: +; CHECK: nilf %r0, 4294901758 ; CHECK: xilf %r0, 4294967295 -; CHECK: oilf %r0, 65537 ; CHECK: br %r14 %res = atomicrmw nand ptr %src, i32 -65538 seq_cst ret i32 %res diff --git a/llvm/test/CodeGen/SystemZ/atomicrmw-nand-04.ll b/llvm/test/CodeGen/SystemZ/atomicrmw-nand-04.ll index 98056c9c9a012..b37030255d7cc 100644 --- a/llvm/test/CodeGen/SystemZ/atomicrmw-nand-04.ll +++ b/llvm/test/CodeGen/SystemZ/atomicrmw-nand-04.ll @@ -9,8 +9,8 @@ define i64 @f1(i64 %dummy, ptr %src, i64 %b) { ; CHECK: [[LABEL:\.[^:]*]]: ; CHECK: lgr %r0, %r2 ; CHECK: ngr %r0, %r4 -; CHECK: xihf %r0, 4294967295 -; CHECK: xilf %r0, 4294967295 +; CHECK: lcgr %r0, %r0 +; CHECK: aghi %r0, -1 ; CHECK: csg %r2, %r0, 0(%r3) ; CHECK: jl [[LABEL]] ; CHECK: br %r14 @@ -18,39 +18,40 @@ define i64 @f1(i64 %dummy, ptr %src, i64 %b) { ret i64 %res } -; Check NANDs of 1, which are done using a register. +; Check NANDs of 1, which are done using a register. (We could use RISBG +; instead, but that isn't implemented yet.) define i64 @f2(i64 %dummy, ptr %src) { ; CHECK-LABEL: f2: -; CHECK: xihf %r0, 4294967295 -; CHECK: xilf %r0, 4294967295 -; CHECK: oihf %r0, 4294967295 -; CHECK: oilf %r0, 4294967294 +; CHECK: ngr ; CHECK: br %r14 %res = atomicrmw nand ptr %src, i64 1 seq_cst ret i64 %res } +; Check the equivalent of NIHF with 1, which can use RISBG instead. define i64 @f3(i64 %dummy, ptr %src) { ; CHECK-LABEL: f3: ; CHECK: lg %r2, 0(%r3) ; CHECK: [[LABEL:\.[^:]*]]: -; CHECK: xihf %r0, 4294967295 -; CHECK: xilf %r0, 4294967295 -; CHECK: oihf %r0, 4294967294 +; CHECK: risbg %r0, %r2, 31, 191, 0 +; CHECK: lcgr %r0, %r0 +; CHECK: aghi %r0, -1 +; CHECK: csg %r2, %r0, 0(%r3) ; CHECK: jl [[LABEL]] ; CHECK: br %r14 %res = atomicrmw nand ptr %src, i64 8589934591 seq_cst ret i64 %res } +; Check the lowest NIHF value outside the range of RISBG. define i64 @f4(i64 %dummy, ptr %src) { ; CHECK-LABEL: f4: ; CHECK: lg %r2, 0(%r3) ; CHECK: [[LABEL:\.[^:]*]]: ; CHECK: lgr %r0, %r2 -; CHECK: xihf %r0, 4294967295 -; CHECK: xilf %r0, 4294967295 -; CHECK: oihf %r0, 4294967293 +; CHECK: nihf %r0, 2 +; CHECK: lcgr %r0, %r0 +; CHECK: aghi %r0, -1 ; CHECK: csg %r2, %r0, 0(%r3) ; CHECK: jl [[LABEL]] ; CHECK: br %r14 @@ -58,133 +59,118 @@ define i64 @f4(i64 %dummy, ptr %src) { ret i64 %res } +; Check the next value up, which must use a register. define i64 @f5(i64 %dummy, ptr %src) { ; CHECK-LABEL: f5: -; CHECK: xihf %r0, 4294967295 -; CHECK: xilf %r0, 4294967295 -; CHECK: oihf %r0, 4294967292 -; CHECK: oilf %r0, 4294967295 +; CHECK: ngr ; CHECK: br %r14 %res = atomicrmw nand ptr %src, i64 12884901888 seq_cst ret i64 %res } +; Check the lowest NIHH value outside the range of RISBG. define i64 @f6(i64 %dummy, ptr %src) { ; CHECK-LABEL: f6: -; CHECK: xihf %r0, 4294967295 -; CHECK: xilf %r0, 4294967295 -; CHECK: oihh %r0, 65533 +; CHECK: nihh {{%r[0-5]}}, 2 ; CHECK: br %r14 %res = atomicrmw nand ptr %src, i64 844424930131967 seq_cst ret i64 %res } +; Check the next value up, which must use a register. define i64 @f7(i64 %dummy, ptr %src) { ; CHECK-LABEL: f7: -; CHECK: xihf %r0, 4294967295 -; CHECK: xilf %r0, 4294967295 -; CHECK: oihf %r0, 4294901759 -; CHECK: oilf %r0, 4294967295 +; CHECK: ngr ; CHECK: br %r14 %res = atomicrmw nand ptr %src, i64 281474976710656 seq_cst ret i64 %res } +; Check the highest NILL value outside the range of RISBG. define i64 @f8(i64 %dummy, ptr %src) { ; CHECK-LABEL: f8: -; CHECK: xihf %r0, 4294967295 -; CHECK: xilf %r0, 4294967295 -; CHECK: oill %r0, 5 +; CHECK: nill {{%r[0-5]}}, 65530 ; CHECK: br %r14 %res = atomicrmw nand ptr %src, i64 -6 seq_cst ret i64 %res } +; Check the lowest NILL value outside the range of RISBG. define i64 @f9(i64 %dummy, ptr %src) { ; CHECK-LABEL: f9: -; CHECK: xihf %r0, 4294967295 -; CHECK: xilf %r0, 4294967295 -; CHECK: oill %r0, 65533 +; CHECK: nill {{%r[0-5]}}, 2 ; CHECK: br %r14 %res = atomicrmw nand ptr %src, i64 -65534 seq_cst ret i64 %res } +; Check the highest useful NILF value. define i64 @f10(i64 %dummy, ptr %src) { ; CHECK-LABEL: f10: -; CHECK: xihf %r0, 4294967295 -; CHECK: xilf %r0, 4294967295 -; CHECK: oilf %r0, 65537 +; CHECK: nilf {{%r[0-5]}}, 4294901758 ; CHECK: br %r14 %res = atomicrmw nand ptr %src, i64 -65538 seq_cst ret i64 %res } +; Check the highest NILH value outside the range of RISBG. define i64 @f11(i64 %dummy, ptr %src) { ; CHECK-LABEL: f11: -; CHECK: xihf %r0, 4294967295 -; CHECK: xilf %r0, 4294967295 -; CHECK: oilh %r0, 5 +; CHECK: nilh {{%r[0-5]}}, 65530 ; CHECK: br %r14 %res = atomicrmw nand ptr %src, i64 -327681 seq_cst ret i64 %res } +; Check the lowest NILH value outside the range of RISBG. define i64 @f12(i64 %dummy, ptr %src) { ; CHECK-LABEL: f12: -; CHECK: xihf %r0, 4294967295 -; CHECK: xilf %r0, 4294967295 -; CHECK: oilh %r0, 65533 +; CHECK: nilh {{%r[0-5]}}, 2 ; CHECK: br %r14 %res = atomicrmw nand ptr %src, i64 -4294770689 seq_cst ret i64 %res } +; Check the lowest NILF value outside the range of RISBG. define i64 @f13(i64 %dummy, ptr %src) { ; CHECK-LABEL: f13: -; CHECK: xihf %r0, 4294967295 -; CHECK: xilf %r0, 4294967295 -; CHECK: oilf %r0, 4294967293 +; CHECK: nilf {{%r[0-5]}}, 2 ; CHECK: br %r14 %res = atomicrmw nand ptr %src, i64 -4294967294 seq_cst ret i64 %res } +; Check the highest NIHL value outside the range of RISBG. define i64 @f14(i64 %dummy, ptr %src) { ; CHECK-LABEL: f14: -; CHECK: xihf %r0, 4294967295 -; CHECK: xilf %r0, 4294967295 -; CHECK: oihl %r0, 5 +; CHECK: nihl {{%r[0-5]}}, 65530 ; CHECK: br %r14 %res = atomicrmw nand ptr %src, i64 -21474836481 seq_cst ret i64 %res } +; Check the lowest NIHL value outside the range of RISBG. define i64 @f15(i64 %dummy, ptr %src) { ; CHECK-LABEL: f15: -; CHECK: xihf %r0, 4294967295 -; CHECK: xilf %r0, 4294967295 -; CHECK: oihl %r0, 65533 +; CHECK: nihl {{%r[0-5]}}, 2 ; CHECK: br %r14 %res = atomicrmw nand ptr %src, i64 -281462091808769 seq_cst ret i64 %res } +; Check the highest NIHH value outside the range of RISBG. define i64 @f16(i64 %dummy, ptr %src) { ; CHECK-LABEL: f16: -; CHECK: xihf %r0, 4294967295 -; CHECK: xilf %r0, 4294967295 -; CHECK: oihh %r0, 5 +; CHECK: nihh {{%r[0-5]}}, 65530 ; CHECK: br %r14 %res = atomicrmw nand ptr %src, i64 -1407374883553281 seq_cst ret i64 %res } +; Check the highest useful NIHF value. define i64 @f17(i64 %dummy, ptr %src) { ; CHECK-LABEL: f17: -; CHECK: xihf %r0, 4294967295 -; CHECK: xilf %r0, 4294967295 -; CHECK: oihf %r0, 65537 +; CHECK: nihf {{%r[0-5]}}, 4294901758 ; CHECK: br %r14 %res = atomicrmw nand ptr %src, i64 -281479271677953 seq_cst ret i64 %res diff --git a/llvm/test/CodeGen/SystemZ/atomicrmw-or-04.ll b/llvm/test/CodeGen/SystemZ/atomicrmw-or-04.ll index e29097b10e169..a1322df49dbf8 100644 --- a/llvm/test/CodeGen/SystemZ/atomicrmw-or-04.ll +++ b/llvm/test/CodeGen/SystemZ/atomicrmw-or-04.ll @@ -93,11 +93,11 @@ define i64 @f9(i64 %dummy, ptr %src) { ret i64 %res } -; Check the next value up, which must use a register. +; Check the next value up, which must use a register. (We could use +; combinations of OIH* and OIL* instead, but that isn't implemented.) define i64 @f10(i64 %dummy, ptr %src) { ; CHECK-LABEL: f10: -; CHECK: oihl %r0, 1 -; CHECK: oill %r0, 1 +; CHECK: ogr ; CHECK: br %r14 %res = atomicrmw or ptr %src, i64 4294967297 seq_cst ret i64 %res @@ -139,11 +139,10 @@ define i64 @f14(i64 %dummy, ptr %src) { ret i64 %res } -; Check the next value up. +; Check the next value up, which must use a register. define i64 @f15(i64 %dummy, ptr %src) { ; CHECK-LABEL: f15: -; CHECK: oihh %r0, 65535 -; CHECK: oill %r0, 1 +; CHECK: ogr ; CHECK: br %r14 %res = atomicrmw or ptr %src, i64 18446462598732840961 seq_cst ret i64 %res diff --git a/llvm/test/CodeGen/SystemZ/atomicrmw-sub-04.ll b/llvm/test/CodeGen/SystemZ/atomicrmw-sub-04.ll index d18c72f3b41e7..5d23d4e9ca155 100644 --- a/llvm/test/CodeGen/SystemZ/atomicrmw-sub-04.ll +++ b/llvm/test/CodeGen/SystemZ/atomicrmw-sub-04.ll @@ -16,12 +16,13 @@ define i64 @f1(i64 %dummy, ptr %src, i64 %b) { ret i64 %res } -; Check subtraction of 1. +; Check subtraction of 1, which can use AGHI. define i64 @f2(i64 %dummy, ptr %src) { ; CHECK-LABEL: f2: ; CHECK: lg %r2, 0(%r3) ; CHECK: [[LABEL:\.[^:]*]]: -; CHECK: lay %r0, -1(%r2) +; CHECK: lgr %r0, %r2 +; CHECK: aghi %r0, -1 ; CHECK: csg %r2, %r0, 0(%r3) ; CHECK: jl [[LABEL]] ; CHECK: br %r14 @@ -29,64 +30,82 @@ define i64 @f2(i64 %dummy, ptr %src) { ret i64 %res } -; Check use of LAY. +; Check the low end of the AGHI range. define i64 @f3(i64 %dummy, ptr %src) { ; CHECK-LABEL: f3: -; CHECK: lay %r0, -32768(%r2) +; CHECK: aghi %r0, -32768 ; CHECK: br %r14 %res = atomicrmw sub ptr %src, i64 32768 seq_cst ret i64 %res } -; Check the low end of the AGFI range. +; Check the next value up, which must use AGFI. define i64 @f4(i64 %dummy, ptr %src) { ; CHECK-LABEL: f4: -; CHECK: agfi %r0, -2147483648 +; CHECK: agfi %r0, -32769 ; CHECK: br %r14 - %res = atomicrmw sub ptr %src, i64 2147483648 seq_cst + %res = atomicrmw sub ptr %src, i64 32769 seq_cst ret i64 %res } -; Check the next value up, which uses an SLGFI. +; Check the low end of the AGFI range. define i64 @f5(i64 %dummy, ptr %src) { ; CHECK-LABEL: f5: -; CHECK: slgfi +; CHECK: agfi %r0, -2147483648 ; CHECK: br %r14 - %res = atomicrmw sub ptr %src, i64 2147483649 seq_cst + %res = atomicrmw sub ptr %src, i64 2147483648 seq_cst ret i64 %res } -; Check subtraction of -1, which can use LA. +; Check the next value up, which must use a register operation. define i64 @f6(i64 %dummy, ptr %src) { ; CHECK-LABEL: f6: -; CHECK: la %r0, 1(%r2) +; CHECK: sgr ; CHECK: br %r14 - %res = atomicrmw sub ptr %src, i64 -1 seq_cst + %res = atomicrmw sub ptr %src, i64 2147483649 seq_cst ret i64 %res } -; Check use of LAY. +; Check subtraction of -1, which can use AGHI. define i64 @f7(i64 %dummy, ptr %src) { ; CHECK-LABEL: f7: -; CHECK: lay %r0, 32767(%r2) +; CHECK: aghi %r0, 1 ; CHECK: br %r14 - %res = atomicrmw sub ptr %src, i64 -32767 seq_cst + %res = atomicrmw sub ptr %src, i64 -1 seq_cst ret i64 %res } -; Check the high end of the AGFI range. +; Check the high end of the AGHI range. define i64 @f8(i64 %dummy, ptr %src) { ; CHECK-LABEL: f8: -; CHECK: agfi %r0, 2147483647 +; CHECK: aghi %r0, 32767 ; CHECK: br %r14 - %res = atomicrmw sub ptr %src, i64 -2147483647 seq_cst + %res = atomicrmw sub ptr %src, i64 -32767 seq_cst ret i64 %res } -; Check the next value down, which must use an ALGFI. +; Check the next value down, which must use AGFI instead. define i64 @f9(i64 %dummy, ptr %src) { ; CHECK-LABEL: f9: -; CHECK: algfi +; CHECK: agfi %r0, 32768 +; CHECK: br %r14 + %res = atomicrmw sub ptr %src, i64 -32768 seq_cst + ret i64 %res +} + +; Check the high end of the AGFI range. +define i64 @f10(i64 %dummy, ptr %src) { +; CHECK-LABEL: f10: +; CHECK: agfi %r0, 2147483647 +; CHECK: br %r14 + %res = atomicrmw sub ptr %src, i64 -2147483647 seq_cst + ret i64 %res +} + +; Check the next value down, which must use a register operation. +define i64 @f11(i64 %dummy, ptr %src) { +; CHECK-LABEL: f11: +; CHECK: sgr ; CHECK: br %r14 %res = atomicrmw sub ptr %src, i64 -2147483648 seq_cst ret i64 %res diff --git a/llvm/test/CodeGen/SystemZ/atomicrmw-xchg-03.ll b/llvm/test/CodeGen/SystemZ/atomicrmw-xchg-03.ll index d991997245089..cf366b9f2e29a 100644 --- a/llvm/test/CodeGen/SystemZ/atomicrmw-xchg-03.ll +++ b/llvm/test/CodeGen/SystemZ/atomicrmw-xchg-03.ll @@ -96,8 +96,8 @@ define i32 @f8(i32 %dummy, ptr %src, i32 %b) { ; Check that indexed addresses are not allowed. define i32 @f9(i32 %dummy, i64 %base, i64 %index, i32 %b) { ; CHECK-LABEL: f9: -; CHECK: l %r2, 0(%r4,%r3) ; CHECK: agr %r3, %r4 +; CHECK: l %r2, 0(%r3) ; CHECK: cs %r2, {{%r[0-9]+}}, 0(%r3) ; CHECK: br %r14 %add = add i64 %base, %index diff --git a/llvm/test/CodeGen/SystemZ/atomicrmw-xchg-04.ll b/llvm/test/CodeGen/SystemZ/atomicrmw-xchg-04.ll index 4797c5cf06798..9a493cb7fd8c7 100644 --- a/llvm/test/CodeGen/SystemZ/atomicrmw-xchg-04.ll +++ b/llvm/test/CodeGen/SystemZ/atomicrmw-xchg-04.ll @@ -63,8 +63,8 @@ define i64 @f5(i64 %dummy, ptr %src, i64 %b) { ; Check that indexed addresses are not allowed. define i64 @f6(i64 %dummy, i64 %base, i64 %index, i64 %b) { ; CHECK-LABEL: f6: -; CHECK: lg %r2, 0(%r4,%r3) ; CHECK: agr %r3, %r4 +; CHECK: lg %r2, 0(%r3) ; CHECK: csg %r2, {{%r[0-9]+}}, 0(%r3) ; CHECK: br %r14 %add = add i64 %base, %index diff --git a/llvm/test/CodeGen/SystemZ/atomicrmw-xor-04.ll b/llvm/test/CodeGen/SystemZ/atomicrmw-xor-04.ll index ee5fc1cf415dc..6cf1b80b8d0cd 100644 --- a/llvm/test/CodeGen/SystemZ/atomicrmw-xor-04.ll +++ b/llvm/test/CodeGen/SystemZ/atomicrmw-xor-04.ll @@ -48,11 +48,11 @@ define i64 @f4(i64 %dummy, ptr %src) { ret i64 %res } -; Check the next value up. +; Check the next value up, which must use a register. (We could use +; combinations of XIH* and XIL* instead, but that isn't implemented.) define i64 @f5(i64 %dummy, ptr %src) { ; CHECK-LABEL: f5: -; CHECK: xihf %r0, 1 -; CHECK: xilf %r0, 1 +; CHECK: xgr ; CHECK: br %r14 %res = atomicrmw xor ptr %src, i64 4294967297 seq_cst ret i64 %res @@ -70,8 +70,7 @@ define i64 @f6(i64 %dummy, ptr %src) { ; Check the next value up, which must use a register. define i64 @f7(i64 %dummy, ptr %src) { ; CHECK-LABEL: f7: -; CHECK: xihf %r0, 4294967295 -; CHECK: xilf %r0, 1 +; CHECK: xgr ; CHECK: br %r14 %res = atomicrmw xor ptr %src, i64 -4294967295 seq_cst ret i64 %res diff --git a/llvm/test/CodeGen/X86/avx512fp16-arith.ll b/llvm/test/CodeGen/X86/avx512fp16-arith.ll index 8d811d8d29e06..77cab3d1512ca 100644 --- a/llvm/test/CodeGen/X86/avx512fp16-arith.ll +++ b/llvm/test/CodeGen/X86/avx512fp16-arith.ll @@ -329,7 +329,7 @@ define half @fcopysign(half %x, half %y) { ; CHECK-LABEL: fcopysign: ; CHECK: ## %bb.0: ; CHECK-NEXT: vpbroadcastw {{.*#+}} xmm2 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN] -; CHECK-NEXT: vpternlogd $226, %xmm1, %xmm2, %xmm0 +; CHECK-NEXT: vpternlogq $226, %xmm1, %xmm2, %xmm0 ; CHECK-NEXT: retq %a = call half @llvm.copysign.f16(half %x, half %y) ret half %a @@ -384,7 +384,7 @@ declare <8 x half> @llvm.fabs.v8f16(<8 x half>) define <8 x half> @fcopysignv8f16(<8 x half> %x, <8 x half> %y) { ; CHECK-LABEL: fcopysignv8f16: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpternlogd $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0 +; CHECK-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm1, %xmm0 ; CHECK-NEXT: retq %a = call <8 x half> @llvm.copysign.v8f16(<8 x half> %x, <8 x half> %y) ret <8 x half> %a @@ -439,7 +439,7 @@ declare <16 x half> @llvm.fabs.v16f16(<16 x half>) define <16 x half> @fcopysignv16f16(<16 x half> %x, <16 x half> %y) { ; CHECK-LABEL: fcopysignv16f16: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpternlogd $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm0 +; CHECK-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm1, %ymm0 ; CHECK-NEXT: retq %a = call <16 x half> @llvm.copysign.v16f16(<16 x half> %x, <16 x half> %y) ret <16 x half> %a @@ -494,7 +494,7 @@ declare <32 x half> @llvm.fabs.v32f16(<32 x half>) define <32 x half> @fcopysignv32f16(<32 x half> %x, <32 x half> %y) { ; CHECK-LABEL: fcopysignv32f16: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpternlogd $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0 +; CHECK-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm0 ; CHECK-NEXT: retq %a = call <32 x half> @llvm.copysign.v32f16(<32 x half> %x, <32 x half> %y) ret <32 x half> %a diff --git a/llvm/test/CodeGen/X86/fold-pcmpeqd-2.ll b/llvm/test/CodeGen/X86/fold-pcmpeqd-2.ll index 493e7aa7039e3..5dcb1d63207d1 100644 --- a/llvm/test/CodeGen/X86/fold-pcmpeqd-2.ll +++ b/llvm/test/CodeGen/X86/fold-pcmpeqd-2.ll @@ -51,6 +51,11 @@ define void @program_1(ptr %dest, ptr %t0, <4 x float> %p0, <4 x float> %p1, <4 ; X86-NEXT: movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload ; X86-NEXT: mulps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X86-NEXT: movaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill +; X86-NEXT: xorps %xmm0, %xmm0 +; X86-NEXT: movaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill +; X86-NEXT: movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload +; X86-NEXT: mulps %xmm0, %xmm0 +; X86-NEXT: movaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill ; X86-NEXT: movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload ; X86-NEXT: mulps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X86-NEXT: movaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill @@ -59,10 +64,8 @@ define void @program_1(ptr %dest, ptr %t0, <4 x float> %p0, <4 x float> %p1, <4 ; X86-NEXT: movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload ; X86-NEXT: cmpunordps %xmm0, %xmm0 ; X86-NEXT: movaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill -; X86-NEXT: xorps %xmm0, %xmm0 -; X86-NEXT: movaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill ; X86-NEXT: movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload -; X86-NEXT: minps %xmm0, %xmm0 +; X86-NEXT: minps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X86-NEXT: movaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill ; X86-NEXT: xorps %xmm0, %xmm0 ; X86-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) @@ -132,6 +135,11 @@ define void @program_1(ptr %dest, ptr %t0, <4 x float> %p0, <4 x float> %p1, <4 ; X64-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload ; X64-NEXT: mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; X64-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill +; X64-NEXT: xorps %xmm0, %xmm0 +; X64-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; X64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; X64-NEXT: mulps %xmm0, %xmm0 +; X64-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill ; X64-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload ; X64-NEXT: mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; X64-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill @@ -140,10 +148,8 @@ define void @program_1(ptr %dest, ptr %t0, <4 x float> %p0, <4 x float> %p1, <4 ; X64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload ; X64-NEXT: cmpunordps %xmm0, %xmm0 ; X64-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill -; X64-NEXT: xorps %xmm0, %xmm0 -; X64-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill ; X64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload -; X64-NEXT: minps %xmm0, %xmm0 +; X64-NEXT: minps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; X64-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill ; X64-NEXT: xorl %ebx, %ebx ; X64-NEXT: xorps %xmm3, %xmm3 diff --git a/llvm/test/CodeGen/X86/gfni-funnel-shifts.ll b/llvm/test/CodeGen/X86/gfni-funnel-shifts.ll index 8f1272013c11b..6cd85e074c648 100644 --- a/llvm/test/CodeGen/X86/gfni-funnel-shifts.ll +++ b/llvm/test/CodeGen/X86/gfni-funnel-shifts.ll @@ -31,7 +31,7 @@ define <16 x i8> @splatconstant_fshl_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind ; GFNIAVX512: # %bb.0: ; GFNIAVX512-NEXT: vpsllw $3, %xmm0, %xmm2 ; GFNIAVX512-NEXT: vpsrlw $5, %xmm1, %xmm0 -; GFNIAVX512-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm2, %xmm0 +; GFNIAVX512-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm2, %xmm0 ; GFNIAVX512-NEXT: retq %res = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> ) ret <16 x i8> %res @@ -119,7 +119,7 @@ define <32 x i8> @splatconstant_fshl_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind ; GFNIAVX512: # %bb.0: ; GFNIAVX512-NEXT: vpsllw $4, %ymm0, %ymm2 ; GFNIAVX512-NEXT: vpsrlw $4, %ymm1, %ymm0 -; GFNIAVX512-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm0 +; GFNIAVX512-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm2, %ymm0 ; GFNIAVX512-NEXT: retq %res = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> ) ret <32 x i8> %res @@ -175,7 +175,7 @@ define <32 x i8> @splatconstant_fshr_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind ; GFNIAVX512: # %bb.0: ; GFNIAVX512-NEXT: vpsllw $2, %ymm0, %ymm2 ; GFNIAVX512-NEXT: vpsrlw $6, %ymm1, %ymm0 -; GFNIAVX512-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm0 +; GFNIAVX512-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm2, %ymm0 ; GFNIAVX512-NEXT: retq %res = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> ) ret <32 x i8> %res @@ -339,7 +339,7 @@ define <64 x i8> @splatconstant_fshr_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind ; GFNIAVX512: # %bb.0: ; GFNIAVX512-NEXT: vpsllw $6, %zmm0, %zmm2 ; GFNIAVX512-NEXT: vpsrlw $2, %zmm1, %zmm0 -; GFNIAVX512-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm0 +; GFNIAVX512-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm0 ; GFNIAVX512-NEXT: retq %res = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> ) ret <64 x i8> %res diff --git a/llvm/test/CodeGen/X86/gfni-rotates.ll b/llvm/test/CodeGen/X86/gfni-rotates.ll index 8b94a3f84cb96..29b58d047596d 100644 --- a/llvm/test/CodeGen/X86/gfni-rotates.ll +++ b/llvm/test/CodeGen/X86/gfni-rotates.ll @@ -32,7 +32,7 @@ define <16 x i8> @splatconstant_rotl_v16i8(<16 x i8> %a) nounwind { ; GFNIAVX512: # %bb.0: ; GFNIAVX512-NEXT: vpsllw $3, %xmm0, %xmm1 ; GFNIAVX512-NEXT: vpsrlw $5, %xmm0, %xmm0 -; GFNIAVX512-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0 +; GFNIAVX512-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm1, %xmm0 ; GFNIAVX512-NEXT: retq %res = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a, <16 x i8> %a, <16 x i8> ) ret <16 x i8> %res @@ -121,7 +121,7 @@ define <32 x i8> @splatconstant_rotl_v32i8(<32 x i8> %a) nounwind { ; GFNIAVX512: # %bb.0: ; GFNIAVX512-NEXT: vpsllw $4, %ymm0, %ymm1 ; GFNIAVX512-NEXT: vpsrlw $4, %ymm0, %ymm0 -; GFNIAVX512-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm0 +; GFNIAVX512-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm1, %ymm0 ; GFNIAVX512-NEXT: retq %res = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a, <32 x i8> %a, <32 x i8> ) ret <32 x i8> %res @@ -177,7 +177,7 @@ define <32 x i8> @splatconstant_rotr_v32i8(<32 x i8> %a) nounwind { ; GFNIAVX512: # %bb.0: ; GFNIAVX512-NEXT: vpsllw $2, %ymm0, %ymm1 ; GFNIAVX512-NEXT: vpsrlw $6, %ymm0, %ymm0 -; GFNIAVX512-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm0 +; GFNIAVX512-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm1, %ymm0 ; GFNIAVX512-NEXT: retq %res = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a, <32 x i8> %a, <32 x i8> ) ret <32 x i8> %res @@ -344,7 +344,7 @@ define <64 x i8> @splatconstant_rotr_v64i8(<64 x i8> %a) nounwind { ; GFNIAVX512: # %bb.0: ; GFNIAVX512-NEXT: vpsllw $6, %zmm0, %zmm1 ; GFNIAVX512-NEXT: vpsrlw $2, %zmm0, %zmm0 -; GFNIAVX512-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0 +; GFNIAVX512-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm0 ; GFNIAVX512-NEXT: retq %res = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a, <64 x i8> %a, <64 x i8> ) ret <64 x i8> %res diff --git a/llvm/test/CodeGen/X86/min-legal-vector-width.ll b/llvm/test/CodeGen/X86/min-legal-vector-width.ll index 5cd0c232de44d..7dd4af7689792 100644 --- a/llvm/test/CodeGen/X86/min-legal-vector-width.ll +++ b/llvm/test/CodeGen/X86/min-legal-vector-width.ll @@ -2010,7 +2010,7 @@ define <32 x i8> @splatconstant_rotate_v32i8(<32 x i8> %a) nounwind "min-legal-v ; CHECK: # %bb.0: ; CHECK-NEXT: vpsllw $4, %ymm0, %ymm1 ; CHECK-NEXT: vpsrlw $4, %ymm0, %ymm0 -; CHECK-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm0 +; CHECK-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm1, %ymm0 ; CHECK-NEXT: retq %shl = shl <32 x i8> %a, %lshr = lshr <32 x i8> %a, @@ -2023,7 +2023,7 @@ define <32 x i8> @splatconstant_rotate_mask_v32i8(<32 x i8> %a) nounwind "min-le ; CHECK: # %bb.0: ; CHECK-NEXT: vpsllw $4, %ymm0, %ymm1 ; CHECK-NEXT: vpsrlw $4, %ymm0, %ymm0 -; CHECK-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm0 +; CHECK-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm1, %ymm0 ; CHECK-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0 ; CHECK-NEXT: retq %shl = shl <32 x i8> %a, diff --git a/llvm/test/CodeGen/X86/pr74736.ll b/llvm/test/CodeGen/X86/pr74736.ll deleted file mode 100644 index 3dfdbf102c953..0000000000000 --- a/llvm/test/CodeGen/X86/pr74736.ll +++ /dev/null @@ -1,68 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 -; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2 | FileCheck %s --check-prefixes=SSE -; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX - -define void @main(<16 x i32> %0, i32 %1) { -; SSE-LABEL: main: -; SSE: # %bb.0: # %entry -; SSE-NEXT: movd %edi, %xmm4 -; SSE-NEXT: movaps {{.*#+}} xmm0 = [1,0,0,0] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm4[1,0] -; SSE-NEXT: paddd %xmm0, %xmm0 -; SSE-NEXT: paddd %xmm1, %xmm1 -; SSE-NEXT: paddd %xmm3, %xmm3 -; SSE-NEXT: paddd %xmm2, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,1,1,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[1,0] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm2[1,0] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,0],xmm3[1,0] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[1,3] -; SSE-NEXT: xorps %xmm2, %xmm0 -; SSE-NEXT: xorps %xmm4, %xmm1 -; SSE-NEXT: xorps %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; SSE-NEXT: pxor %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; SSE-NEXT: pxor %xmm0, %xmm1 -; SSE-NEXT: movd %xmm1, 0 -; SSE-NEXT: retq -; -; AVX-LABEL: main: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm0[1,2,3] -; AVX-NEXT: movl $1, %eax -; AVX-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 -; AVX-NEXT: vpinsrd $3, %edi, %xmm2, %xmm2 -; AVX-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX-NEXT: vpaddd %ymm0, %ymm0, %ymm0 -; AVX-NEXT: vpaddd %ymm1, %ymm1, %ymm1 -; AVX-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,1,3,3,5,5,7] -; AVX-NEXT: vpermd %ymm0, %ymm2, %ymm2 -; AVX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] -; AVX-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,3,3,3,7,7,7,7] -; AVX-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,1,3,4,5,5,7] -; AVX-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] -; AVX-NEXT: vpxor %ymm0, %ymm2, %ymm0 -; AVX-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, 0 -; AVX-NEXT: vzeroupper -; AVX-NEXT: retq -entry: - %2 = insertelement <16 x i32> %0, i32 1, i64 1 - %3 = insertelement <16 x i32> %2, i32 %1, i64 3 - %4 = insertelement <16 x i32> %3, i32 0, i64 0 - %5 = shl <16 x i32> %4, - %6 = shufflevector <16 x i32> %5, <16 x i32> zeroinitializer, <16 x i32> - %7 = tail call i32 @llvm.vector.reduce.xor.v16i32(<16 x i32> %6) - store i32 %7, ptr null, align 4 - ret void -} -declare i32 @llvm.vector.reduce.xor.v16i32(<16 x i32>) diff --git a/llvm/test/CodeGen/X86/var-permute-256.ll b/llvm/test/CodeGen/X86/var-permute-256.ll index 853d8a278960d..6c07c4ca523f8 100644 --- a/llvm/test/CodeGen/X86/var-permute-256.ll +++ b/llvm/test/CodeGen/X86/var-permute-256.ll @@ -25,20 +25,18 @@ define <4 x i64> @var_shuffle_v4i64(<4 x i64> %v, <4 x i64> %indices) nounwind { ; ; AVX1-LABEL: var_shuffle_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vpaddq %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [2,2] -; AVX1-NEXT: # xmm3 = mem[0,0] -; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm4 +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,2,3] +; AVX1-NEXT: vpaddq %xmm1, %xmm1, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-NEXT: vpaddq %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpgtq %xmm3, %xmm1, %xmm3 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 -; AVX1-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm0[2,3,2,3] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-NEXT: vpermilpd %ymm1, %ymm4, %ymm2 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm4 +; AVX1-NEXT: vpermilpd %ymm4, %ymm2, %ymm2 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-NEXT: vpermilpd %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vblendvpd %ymm3, %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: vpermilpd %ymm4, %ymm0, %ymm0 +; AVX1-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 +; AVX1-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 +; AVX1-NEXT: vblendvpd %ymm1, %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: var_shuffle_v4i64: @@ -90,16 +88,15 @@ define <8 x i32> @var_shuffle_v8i32(<8 x i32> %v, <8 x i32> %indices) nounwind { ; ; AVX1-LABEL: var_shuffle_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [3,3,3,3] -; AVX1-NEXT: vpcmpgtd %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpgtd %xmm3, %xmm1, %xmm3 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,2,3] -; AVX1-NEXT: vpermilps %ymm1, %ymm3, %ymm3 +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,2,3] +; AVX1-NEXT: vpermilps %ymm1, %ymm2, %ymm2 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: vpermilps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vblendvps %ymm2, %ymm3, %ymm0, %ymm0 +; AVX1-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 +; AVX1-NEXT: vblendvps %ymm1, %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; INT256-LABEL: var_shuffle_v8i32: @@ -448,20 +445,18 @@ define <4 x double> @var_shuffle_v4f64(<4 x double> %v, <4 x i64> %indices) noun ; ; AVX1-LABEL: var_shuffle_v4f64: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vpaddq %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [2,2] -; AVX1-NEXT: # xmm3 = mem[0,0] -; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm4 +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,2,3] +; AVX1-NEXT: vpaddq %xmm1, %xmm1, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-NEXT: vpaddq %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpgtq %xmm3, %xmm1, %xmm3 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 -; AVX1-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm0[2,3,2,3] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-NEXT: vpermilpd %ymm1, %ymm4, %ymm2 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm4 +; AVX1-NEXT: vpermilpd %ymm4, %ymm2, %ymm2 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-NEXT: vpermilpd %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vblendvpd %ymm3, %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: vpermilpd %ymm4, %ymm0, %ymm0 +; AVX1-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 +; AVX1-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 +; AVX1-NEXT: vblendvpd %ymm1, %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: var_shuffle_v4f64: @@ -513,16 +508,15 @@ define <8 x float> @var_shuffle_v8f32(<8 x float> %v, <8 x i32> %indices) nounwi ; ; AVX1-LABEL: var_shuffle_v8f32: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [3,3,3,3] -; AVX1-NEXT: vpcmpgtd %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpgtd %xmm3, %xmm1, %xmm3 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,2,3] -; AVX1-NEXT: vpermilps %ymm1, %ymm3, %ymm3 +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,2,3] +; AVX1-NEXT: vpermilps %ymm1, %ymm2, %ymm2 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: vpermilps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vblendvps %ymm2, %ymm3, %ymm0, %ymm0 +; AVX1-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 +; AVX1-NEXT: vblendvps %ymm1, %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; INT256-LABEL: var_shuffle_v8f32: @@ -575,19 +569,17 @@ define <4 x i64> @var_shuffle_v4i64_from_v2i64(<2 x i64> %v, <4 x i64> %indices) ; AVX1-LABEL: var_shuffle_v4i64_from_v2i64: ; AVX1: # %bb.0: ; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vpaddq %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [2,2] -; AVX1-NEXT: # xmm3 = mem[0,0] -; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm4 -; AVX1-NEXT: vpaddq %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpgtq %xmm3, %xmm1, %xmm3 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-NEXT: vpermilpd %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vpermilpd %ymm1, %ymm0, %ymm1 -; AVX1-NEXT: vblendvpd %ymm3, %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vpaddq %xmm1, %xmm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vpaddq %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm3 +; AVX1-NEXT: vpermilpd %ymm3, %ymm0, %ymm0 +; AVX1-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX1-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-NEXT: vpermilpd %ymm3, %ymm0, %ymm2 +; AVX1-NEXT: vblendvpd %ymm1, %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: var_shuffle_v4i64_from_v2i64: @@ -641,15 +633,14 @@ define <8 x i32> @var_shuffle_v8i32_from_v4i32(<4 x i32> %v, <8 x i32> %indices) ; AVX1-LABEL: var_shuffle_v8i32_from_v4i32: ; AVX1: # %bb.0: # %entry ; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [3,3,3,3] -; AVX1-NEXT: vpcmpgtd %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpgtd %xmm3, %xmm1, %xmm3 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; AVX1-NEXT: vpermilps %ymm1, %ymm0, %ymm3 +; AVX1-NEXT: vpermilps %ymm1, %ymm0, %ymm2 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: vpermilps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vblendvps %ymm2, %ymm3, %ymm0, %ymm0 +; AVX1-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 +; AVX1-NEXT: vblendvps %ymm1, %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; INT256-LABEL: var_shuffle_v8i32_from_v4i32: @@ -999,19 +990,17 @@ define <4 x double> @var_shuffle_v4f64_from_v2f64(<2 x double> %v, <4 x i64> %in ; AVX1-LABEL: var_shuffle_v4f64_from_v2f64: ; AVX1: # %bb.0: ; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vpaddq %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [2,2] -; AVX1-NEXT: # xmm3 = mem[0,0] -; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm4 -; AVX1-NEXT: vpaddq %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpgtq %xmm3, %xmm1, %xmm3 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-NEXT: vpermilpd %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vpermilpd %ymm1, %ymm0, %ymm1 -; AVX1-NEXT: vblendvpd %ymm3, %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vpaddq %xmm1, %xmm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vpaddq %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm3 +; AVX1-NEXT: vpermilpd %ymm3, %ymm0, %ymm0 +; AVX1-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX1-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-NEXT: vpermilpd %ymm3, %ymm0, %ymm2 +; AVX1-NEXT: vblendvpd %ymm1, %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: var_shuffle_v4f64_from_v2f64: @@ -1065,15 +1054,14 @@ define <8 x float> @var_shuffle_v8f32_from_v4f32(<4 x float> %v, <8 x i32> %indi ; AVX1-LABEL: var_shuffle_v8f32_from_v4f32: ; AVX1: # %bb.0: # %entry ; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [3,3,3,3] -; AVX1-NEXT: vpcmpgtd %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpgtd %xmm3, %xmm1, %xmm3 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; AVX1-NEXT: vpermilps %ymm1, %ymm0, %ymm3 +; AVX1-NEXT: vpermilps %ymm1, %ymm0, %ymm2 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: vpermilps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vblendvps %ymm2, %ymm3, %ymm0, %ymm0 +; AVX1-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 +; AVX1-NEXT: vblendvps %ymm1, %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; INT256-LABEL: var_shuffle_v8f32_from_v4f32: @@ -1283,22 +1271,20 @@ define <4 x i64> @var_shuffle_v4i64_with_v16i8_indices(<4 x i64> %v, <16 x i8> % ; ; AVX1-LABEL: var_shuffle_v4i64_with_v16i8_indices: ; AVX1: # %bb.0: -; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vpsrld $16, %xmm1, %xmm1 +; AVX1-NEXT: vpsrld $16, %xmm1, %xmm2 +; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero ; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,2,3] ; AVX1-NEXT: vpaddq %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [2,2] -; AVX1-NEXT: # xmm3 = mem[0,0] -; AVX1-NEXT: vpcmpgtq %xmm3, %xmm1, %xmm4 ; AVX1-NEXT: vpaddq %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm3 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 -; AVX1-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm0[2,3,2,3] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-NEXT: vpermilpd %ymm1, %ymm4, %ymm2 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm4 +; AVX1-NEXT: vpermilpd %ymm4, %ymm3, %ymm3 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-NEXT: vpermilpd %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vblendvpd %ymm3, %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: vpermilpd %ymm4, %ymm0, %ymm0 +; AVX1-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm2, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-NEXT: vblendvpd %ymm1, %ymm3, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: var_shuffle_v4i64_with_v16i8_indices: diff --git a/llvm/test/CodeGen/X86/vec_fcopysign.ll b/llvm/test/CodeGen/X86/vec_fcopysign.ll index b34b02c90796b..457210732396b 100644 --- a/llvm/test/CodeGen/X86/vec_fcopysign.ll +++ b/llvm/test/CodeGen/X86/vec_fcopysign.ll @@ -159,8 +159,8 @@ define <8 x half> @fcopysign_v8f16(ptr %p0, ptr %p1) nounwind { ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX512-NEXT: vmovdqa (%ecx), %xmm1 -; X86-AVX512-NEXT: vpbroadcastd {{.*#+}} xmm0 = [2147450879,2147450879,2147450879,2147450879] -; X86-AVX512-NEXT: vpternlogd $202, (%eax), %xmm1, %xmm0 +; X86-AVX512-NEXT: vpbroadcastw {{.*#+}} xmm0 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN] +; X86-AVX512-NEXT: vpternlogq $202, (%eax), %xmm1, %xmm0 ; X86-AVX512-NEXT: retl ; ; X64-SSE-LABEL: fcopysign_v8f16: @@ -193,8 +193,8 @@ define <8 x half> @fcopysign_v8f16(ptr %p0, ptr %p1) nounwind { ; X64-AVX512-LABEL: fcopysign_v8f16: ; X64-AVX512: # %bb.0: ; X64-AVX512-NEXT: vmovdqa (%rdi), %xmm1 -; X64-AVX512-NEXT: vpbroadcastd {{.*#+}} xmm0 = [2147450879,2147450879,2147450879,2147450879] -; X64-AVX512-NEXT: vpternlogd $202, (%rsi), %xmm1, %xmm0 +; X64-AVX512-NEXT: vpbroadcastq {{.*#+}} xmm0 = [9223231297218904063,9223231297218904063] +; X64-AVX512-NEXT: vpternlogq $202, (%rsi), %xmm1, %xmm0 ; X64-AVX512-NEXT: retq %a0 = load <8 x half>, ptr %p0, align 16 %a1 = load <8 x half>, ptr %p1, align 16 @@ -405,8 +405,8 @@ define <16 x half> @fcopysign_v16f16(ptr %p0, ptr %p1) nounwind { ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX512-NEXT: vmovdqu (%ecx), %ymm1 -; X86-AVX512-NEXT: vpbroadcastd {{.*#+}} ymm0 = [2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879] -; X86-AVX512-NEXT: vpternlogd $202, (%eax), %ymm1, %ymm0 +; X86-AVX512-NEXT: vpbroadcastw {{.*#+}} ymm0 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN] +; X86-AVX512-NEXT: vpternlogq $202, (%eax), %ymm1, %ymm0 ; X86-AVX512-NEXT: retl ; ; X64-SSE-LABEL: fcopysign_v16f16: @@ -444,8 +444,8 @@ define <16 x half> @fcopysign_v16f16(ptr %p0, ptr %p1) nounwind { ; X64-AVX512-LABEL: fcopysign_v16f16: ; X64-AVX512: # %bb.0: ; X64-AVX512-NEXT: vmovdqu (%rdi), %ymm1 -; X64-AVX512-NEXT: vpbroadcastd {{.*#+}} ymm0 = [2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879] -; X64-AVX512-NEXT: vpternlogd $202, (%rsi), %ymm1, %ymm0 +; X64-AVX512-NEXT: vpbroadcastq {{.*#+}} ymm0 = [9223231297218904063,9223231297218904063,9223231297218904063,9223231297218904063] +; X64-AVX512-NEXT: vpternlogq $202, (%rsi), %ymm1, %ymm0 ; X64-AVX512-NEXT: retq %a0 = load <16 x half>, ptr %p0, align 16 %a1 = load <16 x half>, ptr %p1, align 16 @@ -691,14 +691,34 @@ define <32 x half> @fcopysign_v32f16(ptr %p0, ptr %p1) nounwind { ; X86-AVX2-NEXT: vpor %ymm2, %ymm1, %ymm1 ; X86-AVX2-NEXT: retl ; -; X86-AVX512-LABEL: fcopysign_v32f16: -; X86-AVX512: # %bb.0: -; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-AVX512-NEXT: vmovdqu64 (%ecx), %zmm1 -; X86-AVX512-NEXT: vpbroadcastd {{.*#+}} zmm0 = [2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879] -; X86-AVX512-NEXT: vpternlogd $202, (%eax), %zmm1, %zmm0 -; X86-AVX512-NEXT: retl +; X86-AVX512VL-LABEL: fcopysign_v32f16: +; X86-AVX512VL: # %bb.0: +; X86-AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX512VL-NEXT: vmovdqu64 (%ecx), %zmm1 +; X86-AVX512VL-NEXT: vpbroadcastw {{.*#+}} ymm0 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN] +; X86-AVX512VL-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; X86-AVX512VL-NEXT: vpternlogq $202, (%eax), %zmm1, %zmm0 +; X86-AVX512VL-NEXT: retl +; +; X86-AVX512FP16-LABEL: fcopysign_v32f16: +; X86-AVX512FP16: # %bb.0: +; X86-AVX512FP16-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX512FP16-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX512FP16-NEXT: vmovdqu64 (%ecx), %zmm1 +; X86-AVX512FP16-NEXT: vpbroadcastw {{.*#+}} zmm0 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN] +; X86-AVX512FP16-NEXT: vpternlogq $202, (%eax), %zmm1, %zmm0 +; X86-AVX512FP16-NEXT: retl +; +; X86-AVX512VLDQ-LABEL: fcopysign_v32f16: +; X86-AVX512VLDQ: # %bb.0: +; X86-AVX512VLDQ-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX512VLDQ-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX512VLDQ-NEXT: vmovdqu64 (%ecx), %zmm1 +; X86-AVX512VLDQ-NEXT: vpbroadcastw {{.*#+}} ymm0 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN] +; X86-AVX512VLDQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; X86-AVX512VLDQ-NEXT: vpternlogq $202, (%eax), %zmm1, %zmm0 +; X86-AVX512VLDQ-NEXT: retl ; ; X64-SSE-LABEL: fcopysign_v32f16: ; X64-SSE: # %bb.0: @@ -749,8 +769,8 @@ define <32 x half> @fcopysign_v32f16(ptr %p0, ptr %p1) nounwind { ; X64-AVX512-LABEL: fcopysign_v32f16: ; X64-AVX512: # %bb.0: ; X64-AVX512-NEXT: vmovdqu64 (%rdi), %zmm1 -; X64-AVX512-NEXT: vpbroadcastd {{.*#+}} zmm0 = [2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879,2147450879] -; X64-AVX512-NEXT: vpternlogd $202, (%rsi), %zmm1, %zmm0 +; X64-AVX512-NEXT: vpbroadcastq {{.*#+}} zmm0 = [9223231297218904063,9223231297218904063,9223231297218904063,9223231297218904063,9223231297218904063,9223231297218904063,9223231297218904063,9223231297218904063] +; X64-AVX512-NEXT: vpternlogq $202, (%rsi), %zmm1, %zmm0 ; X64-AVX512-NEXT: retq %a0 = load <32 x half>, ptr %p0, align 16 %a1 = load <32 x half>, ptr %p1, align 16 @@ -766,6 +786,3 @@ declare <32 x half> @llvm.copysign.v32f16(<32 x half>, <32 x half>) ; X64-AVX512VLDQ: {{.*}} ; X86: {{.*}} ; X86-AVX: {{.*}} -; X86-AVX512FP16: {{.*}} -; X86-AVX512VL: {{.*}} -; X86-AVX512VLDQ: {{.*}} diff --git a/llvm/test/CodeGen/X86/vector-fshl-128.ll b/llvm/test/CodeGen/X86/vector-fshl-128.ll index 4f100cd3e0530..2d0e92a54846b 100644 --- a/llvm/test/CodeGen/X86/vector-fshl-128.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-128.ll @@ -2409,7 +2409,7 @@ define <16 x i8> @splatconstant_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y) nounwi ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpsllw $4, %xmm0, %xmm2 ; AVX512F-NEXT: vpsrlw $4, %xmm1, %xmm0 -; AVX512F-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm0 +; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm0 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -2418,14 +2418,14 @@ define <16 x i8> @splatconstant_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y) nounwi ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpsllw $4, %xmm0, %xmm2 ; AVX512VL-NEXT: vpsrlw $4, %xmm1, %xmm0 -; AVX512VL-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm2, %xmm0 +; AVX512VL-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm2, %xmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: splatconstant_funnnel_v16i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpsllw $4, %xmm0, %xmm2 ; AVX512BW-NEXT: vpsrlw $4, %xmm1, %xmm0 -; AVX512BW-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm0 +; AVX512BW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm0 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -2434,7 +2434,7 @@ define <16 x i8> @splatconstant_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y) nounwi ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: vpsllw $4, %xmm0, %xmm2 ; AVX512VBMI2-NEXT: vpsrlw $4, %xmm1, %xmm0 -; AVX512VBMI2-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm0 +; AVX512VBMI2-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm0 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512VBMI2-NEXT: vzeroupper ; AVX512VBMI2-NEXT: retq @@ -2443,14 +2443,14 @@ define <16 x i8> @splatconstant_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y) nounwi ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpsllw $4, %xmm0, %xmm2 ; AVX512VLBW-NEXT: vpsrlw $4, %xmm1, %xmm0 -; AVX512VLBW-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm2, %xmm0 +; AVX512VLBW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm2, %xmm0 ; AVX512VLBW-NEXT: retq ; ; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v16i8: ; AVX512VLVBMI2: # %bb.0: ; AVX512VLVBMI2-NEXT: vpsllw $4, %xmm0, %xmm2 ; AVX512VLVBMI2-NEXT: vpsrlw $4, %xmm1, %xmm0 -; AVX512VLVBMI2-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm2, %xmm0 +; AVX512VLVBMI2-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm2, %xmm0 ; AVX512VLVBMI2-NEXT: retq ; ; XOP-LABEL: splatconstant_funnnel_v16i8: diff --git a/llvm/test/CodeGen/X86/vector-fshl-256.ll b/llvm/test/CodeGen/X86/vector-fshl-256.ll index 05be4e1ee928e..43d426c326da7 100644 --- a/llvm/test/CodeGen/X86/vector-fshl-256.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-256.ll @@ -2303,7 +2303,7 @@ define <32 x i8> @splatconstant_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y) nounwi ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm2 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm0 -; AVX512F-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm0 +; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm0 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512F-NEXT: retq ; @@ -2311,14 +2311,14 @@ define <32 x i8> @splatconstant_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y) nounwi ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm2 ; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm0 -; AVX512VL-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm0 +; AVX512VL-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm2, %ymm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: splatconstant_funnnel_v32i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpsllw $4, %ymm0, %ymm2 ; AVX512BW-NEXT: vpsrlw $4, %ymm1, %ymm0 -; AVX512BW-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm0 +; AVX512BW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm0 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512BW-NEXT: retq ; @@ -2326,7 +2326,7 @@ define <32 x i8> @splatconstant_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y) nounwi ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: vpsllw $4, %ymm0, %ymm2 ; AVX512VBMI2-NEXT: vpsrlw $4, %ymm1, %ymm0 -; AVX512VBMI2-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm0 +; AVX512VBMI2-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm0 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512VBMI2-NEXT: retq ; @@ -2334,14 +2334,14 @@ define <32 x i8> @splatconstant_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y) nounwi ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpsllw $4, %ymm0, %ymm2 ; AVX512VLBW-NEXT: vpsrlw $4, %ymm1, %ymm0 -; AVX512VLBW-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm0 +; AVX512VLBW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm2, %ymm0 ; AVX512VLBW-NEXT: retq ; ; AVX10-LABEL: splatconstant_funnnel_v32i8: ; AVX10: # %bb.0: ; AVX10-NEXT: vpsllw $4, %ymm0, %ymm2 ; AVX10-NEXT: vpsrlw $4, %ymm1, %ymm0 -; AVX10-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm0 +; AVX10-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm2, %ymm0 ; AVX10-NEXT: retq ; ; XOPAVX1-LABEL: splatconstant_funnnel_v32i8: diff --git a/llvm/test/CodeGen/X86/vector-fshl-512.ll b/llvm/test/CodeGen/X86/vector-fshl-512.ll index 0724d879f557d..c246aaf61dbcf 100644 --- a/llvm/test/CodeGen/X86/vector-fshl-512.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-512.ll @@ -1124,7 +1124,7 @@ define <64 x i8> @splatconstant_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y) nounwi ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm0 +; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: splatconstant_funnnel_v64i8: @@ -1137,35 +1137,35 @@ define <64 x i8> @splatconstant_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y) nounwi ; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm1 ; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512VL-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm0 +; AVX512VL-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: splatconstant_funnnel_v64i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm1, %zmm0 -; AVX512BW-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm0 +; AVX512BW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512VBMI2-LABEL: splatconstant_funnnel_v64i8: ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: vpsllw $4, %zmm0, %zmm2 ; AVX512VBMI2-NEXT: vpsrlw $4, %zmm1, %zmm0 -; AVX512VBMI2-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm0 +; AVX512VBMI2-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm0 ; AVX512VBMI2-NEXT: retq ; ; AVX512VLBW-LABEL: splatconstant_funnnel_v64i8: ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpsllw $4, %zmm0, %zmm2 ; AVX512VLBW-NEXT: vpsrlw $4, %zmm1, %zmm0 -; AVX512VLBW-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm0 +; AVX512VLBW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm0 ; AVX512VLBW-NEXT: retq ; ; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v64i8: ; AVX512VLVBMI2: # %bb.0: ; AVX512VLVBMI2-NEXT: vpsllw $4, %zmm0, %zmm2 ; AVX512VLVBMI2-NEXT: vpsrlw $4, %zmm1, %zmm0 -; AVX512VLVBMI2-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm0 +; AVX512VLVBMI2-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm0 ; AVX512VLVBMI2-NEXT: retq %res = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> ) ret <64 x i8> %res diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll index 9ddd171b4db69..37d4f3b3dff54 100644 --- a/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll @@ -1851,7 +1851,7 @@ define <16 x i8> @splatconstant_funnnel_v16i8(<16 x i8> %x) nounwind { ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpsllw $4, %xmm0, %xmm1 ; AVX512F-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX512F-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0 +; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm0 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -1860,14 +1860,14 @@ define <16 x i8> @splatconstant_funnnel_v16i8(<16 x i8> %x) nounwind { ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpsllw $4, %xmm0, %xmm1 ; AVX512VL-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX512VL-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0 +; AVX512VL-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm1, %xmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: splatconstant_funnnel_v16i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpsllw $4, %xmm0, %xmm1 ; AVX512BW-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX512BW-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0 +; AVX512BW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm0 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -1876,14 +1876,14 @@ define <16 x i8> @splatconstant_funnnel_v16i8(<16 x i8> %x) nounwind { ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpsllw $4, %xmm0, %xmm1 ; AVX512VLBW-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX512VLBW-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0 +; AVX512VLBW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm1, %xmm0 ; AVX512VLBW-NEXT: retq ; ; AVX512VBMI2-LABEL: splatconstant_funnnel_v16i8: ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: vpsllw $4, %xmm0, %xmm1 ; AVX512VBMI2-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX512VBMI2-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0 +; AVX512VBMI2-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm0 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512VBMI2-NEXT: vzeroupper ; AVX512VBMI2-NEXT: retq @@ -1892,7 +1892,7 @@ define <16 x i8> @splatconstant_funnnel_v16i8(<16 x i8> %x) nounwind { ; AVX512VLVBMI2: # %bb.0: ; AVX512VLVBMI2-NEXT: vpsllw $4, %xmm0, %xmm1 ; AVX512VLVBMI2-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX512VLVBMI2-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0 +; AVX512VLVBMI2-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm1, %xmm0 ; AVX512VLVBMI2-NEXT: retq ; ; XOP-LABEL: splatconstant_funnnel_v16i8: diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll index 58719e6bd8e0c..8762072f3e8f6 100644 --- a/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll @@ -447,12 +447,12 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind { ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm2 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm3 -; AVX512F-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm3 +; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm3 ; AVX512F-NEXT: vpsllw $5, %ymm1, %ymm1 ; AVX512F-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 ; AVX512F-NEXT: vpsllw $2, %ymm0, %ymm2 ; AVX512F-NEXT: vpsrlw $6, %ymm0, %ymm3 -; AVX512F-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm3 +; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm3 ; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm1 ; AVX512F-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 ; AVX512F-NEXT: vpsrlw $7, %ymm0, %ymm2 @@ -467,12 +467,12 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind { ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm2 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm3 -; AVX512VL-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm3 +; AVX512VL-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm2, %ymm3 ; AVX512VL-NEXT: vpsllw $5, %ymm1, %ymm1 ; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 ; AVX512VL-NEXT: vpsllw $2, %ymm0, %ymm2 ; AVX512VL-NEXT: vpsrlw $6, %ymm0, %ymm3 -; AVX512VL-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm3 +; AVX512VL-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm2, %ymm3 ; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1 ; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 ; AVX512VL-NEXT: vpsrlw $7, %ymm0, %ymm2 @@ -1652,7 +1652,7 @@ define <32 x i8> @splatconstant_funnnel_v32i8(<32 x i8> %x) nounwind { ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm1 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512F-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0 +; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm0 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512F-NEXT: retq ; @@ -1660,14 +1660,14 @@ define <32 x i8> @splatconstant_funnnel_v32i8(<32 x i8> %x) nounwind { ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm1 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512VL-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm0 +; AVX512VL-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: splatconstant_funnnel_v32i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpsllw $4, %ymm0, %ymm1 ; AVX512BW-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512BW-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0 +; AVX512BW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm0 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512BW-NEXT: retq ; @@ -1675,14 +1675,14 @@ define <32 x i8> @splatconstant_funnnel_v32i8(<32 x i8> %x) nounwind { ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpsllw $4, %ymm0, %ymm1 ; AVX512VLBW-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512VLBW-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm0 +; AVX512VLBW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm1, %ymm0 ; AVX512VLBW-NEXT: retq ; ; AVX512VBMI2-LABEL: splatconstant_funnnel_v32i8: ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: vpsllw $4, %ymm0, %ymm1 ; AVX512VBMI2-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512VBMI2-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0 +; AVX512VBMI2-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm0 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512VBMI2-NEXT: retq ; @@ -1690,7 +1690,7 @@ define <32 x i8> @splatconstant_funnnel_v32i8(<32 x i8> %x) nounwind { ; AVX512VLVBMI2: # %bb.0: ; AVX512VLVBMI2-NEXT: vpsllw $4, %ymm0, %ymm1 ; AVX512VLVBMI2-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512VLVBMI2-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm0 +; AVX512VLVBMI2-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm1, %ymm0 ; AVX512VLVBMI2-NEXT: retq ; ; XOPAVX1-LABEL: splatconstant_funnnel_v32i8: diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-512.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-512.ll index edfa56a70d59e..0a473dd1ed824 100644 --- a/llvm/test/CodeGen/X86/vector-fshl-rot-512.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-rot-512.ll @@ -131,15 +131,15 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 ; AVX512F-NEXT: vpsrlw $4, %ymm2, %ymm3 ; AVX512F-NEXT: vpsllw $4, %ymm2, %ymm4 -; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm5 = [4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160] -; AVX512F-NEXT: vpternlogd $226, %zmm3, %zmm5, %zmm4 +; AVX512F-NEXT: vpbroadcastq {{.*#+}} zmm5 = [17361641481138401520,17361641481138401520,17361641481138401520,17361641481138401520,17361641481138401520,17361641481138401520,17361641481138401520,17361641481138401520] +; AVX512F-NEXT: vpternlogq $226, %zmm3, %zmm5, %zmm4 ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3 ; AVX512F-NEXT: vpsllw $5, %ymm3, %ymm3 ; AVX512F-NEXT: vpblendvb %ymm3, %ymm4, %ymm2, %ymm2 ; AVX512F-NEXT: vpsrlw $6, %ymm2, %ymm4 ; AVX512F-NEXT: vpsllw $2, %ymm2, %ymm6 -; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm7 = [4244438268,4244438268,4244438268,4244438268,4244438268,4244438268,4244438268,4244438268,4244438268,4244438268,4244438268,4244438268,4244438268,4244438268,4244438268,4244438268] -; AVX512F-NEXT: vpternlogd $226, %zmm4, %zmm7, %zmm6 +; AVX512F-NEXT: vpbroadcastq {{.*#+}} zmm7 = [18229723555195321596,18229723555195321596,18229723555195321596,18229723555195321596,18229723555195321596,18229723555195321596,18229723555195321596,18229723555195321596] +; AVX512F-NEXT: vpternlogq $226, %zmm4, %zmm7, %zmm6 ; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm3 ; AVX512F-NEXT: vpblendvb %ymm3, %ymm6, %ymm2, %ymm2 ; AVX512F-NEXT: vpsrlw $7, %ymm2, %ymm4 @@ -151,12 +151,12 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind { ; AVX512F-NEXT: vpblendvb %ymm3, %ymm4, %ymm2, %ymm2 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm3 ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm4 -; AVX512F-NEXT: vpternlogd $226, %zmm3, %zmm5, %zmm4 +; AVX512F-NEXT: vpternlogq $226, %zmm3, %zmm5, %zmm4 ; AVX512F-NEXT: vpsllw $5, %ymm1, %ymm1 ; AVX512F-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0 ; AVX512F-NEXT: vpsrlw $6, %ymm0, %ymm3 ; AVX512F-NEXT: vpsllw $2, %ymm0, %ymm4 -; AVX512F-NEXT: vpternlogd $226, %zmm3, %zmm7, %zmm4 +; AVX512F-NEXT: vpternlogq $226, %zmm3, %zmm7, %zmm4 ; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm1 ; AVX512F-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0 ; AVX512F-NEXT: vpsrlw $7, %ymm0, %ymm3 @@ -173,15 +173,15 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind { ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 ; AVX512VL-NEXT: vpsrlw $4, %ymm2, %ymm3 ; AVX512VL-NEXT: vpsllw $4, %ymm2, %ymm4 -; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm5 = [4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160] -; AVX512VL-NEXT: vpternlogd $226, %ymm3, %ymm5, %ymm4 +; AVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm5 = [17361641481138401520,17361641481138401520,17361641481138401520,17361641481138401520] +; AVX512VL-NEXT: vpternlogq $226, %ymm3, %ymm5, %ymm4 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm3 ; AVX512VL-NEXT: vpsllw $5, %ymm3, %ymm3 ; AVX512VL-NEXT: vpblendvb %ymm3, %ymm4, %ymm2, %ymm2 ; AVX512VL-NEXT: vpsrlw $6, %ymm2, %ymm4 ; AVX512VL-NEXT: vpsllw $2, %ymm2, %ymm6 -; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm7 = [4244438268,4244438268,4244438268,4244438268,4244438268,4244438268,4244438268,4244438268] -; AVX512VL-NEXT: vpternlogd $226, %ymm4, %ymm7, %ymm6 +; AVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm7 = [18229723555195321596,18229723555195321596,18229723555195321596,18229723555195321596] +; AVX512VL-NEXT: vpternlogq $226, %ymm4, %ymm7, %ymm6 ; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm3 ; AVX512VL-NEXT: vpblendvb %ymm3, %ymm6, %ymm2, %ymm2 ; AVX512VL-NEXT: vpsrlw $7, %ymm2, %ymm4 @@ -192,12 +192,12 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind { ; AVX512VL-NEXT: vpblendvb %ymm3, %ymm6, %ymm2, %ymm2 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm3 ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm4 -; AVX512VL-NEXT: vpternlogd $226, %ymm3, %ymm5, %ymm4 +; AVX512VL-NEXT: vpternlogq $226, %ymm3, %ymm5, %ymm4 ; AVX512VL-NEXT: vpsllw $5, %ymm1, %ymm1 ; AVX512VL-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0 ; AVX512VL-NEXT: vpsrlw $6, %ymm0, %ymm3 ; AVX512VL-NEXT: vpsllw $2, %ymm0, %ymm4 -; AVX512VL-NEXT: vpternlogd $226, %ymm3, %ymm7, %ymm4 +; AVX512VL-NEXT: vpternlogq $226, %ymm3, %ymm7, %ymm4 ; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1 ; AVX512VL-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0 ; AVX512VL-NEXT: vpsrlw $7, %ymm0, %ymm3 @@ -721,7 +721,7 @@ define <64 x i8> @splatconstant_funnnel_v64i8(<64 x i8> %x) nounwind { ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX512F-NEXT: vpsrlw $4, %ymm2, %ymm2 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 -; AVX512F-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0 +; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: splatconstant_funnnel_v64i8: @@ -733,35 +733,35 @@ define <64 x i8> @splatconstant_funnnel_v64i8(<64 x i8> %x) nounwind { ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX512VL-NEXT: vpsrlw $4, %ymm2, %ymm2 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 -; AVX512VL-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0 +; AVX512VL-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: splatconstant_funnnel_v64i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm1 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 -; AVX512BW-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0 +; AVX512BW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512VLBW-LABEL: splatconstant_funnnel_v64i8: ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpsllw $4, %zmm0, %zmm1 ; AVX512VLBW-NEXT: vpsrlw $4, %zmm0, %zmm0 -; AVX512VLBW-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0 +; AVX512VLBW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm0 ; AVX512VLBW-NEXT: retq ; ; AVX512VBMI2-LABEL: splatconstant_funnnel_v64i8: ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: vpsllw $4, %zmm0, %zmm1 ; AVX512VBMI2-NEXT: vpsrlw $4, %zmm0, %zmm0 -; AVX512VBMI2-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0 +; AVX512VBMI2-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm0 ; AVX512VBMI2-NEXT: retq ; ; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v64i8: ; AVX512VLVBMI2: # %bb.0: ; AVX512VLVBMI2-NEXT: vpsllw $4, %zmm0, %zmm1 ; AVX512VLVBMI2-NEXT: vpsrlw $4, %zmm0, %zmm0 -; AVX512VLVBMI2-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0 +; AVX512VLVBMI2-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm0 ; AVX512VLVBMI2-NEXT: retq %res = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %x, <64 x i8> %x, <64 x i8> ) ret <64 x i8> %res diff --git a/llvm/test/CodeGen/X86/vector-fshr-128.ll b/llvm/test/CodeGen/X86/vector-fshr-128.ll index eb2df0dcda98a..809735a88f208 100644 --- a/llvm/test/CodeGen/X86/vector-fshr-128.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-128.ll @@ -2403,7 +2403,7 @@ define <16 x i8> @splatconstant_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y) nounwi ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpsllw $4, %xmm0, %xmm2 ; AVX512F-NEXT: vpsrlw $4, %xmm1, %xmm0 -; AVX512F-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm0 +; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm0 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -2412,14 +2412,14 @@ define <16 x i8> @splatconstant_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y) nounwi ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpsllw $4, %xmm0, %xmm2 ; AVX512VL-NEXT: vpsrlw $4, %xmm1, %xmm0 -; AVX512VL-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm2, %xmm0 +; AVX512VL-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm2, %xmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: splatconstant_funnnel_v16i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpsllw $4, %xmm0, %xmm2 ; AVX512BW-NEXT: vpsrlw $4, %xmm1, %xmm0 -; AVX512BW-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm0 +; AVX512BW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm0 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -2428,7 +2428,7 @@ define <16 x i8> @splatconstant_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y) nounwi ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: vpsllw $4, %xmm0, %xmm2 ; AVX512VBMI2-NEXT: vpsrlw $4, %xmm1, %xmm0 -; AVX512VBMI2-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm0 +; AVX512VBMI2-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm0 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512VBMI2-NEXT: vzeroupper ; AVX512VBMI2-NEXT: retq @@ -2437,14 +2437,14 @@ define <16 x i8> @splatconstant_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y) nounwi ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpsllw $4, %xmm0, %xmm2 ; AVX512VLBW-NEXT: vpsrlw $4, %xmm1, %xmm0 -; AVX512VLBW-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm2, %xmm0 +; AVX512VLBW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm2, %xmm0 ; AVX512VLBW-NEXT: retq ; ; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v16i8: ; AVX512VLVBMI2: # %bb.0: ; AVX512VLVBMI2-NEXT: vpsllw $4, %xmm0, %xmm2 ; AVX512VLVBMI2-NEXT: vpsrlw $4, %xmm1, %xmm0 -; AVX512VLVBMI2-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm2, %xmm0 +; AVX512VLVBMI2-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm2, %xmm0 ; AVX512VLVBMI2-NEXT: retq ; ; XOP-LABEL: splatconstant_funnnel_v16i8: diff --git a/llvm/test/CodeGen/X86/vector-fshr-256.ll b/llvm/test/CodeGen/X86/vector-fshr-256.ll index 1a6ecea596563..fa41a10adb5e8 100644 --- a/llvm/test/CodeGen/X86/vector-fshr-256.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-256.ll @@ -2104,7 +2104,7 @@ define <32 x i8> @splatconstant_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y) nounwi ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm2 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm0 -; AVX512F-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm0 +; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm0 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512F-NEXT: retq ; @@ -2112,14 +2112,14 @@ define <32 x i8> @splatconstant_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y) nounwi ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm2 ; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm0 -; AVX512VL-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm0 +; AVX512VL-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm2, %ymm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: splatconstant_funnnel_v32i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpsllw $4, %ymm0, %ymm2 ; AVX512BW-NEXT: vpsrlw $4, %ymm1, %ymm0 -; AVX512BW-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm0 +; AVX512BW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm0 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512BW-NEXT: retq ; @@ -2127,7 +2127,7 @@ define <32 x i8> @splatconstant_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y) nounwi ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: vpsllw $4, %ymm0, %ymm2 ; AVX512VBMI2-NEXT: vpsrlw $4, %ymm1, %ymm0 -; AVX512VBMI2-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm0 +; AVX512VBMI2-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm0 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512VBMI2-NEXT: retq ; @@ -2135,14 +2135,14 @@ define <32 x i8> @splatconstant_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y) nounwi ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpsllw $4, %ymm0, %ymm2 ; AVX512VLBW-NEXT: vpsrlw $4, %ymm1, %ymm0 -; AVX512VLBW-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm0 +; AVX512VLBW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm2, %ymm0 ; AVX512VLBW-NEXT: retq ; ; AVX10-LABEL: splatconstant_funnnel_v32i8: ; AVX10: # %bb.0: ; AVX10-NEXT: vpsllw $4, %ymm0, %ymm2 ; AVX10-NEXT: vpsrlw $4, %ymm1, %ymm0 -; AVX10-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm0 +; AVX10-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm2, %ymm0 ; AVX10-NEXT: retq ; ; XOPAVX1-LABEL: splatconstant_funnnel_v32i8: diff --git a/llvm/test/CodeGen/X86/vector-fshr-512.ll b/llvm/test/CodeGen/X86/vector-fshr-512.ll index 1c6646152ab1f..b1fee9d1b0b39 100644 --- a/llvm/test/CodeGen/X86/vector-fshr-512.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-512.ll @@ -1180,7 +1180,7 @@ define <64 x i8> @splatconstant_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y) nounwi ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm0 +; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: splatconstant_funnnel_v64i8: @@ -1193,35 +1193,35 @@ define <64 x i8> @splatconstant_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y) nounwi ; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm1 ; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512VL-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm0 +; AVX512VL-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: splatconstant_funnnel_v64i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm2 ; AVX512BW-NEXT: vpsrlw $4, %zmm1, %zmm0 -; AVX512BW-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm0 +; AVX512BW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512VBMI2-LABEL: splatconstant_funnnel_v64i8: ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: vpsllw $4, %zmm0, %zmm2 ; AVX512VBMI2-NEXT: vpsrlw $4, %zmm1, %zmm0 -; AVX512VBMI2-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm0 +; AVX512VBMI2-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm0 ; AVX512VBMI2-NEXT: retq ; ; AVX512VLBW-LABEL: splatconstant_funnnel_v64i8: ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpsllw $4, %zmm0, %zmm2 ; AVX512VLBW-NEXT: vpsrlw $4, %zmm1, %zmm0 -; AVX512VLBW-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm0 +; AVX512VLBW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm0 ; AVX512VLBW-NEXT: retq ; ; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v64i8: ; AVX512VLVBMI2: # %bb.0: ; AVX512VLVBMI2-NEXT: vpsllw $4, %zmm0, %zmm2 ; AVX512VLVBMI2-NEXT: vpsrlw $4, %zmm1, %zmm0 -; AVX512VLVBMI2-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm0 +; AVX512VLVBMI2-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm0 ; AVX512VLVBMI2-NEXT: retq %res = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> ) ret <64 x i8> %res diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll index 402eb73e18101..9c5fe49e7d0ca 100644 --- a/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll @@ -1915,7 +1915,7 @@ define <16 x i8> @splatconstant_funnnel_v16i8(<16 x i8> %x) nounwind { ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpsllw $4, %xmm0, %xmm1 ; AVX512F-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX512F-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0 +; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm0 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -1924,14 +1924,14 @@ define <16 x i8> @splatconstant_funnnel_v16i8(<16 x i8> %x) nounwind { ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpsllw $4, %xmm0, %xmm1 ; AVX512VL-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX512VL-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0 +; AVX512VL-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm1, %xmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: splatconstant_funnnel_v16i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpsllw $4, %xmm0, %xmm1 ; AVX512BW-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX512BW-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0 +; AVX512BW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm0 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -1940,14 +1940,14 @@ define <16 x i8> @splatconstant_funnnel_v16i8(<16 x i8> %x) nounwind { ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpsllw $4, %xmm0, %xmm1 ; AVX512VLBW-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX512VLBW-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0 +; AVX512VLBW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm1, %xmm0 ; AVX512VLBW-NEXT: retq ; ; AVX512VBMI2-LABEL: splatconstant_funnnel_v16i8: ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: vpsllw $4, %xmm0, %xmm1 ; AVX512VBMI2-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX512VBMI2-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0 +; AVX512VBMI2-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm0 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512VBMI2-NEXT: vzeroupper ; AVX512VBMI2-NEXT: retq @@ -1956,7 +1956,7 @@ define <16 x i8> @splatconstant_funnnel_v16i8(<16 x i8> %x) nounwind { ; AVX512VLVBMI2: # %bb.0: ; AVX512VLVBMI2-NEXT: vpsllw $4, %xmm0, %xmm1 ; AVX512VLVBMI2-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX512VLVBMI2-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0 +; AVX512VLVBMI2-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm1, %xmm0 ; AVX512VLVBMI2-NEXT: retq ; ; XOP-LABEL: splatconstant_funnnel_v16i8: diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll index bb311468ce913..e4867ba096968 100644 --- a/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll @@ -473,17 +473,17 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind { ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm2 ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm3 -; AVX512F-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm3 +; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm3 ; AVX512F-NEXT: vpsllw $5, %ymm1, %ymm1 ; AVX512F-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 ; AVX512F-NEXT: vpsrlw $2, %ymm0, %ymm2 ; AVX512F-NEXT: vpsllw $6, %ymm0, %ymm3 -; AVX512F-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm3 +; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm3 ; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm1 ; AVX512F-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 ; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm2 ; AVX512F-NEXT: vpsllw $7, %ymm0, %ymm3 -; AVX512F-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm3 +; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm3 ; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm1 ; AVX512F-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 ; AVX512F-NEXT: retq @@ -492,17 +492,17 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind { ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm2 ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm3 -; AVX512VL-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm3 +; AVX512VL-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm2, %ymm3 ; AVX512VL-NEXT: vpsllw $5, %ymm1, %ymm1 ; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 ; AVX512VL-NEXT: vpsrlw $2, %ymm0, %ymm2 ; AVX512VL-NEXT: vpsllw $6, %ymm0, %ymm3 -; AVX512VL-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm3 +; AVX512VL-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm2, %ymm3 ; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1 ; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 ; AVX512VL-NEXT: vpsrlw $1, %ymm0, %ymm2 ; AVX512VL-NEXT: vpsllw $7, %ymm0, %ymm3 -; AVX512VL-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm3 +; AVX512VL-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm2, %ymm3 ; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1 ; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 ; AVX512VL-NEXT: retq @@ -1703,7 +1703,7 @@ define <32 x i8> @splatconstant_funnnel_v32i8(<32 x i8> %x) nounwind { ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm1 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512F-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0 +; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm0 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512F-NEXT: retq ; @@ -1711,14 +1711,14 @@ define <32 x i8> @splatconstant_funnnel_v32i8(<32 x i8> %x) nounwind { ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm1 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512VL-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm0 +; AVX512VL-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: splatconstant_funnnel_v32i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpsllw $4, %ymm0, %ymm1 ; AVX512BW-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512BW-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0 +; AVX512BW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm0 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512BW-NEXT: retq ; @@ -1726,14 +1726,14 @@ define <32 x i8> @splatconstant_funnnel_v32i8(<32 x i8> %x) nounwind { ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpsllw $4, %ymm0, %ymm1 ; AVX512VLBW-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512VLBW-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm0 +; AVX512VLBW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm1, %ymm0 ; AVX512VLBW-NEXT: retq ; ; AVX512VBMI2-LABEL: splatconstant_funnnel_v32i8: ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: vpsllw $4, %ymm0, %ymm1 ; AVX512VBMI2-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512VBMI2-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0 +; AVX512VBMI2-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm0 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512VBMI2-NEXT: retq ; @@ -1741,7 +1741,7 @@ define <32 x i8> @splatconstant_funnnel_v32i8(<32 x i8> %x) nounwind { ; AVX512VLVBMI2: # %bb.0: ; AVX512VLVBMI2-NEXT: vpsllw $4, %ymm0, %ymm1 ; AVX512VLVBMI2-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512VLVBMI2-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm0 +; AVX512VLVBMI2-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm1, %ymm0 ; AVX512VLVBMI2-NEXT: retq ; ; XOPAVX1-LABEL: splatconstant_funnnel_v32i8: diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll index 4364c047dfdeb..3c17bf2f6b9a6 100644 --- a/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll @@ -131,36 +131,36 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 ; AVX512F-NEXT: vpsllw $4, %ymm2, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm2, %ymm4 -; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm5 = [252645135,252645135,252645135,252645135,252645135,252645135,252645135,252645135,252645135,252645135,252645135,252645135,252645135,252645135,252645135,252645135] -; AVX512F-NEXT: vpternlogd $226, %zmm3, %zmm5, %zmm4 +; AVX512F-NEXT: vpbroadcastq {{.*#+}} zmm5 = [1085102592571150095,1085102592571150095,1085102592571150095,1085102592571150095,1085102592571150095,1085102592571150095,1085102592571150095,1085102592571150095] +; AVX512F-NEXT: vpternlogq $226, %zmm3, %zmm5, %zmm4 ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3 ; AVX512F-NEXT: vpsllw $5, %ymm3, %ymm3 ; AVX512F-NEXT: vpblendvb %ymm3, %ymm4, %ymm2, %ymm2 ; AVX512F-NEXT: vpsllw $6, %ymm2, %ymm4 ; AVX512F-NEXT: vpsrlw $2, %ymm2, %ymm6 -; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm7 = [1061109567,1061109567,1061109567,1061109567,1061109567,1061109567,1061109567,1061109567,1061109567,1061109567,1061109567,1061109567,1061109567,1061109567,1061109567,1061109567] -; AVX512F-NEXT: vpternlogd $226, %zmm4, %zmm7, %zmm6 +; AVX512F-NEXT: vpbroadcastq {{.*#+}} zmm7 = [4557430888798830399,4557430888798830399,4557430888798830399,4557430888798830399,4557430888798830399,4557430888798830399,4557430888798830399,4557430888798830399] +; AVX512F-NEXT: vpternlogq $226, %zmm4, %zmm7, %zmm6 ; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm3 ; AVX512F-NEXT: vpblendvb %ymm3, %ymm6, %ymm2, %ymm2 ; AVX512F-NEXT: vpsllw $7, %ymm2, %ymm4 ; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm6 -; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm8 = [2139062143,2139062143,2139062143,2139062143,2139062143,2139062143,2139062143,2139062143,2139062143,2139062143,2139062143,2139062143,2139062143,2139062143,2139062143,2139062143] -; AVX512F-NEXT: vpternlogd $226, %zmm4, %zmm8, %zmm6 +; AVX512F-NEXT: vpbroadcastq {{.*#+}} zmm8 = [9187201950435737471,9187201950435737471,9187201950435737471,9187201950435737471,9187201950435737471,9187201950435737471,9187201950435737471,9187201950435737471] +; AVX512F-NEXT: vpternlogq $226, %zmm4, %zmm8, %zmm6 ; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm3 ; AVX512F-NEXT: vpblendvb %ymm3, %ymm6, %ymm2, %ymm2 ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4 -; AVX512F-NEXT: vpternlogd $226, %zmm3, %zmm5, %zmm4 +; AVX512F-NEXT: vpternlogq $226, %zmm3, %zmm5, %zmm4 ; AVX512F-NEXT: vpsllw $5, %ymm1, %ymm1 ; AVX512F-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0 ; AVX512F-NEXT: vpsllw $6, %ymm0, %ymm3 ; AVX512F-NEXT: vpsrlw $2, %ymm0, %ymm4 -; AVX512F-NEXT: vpternlogd $226, %zmm3, %zmm7, %zmm4 +; AVX512F-NEXT: vpternlogq $226, %zmm3, %zmm7, %zmm4 ; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm1 ; AVX512F-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0 ; AVX512F-NEXT: vpsllw $7, %ymm0, %ymm3 ; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm4 -; AVX512F-NEXT: vpternlogd $226, %zmm3, %zmm8, %zmm4 +; AVX512F-NEXT: vpternlogq $226, %zmm3, %zmm8, %zmm4 ; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm1 ; AVX512F-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 @@ -171,36 +171,36 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind { ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 ; AVX512VL-NEXT: vpsllw $4, %ymm2, %ymm3 ; AVX512VL-NEXT: vpsrlw $4, %ymm2, %ymm4 -; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm5 = [252645135,252645135,252645135,252645135,252645135,252645135,252645135,252645135] -; AVX512VL-NEXT: vpternlogd $226, %ymm3, %ymm5, %ymm4 +; AVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm5 = [1085102592571150095,1085102592571150095,1085102592571150095,1085102592571150095] +; AVX512VL-NEXT: vpternlogq $226, %ymm3, %ymm5, %ymm4 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm3 ; AVX512VL-NEXT: vpsllw $5, %ymm3, %ymm3 ; AVX512VL-NEXT: vpblendvb %ymm3, %ymm4, %ymm2, %ymm2 ; AVX512VL-NEXT: vpsllw $6, %ymm2, %ymm4 ; AVX512VL-NEXT: vpsrlw $2, %ymm2, %ymm6 -; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm7 = [1061109567,1061109567,1061109567,1061109567,1061109567,1061109567,1061109567,1061109567] -; AVX512VL-NEXT: vpternlogd $226, %ymm4, %ymm7, %ymm6 +; AVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm7 = [4557430888798830399,4557430888798830399,4557430888798830399,4557430888798830399] +; AVX512VL-NEXT: vpternlogq $226, %ymm4, %ymm7, %ymm6 ; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm3 ; AVX512VL-NEXT: vpblendvb %ymm3, %ymm6, %ymm2, %ymm2 ; AVX512VL-NEXT: vpsllw $7, %ymm2, %ymm4 ; AVX512VL-NEXT: vpsrlw $1, %ymm2, %ymm6 -; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm8 = [2139062143,2139062143,2139062143,2139062143,2139062143,2139062143,2139062143,2139062143] -; AVX512VL-NEXT: vpternlogd $226, %ymm4, %ymm8, %ymm6 +; AVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm8 = [9187201950435737471,9187201950435737471,9187201950435737471,9187201950435737471] +; AVX512VL-NEXT: vpternlogq $226, %ymm4, %ymm8, %ymm6 ; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm3 ; AVX512VL-NEXT: vpblendvb %ymm3, %ymm6, %ymm2, %ymm2 ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm3 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm4 -; AVX512VL-NEXT: vpternlogd $226, %ymm3, %ymm5, %ymm4 +; AVX512VL-NEXT: vpternlogq $226, %ymm3, %ymm5, %ymm4 ; AVX512VL-NEXT: vpsllw $5, %ymm1, %ymm1 ; AVX512VL-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0 ; AVX512VL-NEXT: vpsllw $6, %ymm0, %ymm3 ; AVX512VL-NEXT: vpsrlw $2, %ymm0, %ymm4 -; AVX512VL-NEXT: vpternlogd $226, %ymm3, %ymm7, %ymm4 +; AVX512VL-NEXT: vpternlogq $226, %ymm3, %ymm7, %ymm4 ; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1 ; AVX512VL-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0 ; AVX512VL-NEXT: vpsllw $7, %ymm0, %ymm3 ; AVX512VL-NEXT: vpsrlw $1, %ymm0, %ymm4 -; AVX512VL-NEXT: vpternlogd $226, %ymm3, %ymm8, %ymm4 +; AVX512VL-NEXT: vpternlogq $226, %ymm3, %ymm8, %ymm4 ; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1 ; AVX512VL-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 @@ -721,7 +721,7 @@ define <64 x i8> @splatconstant_funnnel_v64i8(<64 x i8> %x) nounwind { ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX512F-NEXT: vpsrlw $4, %ymm2, %ymm2 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 -; AVX512F-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0 +; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: splatconstant_funnnel_v64i8: @@ -733,35 +733,35 @@ define <64 x i8> @splatconstant_funnnel_v64i8(<64 x i8> %x) nounwind { ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX512VL-NEXT: vpsrlw $4, %ymm2, %ymm2 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 -; AVX512VL-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0 +; AVX512VL-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: splatconstant_funnnel_v64i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm1 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 -; AVX512BW-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0 +; AVX512BW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512VLBW-LABEL: splatconstant_funnnel_v64i8: ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpsllw $4, %zmm0, %zmm1 ; AVX512VLBW-NEXT: vpsrlw $4, %zmm0, %zmm0 -; AVX512VLBW-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0 +; AVX512VLBW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm0 ; AVX512VLBW-NEXT: retq ; ; AVX512VBMI2-LABEL: splatconstant_funnnel_v64i8: ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: vpsllw $4, %zmm0, %zmm1 ; AVX512VBMI2-NEXT: vpsrlw $4, %zmm0, %zmm0 -; AVX512VBMI2-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0 +; AVX512VBMI2-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm0 ; AVX512VBMI2-NEXT: retq ; ; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v64i8: ; AVX512VLVBMI2: # %bb.0: ; AVX512VLVBMI2-NEXT: vpsllw $4, %zmm0, %zmm1 ; AVX512VLVBMI2-NEXT: vpsrlw $4, %zmm0, %zmm0 -; AVX512VLVBMI2-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0 +; AVX512VLVBMI2-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm0 ; AVX512VLVBMI2-NEXT: retq %res = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %x, <64 x i8> %x, <64 x i8> ) ret <64 x i8> %res diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll index e2195f1fc25a1..c9cbb0994810f 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll @@ -156,62 +156,64 @@ define void @load_i8_stride4_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr define void @load_i8_stride4_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3) nounwind { ; SSE-LABEL: load_i8_stride4_vf8: ; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm3 +; SSE-NEXT: movdqa (%rdi), %xmm1 ; SSE-NEXT: movdqa 16(%rdi), %xmm4 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,0,255,0,255,0,255,0] -; SSE-NEXT: movdqa %xmm4, %xmm1 ; SSE-NEXT: movdqa %xmm4, %xmm2 -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm4[3,1,2,3,4,5,6,7] -; SSE-NEXT: pand %xmm0, %xmm4 -; SSE-NEXT: pand %xmm3, %xmm0 -; SSE-NEXT: packuswb %xmm4, %xmm0 +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: packuswb %xmm2, %xmm0 ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pxor %xmm6, %xmm6 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] -; SSE-NEXT: movdqa %xmm3, %xmm4 -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm3[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm6[8],xmm3[9],xmm6[9],xmm3[10],xmm6[10],xmm3[11],xmm6[11],xmm3[12],xmm6[12],xmm3[13],xmm6[13],xmm3[14],xmm6[14],xmm3[15],xmm6[15] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm3[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[1,3,2,3,4,5,6,7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[1,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1] -; SSE-NEXT: packuswb %xmm7, %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,3,2,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,5,6,7] -; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255] -; SSE-NEXT: pand %xmm7, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,6,5,4] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,7,5,6,7] -; SSE-NEXT: pand %xmm7, %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm8[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[1,0,3,2,4,5,6,7] -; SSE-NEXT: packuswb %xmm5, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,3,1,4,5,6,7] +; SSE-NEXT: pxor %xmm7, %xmm7 +; SSE-NEXT: movdqa %xmm4, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm7[8],xmm2[9],xmm7[9],xmm2[10],xmm7[10],xmm2[11],xmm7[11],xmm2[12],xmm7[12],xmm2[13],xmm7[13],xmm2[14],xmm7[14],xmm2[15],xmm7[15] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm3[0,1,1,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm4, %xmm3 +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3],xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm6[0,1,1,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm5[0],xmm8[1],xmm5[1] +; SSE-NEXT: movdqa %xmm1, %xmm5 +; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm7[8],xmm5[9],xmm7[9],xmm5[10],xmm7[10],xmm5[11],xmm7[11],xmm5[12],xmm7[12],xmm5[13],xmm7[13],xmm5[14],xmm7[14],xmm5[15],xmm7[15] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm6[1,3,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm1, %xmm6 +; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[1,3,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] +; SSE-NEXT: packuswb %xmm8, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,3,2,3] +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255] +; SSE-NEXT: pand %xmm8, %xmm4 +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4] +; SSE-NEXT: pand %xmm8, %xmm1 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7] +; SSE-NEXT: packuswb %xmm4, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: packuswb %xmm2, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; SSE-NEXT: packuswb %xmm3, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,3,2,3] ; SSE-NEXT: movq %xmm0, (%rsi) -; SSE-NEXT: movq %xmm6, (%rdx) -; SSE-NEXT: movq %xmm5, (%rcx) -; SSE-NEXT: movq %xmm1, (%r8) +; SSE-NEXT: movq %xmm7, (%rdx) +; SSE-NEXT: movq %xmm1, (%rcx) +; SSE-NEXT: movq %xmm2, (%r8) ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i8_stride4_vf8: @@ -294,111 +296,111 @@ define void @load_i8_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-LABEL: load_i8_stride4_vf16: ; SSE: # %bb.0: ; SSE-NEXT: movdqa (%rdi), %xmm1 -; SSE-NEXT: movdqa 16(%rdi), %xmm3 -; SSE-NEXT: movdqa 32(%rdi), %xmm9 -; SSE-NEXT: movdqa 48(%rdi), %xmm10 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0] -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: movdqa %xmm10, %xmm4 -; SSE-NEXT: movdqa %xmm10, %xmm7 -; SSE-NEXT: pand %xmm2, %xmm7 -; SSE-NEXT: movdqa %xmm9, %xmm5 -; SSE-NEXT: movdqa %xmm9, %xmm6 -; SSE-NEXT: movdqa %xmm9, %xmm11 -; SSE-NEXT: pand %xmm2, %xmm11 -; SSE-NEXT: packuswb %xmm7, %xmm11 -; SSE-NEXT: movdqa %xmm3, %xmm7 -; SSE-NEXT: movdqa %xmm3, %xmm8 -; SSE-NEXT: movdqa %xmm3, %xmm12 -; SSE-NEXT: pand %xmm2, %xmm12 -; SSE-NEXT: pand %xmm1, %xmm2 -; SSE-NEXT: packuswb %xmm12, %xmm2 -; SSE-NEXT: packuswb %xmm11, %xmm2 -; SSE-NEXT: pxor %xmm11, %xmm11 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm11[8],xmm0[9],xmm11[9],xmm0[10],xmm11[10],xmm0[11],xmm11[11],xmm0[12],xmm11[12],xmm0[13],xmm11[13],xmm0[14],xmm11[14],xmm0[15],xmm11[15] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm12[0,1,1,3,4,5,6,7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm11[0],xmm4[1],xmm11[1],xmm4[2],xmm11[2],xmm4[3],xmm11[3],xmm4[4],xmm11[4],xmm4[5],xmm11[5],xmm4[6],xmm11[6],xmm4[7],xmm11[7] -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm4[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm13[0,1,1,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] -; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm11[8],xmm5[9],xmm11[9],xmm5[10],xmm11[10],xmm5[11],xmm11[11],xmm5[12],xmm11[12],xmm5[13],xmm11[13],xmm5[14],xmm11[14],xmm5[15],xmm11[15] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm5[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm12[1,3,2,3,4,5,6,7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3],xmm6[4],xmm11[4],xmm6[5],xmm11[5],xmm6[6],xmm11[6],xmm6[7],xmm11[7] -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm6[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm14[1,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm12[0],xmm14[1],xmm12[1] -; SSE-NEXT: packuswb %xmm13, %xmm14 -; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm11[8],xmm7[9],xmm11[9],xmm7[10],xmm11[10],xmm7[11],xmm11[11],xmm7[12],xmm11[12],xmm7[13],xmm11[13],xmm7[14],xmm11[14],xmm7[15],xmm11[15] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm7[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm12[0,1,1,3,4,5,6,7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm11[0],xmm8[1],xmm11[1],xmm8[2],xmm11[2],xmm8[3],xmm11[3],xmm8[4],xmm11[4],xmm8[5],xmm11[5],xmm8[6],xmm11[6],xmm8[7],xmm11[7] -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm8[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm13[0,1,1,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm12[0],xmm15[1],xmm12[1] +; SSE-NEXT: movdqa 16(%rdi), %xmm2 +; SSE-NEXT: movdqa 32(%rdi), %xmm8 +; SSE-NEXT: movdqa 48(%rdi), %xmm13 +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,0,255,0,255,0,255,0] +; SSE-NEXT: movdqa %xmm13, %xmm3 +; SSE-NEXT: pand %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm8, %xmm4 +; SSE-NEXT: pand %xmm0, %xmm4 +; SSE-NEXT: packuswb %xmm3, %xmm4 +; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: pand %xmm0, %xmm3 +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: packuswb %xmm3, %xmm0 +; SSE-NEXT: packuswb %xmm4, %xmm0 +; SSE-NEXT: pxor %xmm10, %xmm10 +; SSE-NEXT: movdqa %xmm13, %xmm3 +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm10[8],xmm3[9],xmm10[9],xmm3[10],xmm10[10],xmm3[11],xmm10[11],xmm3[12],xmm10[12],xmm3[13],xmm10[13],xmm3[14],xmm10[14],xmm3[15],xmm10[15] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm4[0,1,1,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm13, %xmm4 +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3],xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm6[0,1,1,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] +; SSE-NEXT: movdqa %xmm8, %xmm5 +; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm10[8],xmm5[9],xmm10[9],xmm5[10],xmm10[10],xmm5[11],xmm10[11],xmm5[12],xmm10[12],xmm5[13],xmm10[13],xmm5[14],xmm10[14],xmm5[15],xmm10[15] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm6[1,3,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm8, %xmm6 +; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm10[0],xmm6[1],xmm10[1],xmm6[2],xmm10[2],xmm6[3],xmm10[3],xmm6[4],xmm10[4],xmm6[5],xmm10[5],xmm6[6],xmm10[6],xmm6[7],xmm10[7] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm6[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm11[1,3,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm9[0],xmm14[1],xmm9[1] +; SSE-NEXT: packuswb %xmm7, %xmm14 +; SSE-NEXT: movdqa %xmm2, %xmm7 +; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm10[8],xmm7[9],xmm10[9],xmm7[10],xmm10[10],xmm7[11],xmm10[11],xmm7[12],xmm10[12],xmm7[13],xmm10[13],xmm7[14],xmm10[14],xmm7[15],xmm10[15] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm7[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm9[0,1,1,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm2, %xmm9 +; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3],xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm9[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm12[0,1,1,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm11[0],xmm15[1],xmm11[1] ; SSE-NEXT: movdqa %xmm1, %xmm12 -; SSE-NEXT: movdqa %xmm1, %xmm13 -; SSE-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm11[8],xmm13[9],xmm11[9],xmm13[10],xmm11[10],xmm13[11],xmm11[11],xmm13[12],xmm11[12],xmm13[13],xmm11[13],xmm13[14],xmm11[14],xmm13[15],xmm11[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm13[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[1,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm12[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[1,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1] -; SSE-NEXT: packuswb %xmm15, %xmm11 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm10[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[3,1,2,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,3],xmm14[0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7] -; SSE-NEXT: movdqa {{.*#+}} xmm10 = [255,255,255,255,255,255,255,255] -; SSE-NEXT: pand %xmm10, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] -; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,7,5,6,7] -; SSE-NEXT: pand %xmm10, %xmm9 -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[1,0,3,2,4,5,6,7] -; SSE-NEXT: packuswb %xmm0, %xmm9 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7] -; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm1, %xmm11 +; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3],xmm12[4],xmm10[4],xmm12[5],xmm10[5],xmm12[6],xmm10[6],xmm12[7],xmm10[7] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm11[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm10[1,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm12[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[1,3,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm1[0],xmm10[1],xmm1[1] +; SSE-NEXT: packuswb %xmm15, %xmm10 +; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,3],xmm14[0,3] +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] +; SSE-NEXT: pand %xmm1, %xmm13 +; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm13[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm13[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,7,6,5,4] +; SSE-NEXT: pand %xmm1, %xmm8 +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[1,0,3,2,4,5,6,7] +; SSE-NEXT: packuswb %xmm13, %xmm8 +; SSE-NEXT: pand %xmm1, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: pand %xmm1, %xmm13 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,7,6,5,4] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm13[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7] -; SSE-NEXT: pand %xmm10, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7] -; SSE-NEXT: packuswb %xmm0, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm9[0,3] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,1,4,5,6,7] +; SSE-NEXT: packuswb %xmm2, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm8[0,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] ; SSE-NEXT: packuswb %xmm3, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm8[3,1,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm11[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm12[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] ; SSE-NEXT: packuswb %xmm3, %xmm5 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm4[0,3] -; SSE-NEXT: movdqa %xmm2, (%rsi) -; SSE-NEXT: movaps %xmm11, (%rdx) +; SSE-NEXT: movdqa %xmm0, (%rsi) +; SSE-NEXT: movaps %xmm10, (%rdx) ; SSE-NEXT: movaps %xmm1, (%rcx) ; SSE-NEXT: movaps %xmm5, (%r8) ; SSE-NEXT: retq @@ -528,247 +530,249 @@ define void @load_i8_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE: # %bb.0: ; SSE-NEXT: subq $136, %rsp ; SSE-NEXT: movdqa 64(%rdi), %xmm4 -; SSE-NEXT: movdqa 80(%rdi), %xmm9 -; SSE-NEXT: movdqa 96(%rdi), %xmm11 -; SSE-NEXT: movdqa 112(%rdi), %xmm12 -; SSE-NEXT: movdqa (%rdi), %xmm13 -; SSE-NEXT: movdqa 16(%rdi), %xmm14 -; SSE-NEXT: movdqa 32(%rdi), %xmm2 -; SSE-NEXT: movdqa 48(%rdi), %xmm10 -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,255,0,255,0,255,0] -; SSE-NEXT: movdqa %xmm10, %xmm3 -; SSE-NEXT: movdqa %xmm10, %xmm5 -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: pand %xmm8, %xmm0 -; SSE-NEXT: movdqa %xmm2, %xmm6 -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm2, %xmm15 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm8, %xmm1 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 80(%rdi), %xmm5 +; SSE-NEXT: movdqa 96(%rdi), %xmm15 +; SSE-NEXT: movdqa 112(%rdi), %xmm14 +; SSE-NEXT: movdqa (%rdi), %xmm12 +; SSE-NEXT: movdqa 16(%rdi), %xmm11 +; SSE-NEXT: movdqa 32(%rdi), %xmm7 +; SSE-NEXT: movdqa 48(%rdi), %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,0,255,0,255,0,255,0] +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: pand %xmm6, %xmm0 +; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: pand %xmm6, %xmm1 ; SSE-NEXT: packuswb %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: pand %xmm6, %xmm0 +; SSE-NEXT: movdqa %xmm12, %xmm2 +; SSE-NEXT: pand %xmm6, %xmm2 +; SSE-NEXT: packuswb %xmm0, %xmm2 +; SSE-NEXT: packuswb %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: pand %xmm8, %xmm0 -; SSE-NEXT: movdqa %xmm13, %xmm7 -; SSE-NEXT: pand %xmm8, %xmm7 -; SSE-NEXT: packuswb %xmm0, %xmm7 -; SSE-NEXT: packuswb %xmm1, %xmm7 -; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: pand %xmm8, %xmm0 -; SSE-NEXT: movdqa %xmm11, %xmm1 -; SSE-NEXT: pand %xmm8, %xmm1 +; SSE-NEXT: pand %xmm6, %xmm0 +; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: pand %xmm6, %xmm1 ; SSE-NEXT: packuswb %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm9, %xmm0 -; SSE-NEXT: pand %xmm8, %xmm0 -; SSE-NEXT: pand %xmm4, %xmm8 -; SSE-NEXT: packuswb %xmm0, %xmm8 -; SSE-NEXT: packuswb %xmm1, %xmm8 -; SSE-NEXT: pxor %xmm2, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] +; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: movdqa %xmm5, %xmm2 +; SSE-NEXT: pand %xmm6, %xmm0 +; SSE-NEXT: pand %xmm4, %xmm6 +; SSE-NEXT: packuswb %xmm0, %xmm6 +; SSE-NEXT: packuswb %xmm1, %xmm6 +; SSE-NEXT: pxor %xmm4, %xmm4 +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,1,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm2[8],xmm6[9],xmm2[9],xmm6[10],xmm2[10],xmm6[11],xmm2[11],xmm6[12],xmm2[12],xmm6[13],xmm2[13],xmm6[14],xmm2[14],xmm6[15],xmm2[15] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm15, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[1,3,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; SSE-NEXT: packuswb %xmm3, %xmm5 -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] +; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[1,3,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] +; SSE-NEXT: packuswb %xmm5, %xmm8 +; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm14, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,1,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: movdqa %xmm13, %xmm14 -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm5[0,1,1,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] +; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15] ; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm2[0],xmm14[1],xmm2[1],xmm14[2],xmm2[2],xmm14[3],xmm2[3],xmm14[4],xmm2[4],xmm14[5],xmm2[5],xmm14[6],xmm2[6],xmm14[7],xmm2[7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm14[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm6[1,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: packuswb %xmm3, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm5[0,3] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, %xmm15 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] +; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm5[1,3,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1] +; SSE-NEXT: packuswb %xmm9, %xmm13 +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,3],xmm8[0,3] +; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm2[0],xmm15[1],xmm2[1],xmm15[2],xmm2[2],xmm15[3],xmm2[3],xmm15[4],xmm2[4],xmm15[5],xmm2[5],xmm15[6],xmm2[6],xmm15[7],xmm2[7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm15[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,1,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: movdqa %xmm11, %xmm12 -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm11, %xmm13 -; SSE-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm2[8],xmm13[9],xmm2[9],xmm13[10],xmm2[10],xmm13[11],xmm2[11],xmm13[12],xmm2[12],xmm13[13],xmm2[13],xmm13[14],xmm2[14],xmm13[15],xmm2[15] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm0[1,3,2,3,4,5,6,7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm2[0],xmm12[1],xmm2[1],xmm12[2],xmm2[2],xmm12[3],xmm2[3],xmm12[4],xmm2[4],xmm12[5],xmm2[5],xmm12[6],xmm2[6],xmm12[7],xmm2[7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm12[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[1,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] -; SSE-NEXT: packuswb %xmm3, %xmm0 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm9, %xmm11 -; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm2[8],xmm11[9],xmm2[9],xmm11[10],xmm2[10],xmm11[11],xmm2[11],xmm11[12],xmm2[12],xmm11[13],xmm2[13],xmm11[14],xmm2[14],xmm11[15],xmm2[15] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm11[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,1,3,4,5,6,7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm2[0],xmm9[1],xmm2[1],xmm9[2],xmm2[2],xmm9[3],xmm2[3],xmm9[4],xmm2[4],xmm9[5],xmm2[5],xmm9[6],xmm2[6],xmm9[7],xmm2[7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm9[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm5[0,1,1,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; SSE-NEXT: movdqa %xmm4, %xmm5 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm4, %xmm6 -; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm2[8],xmm6[9],xmm2[9],xmm6[10],xmm2[10],xmm6[11],xmm2[11],xmm6[12],xmm2[12],xmm6[13],xmm2[13],xmm6[14],xmm2[14],xmm6[15],xmm2[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[1,3,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,1,1,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] +; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm0[1,3,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm0[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[1,3,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] +; SSE-NEXT: packuswb %xmm8, %xmm0 +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,1,1,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm2, %xmm10 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm9[0,1,1,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movdqa %xmm5, %xmm9 +; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm4[8],xmm9[9],xmm4[9],xmm9[10],xmm4[10],xmm9[11],xmm4[11],xmm9[12],xmm4[12],xmm9[13],xmm4[13],xmm9[14],xmm4[14],xmm9[15],xmm4[15] +; SSE-NEXT: movdqa %xmm5, %xmm8 +; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3],xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm9[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[1,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm8[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] ; SSE-NEXT: packuswb %xmm1, %xmm4 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,3],xmm0[0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm10[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,7,5,6,7] -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; SSE-NEXT: pand %xmm3, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7] -; SSE-NEXT: pand %xmm3, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7] -; SSE-NEXT: packuswb %xmm2, %xmm1 -; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: # xmm10 = mem[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,7] -; SSE-NEXT: pand %xmm3, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4] -; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,7,5,6,7] -; SSE-NEXT: pand %xmm3, %xmm10 -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[1,0,3,2,4,5,6,7] -; SSE-NEXT: packuswb %xmm2, %xmm10 -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,3],xmm1[0,3] -; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[3,1,2,3,4,5,6,7] +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255] +; SSE-NEXT: pand %xmm0, %xmm3 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7] -; SSE-NEXT: pand %xmm3, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4] +; SSE-NEXT: pand %xmm0, %xmm7 +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm7[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,7] -; SSE-NEXT: pand %xmm3, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,2,4,5,6,7] ; SSE-NEXT: packuswb %xmm1, %xmm2 -; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[3,1,2,3,4,5,6,7] +; SSE-NEXT: pand %xmm0, %xmm11 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm11[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7] -; SSE-NEXT: pand %xmm3, %xmm1 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7] -; SSE-NEXT: pand %xmm3, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] -; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm2[0,3] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,3,1,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,7,6,5,4] +; SSE-NEXT: pand %xmm0, %xmm12 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm12[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7] +; SSE-NEXT: packuswb %xmm3, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm2[0,3] +; SSE-NEXT: pand %xmm0, %xmm14 +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm14[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4] +; SSE-NEXT: pand %xmm0, %xmm15 +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm15[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm3[1,0,3,2,4,5,6,7] +; SSE-NEXT: packuswb %xmm2, %xmm7 +; SSE-NEXT: pand %xmm0, %xmm10 +; SSE-NEXT: pand %xmm0, %xmm5 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm10[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm5[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[1,0,3,2,4,5,6,7] +; SSE-NEXT: packuswb %xmm0, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm7[0,3] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,1,4,5,6,7] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: packuswb %xmm2, %xmm3 -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,3,1,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[3,1,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] +; SSE-NEXT: packuswb %xmm2, %xmm7 +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,1,4,5,6,7] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; SSE-NEXT: # xmm2 = mem[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: pshufd $231, (%rsp), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm14[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm14[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1] -; SSE-NEXT: packuswb %xmm2, %xmm14 -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,3],xmm3[0,3] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: # xmm1 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,3,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm15[3,1,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: pshufd $231, (%rsp), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm0[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] +; SSE-NEXT: packuswb %xmm2, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm7[0,3] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm12[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: packuswb %xmm2, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,3,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm9[3,1,2,3] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,3,1,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: # xmm11 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[3,1,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm2[0],xmm11[1],xmm2[1] +; SSE-NEXT: packuswb %xmm7, %xmm11 +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE-NEXT: # xmm2 = mem[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] -; SSE-NEXT: packuswb %xmm2, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm3[0,3] -; SSE-NEXT: movdqa %xmm8, 16(%rsi) -; SSE-NEXT: movdqa %xmm7, (%rsi) +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,3,1,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm9[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[3,1,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1] +; SSE-NEXT: packuswb %xmm7, %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,3],xmm11[0,3] +; SSE-NEXT: movdqa %xmm6, 16(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, (%rsi) ; SSE-NEXT: movaps %xmm4, 16(%rdx) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%rdx) -; SSE-NEXT: movaps %xmm0, 16(%rcx) -; SSE-NEXT: movaps %xmm10, (%rcx) -; SSE-NEXT: movaps %xmm5, 16(%r8) -; SSE-NEXT: movaps %xmm14, (%r8) +; SSE-NEXT: movaps %xmm13, (%rdx) +; SSE-NEXT: movaps %xmm3, 16(%rcx) +; SSE-NEXT: movaps %xmm1, (%rcx) +; SSE-NEXT: movaps %xmm8, 16(%r8) +; SSE-NEXT: movaps %xmm0, (%r8) ; SSE-NEXT: addq $136, %rsp ; SSE-NEXT: retq ; @@ -1019,518 +1023,522 @@ define void @load_i8_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3) nounwind { ; SSE-LABEL: load_i8_stride4_vf64: ; SSE: # %bb.0: -; SSE-NEXT: subq $600, %rsp # imm = 0x258 +; SSE-NEXT: subq $664, %rsp # imm = 0x298 ; SSE-NEXT: movdqa 16(%rdi), %xmm8 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 32(%rdi), %xmm15 +; SSE-NEXT: movdqa 32(%rdi), %xmm6 ; SSE-NEXT: movdqa 48(%rdi), %xmm14 -; SSE-NEXT: movdqa 128(%rdi), %xmm4 -; SSE-NEXT: movdqa 144(%rdi), %xmm7 -; SSE-NEXT: movdqa 160(%rdi), %xmm10 -; SSE-NEXT: movdqa 176(%rdi), %xmm12 -; SSE-NEXT: movdqa 64(%rdi), %xmm6 -; SSE-NEXT: movdqa 80(%rdi), %xmm13 +; SSE-NEXT: movdqa 128(%rdi), %xmm15 +; SSE-NEXT: movdqa 144(%rdi), %xmm10 +; SSE-NEXT: movdqa 160(%rdi), %xmm11 +; SSE-NEXT: movdqa 176(%rdi), %xmm3 +; SSE-NEXT: movdqa 64(%rdi), %xmm13 +; SSE-NEXT: movdqa 80(%rdi), %xmm7 ; SSE-NEXT: movdqa 96(%rdi), %xmm2 ; SSE-NEXT: movdqa 112(%rdi), %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm9 = [255,0,255,0,255,0,255,0] +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,0,255,0,255,0,255,0] ; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: pand %xmm9, %xmm0 +; SSE-NEXT: movdqa %xmm1, %xmm4 +; SSE-NEXT: pand %xmm5, %xmm0 ; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm2, %xmm5 -; SSE-NEXT: pand %xmm9, %xmm1 +; SSE-NEXT: movdqa %xmm2, %xmm9 +; SSE-NEXT: pand %xmm5, %xmm1 ; SSE-NEXT: packuswb %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: pand %xmm9, %xmm0 -; SSE-NEXT: movdqa %xmm6, %xmm2 -; SSE-NEXT: pand %xmm9, %xmm2 +; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: pand %xmm5, %xmm0 +; SSE-NEXT: movdqa %xmm13, %xmm2 +; SSE-NEXT: pand %xmm5, %xmm2 ; SSE-NEXT: packuswb %xmm0, %xmm2 ; SSE-NEXT: packuswb %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: pand %xmm9, %xmm0 -; SSE-NEXT: movdqa %xmm10, %xmm1 -; SSE-NEXT: pand %xmm9, %xmm1 +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: pand %xmm5, %xmm0 +; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: pand %xmm5, %xmm1 ; SSE-NEXT: packuswb %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm7, %xmm0 -; SSE-NEXT: pand %xmm9, %xmm0 -; SSE-NEXT: movdqa %xmm4, %xmm2 -; SSE-NEXT: pand %xmm9, %xmm2 +; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: pand %xmm5, %xmm0 +; SSE-NEXT: movdqa %xmm15, %xmm2 +; SSE-NEXT: pand %xmm5, %xmm2 ; SSE-NEXT: packuswb %xmm0, %xmm2 ; SSE-NEXT: packuswb %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: pand %xmm9, %xmm0 -; SSE-NEXT: movdqa %xmm15, %xmm1 -; SSE-NEXT: pand %xmm9, %xmm1 +; SSE-NEXT: pand %xmm5, %xmm0 +; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: pand %xmm5, %xmm1 ; SSE-NEXT: packuswb %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm8, %xmm0 -; SSE-NEXT: pand %xmm9, %xmm0 -; SSE-NEXT: movdqa (%rdi), %xmm11 -; SSE-NEXT: movdqa %xmm11, %xmm2 -; SSE-NEXT: pand %xmm9, %xmm2 +; SSE-NEXT: pand %xmm5, %xmm0 +; SSE-NEXT: movdqa (%rdi), %xmm12 +; SSE-NEXT: movdqa %xmm12, %xmm2 +; SSE-NEXT: pand %xmm5, %xmm2 ; SSE-NEXT: packuswb %xmm0, %xmm2 ; SSE-NEXT: packuswb %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 240(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm9, %xmm0 +; SSE-NEXT: movdqa 240(%rdi), %xmm8 +; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: pand %xmm5, %xmm0 ; SSE-NEXT: movdqa 224(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm9, %xmm1 +; SSE-NEXT: pand %xmm5, %xmm1 ; SSE-NEXT: packuswb %xmm0, %xmm1 ; SSE-NEXT: movdqa 208(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm9, %xmm0 +; SSE-NEXT: pand %xmm5, %xmm0 ; SSE-NEXT: movdqa 192(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm2, %xmm9 -; SSE-NEXT: packuswb %xmm0, %xmm9 -; SSE-NEXT: packuswb %xmm1, %xmm9 -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: pxor %xmm8, %xmm8 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15] +; SSE-NEXT: pand %xmm2, %xmm5 +; SSE-NEXT: packuswb %xmm0, %xmm5 +; SSE-NEXT: packuswb %xmm1, %xmm5 +; SSE-NEXT: pxor %xmm2, %xmm2 +; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,1,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: movdqa %xmm5, %xmm1 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[1,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; SSE-NEXT: packuswb %xmm3, %xmm5 -; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[1,3,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] +; SSE-NEXT: packuswb %xmm4, %xmm9 +; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,1,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: movdqa %xmm6, %xmm1 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: movdqa %xmm13, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm6[1,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm13[1,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: packuswb %xmm3, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm5[0,3] +; SSE-NEXT: packuswb %xmm4, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm9[0,3] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, %xmm1 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,1,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: movdqa %xmm10, %xmm1 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[1,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] -; SSE-NEXT: packuswb %xmm3, %xmm6 -; SSE-NEXT: movdqa %xmm7, (%rsp) # 16-byte Spill -; SSE-NEXT: movdqa %xmm7, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm2[0],xmm11[1],xmm2[1],xmm11[2],xmm2[2],xmm11[3],xmm2[3],xmm11[4],xmm2[4],xmm11[5],xmm2[5],xmm11[6],xmm2[6],xmm11[7],xmm2[7] +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm11[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[1,3,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] +; SSE-NEXT: packuswb %xmm4, %xmm9 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,1,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm2[0],xmm10[1],xmm2[1],xmm10[2],xmm2[2],xmm10[3],xmm2[3],xmm10[4],xmm2[4],xmm10[5],xmm2[5],xmm10[6],xmm2[6],xmm10[7],xmm2[7] +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm10[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm4, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm5[1,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1] -; SSE-NEXT: packuswb %xmm3, %xmm10 -; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,3],xmm6[0,3] -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm13[1,3,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: packuswb %xmm4, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm9[0,3] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15] +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm8[0],xmm14[1],xmm8[1],xmm14[2],xmm8[2],xmm14[3],xmm8[3],xmm14[4],xmm8[4],xmm14[5],xmm8[5],xmm14[6],xmm8[6],xmm14[7],xmm8[7] -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm14[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,1,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15] +; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; SSE-NEXT: movdqa %xmm6, %xmm7 +; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm0[1,3,2,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm15, %xmm12 -; SSE-NEXT: movdqa %xmm15, %xmm14 -; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm8[0],xmm12[1],xmm8[1],xmm12[2],xmm8[2],xmm12[3],xmm8[3],xmm12[4],xmm8[4],xmm12[5],xmm8[5],xmm12[6],xmm8[6],xmm12[7],xmm8[7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm12[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[1,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] -; SSE-NEXT: packuswb %xmm3, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movdqa %xmm15, %xmm13 -; SSE-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm8[8],xmm13[9],xmm8[9],xmm13[10],xmm8[10],xmm13[11],xmm8[11],xmm13[12],xmm8[12],xmm13[13],xmm8[13],xmm13[14],xmm8[14],xmm13[15],xmm8[15] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm13[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,1,3,4,5,6,7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm15[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm6[0,1,1,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; SSE-NEXT: movdqa %xmm11, %xmm7 -; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm8[8],xmm7[9],xmm8[9],xmm7[10],xmm8[10],xmm7[11],xmm8[11],xmm7[12],xmm8[12],xmm7[13],xmm8[13],xmm7[14],xmm8[14],xmm7[15],xmm8[15] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,3,2,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm11, %xmm6 -; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3],xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm6[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[1,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] -; SSE-NEXT: packuswb %xmm1, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm0[0,3] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15] +; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm0[1,3,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm0[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[1,3,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] +; SSE-NEXT: packuswb %xmm4, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm9[0,1,1,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; SSE-NEXT: movdqa %xmm12, %xmm3 +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm4[1,3,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm12, %xmm4 +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm4[1,3,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm3[0],xmm15[1],xmm3[1] +; SSE-NEXT: packuswb %xmm1, %xmm15 +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,3],xmm0[0,3] +; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3],xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3] +; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm2, %xmm4 -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3],xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7] -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3] +; SSE-NEXT: movdqa %xmm10, %xmm3 +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] ; SSE-NEXT: packuswb %xmm1, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,1,1,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] +; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[0,1,1,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movdqa %xmm0, %xmm4 -; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm8[8],xmm4[9],xmm8[9],xmm4[10],xmm8[10],xmm4[11],xmm8[11],xmm4[12],xmm8[12],xmm4[13],xmm8[13],xmm4[14],xmm8[14],xmm4[15],xmm8[15] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,1,1,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm9, %xmm4 +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[1,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[1,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] -; SSE-NEXT: packuswb %xmm2, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE-NEXT: packuswb %xmm0, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm3[0,3] -; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: # xmm0 = mem[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm14[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,7,5,6,7] ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255] -; SSE-NEXT: pand %xmm0, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4] +; SSE-NEXT: pand %xmm0, %xmm14 +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm14[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4] +; SSE-NEXT: pand %xmm0, %xmm7 +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm7[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,0,3,2,4,5,6,7] +; SSE-NEXT: packuswb %xmm2, %xmm3 +; SSE-NEXT: pand %xmm0, %xmm6 +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm6[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4] +; SSE-NEXT: pand %xmm0, %xmm12 +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm12[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm4[1,0,3,2,4,5,6,7] +; SSE-NEXT: packuswb %xmm2, %xmm12 +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,3],xmm3[0,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[1,0,3,2,4,5,6,7] -; SSE-NEXT: packuswb %xmm3, %xmm4 -; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: # xmm2 = mem[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm11[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pand %xmm0, %xmm3 +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,0,3,2,4,5,6,7] +; SSE-NEXT: packuswb %xmm2, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm2[0,1,2,3,7,6,5,4] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,7,6,5,4] +; SSE-NEXT: movdqa (%rsp), %xmm2 # 16-byte Reload ; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,2,4,5,6,7] -; SSE-NEXT: packuswb %xmm8, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm4[0,3] -; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,7] +; SSE-NEXT: packuswb %xmm4, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm3[0,3] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: pand %xmm0, %xmm3 +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,5,6,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: pand %xmm0, %xmm4 +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,0,3,2,4,5,6,7] ; SSE-NEXT: packuswb %xmm3, %xmm4 -; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = mem[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: pand %xmm0, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,7,5,6,7] -; SSE-NEXT: pand %xmm0, %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm8[1,0,3,2,4,5,6,7] -; SSE-NEXT: packuswb %xmm3, %xmm11 -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,3],xmm4[0,3] -; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,7] -; SSE-NEXT: pand %xmm0, %xmm3 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,5,6,7] -; SSE-NEXT: pand %xmm0, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,0,3,2,4,5,6,7] -; SSE-NEXT: packuswb %xmm3, %xmm4 -; SSE-NEXT: pshuflw $231, (%rsp), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = mem[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm3[0,1,2,3,7,6,5,4] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: pand %xmm0, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,7,5,6,7] -; SSE-NEXT: pand %xmm0, %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm8[1,0,3,2,4,5,6,7] -; SSE-NEXT: packuswb %xmm3, %xmm14 -; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,3],xmm4[0,3] -; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: # xmm4 = mem[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,7] -; SSE-NEXT: pand %xmm0, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,0,3,2,4,5,6,7] +; SSE-NEXT: packuswb %xmm6, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm4[0,3] +; SSE-NEXT: pand %xmm0, %xmm8 +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm8[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4] +; SSE-NEXT: pand %xmm0, %xmm10 +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm10[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm6[1,0,3,2,4,5,6,7] +; SSE-NEXT: packuswb %xmm4, %xmm7 +; SSE-NEXT: pand %xmm0, %xmm11 +; SSE-NEXT: pand %xmm0, %xmm9 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm9[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,5,6,7] -; SSE-NEXT: pand %xmm0, %xmm4 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,0,3,2,4,5,6,7] -; SSE-NEXT: packuswb %xmm3, %xmm4 -; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload -; SSE-NEXT: # xmm8 = mem[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,7] -; SSE-NEXT: pand %xmm0, %xmm3 -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,7,5,6,7] -; SSE-NEXT: pand %xmm0, %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,7,6,5,4] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] -; SSE-NEXT: packuswb %xmm3, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm4[0,3] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm4[1,0,3,2,4,5,6,7] +; SSE-NEXT: packuswb %xmm0, %xmm6 +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,3],xmm7[0,3] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,1,4,5,6,7] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = mem[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm12[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1] -; SSE-NEXT: packuswb %xmm4, %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm13[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm15[3,1,2,3] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[3,1,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] +; SSE-NEXT: packuswb %xmm4, %xmm7 +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,1,4,5,6,7] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] -; SSE-NEXT: packuswb %xmm4, %xmm6 -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,3],xmm8[0,3] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm0[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: # xmm0 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] +; SSE-NEXT: packuswb %xmm4, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm7[0,3] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = mem[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload ; SSE-NEXT: # xmm7 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm7[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1] -; SSE-NEXT: packuswb %xmm4, %xmm8 -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,3,1,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[3,1,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1] +; SSE-NEXT: packuswb %xmm7, %xmm8 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = mem[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: # xmm7 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm7[0,1,3,1,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm4[0],xmm10[1],xmm4[1] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload ; SSE-NEXT: # xmm7 = mem[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1] -; SSE-NEXT: packuswb %xmm4, %xmm7 +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] +; SSE-NEXT: packuswb %xmm10, %xmm7 ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,3],xmm8[0,3] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = mem[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload ; SSE-NEXT: # xmm8 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm8[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm3[0],xmm12[1],xmm3[1] -; SSE-NEXT: packuswb %xmm4, %xmm12 -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,1,3,1,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[3,1,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm4[0],xmm10[1],xmm4[1] +; SSE-NEXT: packuswb %xmm8, %xmm10 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = mem[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: # xmm8 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm8[0,1,3,1,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload ; SSE-NEXT: # xmm8 = mem[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1] -; SSE-NEXT: packuswb %xmm4, %xmm8 -; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,3],xmm12[0,3] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1] +; SSE-NEXT: packuswb %xmm11, %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,3],xmm10[0,3] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = mem[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: # xmm12 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm12[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm3[0],xmm12[1],xmm3[1] -; SSE-NEXT: packuswb %xmm4, %xmm12 -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[0,1,3,1,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm4[0],xmm10[1],xmm4[1] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: # xmm11 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[3,1,2,3,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1] +; SSE-NEXT: packuswb %xmm10, %xmm11 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE-NEXT: # xmm4 = mem[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: # xmm3 = mem[3,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: # xmm10 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[0,1,3,1,4,5,6,7] +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm4[0],xmm10[1],xmm4[1] +; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: # xmm4 = mem[3,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload ; SSE-NEXT: # xmm13 = mem[3,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm13[3,1,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm3[0],xmm13[1],xmm3[1] -; SSE-NEXT: packuswb %xmm4, %xmm13 -; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,3],xmm12[0,3] -; SSE-NEXT: movdqa %xmm9, 48(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, (%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 32(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movaps %xmm3, 16(%rsi) +; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm4[0],xmm13[1],xmm4[1] +; SSE-NEXT: packuswb %xmm10, %xmm13 +; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,3],xmm11[0,3] +; SSE-NEXT: movdqa %xmm5, 48(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movaps %xmm4, (%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movaps %xmm4, 32(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movaps %xmm4, 16(%rsi) ; SSE-NEXT: movaps %xmm1, 48(%rdx) -; SSE-NEXT: movaps %xmm5, (%rdx) -; SSE-NEXT: movaps %xmm10, 32(%rdx) +; SSE-NEXT: movaps %xmm15, (%rdx) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movaps %xmm1, 32(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 16(%rdx) -; SSE-NEXT: movaps %xmm0, 48(%rcx) -; SSE-NEXT: movaps %xmm14, 32(%rcx) -; SSE-NEXT: movaps %xmm11, 16(%rcx) -; SSE-NEXT: movaps %xmm2, (%rcx) +; SSE-NEXT: movaps %xmm6, 48(%rcx) +; SSE-NEXT: movaps %xmm3, 32(%rcx) +; SSE-NEXT: movaps %xmm2, 16(%rcx) +; SSE-NEXT: movaps %xmm12, (%rcx) ; SSE-NEXT: movaps %xmm13, 48(%r8) ; SSE-NEXT: movaps %xmm8, 32(%r8) ; SSE-NEXT: movaps %xmm7, 16(%r8) -; SSE-NEXT: movaps %xmm6, (%r8) -; SSE-NEXT: addq $600, %rsp # imm = 0x258 +; SSE-NEXT: movaps %xmm0, (%r8) +; SSE-NEXT: addq $664, %rsp # imm = 0x298 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i8_stride4_vf64: diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll index be34bbd254022..a2ad944c66f42 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll @@ -104,9 +104,9 @@ define void @load_i8_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pand %xmm0, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm0 ; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [16711935,16711935,16711935,16711935] +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] ; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7] @@ -130,9 +130,10 @@ define void @load_i8_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm9[2],xmm7[3],xmm9[3] ; SSE-NEXT: packuswb %xmm7, %xmm7 ; SSE-NEXT: por %xmm7, %xmm4 -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm6[2,1,2,3,4,5,6,7] +; SSE-NEXT: movaps %xmm6, %xmm7 +; SSE-NEXT: andps %xmm2, %xmm7 +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,7,6,7] -; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm7 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[1,2,3,0,4,5,6,7] ; SSE-NEXT: packuswb %xmm7, %xmm7 @@ -144,8 +145,8 @@ define void @load_i8_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] ; SSE-NEXT: packuswb %xmm6, %xmm6 ; SSE-NEXT: por %xmm1, %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[3,1,2,0] -; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: pand %xmm8, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,0,3,4,5,6,7] @@ -224,9 +225,10 @@ define void @load_i8_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pand %xmm8, %xmm1 ; SSE-NEXT: pandn %xmm3, %xmm8 ; SSE-NEXT: por %xmm1, %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,2,1,3] -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [16711935,16711935,16711935,16711935] +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255] +; SSE-NEXT: movdqa %xmm8, %xmm1 ; SSE-NEXT: pand %xmm5, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7] @@ -272,9 +274,10 @@ define void @load_i8_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa %xmm4, %xmm12 ; SSE-NEXT: pand %xmm11, %xmm12 ; SSE-NEXT: por %xmm9, %xmm12 -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm12[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,4,7] +; SSE-NEXT: movdqa %xmm12, %xmm9 ; SSE-NEXT: pand %xmm5, %xmm9 +; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,4,7] ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[1,2,3,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm9[0,1,2,3,5,5,5,5] @@ -315,8 +318,8 @@ define void @load_i8_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pand %xmm11, %xmm3 ; SSE-NEXT: pandn %xmm4, %xmm11 ; SSE-NEXT: por %xmm3, %xmm11 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm11[3,1,2,0] -; SSE-NEXT: pand %xmm5, %xmm3 +; SSE-NEXT: pand %xmm11, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[3,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,0,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[2,1,0,3,4,5,6,7] @@ -467,81 +470,79 @@ define void @load_i8_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5) nounwind { ; SSE-LABEL: load_i8_stride6_vf16: ; SSE: # %bb.0: -; SSE-NEXT: movdqa 64(%rdi), %xmm10 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 64(%rdi), %xmm11 +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa (%rdi), %xmm5 -; SSE-NEXT: movdqa 16(%rdi), %xmm1 -; SSE-NEXT: movdqa 32(%rdi), %xmm7 +; SSE-NEXT: movdqa 16(%rdi), %xmm2 +; SSE-NEXT: movdqa 32(%rdi), %xmm8 ; SSE-NEXT: movdqa 48(%rdi), %xmm6 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,0,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm4, %xmm7 +; SSE-NEXT: pandn %xmm8, %xmm7 +; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,0,65535,65535,0,65535,65535,0] +; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: pandn %xmm6, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: pandn %xmm7, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,0,65535,65535,0] -; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: pandn %xmm6, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm4, %xmm3 -; SSE-NEXT: pandn %xmm6, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pandn %xmm6, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pand %xmm4, %xmm6 -; SSE-NEXT: por %xmm0, %xmm6 +; SSE-NEXT: por %xmm7, %xmm6 +; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255] ; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE-NEXT: pand %xmm7, %xmm0 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,5] ; SSE-NEXT: packuswb %xmm3, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,65535,0,0,0,65535,65535] -; SSE-NEXT: movdqa %xmm8, %xmm9 +; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,65535,0,0,0,65535,65535] ; SSE-NEXT: pandn %xmm0, %xmm9 -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: movdqa %xmm2, %xmm11 -; SSE-NEXT: pandn %xmm1, %xmm11 -; SSE-NEXT: pand %xmm4, %xmm10 -; SSE-NEXT: movdqa %xmm4, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: pandn %xmm2, %xmm10 +; SSE-NEXT: pand %xmm4, %xmm11 +; SSE-NEXT: movdqa %xmm4, %xmm3 +; SSE-NEXT: pandn %xmm2, %xmm3 ; SSE-NEXT: movdqa %xmm5, %xmm14 ; SSE-NEXT: pand %xmm4, %xmm14 -; SSE-NEXT: movdqa 80(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, %xmm13 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 80(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm13 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pand %xmm4, %xmm13 -; SSE-NEXT: movdqa %xmm7, %xmm15 -; SSE-NEXT: pand %xmm4, %xmm7 +; SSE-NEXT: movdqa %xmm8, %xmm15 +; SSE-NEXT: pand %xmm4, %xmm8 ; SSE-NEXT: pand %xmm4, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm4, %xmm12 ; SSE-NEXT: movdqa %xmm4, %xmm2 ; SSE-NEXT: pandn %xmm5, %xmm4 ; SSE-NEXT: pand %xmm0, %xmm5 -; SSE-NEXT: por %xmm11, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm5[0,2,1,3] -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] -; SSE-NEXT: pand %xmm1, %xmm11 -; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[0,3,2,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm11[0,1,2,3,4,7,6,7] +; SSE-NEXT: por %xmm10, %xmm5 +; SSE-NEXT: movdqa %xmm5, %xmm10 +; SSE-NEXT: pand %xmm7, %xmm10 +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,2,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[0,3,2,1,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm10[0,1,2,3,4,7,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE-NEXT: por %xmm9, %xmm0 -; SSE-NEXT: pandn %xmm3, %xmm12 -; SSE-NEXT: por %xmm12, %xmm10 -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm10[3,1,2,0] -; SSE-NEXT: pand %xmm1, %xmm9 -; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm12 +; SSE-NEXT: por %xmm12, %xmm11 +; SSE-NEXT: movdqa %xmm11, %xmm9 +; SSE-NEXT: pand %xmm7, %xmm9 +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[3,1,2,0] ; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,3,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,7,6,5] ; SSE-NEXT: packuswb %xmm9, %xmm9 -; SSE-NEXT: movdqa {{.*#+}} xmm11 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] -; SSE-NEXT: movdqa %xmm11, %xmm12 -; SSE-NEXT: pandn %xmm9, %xmm12 -; SSE-NEXT: pand %xmm11, %xmm0 -; SSE-NEXT: por %xmm0, %xmm12 +; SSE-NEXT: movdqa {{.*#+}} xmm10 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: pandn %xmm9, %xmm1 +; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pxor %xmm9, %xmm9 ; SSE-NEXT: movdqa %xmm6, %xmm0 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm9[8],xmm0[9],xmm9[9],xmm0[10],xmm9[10],xmm0[11],xmm9[11],xmm0[12],xmm9[12],xmm0[13],xmm9[13],xmm0[14],xmm9[14],xmm0[15],xmm9[15] @@ -567,14 +568,15 @@ define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pandn %xmm0, %xmm6 ; SSE-NEXT: por %xmm5, %xmm6 ; SSE-NEXT: packuswb %xmm6, %xmm6 -; SSE-NEXT: pand %xmm8, %xmm6 -; SSE-NEXT: pandn %xmm1, %xmm8 -; SSE-NEXT: por %xmm8, %xmm6 -; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,0,0,0,65535,65535] +; SSE-NEXT: pand %xmm0, %xmm6 +; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: por %xmm0, %xmm6 +; SSE-NEXT: movdqa %xmm11, %xmm0 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] -; SSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm10[3,1,2,3,4,5,6,7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm9[8],xmm11[9],xmm9[9],xmm11[10],xmm9[10],xmm11[11],xmm9[11],xmm11[12],xmm9[12],xmm11[13],xmm9[13],xmm11[14],xmm9[14],xmm11[15],xmm9[15] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm11[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,7,6,4] ; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,0,65535,65535,0,65535] @@ -582,14 +584,14 @@ define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pandn %xmm0, %xmm5 ; SSE-NEXT: por %xmm1, %xmm5 ; SSE-NEXT: packuswb %xmm5, %xmm0 -; SSE-NEXT: movdqa %xmm11, %xmm10 -; SSE-NEXT: pandn %xmm0, %xmm10 -; SSE-NEXT: pand %xmm11, %xmm6 -; SSE-NEXT: por %xmm6, %xmm10 +; SSE-NEXT: movdqa %xmm10, %xmm12 +; SSE-NEXT: pandn %xmm0, %xmm12 +; SSE-NEXT: pand %xmm10, %xmm6 +; SSE-NEXT: por %xmm6, %xmm12 ; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm15 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload ; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: pand %xmm7, %xmm0 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,7,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,3,4,5,6,7] @@ -597,11 +599,11 @@ define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,5,6] ; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm14[2,1,2,3,4,5,6,7] +; SSE-NEXT: por %xmm3, %xmm14 +; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: pand %xmm7, %xmm1 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] -; SSE-NEXT: pand %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm3, %xmm8 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] @@ -611,118 +613,117 @@ define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pandn %xmm1, %xmm5 ; SSE-NEXT: pand %xmm3, %xmm0 ; SSE-NEXT: por %xmm0, %xmm5 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: pandn %xmm6, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: pandn %xmm11, %xmm2 ; SSE-NEXT: por %xmm2, %xmm13 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[0,3,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: pand %xmm7, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] -; SSE-NEXT: pand %xmm8, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,4] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm11, %xmm8 -; SSE-NEXT: pandn %xmm0, %xmm8 -; SSE-NEXT: pand %xmm11, %xmm5 -; SSE-NEXT: por %xmm5, %xmm8 +; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: pand %xmm10, %xmm5 +; SSE-NEXT: por %xmm5, %xmm1 ; SSE-NEXT: movdqa %xmm15, %xmm0 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] ; SSE-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm9[8],xmm15[9],xmm9[9],xmm15[10],xmm9[10],xmm15[11],xmm9[11],xmm15[12],xmm9[12],xmm15[13],xmm9[13],xmm15[14],xmm9[14],xmm15[15],xmm9[15] -; SSE-NEXT: movdqa %xmm15, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0] -; SSE-NEXT: movaps %xmm0, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2] +; SSE-NEXT: movdqa %xmm15, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[3,0] +; SSE-NEXT: movaps %xmm0, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm2[0,2] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm15[0,0] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm15[2,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,2] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm0, %xmm1 +; SSE-NEXT: packuswb %xmm0, %xmm2 ; SSE-NEXT: movdqa %xmm14, %xmm0 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,1,4,5,6,7] ; SSE-NEXT: punpckhbw {{.*#+}} xmm14 = xmm14[8],xmm9[8],xmm14[9],xmm9[9],xmm14[10],xmm9[10],xmm14[11],xmm9[11],xmm14[12],xmm9[12],xmm14[13],xmm9[13],xmm14[14],xmm9[14],xmm14[15],xmm9[15] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[0,3,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7] -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [0,65535,65535,0,65535,65535,65535,65535] -; SSE-NEXT: pand %xmm5, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm5 -; SSE-NEXT: por %xmm2, %xmm5 -; SSE-NEXT: pand %xmm3, %xmm1 -; SSE-NEXT: packuswb %xmm5, %xmm5 -; SSE-NEXT: pandn %xmm5, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm14[0,3,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,7,7,7] +; SSE-NEXT: movdqa {{.*#+}} xmm6 = [0,65535,65535,0,65535,65535,65535,65535] +; SSE-NEXT: pand %xmm6, %xmm5 +; SSE-NEXT: pandn %xmm0, %xmm6 +; SSE-NEXT: por %xmm5, %xmm6 +; SSE-NEXT: pand %xmm3, %xmm2 +; SSE-NEXT: packuswb %xmm6, %xmm6 +; SSE-NEXT: pandn %xmm6, %xmm3 +; SSE-NEXT: por %xmm2, %xmm3 ; SSE-NEXT: movdqa %xmm13, %xmm0 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm9[8],xmm0[9],xmm9[9],xmm0[10],xmm9[10],xmm0[11],xmm9[11],xmm0[12],xmm9[12],xmm0[13],xmm9[13],xmm0[14],xmm9[14],xmm0[15],xmm9[15] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,5] ; SSE-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm9[0],xmm13[1],xmm9[1],xmm13[2],xmm9[2],xmm13[3],xmm9[3],xmm13[4],xmm9[4],xmm13[5],xmm9[5],xmm13[6],xmm9[6],xmm13[7],xmm9[7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[0,2,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,7,7] -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535,0,65535,65535,0] -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: pand %xmm11, %xmm3 -; SSE-NEXT: packuswb %xmm2, %xmm0 -; SSE-NEXT: pandn %xmm0, %xmm11 -; SSE-NEXT: por %xmm3, %xmm11 -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm7, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[0,2,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,7] +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535,0,65535,65535,0] +; SSE-NEXT: pand %xmm5, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm5 +; SSE-NEXT: por %xmm2, %xmm5 +; SSE-NEXT: pand %xmm10, %xmm3 +; SSE-NEXT: packuswb %xmm5, %xmm0 +; SSE-NEXT: pandn %xmm0, %xmm10 +; SSE-NEXT: por %xmm3, %xmm10 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: pand %xmm7, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,2] -; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: packuswb %xmm2, %xmm0 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm2 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[3,1,2,0] -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: movdqa %xmm2, %xmm5 +; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: pand %xmm7, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[2,1,0,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: pand %xmm3, %xmm2 -; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm0[2,1,0,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm5, %xmm5 +; SSE-NEXT: pand %xmm3, %xmm5 +; SSE-NEXT: por %xmm2, %xmm5 ; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,0,65535,65535,0,65535,65535,0] -; SSE-NEXT: movdqa %xmm6, %xmm1 -; SSE-NEXT: pand %xmm13, %xmm1 +; SSE-NEXT: pand %xmm13, %xmm11 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,0,0,0] -; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: por %xmm1, %xmm13 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[0,2,1,3] -; SSE-NEXT: pand %xmm5, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7] -; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: pand %xmm0, %xmm5 +; SSE-NEXT: por %xmm11, %xmm13 +; SSE-NEXT: pand %xmm13, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,2,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,4,7] +; SSE-NEXT: packuswb %xmm2, %xmm2 ; SSE-NEXT: movdqa %xmm0, %xmm6 -; SSE-NEXT: pandn %xmm1, %xmm6 -; SSE-NEXT: por %xmm2, %xmm6 -; SSE-NEXT: movdqa %xmm7, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm9[8],xmm1[9],xmm9[9],xmm1[10],xmm9[10],xmm1[11],xmm9[11],xmm1[12],xmm9[12],xmm1[13],xmm9[13],xmm1[14],xmm9[14],xmm1[15],xmm9[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3],xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7] -; SSE-NEXT: movdqa %xmm7, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,0],xmm1[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm1[2,3] -; SSE-NEXT: psrlq $48, %xmm1 -; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm7[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,5,7] -; SSE-NEXT: packuswb %xmm2, %xmm1 +; SSE-NEXT: pandn %xmm2, %xmm6 +; SSE-NEXT: por %xmm5, %xmm6 +; SSE-NEXT: movdqa %xmm8, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm9[8],xmm2[9],xmm9[9],xmm2[10],xmm9[10],xmm2[11],xmm9[11],xmm2[12],xmm9[12],xmm2[13],xmm9[13],xmm2[14],xmm9[14],xmm2[15],xmm9[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3],xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] +; SSE-NEXT: movdqa %xmm8, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,0],xmm2[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm2[2,3] +; SSE-NEXT: psrlq $48, %xmm2 +; SSE-NEXT: psrldq {{.*#+}} xmm5 = xmm5[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm8[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm2[0,1,2,3,4,4,5,7] +; SSE-NEXT: packuswb %xmm5, %xmm7 ; SSE-NEXT: movdqa %xmm4, %xmm2 ; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm9[8],xmm2[9],xmm9[9],xmm2[10],xmm9[10],xmm2[11],xmm9[11],xmm2[12],xmm9[12],xmm2[13],xmm9[13],xmm2[14],xmm9[14],xmm2[15],xmm9[15] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,2,3] @@ -737,28 +738,29 @@ define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: por %xmm4, %xmm2 ; SSE-NEXT: packuswb %xmm2, %xmm2 ; SSE-NEXT: pand %xmm3, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: pandn %xmm7, %xmm3 ; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,1,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm13, %xmm3 +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,1,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,1,3,4,5,6,7] ; SSE-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm9[8],xmm13[9],xmm9[9],xmm13[10],xmm9[10],xmm13[11],xmm9[11],xmm13[12],xmm9[12],xmm13[13],xmm9[13],xmm13[14],xmm9[14],xmm13[15],xmm9[15] -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535,0,65535,0,0] -; SSE-NEXT: pand %xmm3, %xmm1 -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm13[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,7,4] -; SSE-NEXT: pandn %xmm4, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535,0,65535,0,0] +; SSE-NEXT: pand %xmm4, %xmm3 +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm13[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,7,4] +; SSE-NEXT: pandn %xmm5, %xmm4 +; SSE-NEXT: por %xmm3, %xmm4 ; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: packuswb %xmm3, %xmm1 -; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: packuswb %xmm4, %xmm3 +; SSE-NEXT: pandn %xmm3, %xmm0 ; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: movdqa %xmm12, (%rsi) -; SSE-NEXT: movdqa %xmm10, (%rdx) -; SSE-NEXT: movdqa %xmm8, (%rcx) -; SSE-NEXT: movdqa %xmm11, (%r8) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-NEXT: movaps %xmm2, (%rsi) +; SSE-NEXT: movdqa %xmm12, (%rdx) +; SSE-NEXT: movdqa %xmm1, (%rcx) +; SSE-NEXT: movdqa %xmm10, (%r8) ; SSE-NEXT: movdqa %xmm6, (%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movdqa %xmm0, (%rax) @@ -1074,27 +1076,27 @@ define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5) nounwind { ; SSE-LABEL: load_i8_stride6_vf32: ; SSE: # %bb.0: -; SSE-NEXT: subq $264, %rsp # imm = 0x108 -; SSE-NEXT: movdqa 64(%rdi), %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: subq $280, %rsp # imm = 0x118 +; SSE-NEXT: movdqa 64(%rdi), %xmm10 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 80(%rdi), %xmm9 -; SSE-NEXT: movdqa (%rdi), %xmm12 -; SSE-NEXT: movdqa 16(%rdi), %xmm14 +; SSE-NEXT: movdqa (%rdi), %xmm14 +; SSE-NEXT: movdqa 16(%rdi), %xmm2 ; SSE-NEXT: movdqa 32(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 48(%rdi), %xmm5 -; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,0,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,0,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm7, %xmm0 ; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,0,65535,65535,0,65535,65535,0] -; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,0,65535,65535,0,65535,65535,0] +; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: pandn %xmm5, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: movdqa %xmm7, %xmm1 ; SSE-NEXT: pandn %xmm5, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm5, %xmm15 -; SSE-NEXT: pand %xmm10, %xmm15 +; SSE-NEXT: pand %xmm7, %xmm15 ; SSE-NEXT: por %xmm0, %xmm15 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] ; SSE-NEXT: movdqa %xmm15, %xmm0 @@ -1105,237 +1107,238 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,5] ; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,0,0,65535,65535] +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,0,0,0,65535,65535] +; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: pandn %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm14, %xmm11 +; SSE-NEXT: pand %xmm12, %xmm11 +; SSE-NEXT: por %xmm1, %xmm11 ; SSE-NEXT: movdqa %xmm11, %xmm1 -; SSE-NEXT: pandn %xmm14, %xmm1 -; SSE-NEXT: movdqa %xmm12, %xmm8 -; SSE-NEXT: pand %xmm11, %xmm8 -; SSE-NEXT: por %xmm1, %xmm8 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,2,1,3] ; SSE-NEXT: pand %xmm3, %xmm1 ; SSE-NEXT: movdqa %xmm3, %xmm6 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: movdqa %xmm2, %xmm5 +; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: movdqa %xmm4, %xmm3 ; SSE-NEXT: pandn %xmm0, %xmm3 ; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: movdqa %xmm7, %xmm0 ; SSE-NEXT: pandn %xmm9, %xmm0 -; SSE-NEXT: pand %xmm10, %xmm7 -; SSE-NEXT: por %xmm0, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[3,1,2,0] +; SSE-NEXT: pand %xmm7, %xmm10 +; SSE-NEXT: por %xmm0, %xmm10 +; SSE-NEXT: movdqa %xmm10, %xmm0 ; SSE-NEXT: pand %xmm6, %xmm0 +; SSE-NEXT: movdqa %xmm6, %xmm8 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] -; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: pand %xmm3, %xmm1 -; SSE-NEXT: por %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 128(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: movdqa 144(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm11, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: pand %xmm10, %xmm2 -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: pand %xmm6, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[0,3,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,5] -; SSE-NEXT: packuswb %xmm3, %xmm0 -; SSE-NEXT: movdqa %xmm5, %xmm6 -; SSE-NEXT: pandn %xmm0, %xmm6 -; SSE-NEXT: movdqa %xmm10, %xmm1 -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: pandn %xmm12, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 112(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm11, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; SSE-NEXT: movdqa %xmm5, %xmm3 ; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: movdqa 160(%rdi), %xmm5 +; SSE-NEXT: pand %xmm5, %xmm1 +; SSE-NEXT: por %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 128(%rdi), %xmm13 +; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: pandn %xmm13, %xmm0 +; SSE-NEXT: movdqa 144(%rdi), %xmm6 +; SSE-NEXT: movdqa %xmm12, %xmm3 +; SSE-NEXT: pandn %xmm6, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm7, %xmm3 +; SSE-NEXT: pandn %xmm6, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm7, %xmm6 +; SSE-NEXT: por %xmm0, %xmm6 +; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[0,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,5] +; SSE-NEXT: packuswb %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm4, %xmm8 +; SSE-NEXT: pandn %xmm0, %xmm8 +; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: pandn %xmm14, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 112(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm12, %xmm4 +; SSE-NEXT: pandn %xmm1, %xmm4 +; SSE-NEXT: movdqa 160(%rdi), %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm0, %xmm7 +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: pandn %xmm2, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm0, %xmm14 +; SSE-NEXT: pandn %xmm9, %xmm12 +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm0, %xmm9 +; SSE-NEXT: movdqa %xmm9, (%rsp) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 96(%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm3, %xmm5 +; SSE-NEXT: pand %xmm0, %xmm5 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm10, %xmm5 -; SSE-NEXT: movdqa %xmm10, %xmm4 -; SSE-NEXT: pandn %xmm14, %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm10, %xmm12 -; SSE-NEXT: movdqa %xmm11, %xmm4 -; SSE-NEXT: pandn %xmm9, %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm9, %xmm11 -; SSE-NEXT: pand %xmm1, %xmm11 -; SSE-NEXT: movdqa %xmm1, %xmm4 -; SSE-NEXT: pandn %xmm0, %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 96(%rdi), %xmm13 -; SSE-NEXT: movdqa %xmm13, %xmm4 -; SSE-NEXT: pand %xmm1, %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 176(%rdi), %xmm4 -; SSE-NEXT: movdqa %xmm4, %xmm10 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm1, %xmm10 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 176(%rdi), %xmm5 +; SSE-NEXT: movdqa %xmm5, %xmm12 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm0, %xmm12 +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: movdqa %xmm9, %xmm10 -; SSE-NEXT: pand %xmm1, %xmm9 +; SSE-NEXT: movdqa %xmm9, %xmm12 +; SSE-NEXT: pand %xmm0, %xmm9 ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm1, %xmm14 -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movdqa %xmm14, %xmm9 -; SSE-NEXT: pand %xmm1, %xmm14 -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm1, %xmm14 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill -; SSE-NEXT: pandn %xmm13, %xmm1 +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm13, %xmm2 +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,2,1,3] -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255] -; SSE-NEXT: pand %xmm0, %xmm3 -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7] -; SSE-NEXT: packuswb %xmm3, %xmm3 -; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 -; SSE-NEXT: por %xmm6, %xmm3 -; SSE-NEXT: pandn %xmm4, %xmm14 -; SSE-NEXT: por %xmm14, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[3,1,2,0] -; SSE-NEXT: pand %xmm0, %xmm4 -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,5] -; SSE-NEXT: packuswb %xmm4, %xmm4 -; SSE-NEXT: movdqa {{.*#+}} xmm13 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] -; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: pandn %xmm4, %xmm0 -; SSE-NEXT: pand %xmm13, %xmm3 -; SSE-NEXT: por %xmm3, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pxor %xmm4, %xmm4 -; SSE-NEXT: movdqa %xmm15, %xmm3 -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm4[0],xmm15[1],xmm4[1],xmm15[2],xmm4[2],xmm15[3],xmm4[3],xmm15[4],xmm4[4],xmm15[5],xmm4[5],xmm15[6],xmm4[6],xmm15[7],xmm4[7] -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm15[2,2,3,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm3[0],xmm14[1],xmm3[1],xmm14[2],xmm3[2],xmm14[3],xmm3[3] -; SSE-NEXT: psrld $16, %xmm3 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pandn %xmm3, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSE-NEXT: por %xmm4, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] +; SSE-NEXT: pand %xmm1, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,1,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,7] +; SSE-NEXT: packuswb %xmm4, %xmm4 +; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 +; SSE-NEXT: por %xmm8, %xmm4 +; SSE-NEXT: pandn %xmm5, %xmm0 +; SSE-NEXT: por %xmm0, %xmm7 +; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5] +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: pand %xmm3, %xmm4 +; SSE-NEXT: por %xmm4, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pxor %xmm9, %xmm9 +; SSE-NEXT: movdqa %xmm15, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm9[8],xmm0[9],xmm9[9],xmm0[10],xmm9[10],xmm0[11],xmm9[11],xmm0[12],xmm9[12],xmm0[13],xmm9[13],xmm0[14],xmm9[14],xmm0[15],xmm9[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3],xmm15[4],xmm9[4],xmm15[5],xmm9[5],xmm15[6],xmm9[6],xmm15[7],xmm9[7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm15[2,2,3,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] +; SSE-NEXT: psrld $16, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm15[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,5,7,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm15 = xmm15[2],xmm3[2],xmm15[3],xmm3[3] -; SSE-NEXT: packuswb %xmm15, %xmm14 -; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,65535,0,0,0,65535,65535] -; SSE-NEXT: movdqa %xmm6, %xmm3 -; SSE-NEXT: pandn %xmm14, %xmm3 -; SSE-NEXT: movdqa %xmm8, %xmm14 -; SSE-NEXT: punpckhbw {{.*#+}} xmm14 = xmm14[8],xmm4[8],xmm14[9],xmm4[9],xmm14[10],xmm4[10],xmm14[11],xmm4[11],xmm14[12],xmm4[12],xmm14[13],xmm4[13],xmm14[14],xmm4[14],xmm14[15],xmm4[15] -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm14[2,1,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm14[1,1,1,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,5,7,6,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm15 = xmm15[2],xmm0[2],xmm15[3],xmm0[3] +; SSE-NEXT: packuswb %xmm15, %xmm5 +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,0,0,0,65535,65535] +; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: pandn %xmm5, %xmm1 +; SSE-NEXT: movdqa %xmm11, %xmm5 +; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm9[8],xmm5[9],xmm9[9],xmm5[10],xmm9[10],xmm5[11],xmm9[11],xmm5[12],xmm9[12],xmm5[13],xmm9[13],xmm5[14],xmm9[14],xmm5[15],xmm9[15] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,1,0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[1,1,1,1,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,7,6,7] ; SSE-NEXT: movdqa {{.*#+}} xmm15 = [65535,65535,0,65535,0,0,65535,65535] ; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: pandn %xmm14, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3],xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm8[1,3,2,0,4,5,6,7] -; SSE-NEXT: pand %xmm15, %xmm14 -; SSE-NEXT: por %xmm0, %xmm14 -; SSE-NEXT: packuswb %xmm14, %xmm14 -; SSE-NEXT: pand %xmm6, %xmm14 -; SSE-NEXT: por %xmm3, %xmm14 -; SSE-NEXT: movdqa %xmm7, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; SSE-NEXT: pandn %xmm5, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3],xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm11[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[1,3,2,0,4,5,6,7] +; SSE-NEXT: pand %xmm15, %xmm5 +; SSE-NEXT: por %xmm0, %xmm5 +; SSE-NEXT: packuswb %xmm5, %xmm5 +; SSE-NEXT: pand %xmm4, %xmm5 +; SSE-NEXT: por %xmm1, %xmm5 +; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,65535,0,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm8, %xmm3 -; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm4[8],xmm7[9],xmm4[9],xmm7[10],xmm4[10],xmm7[11],xmm4[11],xmm7[12],xmm4[12],xmm7[13],xmm4[13],xmm7[14],xmm4[14],xmm7[15],xmm4[15] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[3,1,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm10[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,7,6,4] ; SSE-NEXT: pand %xmm8, %xmm0 -; SSE-NEXT: por %xmm3, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm13, %xmm3 -; SSE-NEXT: pandn %xmm0, %xmm3 -; SSE-NEXT: pand %xmm13, %xmm14 -; SSE-NEXT: por %xmm14, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,2,3,3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: pand %xmm3, %xmm5 +; SSE-NEXT: por %xmm5, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm9[8],xmm0[9],xmm9[9],xmm0[10],xmm9[10],xmm0[11],xmm9[11],xmm0[12],xmm9[12],xmm0[13],xmm9[13],xmm0[14],xmm9[14],xmm0[15],xmm9[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3],xmm6[4],xmm9[4],xmm6[5],xmm9[5],xmm6[6],xmm9[6],xmm6[7],xmm9[7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[2,2,3,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,7,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE-NEXT: packuswb %xmm2, %xmm3 -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm6[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,7,6,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm0[2],xmm5[3],xmm0[3] +; SSE-NEXT: packuswb %xmm5, %xmm1 +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm9[8],xmm0[9],xmm9[9],xmm0[10],xmm9[10],xmm0[11],xmm9[11],xmm0[12],xmm9[12],xmm0[13],xmm9[13],xmm0[14],xmm9[14],xmm0[15],xmm9[15] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,7,6,7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,3,2,0,4,5,6,7] -; SSE-NEXT: pand %xmm15, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3],xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,3,2,0,4,5,6,7] +; SSE-NEXT: pand %xmm15, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm15 -; SSE-NEXT: por %xmm1, %xmm15 +; SSE-NEXT: por %xmm2, %xmm15 ; SSE-NEXT: packuswb %xmm15, %xmm15 -; SSE-NEXT: pand %xmm6, %xmm15 -; SSE-NEXT: pandn %xmm3, %xmm6 -; SSE-NEXT: por %xmm6, %xmm15 -; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: pand %xmm4, %xmm15 +; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: por %xmm0, %xmm15 +; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] -; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm5[3,1,2,3,4,5,6,7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm9[8],xmm7[9],xmm9[9],xmm7[10],xmm9[10],xmm7[11],xmm9[11],xmm7[12],xmm9[12],xmm7[13],xmm9[13],xmm7[14],xmm9[14],xmm7[15],xmm9[15] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm7[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,7,6,4] ; SSE-NEXT: pand %xmm8, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm8 ; SSE-NEXT: por %xmm1, %xmm8 ; SSE-NEXT: packuswb %xmm8, %xmm0 -; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: movdqa %xmm3, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pand %xmm13, %xmm15 -; SSE-NEXT: movdqa %xmm13, %xmm7 +; SSE-NEXT: pand %xmm3, %xmm15 +; SSE-NEXT: movdqa %xmm3, %xmm8 ; SSE-NEXT: por %xmm15, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,0,65535,65535,0,65535,65535,0] -; SSE-NEXT: pand %xmm5, %xmm10 -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm15 = [255,255,255,255,255,255,255,255] -; SSE-NEXT: pand %xmm15, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,0,65535,65535,0,65535,65535,0] +; SSE-NEXT: pand %xmm4, %xmm12 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm11 = [255,255,255,255,255,255,255,255] +; SSE-NEXT: pand %xmm11, %xmm0 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,7,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,3,4,5,6,7] @@ -1343,10 +1346,11 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,4,5,6] ; SSE-NEXT: packuswb %xmm1, %xmm2 -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm12[2,1,2,3,4,5,6,7] +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: pand %xmm11, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7] -; SSE-NEXT: pand %xmm15, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,2,3,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,5,5,5,5] @@ -1356,26 +1360,27 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pandn %xmm1, %xmm3 ; SSE-NEXT: pand %xmm0, %xmm2 ; SSE-NEXT: por %xmm2, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pandn %xmm14, %xmm1 -; SSE-NEXT: por %xmm1, %xmm11 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm11[0,3,2,3,4,5,6,7] +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: movdqa (%rsp), %xmm15 # 16-byte Reload +; SSE-NEXT: por %xmm1, %xmm15 +; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: pand %xmm11, %xmm1 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] -; SSE-NEXT: pand %xmm15, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,7,4] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm13, %xmm2 +; SSE-NEXT: movdqa %xmm8, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: pand %xmm13, %xmm3 +; SSE-NEXT: pand %xmm8, %xmm3 ; SSE-NEXT: por %xmm3, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm5, %xmm9 -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm9, %xmm1 -; SSE-NEXT: pand %xmm15, %xmm1 +; SSE-NEXT: pand %xmm4, %xmm13 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: pand %xmm11, %xmm1 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,7,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,2,3,3,4,5,6,7] @@ -1383,11 +1388,12 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,5,6] ; SSE-NEXT: packuswb %xmm2, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm13[2,1,2,3,4,5,6,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm4, %xmm2 +; SSE-NEXT: pand %xmm11, %xmm2 +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] -; SSE-NEXT: pand %xmm15, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,2,3,0,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] @@ -1396,289 +1402,294 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pandn %xmm2, %xmm3 ; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa (%rsp), %xmm1 # 16-byte Reload +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: por %xmm1, %xmm8 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm8[0,3,2,3,4,5,6,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: por %xmm1, %xmm10 +; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: pand %xmm11, %xmm1 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] -; SSE-NEXT: pand %xmm15, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,7,4] ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm7, %xmm2 +; SSE-NEXT: movdqa %xmm8, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: pand %xmm7, %xmm3 +; SSE-NEXT: pand %xmm8, %xmm3 ; SSE-NEXT: por %xmm3, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm4[8],xmm10[9],xmm4[9],xmm10[10],xmm4[10],xmm10[11],xmm4[11],xmm10[12],xmm4[12],xmm10[13],xmm4[13],xmm10[14],xmm4[14],xmm10[15],xmm4[15] -; SSE-NEXT: movdqa %xmm10, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[3,0] -; SSE-NEXT: movaps %xmm1, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm10[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm10[2,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,2] +; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm9[8],xmm12[9],xmm9[9],xmm12[10],xmm9[10],xmm12[11],xmm9[11],xmm12[12],xmm9[12],xmm12[13],xmm9[13],xmm12[14],xmm9[14],xmm12[15],xmm9[15] +; SSE-NEXT: movdqa %xmm12, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm1[3,0] +; SSE-NEXT: movaps %xmm1, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm12[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm12[2,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm5[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,2] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,3,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm12, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; SSE-NEXT: packuswb %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[3,1,2,1,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm1[3,1,2,1,4,5,6,7] ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,0,65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm1, %xmm5 -; SSE-NEXT: pandn %xmm3, %xmm5 -; SSE-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm4[8],xmm12[9],xmm4[9],xmm12[10],xmm4[10],xmm12[11],xmm4[11],xmm12[12],xmm4[12],xmm12[13],xmm4[13],xmm12[14],xmm4[14],xmm12[15],xmm4[15] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm12[0,3,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,7,7,7] -; SSE-NEXT: pand %xmm1, %xmm3 -; SSE-NEXT: por %xmm5, %xmm3 -; SSE-NEXT: packuswb %xmm3, %xmm3 -; SSE-NEXT: movdqa %xmm0, %xmm5 -; SSE-NEXT: pandn %xmm3, %xmm5 -; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: por %xmm2, %xmm5 -; SSE-NEXT: movdqa %xmm11, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm2[0,1,2,3,7,5,6,5] -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535,0,65535,65535,0] -; SSE-NEXT: movdqa %xmm2, %xmm6 -; SSE-NEXT: pandn %xmm3, %xmm6 -; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1],xmm11[2],xmm4[2],xmm11[3],xmm4[3],xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm11[0,2,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,7,7] -; SSE-NEXT: pand %xmm2, %xmm3 -; SSE-NEXT: por %xmm6, %xmm3 -; SSE-NEXT: packuswb %xmm3, %xmm3 -; SSE-NEXT: movdqa %xmm7, %xmm6 -; SSE-NEXT: pandn %xmm3, %xmm6 -; SSE-NEXT: pand %xmm7, %xmm5 -; SSE-NEXT: por %xmm5, %xmm6 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm9, %xmm3 -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm4[8],xmm9[9],xmm4[9],xmm9[10],xmm4[10],xmm9[11],xmm4[11],xmm9[12],xmm4[12],xmm9[13],xmm4[13],xmm9[14],xmm4[14],xmm9[15],xmm4[15] -; SSE-NEXT: movdqa %xmm9, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm3[3,0] -; SSE-NEXT: movaps %xmm3, %xmm6 -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm5[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm9[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm9[2,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm6[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,0,2] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,3,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm3, %xmm5 -; SSE-NEXT: movdqa %xmm13, %xmm3 -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,1,4,5,6,7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm4[8],xmm13[9],xmm4[9],xmm13[10],xmm4[10],xmm13[11],xmm4[11],xmm13[12],xmm4[12],xmm13[13],xmm4[13],xmm13[14],xmm4[14],xmm13[15],xmm4[15] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm13[0,3,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,1,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,7,7,7] -; SSE-NEXT: pand %xmm1, %xmm6 -; SSE-NEXT: pandn %xmm3, %xmm1 -; SSE-NEXT: por %xmm6, %xmm1 -; SSE-NEXT: pand %xmm0, %xmm5 +; SSE-NEXT: movdqa %xmm1, %xmm6 +; SSE-NEXT: pandn %xmm5, %xmm6 +; SSE-NEXT: punpckhbw {{.*#+}} xmm14 = xmm14[8],xmm9[8],xmm14[9],xmm9[9],xmm14[10],xmm9[10],xmm14[11],xmm9[11],xmm14[12],xmm9[12],xmm14[13],xmm9[13],xmm14[14],xmm9[14],xmm14[15],xmm9[15] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm14[0,3,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,7,7,7] +; SSE-NEXT: pand %xmm1, %xmm5 +; SSE-NEXT: por %xmm6, %xmm5 +; SSE-NEXT: packuswb %xmm5, %xmm5 +; SSE-NEXT: movdqa %xmm0, %xmm6 +; SSE-NEXT: pandn %xmm5, %xmm6 +; SSE-NEXT: pand %xmm0, %xmm3 +; SSE-NEXT: por %xmm3, %xmm6 +; SSE-NEXT: movdqa %xmm15, %xmm3 +; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm9[8],xmm3[9],xmm9[9],xmm3[10],xmm9[10],xmm3[11],xmm9[11],xmm3[12],xmm9[12],xmm3[13],xmm9[13],xmm3[14],xmm9[14],xmm3[15],xmm9[15] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,7,5,6,5] +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535,0,65535,65535,0] +; SSE-NEXT: movdqa %xmm3, %xmm7 +; SSE-NEXT: pandn %xmm5, %xmm7 +; SSE-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3],xmm15[4],xmm9[4],xmm15[5],xmm9[5],xmm15[6],xmm9[6],xmm15[7],xmm9[7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm15[0,2,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,7,7] +; SSE-NEXT: pand %xmm3, %xmm5 +; SSE-NEXT: por %xmm7, %xmm5 +; SSE-NEXT: packuswb %xmm5, %xmm5 +; SSE-NEXT: movdqa %xmm8, %xmm15 +; SSE-NEXT: pandn %xmm5, %xmm15 +; SSE-NEXT: pand %xmm8, %xmm6 +; SSE-NEXT: por %xmm6, %xmm15 +; SSE-NEXT: movdqa %xmm13, %xmm5 +; SSE-NEXT: pxor %xmm2, %xmm2 +; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm2[8],xmm13[9],xmm2[9],xmm13[10],xmm2[10],xmm13[11],xmm2[11],xmm13[12],xmm2[12],xmm13[13],xmm2[13],xmm13[14],xmm2[14],xmm13[15],xmm2[15] +; SSE-NEXT: movdqa %xmm13, %xmm6 +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm5[3,0] +; SSE-NEXT: movaps %xmm5, %xmm7 +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm6[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm13[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm13[2,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm7[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,0,2] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,3,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm5, %xmm6 +; SSE-NEXT: movdqa %xmm4, %xmm5 +; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[3,1,2,1,4,5,6,7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,3,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,7,7,7] +; SSE-NEXT: pand %xmm1, %xmm7 +; SSE-NEXT: pandn %xmm5, %xmm1 +; SSE-NEXT: por %xmm7, %xmm1 +; SSE-NEXT: pand %xmm0, %xmm6 ; SSE-NEXT: packuswb %xmm1, %xmm1 ; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: por %xmm5, %xmm0 -; SSE-NEXT: movdqa %xmm8, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15] +; SSE-NEXT: por %xmm6, %xmm0 +; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,5] -; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3],xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm8[0,2,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,7,7] -; SSE-NEXT: pand %xmm2, %xmm3 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: movdqa %xmm7, %xmm13 -; SSE-NEXT: pand %xmm7, %xmm0 -; SSE-NEXT: packuswb %xmm2, %xmm1 -; SSE-NEXT: pandn %xmm1, %xmm13 -; SSE-NEXT: por %xmm0, %xmm13 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm7, %xmm0 -; SSE-NEXT: pand %xmm15, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm2[0],xmm10[1],xmm2[1],xmm10[2],xmm2[2],xmm10[3],xmm2[3],xmm10[4],xmm2[4],xmm10[5],xmm2[5],xmm10[6],xmm2[6],xmm10[7],xmm2[7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm10[0,2,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,7,7] +; SSE-NEXT: pand %xmm3, %xmm5 +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: por %xmm5, %xmm3 +; SSE-NEXT: movdqa %xmm8, %xmm4 +; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: packuswb %xmm3, %xmm1 +; SSE-NEXT: pandn %xmm1, %xmm4 +; SSE-NEXT: por %xmm0, %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: pand %xmm11, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,2] ; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255] +; SSE-NEXT: movdqa %xmm5, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[3,1,2,0] -; SSE-NEXT: pand %xmm15, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: pand %xmm11, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[2,1,0,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm3, %xmm3 -; SSE-NEXT: pand %xmm2, %xmm3 +; SSE-NEXT: pand %xmm5, %xmm3 ; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: movdqa %xmm14, %xmm11 -; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,0,65535,65535,0,65535,65535,0] -; SSE-NEXT: pand %xmm12, %xmm11 -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,2,1,3] -; SSE-NEXT: pand %xmm15, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,0,65535,65535,0,65535,65535,0] +; SSE-NEXT: pand %xmm13, %xmm12 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: pand %xmm11, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7] -; SSE-NEXT: packuswb %xmm0, %xmm5 +; SSE-NEXT: packuswb %xmm0, %xmm6 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,0,0,0] ; SSE-NEXT: movdqa %xmm0, %xmm8 -; SSE-NEXT: pandn %xmm5, %xmm8 +; SSE-NEXT: pandn %xmm6, %xmm8 ; SSE-NEXT: pand %xmm0, %xmm3 ; SSE-NEXT: por %xmm3, %xmm8 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload ; SSE-NEXT: movdqa %xmm14, %xmm3 -; SSE-NEXT: pand %xmm15, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[2,1,2,3,4,5,6,7] +; SSE-NEXT: pand %xmm11, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm3[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,2] -; SSE-NEXT: packuswb %xmm5, %xmm3 -; SSE-NEXT: movdqa %xmm2, %xmm5 -; SSE-NEXT: pandn %xmm3, %xmm5 +; SSE-NEXT: packuswb %xmm6, %xmm3 +; SSE-NEXT: movdqa %xmm5, %xmm6 +; SSE-NEXT: pandn %xmm3, %xmm6 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[3,1,2,0] -; SSE-NEXT: pand %xmm15, %xmm3 +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: pand %xmm11, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm3[2,1,0,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm6, %xmm6 -; SSE-NEXT: pand %xmm2, %xmm6 -; SSE-NEXT: por %xmm5, %xmm6 +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm3[2,1,0,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm7, %xmm7 +; SSE-NEXT: pand %xmm5, %xmm7 +; SSE-NEXT: por %xmm6, %xmm7 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pand %xmm12, %xmm3 -; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: por %xmm3, %xmm12 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm12[0,2,1,3] -; SSE-NEXT: pand %xmm15, %xmm3 +; SSE-NEXT: pand %xmm13, %xmm3 +; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; SSE-NEXT: por %xmm3, %xmm13 +; SSE-NEXT: movdqa %xmm11, %xmm3 +; SSE-NEXT: pand %xmm13, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,1,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,2,1,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,1,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,4,7] -; SSE-NEXT: packuswb %xmm3, %xmm5 +; SSE-NEXT: packuswb %xmm3, %xmm6 ; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: pandn %xmm5, %xmm3 -; SSE-NEXT: pand %xmm0, %xmm6 -; SSE-NEXT: por %xmm6, %xmm3 -; SSE-NEXT: movdqa %xmm7, %xmm5 -; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3],xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] -; SSE-NEXT: movdqa %xmm7, %xmm6 -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,0],xmm5[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm5[2,3] -; SSE-NEXT: psrlq $48, %xmm5 -; SSE-NEXT: psrldq {{.*#+}} xmm6 = xmm6[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm7[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,5,7] -; SSE-NEXT: packuswb %xmm6, %xmm5 -; SSE-NEXT: movdqa %xmm2, %xmm6 -; SSE-NEXT: pandn %xmm5, %xmm6 -; SSE-NEXT: movdqa %xmm9, %xmm5 -; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,2,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm5[0,1,2,3,5,5,5,5] -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,0,65535,65535,0,65535,65535,65535] -; SSE-NEXT: movdqa %xmm5, %xmm10 -; SSE-NEXT: pandn %xmm7, %xmm10 -; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3],xmm9[4],xmm4[4],xmm9[5],xmm4[5],xmm9[6],xmm4[6],xmm9[7],xmm4[7] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm9[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm7[3,1,1,2,4,5,6,7] -; SSE-NEXT: pand %xmm5, %xmm9 -; SSE-NEXT: por %xmm10, %xmm9 -; SSE-NEXT: packuswb %xmm9, %xmm9 -; SSE-NEXT: pand %xmm2, %xmm9 -; SSE-NEXT: por %xmm6, %xmm9 -; SSE-NEXT: movdqa %xmm11, %xmm6 -; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm4[8],xmm11[9],xmm4[9],xmm11[10],xmm4[10],xmm11[11],xmm4[11],xmm11[12],xmm4[12],xmm11[13],xmm4[13],xmm11[14],xmm4[14],xmm11[15],xmm4[15] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm11[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm7[0,1,2,3,5,5,7,4] -; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,65535,65535,0,65535,0,0] -; SSE-NEXT: movdqa %xmm7, %xmm11 -; SSE-NEXT: pandn %xmm10, %xmm11 -; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3],xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,3,1,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,1,1,3,4,5,6,7] -; SSE-NEXT: pand %xmm7, %xmm6 -; SSE-NEXT: por %xmm6, %xmm11 -; SSE-NEXT: packuswb %xmm11, %xmm10 -; SSE-NEXT: movdqa %xmm0, %xmm6 -; SSE-NEXT: pandn %xmm10, %xmm6 -; SSE-NEXT: pand %xmm0, %xmm9 -; SSE-NEXT: por %xmm9, %xmm6 +; SSE-NEXT: pandn %xmm6, %xmm3 +; SSE-NEXT: pand %xmm0, %xmm7 +; SSE-NEXT: por %xmm7, %xmm3 +; SSE-NEXT: movdqa %xmm9, %xmm6 +; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm2[8],xmm6[9],xmm2[9],xmm6[10],xmm2[10],xmm6[11],xmm2[11],xmm6[12],xmm2[12],xmm6[13],xmm2[13],xmm6[14],xmm2[14],xmm6[15],xmm2[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm2[0],xmm9[1],xmm2[1],xmm9[2],xmm2[2],xmm9[3],xmm2[3],xmm9[4],xmm2[4],xmm9[5],xmm2[5],xmm9[6],xmm2[6],xmm9[7],xmm2[7] +; SSE-NEXT: movdqa %xmm9, %xmm7 +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,0],xmm6[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,0],xmm6[2,3] +; SSE-NEXT: psrlq $48, %xmm6 +; SSE-NEXT: psrldq {{.*#+}} xmm7 = xmm7[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm9[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,5,7] +; SSE-NEXT: packuswb %xmm7, %xmm6 +; SSE-NEXT: movdqa %xmm5, %xmm7 +; SSE-NEXT: pandn %xmm6, %xmm7 +; SSE-NEXT: movdqa %xmm10, %xmm6 +; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm2[8],xmm6[9],xmm2[9],xmm6[10],xmm2[10],xmm6[11],xmm2[11],xmm6[12],xmm2[12],xmm6[13],xmm2[13],xmm6[14],xmm2[14],xmm6[15],xmm2[15] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,2,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm6[0,1,2,3,5,5,5,5] +; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,0,65535,65535,0,65535,65535,65535] +; SSE-NEXT: movdqa %xmm6, %xmm11 +; SSE-NEXT: pandn %xmm9, %xmm11 +; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm2[0],xmm10[1],xmm2[1],xmm10[2],xmm2[2],xmm10[3],xmm2[3],xmm10[4],xmm2[4],xmm10[5],xmm2[5],xmm10[6],xmm2[6],xmm10[7],xmm2[7] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm10[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm9[3,1,1,2,4,5,6,7] +; SSE-NEXT: pand %xmm6, %xmm10 +; SSE-NEXT: por %xmm11, %xmm10 +; SSE-NEXT: packuswb %xmm10, %xmm10 +; SSE-NEXT: pand %xmm5, %xmm10 +; SSE-NEXT: por %xmm7, %xmm10 +; SSE-NEXT: movdqa %xmm12, %xmm7 +; SSE-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm2[8],xmm12[9],xmm2[9],xmm12[10],xmm2[10],xmm12[11],xmm2[11],xmm12[12],xmm2[12],xmm12[13],xmm2[13],xmm12[14],xmm2[14],xmm12[15],xmm2[15] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm12[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm9[0,1,2,3,5,5,7,4] +; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,65535,65535,0,65535,0,0] +; SSE-NEXT: movdqa %xmm9, %xmm12 +; SSE-NEXT: pandn %xmm11, %xmm12 +; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3],xmm7[4],xmm2[4],xmm7[5],xmm2[5],xmm7[6],xmm2[6],xmm7[7],xmm2[7] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,3,1,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7] +; SSE-NEXT: pand %xmm9, %xmm7 +; SSE-NEXT: por %xmm7, %xmm12 +; SSE-NEXT: packuswb %xmm12, %xmm11 +; SSE-NEXT: movdqa %xmm0, %xmm7 +; SSE-NEXT: pandn %xmm11, %xmm7 +; SSE-NEXT: pand %xmm0, %xmm10 +; SSE-NEXT: por %xmm10, %xmm7 +; SSE-NEXT: movdqa %xmm14, %xmm10 +; SSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm2[8],xmm10[9],xmm2[9],xmm10[10],xmm2[10],xmm10[11],xmm2[11],xmm10[12],xmm2[12],xmm10[13],xmm2[13],xmm10[14],xmm2[14],xmm10[15],xmm2[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm2[0],xmm14[1],xmm2[1],xmm14[2],xmm2[2],xmm14[3],xmm2[3],xmm14[4],xmm2[4],xmm14[5],xmm2[5],xmm14[6],xmm2[6],xmm14[7],xmm2[7] ; SSE-NEXT: movdqa %xmm14, %xmm11 -; SSE-NEXT: movdqa %xmm14, %xmm9 -; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm4[8],xmm9[9],xmm4[9],xmm9[10],xmm4[10],xmm9[11],xmm4[11],xmm9[12],xmm4[12],xmm9[13],xmm4[13],xmm9[14],xmm4[14],xmm9[15],xmm4[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1],xmm11[2],xmm4[2],xmm11[3],xmm4[3],xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7] -; SSE-NEXT: movdqa %xmm11, %xmm10 -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,0],xmm9[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,0],xmm9[2,3] -; SSE-NEXT: psrlq $48, %xmm9 -; SSE-NEXT: psrldq {{.*#+}} xmm10 = xmm10[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm11[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,4,5,7] -; SSE-NEXT: packuswb %xmm10, %xmm9 -; SSE-NEXT: movdqa %xmm1, %xmm10 -; SSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm4[8],xmm10[9],xmm4[9],xmm10[10],xmm4[10],xmm10[11],xmm4[11],xmm10[12],xmm4[12],xmm10[13],xmm4[13],xmm10[14],xmm4[14],xmm10[15],xmm4[15] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[1,1,2,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,5,5,5] -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] -; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm1[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[3,1,1,2,4,5,6,7] -; SSE-NEXT: pand %xmm5, %xmm11 +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,0],xmm10[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,0],xmm10[2,3] +; SSE-NEXT: psrlq $48, %xmm10 +; SSE-NEXT: psrldq {{.*#+}} xmm11 = xmm11[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm14[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,4,5,7] +; SSE-NEXT: packuswb %xmm11, %xmm10 +; SSE-NEXT: movdqa %xmm1, %xmm11 +; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm2[8],xmm11[9],xmm2[9],xmm11[10],xmm2[10],xmm11[11],xmm2[11],xmm11[12],xmm2[12],xmm11[13],xmm2[13],xmm11[14],xmm2[14],xmm11[15],xmm2[15] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[1,1,2,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,5,5,5,5] +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm1[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm12[3,1,1,2,4,5,6,7] +; SSE-NEXT: pand %xmm6, %xmm12 +; SSE-NEXT: pandn %xmm11, %xmm6 +; SSE-NEXT: por %xmm12, %xmm6 +; SSE-NEXT: packuswb %xmm6, %xmm6 +; SSE-NEXT: pand %xmm5, %xmm6 ; SSE-NEXT: pandn %xmm10, %xmm5 -; SSE-NEXT: por %xmm11, %xmm5 -; SSE-NEXT: packuswb %xmm5, %xmm5 -; SSE-NEXT: pand %xmm2, %xmm5 -; SSE-NEXT: pandn %xmm9, %xmm2 -; SSE-NEXT: por %xmm2, %xmm5 -; SSE-NEXT: movdqa %xmm12, %xmm2 -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm4[8],xmm12[9],xmm4[9],xmm12[10],xmm4[10],xmm12[11],xmm4[11],xmm12[12],xmm4[12],xmm12[13],xmm4[13],xmm12[14],xmm4[14],xmm12[15],xmm4[15] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,1,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7] -; SSE-NEXT: pand %xmm7, %xmm2 -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm12[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,7,4] -; SSE-NEXT: pandn %xmm4, %xmm7 -; SSE-NEXT: por %xmm2, %xmm7 -; SSE-NEXT: pand %xmm0, %xmm5 -; SSE-NEXT: packuswb %xmm7, %xmm2 -; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: por %xmm5, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, 16(%rsi) -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movaps %xmm2, (%rsi) +; SSE-NEXT: por %xmm5, %xmm6 +; SSE-NEXT: movdqa %xmm13, %xmm5 +; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm2[8],xmm13[9],xmm2[9],xmm13[10],xmm2[10],xmm13[11],xmm2[11],xmm13[12],xmm2[12],xmm13[13],xmm2[13],xmm13[14],xmm2[14],xmm13[15],xmm2[15] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,3,1,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7] +; SSE-NEXT: pand %xmm9, %xmm4 +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm13[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,7,4] +; SSE-NEXT: pandn %xmm5, %xmm9 +; SSE-NEXT: por %xmm4, %xmm9 +; SSE-NEXT: pand %xmm0, %xmm6 +; SSE-NEXT: packuswb %xmm9, %xmm4 +; SSE-NEXT: pandn %xmm4, %xmm0 +; SSE-NEXT: por %xmm6, %xmm0 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movaps %xmm4, 16(%rsi) +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movaps %xmm4, (%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, 16(%rdx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -1687,15 +1698,15 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movaps %xmm1, 16(%rcx) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: movaps %xmm1, (%rcx) -; SSE-NEXT: movdqa %xmm13, 16(%r8) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movaps %xmm1, (%r8) +; SSE-NEXT: movaps %xmm1, 16(%r8) +; SSE-NEXT: movdqa %xmm15, (%r8) ; SSE-NEXT: movdqa %xmm3, 16(%r9) ; SSE-NEXT: movdqa %xmm8, (%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movdqa %xmm0, 16(%rax) -; SSE-NEXT: movdqa %xmm6, (%rax) -; SSE-NEXT: addq $264, %rsp # imm = 0x108 +; SSE-NEXT: movdqa %xmm7, (%rax) +; SSE-NEXT: addq $280, %rsp # imm = 0x118 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i8_stride6_vf32: @@ -2288,7 +2299,7 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5) nounwind { ; SSE-LABEL: load_i8_stride6_vf64: ; SSE: # %bb.0: -; SSE-NEXT: subq $792, %rsp # imm = 0x318 +; SSE-NEXT: subq $808, %rsp # imm = 0x328 ; SSE-NEXT: movdqa 64(%rdi), %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 80(%rdi), %xmm5 @@ -2300,22 +2311,22 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa 32(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 48(%rdi), %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,65535,0,65535,65535,0,65535,65535] -; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,0,65535,65535,0,65535,65535] +; SSE-NEXT: movdqa %xmm9, %xmm1 ; SSE-NEXT: pandn %xmm2, %xmm1 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,65535,0,65535,65535,0] ; SSE-NEXT: movdqa %xmm3, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm13, %xmm2 +; SSE-NEXT: movdqa %xmm9, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm13, %xmm0 +; SSE-NEXT: pand %xmm9, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] ; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm1, %xmm10 +; SSE-NEXT: movdqa %xmm1, %xmm11 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] @@ -2328,8 +2339,9 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pand %xmm3, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,1,3] -; SSE-NEXT: pand %xmm10, %xmm1 +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: pand %xmm11, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7] @@ -2339,13 +2351,15 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa %xmm8, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 ; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: movdqa %xmm9, %xmm6 +; SSE-NEXT: movdqa %xmm9, %xmm0 ; SSE-NEXT: pandn %xmm5, %xmm0 -; SSE-NEXT: pand %xmm13, %xmm4 +; SSE-NEXT: pand %xmm9, %xmm4 ; SSE-NEXT: por %xmm0, %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[3,1,2,0] -; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: pand %xmm11, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5] @@ -2358,272 +2372,282 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 320(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: movdqa %xmm9, %xmm0 ; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: movdqa 336(%rdi), %xmm12 +; SSE-NEXT: movdqa 336(%rdi), %xmm13 ; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: pandn %xmm12, %xmm1 +; SSE-NEXT: pandn %xmm13, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: pandn %xmm12, %xmm1 +; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: pandn %xmm13, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm13, %xmm12 -; SSE-NEXT: por %xmm0, %xmm12 -; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: pand %xmm9, %xmm13 +; SSE-NEXT: por %xmm0, %xmm13 +; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: pand %xmm11, %xmm0 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,5] ; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm8, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa 304(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm7 +; SSE-NEXT: movdqa %xmm8, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: movdqa 304(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm3, %xmm0 -; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: movdqa 288(%rdi), %xmm6 -; SSE-NEXT: movdqa %xmm6, %xmm2 -; SSE-NEXT: pand %xmm3, %xmm2 -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,1,3] -; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: movdqa 288(%rdi), %xmm7 +; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: pand %xmm3, %xmm1 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: pand %xmm11, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm0 ; SSE-NEXT: pand %xmm8, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: por %xmm2, %xmm0 ; SSE-NEXT: movdqa 368(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm13, %xmm2 +; SSE-NEXT: movdqa %xmm9, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa 352(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm13, %xmm3 -; SSE-NEXT: por %xmm2, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[3,1,2,0] -; SSE-NEXT: pand %xmm10, %xmm2 +; SSE-NEXT: movdqa 352(%rdi), %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm9, %xmm1 +; SSE-NEXT: por %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: pand %xmm11, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,0] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5] ; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: movdqa %xmm4, %xmm3 -; SSE-NEXT: pandn %xmm2, %xmm3 +; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: pandn %xmm2, %xmm1 ; SSE-NEXT: pand %xmm4, %xmm0 ; SSE-NEXT: movdqa %xmm4, %xmm9 -; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 224(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: movdqa %xmm6, %xmm0 ; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: movdqa 240(%rdi), %xmm11 -; SSE-NEXT: movdqa %xmm7, %xmm2 -; SSE-NEXT: pandn %xmm11, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm13, %xmm2 -; SSE-NEXT: pandn %xmm11, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm13, %xmm11 -; SSE-NEXT: por %xmm0, %xmm11 -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: movdqa 240(%rdi), %xmm14 +; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: pandn %xmm14, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: pandn %xmm14, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm6, %xmm14 +; SSE-NEXT: por %xmm0, %xmm14 +; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: pand %xmm11, %xmm0 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,5] ; SSE-NEXT: packuswb %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm8, %xmm5 ; SSE-NEXT: movdqa %xmm8, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: movdqa 208(%rdi), %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm7, %xmm0 -; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: movdqa 192(%rdi), %xmm3 +; SSE-NEXT: movdqa 208(%rdi), %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: pand %xmm7, %xmm1 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,1,3] -; SSE-NEXT: movdqa %xmm10, %xmm1 -; SSE-NEXT: pand %xmm10, %xmm0 -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] -; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm8, %xmm0 -; SSE-NEXT: movdqa %xmm8, %xmm10 +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: pandn %xmm4, %xmm0 +; SSE-NEXT: movdqa 192(%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm3, %xmm4 +; SSE-NEXT: pand %xmm1, %xmm4 +; SSE-NEXT: movdqa %xmm1, %xmm8 +; SSE-NEXT: por %xmm0, %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: pand %xmm11, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: pand %xmm5, %xmm0 +; SSE-NEXT: movdqa %xmm5, %xmm1 ; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: movdqa 272(%rdi), %xmm14 -; SSE-NEXT: movdqa %xmm13, %xmm2 -; SSE-NEXT: pandn %xmm14, %xmm2 -; SSE-NEXT: movdqa 256(%rdi), %xmm15 -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm13, %xmm15 -; SSE-NEXT: por %xmm2, %xmm15 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm15[3,1,2,0] -; SSE-NEXT: pand %xmm1, %xmm2 +; SSE-NEXT: movdqa 272(%rdi), %xmm15 +; SSE-NEXT: movdqa %xmm6, %xmm2 +; SSE-NEXT: pandn %xmm15, %xmm2 +; SSE-NEXT: movdqa 256(%rdi), %xmm12 +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm6, %xmm12 +; SSE-NEXT: por %xmm2, %xmm12 +; SSE-NEXT: movdqa %xmm12, %xmm2 +; SSE-NEXT: pand %xmm11, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,0] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5] ; SSE-NEXT: packuswb %xmm2, %xmm2 +; SSE-NEXT: movdqa %xmm9, %xmm4 ; SSE-NEXT: pandn %xmm2, %xmm4 ; SSE-NEXT: pand %xmm9, %xmm0 ; SSE-NEXT: por %xmm0, %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 128(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill -; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: movdqa %xmm6, %xmm0 ; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: movdqa 144(%rdi), %xmm9 -; SSE-NEXT: movdqa %xmm7, %xmm4 -; SSE-NEXT: pandn %xmm9, %xmm4 +; SSE-NEXT: movdqa 144(%rdi), %xmm10 +; SSE-NEXT: movdqa %xmm8, %xmm4 +; SSE-NEXT: pandn %xmm10, %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm13, %xmm4 -; SSE-NEXT: pandn %xmm9, %xmm4 +; SSE-NEXT: movdqa %xmm6, %xmm4 +; SSE-NEXT: pandn %xmm10, %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm13, %xmm9 -; SSE-NEXT: por %xmm0, %xmm9 -; SSE-NEXT: movdqa %xmm9, %xmm0 -; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: pand %xmm6, %xmm10 +; SSE-NEXT: por %xmm0, %xmm10 +; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: pand %xmm11, %xmm0 ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm0[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,3,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,5] ; SSE-NEXT: packuswb %xmm5, %xmm0 -; SSE-NEXT: pandn %xmm0, %xmm10 -; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: movdqa %xmm13, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: pandn %xmm13, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: pandn %xmm6, %xmm2 +; SSE-NEXT: movdqa %xmm1, %xmm11 +; SSE-NEXT: pandn %xmm0, %xmm11 +; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: movdqa %xmm6, %xmm4 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pandn %xmm1, %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm6, %xmm2 +; SSE-NEXT: pandn %xmm7, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm6, %xmm5 -; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm7, %xmm4 +; SSE-NEXT: movdqa %xmm6, %xmm2 ; SSE-NEXT: pandn %xmm3, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm3, %xmm4 +; SSE-NEXT: movdqa %xmm3, %xmm5 ; SSE-NEXT: movdqa 112(%rdi), %xmm6 -; SSE-NEXT: movdqa %xmm7, %xmm2 -; SSE-NEXT: movdqa %xmm7, %xmm8 -; SSE-NEXT: pandn %xmm6, %xmm8 -; SSE-NEXT: movdqa 160(%rdi), %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm0, %xmm7 +; SSE-NEXT: movdqa %xmm8, %xmm7 +; SSE-NEXT: movdqa %xmm8, %xmm9 +; SSE-NEXT: pandn %xmm6, %xmm9 +; SSE-NEXT: movdqa 160(%rdi), %xmm8 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm0, %xmm8 ; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: pandn %xmm13, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, %xmm3 -; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: pandn %xmm2, %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pandn %xmm3, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm0, %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm1, %xmm5 -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: pandn %xmm3, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, %xmm3 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pandn %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pand %xmm0, %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pandn %xmm14, %xmm5 +; SSE-NEXT: movdqa %xmm7, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pandn %xmm4, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm4, %xmm3 +; SSE-NEXT: pand %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: pandn %xmm4, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm0, %xmm5 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm0, %xmm14 -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: pandn %xmm6, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 96(%rdi), %xmm4 -; SSE-NEXT: movdqa %xmm4, %xmm2 -; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 176(%rdi), %xmm14 -; SSE-NEXT: movdqa %xmm14, %xmm2 -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm0, %xmm13 -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm7, %xmm3 +; SSE-NEXT: pandn %xmm15, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm0, %xmm15 +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: pandn %xmm6, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm6, %xmm15 +; SSE-NEXT: movdqa 96(%rdi), %xmm5 +; SSE-NEXT: movdqa %xmm5, %xmm3 +; SSE-NEXT: pand %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 176(%rdi), %xmm6 +; SSE-NEXT: movdqa %xmm6, %xmm3 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pand %xmm0, %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pand %xmm0, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm0, %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa (%rsp), %xmm1 # 16-byte Reload ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill ; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pand %xmm0, %xmm6 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm0, %xmm13 +; SSE-NEXT: pand %xmm0, %xmm15 +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pandn %xmm4, %xmm1 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pandn %xmm5, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm4, %xmm3 -; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 -; SSE-NEXT: por %xmm8, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,2,1,3] -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] -; SSE-NEXT: pand %xmm1, %xmm5 +; SSE-NEXT: movdqa %xmm5, %xmm3 +; SSE-NEXT: pand %xmm7, %xmm3 +; SSE-NEXT: por %xmm9, %xmm3 +; SSE-NEXT: movdqa %xmm3, %xmm5 +; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255] +; SSE-NEXT: pand %xmm7, %xmm5 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,1,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,3,2,1,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,7] ; SSE-NEXT: packuswb %xmm5, %xmm5 -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,65535,0,0,0,65535,65535] -; SSE-NEXT: pand %xmm8, %xmm5 -; SSE-NEXT: por %xmm10, %xmm5 -; SSE-NEXT: pandn %xmm14, %xmm0 -; SSE-NEXT: por %xmm0, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[3,1,2,0] -; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,65535,0,0,0,65535,65535] +; SSE-NEXT: pand %xmm9, %xmm5 +; SSE-NEXT: por %xmm11, %xmm5 +; SSE-NEXT: pandn %xmm6, %xmm0 +; SSE-NEXT: por %xmm0, %xmm8 +; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: pand %xmm7, %xmm0 +; SSE-NEXT: movdqa %xmm7, %xmm11 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm10 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] -; SSE-NEXT: movdqa %xmm10, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; SSE-NEXT: movdqa %xmm7, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: pand %xmm10, %xmm5 +; SSE-NEXT: pand %xmm7, %xmm5 ; SSE-NEXT: por %xmm5, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pxor %xmm5, %xmm5 @@ -2634,11 +2658,11 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,2,3,3] ; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] ; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm1[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,5,7,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm14 = xmm14[2],xmm0[2],xmm14[3],xmm0[3] -; SSE-NEXT: packuswb %xmm14, %xmm4 -; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm1[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,5,7,6,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm15 = xmm15[2],xmm0[2],xmm15[3],xmm0[3] +; SSE-NEXT: packuswb %xmm15, %xmm4 +; SSE-NEXT: movdqa %xmm9, %xmm1 ; SSE-NEXT: pandn %xmm4, %xmm1 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movdqa %xmm2, %xmm4 @@ -2647,48 +2671,48 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,1,1,1,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,7,6,7] ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,0,65535,0,0,65535,65535] -; SSE-NEXT: movdqa %xmm0, %xmm14 -; SSE-NEXT: pandn %xmm4, %xmm14 +; SSE-NEXT: movdqa %xmm0, %xmm15 +; SSE-NEXT: pandn %xmm4, %xmm15 ; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,3,2,0,4,5,6,7] ; SSE-NEXT: pand %xmm0, %xmm4 -; SSE-NEXT: por %xmm14, %xmm4 +; SSE-NEXT: por %xmm15, %xmm4 ; SSE-NEXT: packuswb %xmm4, %xmm4 -; SSE-NEXT: pand %xmm8, %xmm4 +; SSE-NEXT: pand %xmm9, %xmm4 ; SSE-NEXT: por %xmm1, %xmm4 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; SSE-NEXT: movdqa %xmm6, %xmm1 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] -; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm1[0,1,2,3,5,5,5,5] +; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm1[0,1,2,3,5,5,5,5] ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,65535,65535,0,65535] ; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pandn %xmm14, %xmm1 +; SSE-NEXT: pandn %xmm15, %xmm1 ; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] -; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm6[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm14[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,5,7,6,4] -; SSE-NEXT: pand %xmm2, %xmm14 -; SSE-NEXT: por %xmm1, %xmm14 -; SSE-NEXT: packuswb %xmm14, %xmm1 -; SSE-NEXT: movdqa %xmm10, %xmm14 -; SSE-NEXT: pandn %xmm1, %xmm14 -; SSE-NEXT: pand %xmm10, %xmm4 -; SSE-NEXT: por %xmm4, %xmm14 -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm6[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm15[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,5,7,6,4] +; SSE-NEXT: pand %xmm2, %xmm15 +; SSE-NEXT: por %xmm1, %xmm15 +; SSE-NEXT: packuswb %xmm15, %xmm1 +; SSE-NEXT: movdqa %xmm7, %xmm15 +; SSE-NEXT: pandn %xmm1, %xmm15 +; SSE-NEXT: pand %xmm7, %xmm4 +; SSE-NEXT: por %xmm4, %xmm15 +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm13, %xmm1 ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm5[0],xmm12[1],xmm5[1],xmm12[2],xmm5[2],xmm12[3],xmm5[3],xmm12[4],xmm5[4],xmm12[5],xmm5[5],xmm12[6],xmm5[6],xmm12[7],xmm5[7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm12[2,2,3,3] +; SSE-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm5[0],xmm13[1],xmm5[1],xmm13[2],xmm5[2],xmm13[3],xmm5[3],xmm13[4],xmm5[4],xmm13[5],xmm5[5],xmm13[6],xmm5[6],xmm13[7],xmm5[7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm13[2,2,3,3] ; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] ; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,5,7,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm12 = xmm12[2],xmm1[2],xmm12[3],xmm1[3] -; SSE-NEXT: packuswb %xmm12, %xmm4 -; SSE-NEXT: movdqa %xmm8, %xmm14 -; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm13[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,5,7,6,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm1[2],xmm13[3],xmm1[3] +; SSE-NEXT: packuswb %xmm13, %xmm4 +; SSE-NEXT: movdqa %xmm9, %xmm15 +; SSE-NEXT: movdqa %xmm9, %xmm1 ; SSE-NEXT: pandn %xmm4, %xmm1 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; SSE-NEXT: movdqa %xmm6, %xmm4 @@ -2696,46 +2720,46 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,0,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,1,1,1,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,7,6,7] -; SSE-NEXT: movdqa %xmm0, %xmm12 -; SSE-NEXT: pandn %xmm4, %xmm12 +; SSE-NEXT: movdqa %xmm0, %xmm13 +; SSE-NEXT: pandn %xmm4, %xmm13 ; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm6[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,3,2,0,4,5,6,7] ; SSE-NEXT: pand %xmm0, %xmm4 -; SSE-NEXT: por %xmm12, %xmm4 +; SSE-NEXT: por %xmm13, %xmm4 ; SSE-NEXT: packuswb %xmm4, %xmm4 -; SSE-NEXT: pand %xmm8, %xmm4 +; SSE-NEXT: pand %xmm9, %xmm4 ; SSE-NEXT: por %xmm1, %xmm4 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; SSE-NEXT: movdqa %xmm6, %xmm1 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] -; SSE-NEXT: movdqa %xmm2, %xmm12 -; SSE-NEXT: pandn %xmm1, %xmm12 +; SSE-NEXT: movdqa %xmm2, %xmm13 +; SSE-NEXT: pandn %xmm1, %xmm13 ; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm6[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,7,6,4] ; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: por %xmm12, %xmm1 +; SSE-NEXT: por %xmm13, %xmm1 ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm10, %xmm12 -; SSE-NEXT: pandn %xmm1, %xmm12 -; SSE-NEXT: pand %xmm10, %xmm4 -; SSE-NEXT: por %xmm4, %xmm12 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: movdqa %xmm7, %xmm13 +; SSE-NEXT: pandn %xmm1, %xmm13 +; SSE-NEXT: pand %xmm7, %xmm4 +; SSE-NEXT: por %xmm4, %xmm13 +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm14, %xmm1 ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm5[0],xmm11[1],xmm5[1],xmm11[2],xmm5[2],xmm11[3],xmm5[3],xmm11[4],xmm5[4],xmm11[5],xmm5[5],xmm11[6],xmm5[6],xmm11[7],xmm5[7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm11[2,2,3,3] +; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm5[0],xmm14[1],xmm5[1],xmm14[2],xmm5[2],xmm14[3],xmm5[3],xmm14[4],xmm5[4],xmm14[5],xmm5[5],xmm14[6],xmm5[6],xmm14[7],xmm5[7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm14[2,2,3,3] ; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] ; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm11[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,7,6,7] -; SSE-NEXT: punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm1[2],xmm8[3],xmm1[3] -; SSE-NEXT: packuswb %xmm8, %xmm4 -; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm14[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,5,7,6,7] +; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm1[2],xmm9[3],xmm1[3] +; SSE-NEXT: packuswb %xmm9, %xmm4 +; SSE-NEXT: movdqa %xmm15, %xmm1 ; SSE-NEXT: pandn %xmm4, %xmm1 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; SSE-NEXT: movdqa %xmm6, %xmm4 @@ -2743,41 +2767,41 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,0,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,1,1,1,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,7,6,7] -; SSE-NEXT: movdqa %xmm0, %xmm8 -; SSE-NEXT: pandn %xmm4, %xmm8 +; SSE-NEXT: movdqa %xmm0, %xmm9 +; SSE-NEXT: pandn %xmm4, %xmm9 ; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm6[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,3,2,0,4,5,6,7] ; SSE-NEXT: pand %xmm0, %xmm4 -; SSE-NEXT: por %xmm8, %xmm4 +; SSE-NEXT: por %xmm9, %xmm4 ; SSE-NEXT: packuswb %xmm4, %xmm4 -; SSE-NEXT: pand %xmm14, %xmm4 +; SSE-NEXT: pand %xmm15, %xmm4 ; SSE-NEXT: por %xmm1, %xmm4 -; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] -; SSE-NEXT: movdqa %xmm2, %xmm8 -; SSE-NEXT: pandn %xmm1, %xmm8 -; SSE-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm5[8],xmm15[9],xmm5[9],xmm15[10],xmm5[10],xmm15[11],xmm5[11],xmm15[12],xmm5[12],xmm15[13],xmm5[13],xmm15[14],xmm5[14],xmm15[15],xmm5[15] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm15[3,1,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm2, %xmm9 +; SSE-NEXT: pandn %xmm1, %xmm9 +; SSE-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm5[8],xmm12[9],xmm5[9],xmm12[10],xmm5[10],xmm12[11],xmm5[11],xmm12[12],xmm5[12],xmm12[13],xmm5[13],xmm12[14],xmm5[14],xmm12[15],xmm5[15] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm12[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,7,6,4] ; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: por %xmm8, %xmm1 +; SSE-NEXT: por %xmm9, %xmm1 ; SSE-NEXT: packuswb %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm10, %xmm8 -; SSE-NEXT: pandn %xmm1, %xmm8 -; SSE-NEXT: pand %xmm10, %xmm4 -; SSE-NEXT: por %xmm4, %xmm8 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm9, %xmm1 +; SSE-NEXT: movdqa %xmm7, %xmm9 +; SSE-NEXT: pandn %xmm1, %xmm9 +; SSE-NEXT: pand %xmm7, %xmm4 +; SSE-NEXT: por %xmm4, %xmm9 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm10, %xmm1 ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3],xmm9[4],xmm5[4],xmm9[5],xmm5[5],xmm9[6],xmm5[6],xmm9[7],xmm5[7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm9[2,2,3,3] +; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3],xmm10[4],xmm5[4],xmm10[5],xmm5[5],xmm10[6],xmm5[6],xmm10[7],xmm5[7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm10[2,2,3,3] ; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] ; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm9[0,1,0,3] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm10[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,7,6,7] ; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm1[2],xmm6[3],xmm1[3] ; SSE-NEXT: packuswb %xmm6, %xmm4 @@ -2794,34 +2818,32 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pandn %xmm1, %xmm0 ; SSE-NEXT: por %xmm3, %xmm0 ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm14, %xmm1 -; SSE-NEXT: pand %xmm14, %xmm0 +; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: pand %xmm15, %xmm0 ; SSE-NEXT: pandn %xmm4, %xmm1 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: movdqa %xmm8, %xmm1 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] -; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm5[8],xmm7[9],xmm5[9],xmm7[10],xmm5[10],xmm7[11],xmm5[11],xmm7[12],xmm5[12],xmm7[13],xmm5[13],xmm7[14],xmm5[14],xmm7[15],xmm5[15] -; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm7[3,1,2,3,4,5,6,7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm5[8],xmm8[9],xmm5[9],xmm8[10],xmm5[10],xmm8[11],xmm5[11],xmm8[12],xmm5[12],xmm8[13],xmm5[13],xmm8[14],xmm5[14],xmm8[15],xmm5[15] +; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm8[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,7,6,4] ; SSE-NEXT: pand %xmm2, %xmm3 ; SSE-NEXT: pandn %xmm1, %xmm2 ; SSE-NEXT: por %xmm3, %xmm2 ; SSE-NEXT: packuswb %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm10, %xmm2 +; SSE-NEXT: movdqa %xmm7, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: pand %xmm10, %xmm0 -; SSE-NEXT: movdqa %xmm10, %xmm11 +; SSE-NEXT: pand %xmm7, %xmm0 ; SSE-NEXT: por %xmm0, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,0,65535,65535,0,65535,65535,0] -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: pand %xmm14, %xmm4 -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm10 = [255,255,255,255,255,255,255,255] -; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,0,65535,65535,0,65535,65535,0] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pand %xmm12, %xmm0 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm11, %xmm0 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,7,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,3,4,5,6,7] @@ -2831,712 +2853,719 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: packuswb %xmm1, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm6[2,1,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: pand %xmm11, %xmm1 +; SSE-NEXT: movdqa %xmm11, %xmm4 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] -; SSE-NEXT: pand %xmm10, %xmm1 -; SSE-NEXT: movdqa %xmm10, %xmm12 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,5,5,5,5] -; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm5, %xmm3 -; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: pand %xmm5, %xmm0 -; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] +; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255] +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,3,2,3,4,5,6,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: por %xmm0, %xmm14 +; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: pand %xmm11, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] -; SSE-NEXT: pand %xmm10, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,4] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm11, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: pand %xmm11, %xmm3 -; SSE-NEXT: movdqa %xmm11, %xmm8 -; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: pand %xmm14, %xmm13 -; SSE-NEXT: movdqa %xmm14, %xmm7 -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: pand %xmm10, %xmm0 -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,2,3,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: pand %xmm7, %xmm2 +; SSE-NEXT: por %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pand %xmm12, %xmm1 +; SSE-NEXT: movdqa %xmm12, %xmm10 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm1, %xmm12 +; SSE-NEXT: pand %xmm11, %xmm0 +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,3,4,5,6,7] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,5,6] -; SSE-NEXT: packuswb %xmm2, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm14[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] -; SSE-NEXT: pand %xmm10, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,2,3,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] -; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: movdqa %xmm5, %xmm3 -; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: pand %xmm5, %xmm0 -; SSE-NEXT: por %xmm0, %xmm3 +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm15, %xmm1 +; SSE-NEXT: pand %xmm11, %xmm1 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] +; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: por %xmm0, %xmm2 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; SSE-NEXT: por %xmm0, %xmm11 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[0,3,2,3,4,5,6,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: por %xmm0, %xmm9 +; SSE-NEXT: movdqa %xmm9, %xmm0 +; SSE-NEXT: pand %xmm11, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] -; SSE-NEXT: pand %xmm10, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,4] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm8, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: pand %xmm8, %xmm3 -; SSE-NEXT: movdqa %xmm8, %xmm9 -; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; SSE-NEXT: pand %xmm7, %xmm10 -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: pand %xmm12, %xmm0 -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,2,3,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: pand %xmm7, %xmm2 +; SSE-NEXT: por %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: pand %xmm10, %xmm8 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: pand %xmm11, %xmm0 +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,3,4,5,6,7] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,5,6] -; SSE-NEXT: packuswb %xmm2, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] -; SSE-NEXT: pand %xmm12, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,2,3,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] -; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: movdqa %xmm5, %xmm3 -; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: pand %xmm5, %xmm0 -; SSE-NEXT: por %xmm0, %xmm3 +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: pand %xmm11, %xmm1 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] +; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: por %xmm0, %xmm2 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; SSE-NEXT: por %xmm0, %xmm8 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[0,3,2,3,4,5,6,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: pand %xmm11, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] -; SSE-NEXT: pand %xmm12, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,4] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm9, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: pand %xmm9, %xmm3 -; SSE-NEXT: movdqa %xmm9, %xmm1 -; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: pand %xmm7, %xmm0 -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm12, %xmm0 -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,2,3,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm7, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: pand %xmm7, %xmm2 +; SSE-NEXT: por %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; SSE-NEXT: pand %xmm10, %xmm7 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: pand %xmm11, %xmm0 +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,3,4,5,6,7] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,5,6] -; SSE-NEXT: packuswb %xmm2, %xmm0 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm15[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] -; SSE-NEXT: pand %xmm12, %xmm2 -; SSE-NEXT: movdqa %xmm12, %xmm9 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,2,3,0,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] -; SSE-NEXT: packuswb %xmm2, %xmm2 -; SSE-NEXT: movdqa %xmm5, %xmm3 -; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: pand %xmm5, %xmm0 -; SSE-NEXT: por %xmm0, %xmm3 +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] +; SSE-NEXT: packuswb %xmm1, %xmm1 +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: por %xmm0, %xmm2 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: por %xmm0, %xmm12 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm12[0,3,2,3,4,5,6,7] +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: por %xmm0, %xmm10 +; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] -; SSE-NEXT: pand %xmm9, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,4] ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm1, %xmm9 -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: pand %xmm1, %xmm3 -; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: pand %xmm4, %xmm2 +; SSE-NEXT: por %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15] -; SSE-NEXT: pxor %xmm7, %xmm7 -; SSE-NEXT: movdqa %xmm4, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[3,0] -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[0,2] +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15] +; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0] +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm4[0,0] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm4[2,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,2] +; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,2] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm0, %xmm2 +; SSE-NEXT: packuswb %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm6, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3],xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,1,4,5,6,7] -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [0,65535,65535,0,65535,65535,65535,65535] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[3,1,2,1,4,5,6,7] +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,65535,65535,0,65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: pandn %xmm2, %xmm4 +; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[0,3,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7] +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: por %xmm4, %xmm2 +; SSE-NEXT: packuswb %xmm2, %xmm2 ; SSE-NEXT: movdqa %xmm3, %xmm4 -; SSE-NEXT: pandn %xmm0, %xmm4 -; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm7[8],xmm6[9],xmm7[9],xmm6[10],xmm7[10],xmm6[11],xmm7[11],xmm6[12],xmm7[12],xmm6[13],xmm7[13],xmm6[14],xmm7[14],xmm6[15],xmm7[15] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,3,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] -; SSE-NEXT: pand %xmm3, %xmm0 -; SSE-NEXT: por %xmm4, %xmm0 -; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm5, %xmm6 -; SSE-NEXT: pandn %xmm0, %xmm6 -; SSE-NEXT: pand %xmm5, %xmm2 -; SSE-NEXT: por %xmm2, %xmm6 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm7[8],xmm0[9],xmm7[9],xmm0[10],xmm7[10],xmm0[11],xmm7[11],xmm0[12],xmm7[12],xmm0[13],xmm7[13],xmm0[14],xmm7[14],xmm0[15],xmm7[15] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,5] -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535,0,65535,65535,0] -; SSE-NEXT: movdqa %xmm4, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3],xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,7] -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm9, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: pand %xmm9, %xmm6 +; SSE-NEXT: pandn %xmm2, %xmm4 +; SSE-NEXT: pand %xmm3, %xmm1 +; SSE-NEXT: por %xmm1, %xmm4 +; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,7,5,6,5] +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,0,65535,65535,0] +; SSE-NEXT: movdqa %xmm1, %xmm6 +; SSE-NEXT: pandn %xmm2, %xmm6 +; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm5[0],xmm14[1],xmm5[1],xmm14[2],xmm5[2],xmm14[3],xmm5[3],xmm14[4],xmm5[4],xmm14[5],xmm5[5],xmm14[6],xmm5[6],xmm14[7],xmm5[7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[0,2,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,7] +; SSE-NEXT: pand %xmm1, %xmm2 ; SSE-NEXT: por %xmm6, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm1[8],xmm13[9],xmm1[9],xmm13[10],xmm1[10],xmm13[11],xmm1[11],xmm13[12],xmm1[12],xmm13[13],xmm1[13],xmm13[14],xmm1[14],xmm13[15],xmm1[15] -; SSE-NEXT: movdqa %xmm13, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[3,0] -; SSE-NEXT: movaps %xmm0, %xmm6 -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm2[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm13[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm13[2,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm6[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,2] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm14, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,1,4,5,6,7] -; SSE-NEXT: movdqa %xmm3, %xmm6 -; SSE-NEXT: pandn %xmm0, %xmm6 -; SSE-NEXT: punpckhbw {{.*#+}} xmm14 = xmm14[8],xmm1[8],xmm14[9],xmm1[9],xmm14[10],xmm1[10],xmm14[11],xmm1[11],xmm14[12],xmm1[12],xmm14[13],xmm1[13],xmm14[14],xmm1[14],xmm14[15],xmm1[15] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[0,3,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] -; SSE-NEXT: pand %xmm3, %xmm0 -; SSE-NEXT: por %xmm6, %xmm0 -; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm5, %xmm6 -; SSE-NEXT: pandn %xmm0, %xmm6 -; SSE-NEXT: pand %xmm5, %xmm2 -; SSE-NEXT: por %xmm2, %xmm6 -; SSE-NEXT: movdqa %xmm11, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,5] -; SSE-NEXT: movdqa %xmm4, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm1[0],xmm11[1],xmm1[1],xmm11[2],xmm1[2],xmm11[3],xmm1[3],xmm11[4],xmm1[4],xmm11[5],xmm1[5],xmm11[6],xmm1[6],xmm11[7],xmm1[7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,2,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,7] -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm9, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: pand %xmm9, %xmm6 +; SSE-NEXT: packuswb %xmm2, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm14 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] +; SSE-NEXT: movdqa %xmm14, %xmm6 +; SSE-NEXT: pandn %xmm2, %xmm6 +; SSE-NEXT: pand %xmm14, %xmm4 +; SSE-NEXT: por %xmm4, %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm12, %xmm2 +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm5[8],xmm12[9],xmm5[9],xmm12[10],xmm5[10],xmm12[11],xmm5[11],xmm12[12],xmm5[12],xmm12[13],xmm5[13],xmm12[14],xmm5[14],xmm12[15],xmm5[15] +; SSE-NEXT: movdqa %xmm12, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm2[3,0] +; SSE-NEXT: movaps %xmm2, %xmm6 +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm4[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm12[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm12[2,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm6[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,2] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,3,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm2, %xmm4 +; SSE-NEXT: movdqa %xmm15, %xmm2 +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,1,4,5,6,7] +; SSE-NEXT: movdqa %xmm0, %xmm6 +; SSE-NEXT: pandn %xmm2, %xmm6 +; SSE-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm5[8],xmm15[9],xmm5[9],xmm15[10],xmm5[10],xmm15[11],xmm5[11],xmm15[12],xmm5[12],xmm15[13],xmm5[13],xmm15[14],xmm5[14],xmm15[15],xmm5[15] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm15[0,3,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7] +; SSE-NEXT: pand %xmm0, %xmm2 ; SSE-NEXT: por %xmm6, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm1[8],xmm10[9],xmm1[9],xmm10[10],xmm1[10],xmm10[11],xmm1[11],xmm10[12],xmm1[12],xmm10[13],xmm1[13],xmm10[14],xmm1[14],xmm10[15],xmm1[15] -; SSE-NEXT: movdqa %xmm10, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[3,0] -; SSE-NEXT: movaps %xmm0, %xmm6 -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm2[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm10[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm10[2,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm6[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,2] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm0, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: movdqa %xmm7, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,1,4,5,6,7] +; SSE-NEXT: packuswb %xmm2, %xmm2 ; SSE-NEXT: movdqa %xmm3, %xmm6 -; SSE-NEXT: pandn %xmm0, %xmm6 -; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm1[8],xmm7[9],xmm1[9],xmm7[10],xmm1[10],xmm7[11],xmm1[11],xmm7[12],xmm1[12],xmm7[13],xmm1[13],xmm7[14],xmm1[14],xmm7[15],xmm1[15] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,3,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] -; SSE-NEXT: pand %xmm3, %xmm0 -; SSE-NEXT: por %xmm6, %xmm0 -; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movdqa %xmm5, %xmm6 -; SSE-NEXT: pandn %xmm0, %xmm6 -; SSE-NEXT: pand %xmm5, %xmm2 -; SSE-NEXT: por %xmm2, %xmm6 -; SSE-NEXT: movdqa %xmm8, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,5] -; SSE-NEXT: movdqa %xmm4, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3],xmm8[4],xmm1[4],xmm8[5],xmm1[5],xmm8[6],xmm1[6],xmm8[7],xmm1[7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,2,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,7] -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: packuswb %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm9, %xmm0 -; SSE-NEXT: pandn %xmm2, %xmm0 -; SSE-NEXT: pand %xmm9, %xmm6 -; SSE-NEXT: por %xmm6, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; SSE-NEXT: movdqa %xmm0, %xmm6 -; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm2[3,0] -; SSE-NEXT: movaps %xmm2, %xmm7 -; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm6[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm0[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[2,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm7[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,0,2] +; SSE-NEXT: pandn %xmm2, %xmm6 +; SSE-NEXT: pand %xmm3, %xmm4 +; SSE-NEXT: por %xmm4, %xmm6 +; SSE-NEXT: movdqa %xmm9, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,5] +; SSE-NEXT: movdqa %xmm1, %xmm4 +; SSE-NEXT: pandn %xmm2, %xmm4 +; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3],xmm9[4],xmm5[4],xmm9[5],xmm5[5],xmm9[6],xmm5[6],xmm9[7],xmm5[7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm9[0,2,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,7] +; SSE-NEXT: pand %xmm1, %xmm2 +; SSE-NEXT: por %xmm4, %xmm2 +; SSE-NEXT: packuswb %xmm2, %xmm2 +; SSE-NEXT: movdqa %xmm14, %xmm4 +; SSE-NEXT: pandn %xmm2, %xmm4 +; SSE-NEXT: pand %xmm14, %xmm6 +; SSE-NEXT: por %xmm6, %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm8, %xmm2 +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm5[8],xmm8[9],xmm5[9],xmm8[10],xmm5[10],xmm8[11],xmm5[11],xmm8[12],xmm5[12],xmm8[13],xmm5[13],xmm8[14],xmm5[14],xmm8[15],xmm5[15] +; SSE-NEXT: movdqa %xmm8, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm2[3,0] +; SSE-NEXT: movaps %xmm2, %xmm6 +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm4[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm8[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm8[2,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm6[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,2] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,3,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm2, %xmm6 -; SSE-NEXT: movdqa %xmm15, %xmm2 -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE-NEXT: packuswb %xmm2, %xmm4 +; SSE-NEXT: movdqa %xmm13, %xmm2 +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,1,4,5,6,7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm1[8],xmm15[9],xmm1[9],xmm15[10],xmm1[10],xmm15[11],xmm1[11],xmm15[12],xmm1[12],xmm15[13],xmm1[13],xmm15[14],xmm1[14],xmm15[15],xmm1[15] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm15[0,3,2,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,3,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,7,7,7] -; SSE-NEXT: pand %xmm3, %xmm7 -; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: por %xmm7, %xmm3 -; SSE-NEXT: pand %xmm5, %xmm6 -; SSE-NEXT: packuswb %xmm3, %xmm3 -; SSE-NEXT: pandn %xmm3, %xmm5 -; SSE-NEXT: por %xmm6, %xmm5 -; SSE-NEXT: movdqa %xmm12, %xmm2 -; SSE-NEXT: pxor %xmm0, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] +; SSE-NEXT: movdqa %xmm0, %xmm6 +; SSE-NEXT: pandn %xmm2, %xmm6 +; SSE-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm5[8],xmm13[9],xmm5[9],xmm13[10],xmm5[10],xmm13[11],xmm5[11],xmm13[12],xmm5[12],xmm13[13],xmm5[13],xmm13[14],xmm5[14],xmm13[15],xmm5[15] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[0,3,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7] +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: por %xmm6, %xmm2 +; SSE-NEXT: packuswb %xmm2, %xmm2 +; SSE-NEXT: movdqa %xmm3, %xmm6 +; SSE-NEXT: pandn %xmm2, %xmm6 +; SSE-NEXT: pand %xmm3, %xmm4 +; SSE-NEXT: por %xmm4, %xmm6 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; SSE-NEXT: movdqa %xmm8, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,5] -; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3],xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm12[0,2,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,7,7] -; SSE-NEXT: pand %xmm4, %xmm3 +; SSE-NEXT: movdqa %xmm1, %xmm4 ; SSE-NEXT: pandn %xmm2, %xmm4 -; SSE-NEXT: por %xmm3, %xmm4 -; SSE-NEXT: pand %xmm9, %xmm5 -; SSE-NEXT: packuswb %xmm4, %xmm2 -; SSE-NEXT: pandn %xmm2, %xmm9 -; SSE-NEXT: por %xmm5, %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3],xmm8[4],xmm5[4],xmm8[5],xmm5[5],xmm8[6],xmm5[6],xmm8[7],xmm5[7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[0,2,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,7] +; SSE-NEXT: pand %xmm1, %xmm2 +; SSE-NEXT: por %xmm4, %xmm2 +; SSE-NEXT: packuswb %xmm2, %xmm2 +; SSE-NEXT: movdqa %xmm14, %xmm4 +; SSE-NEXT: pandn %xmm2, %xmm4 +; SSE-NEXT: pand %xmm14, %xmm6 +; SSE-NEXT: por %xmm6, %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm7, %xmm2 +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm5[8],xmm7[9],xmm5[9],xmm7[10],xmm5[10],xmm7[11],xmm5[11],xmm7[12],xmm5[12],xmm7[13],xmm5[13],xmm7[14],xmm5[14],xmm7[15],xmm5[15] +; SSE-NEXT: movdqa %xmm7, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm2[3,0] +; SSE-NEXT: movaps %xmm2, %xmm6 +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm4[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm7[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm7[2,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm6[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,2] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,3,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm2, %xmm4 +; SSE-NEXT: movdqa %xmm11, %xmm2 +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,1,4,5,6,7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm5[8],xmm11[9],xmm5[9],xmm11[10],xmm5[10],xmm11[11],xmm5[11],xmm11[12],xmm5[12],xmm11[13],xmm5[13],xmm11[14],xmm5[14],xmm11[15],xmm5[15] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm11[0,3,2,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,1,3,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,7,7,7] +; SSE-NEXT: pand %xmm0, %xmm6 +; SSE-NEXT: pandn %xmm2, %xmm0 +; SSE-NEXT: por %xmm6, %xmm0 +; SSE-NEXT: pand %xmm3, %xmm4 +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: pandn %xmm0, %xmm3 +; SSE-NEXT: por %xmm4, %xmm3 +; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm5[8],xmm0[9],xmm5[9],xmm0[10],xmm5[10],xmm0[11],xmm5[11],xmm0[12],xmm5[12],xmm0[13],xmm5[13],xmm0[14],xmm5[14],xmm0[15],xmm5[15] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,5] +; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3],xmm10[4],xmm5[4],xmm10[5],xmm5[5],xmm10[6],xmm5[6],xmm10[7],xmm5[7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[0,2,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,7] +; SSE-NEXT: pand %xmm1, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: por %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm14, %xmm2 +; SSE-NEXT: pand %xmm14, %xmm3 +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: por %xmm3, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255] +; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,2] +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm13 = [255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255] +; SSE-NEXT: movdqa %xmm13, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm11, %xmm0 +; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: pand %xmm13, %xmm0 +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,0,65535,65535,0,65535,65535,0] +; SSE-NEXT: pand %xmm6, %xmm9 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm9, %xmm2 +; SSE-NEXT: pand %xmm8, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,2,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,4,7] +; SSE-NEXT: packuswb %xmm2, %xmm2 +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535,65535,0,0,0] +; SSE-NEXT: movdqa %xmm4, %xmm1 +; SSE-NEXT: pandn %xmm2, %xmm1 +; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,2] +; SSE-NEXT: packuswb %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm13, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: pand %xmm13, %xmm0 +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: pand %xmm6, %xmm1 +; SSE-NEXT: movdqa %xmm6, %xmm7 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: pand %xmm8, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,2,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,4,7] +; SSE-NEXT: packuswb %xmm2, %xmm2 +; SSE-NEXT: movdqa %xmm4, %xmm6 +; SSE-NEXT: pandn %xmm2, %xmm6 +; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: por %xmm0, %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm10 = [255,255,255,255,255,255,255,255] -; SSE-NEXT: pand %xmm10, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,1,2,3] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,2] -; SSE-NEXT: packuswb %xmm2, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm15 = [255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255] -; SSE-NEXT: movdqa %xmm15, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[3,1,2,0] -; SSE-NEXT: pand %xmm10, %xmm1 -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm1[2,1,0,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm6, %xmm6 -; SSE-NEXT: pand %xmm15, %xmm6 -; SSE-NEXT: por %xmm2, %xmm6 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,0,65535,65535,0,65535,65535,0] -; SSE-NEXT: pand %xmm11, %xmm13 -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[0,2,1,3] -; SSE-NEXT: pand %xmm10, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7] -; SSE-NEXT: packuswb %xmm1, %xmm2 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,65535,0,0,0] -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: pandn %xmm2, %xmm3 -; SSE-NEXT: pand %xmm1, %xmm6 -; SSE-NEXT: por %xmm6, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,2] +; SSE-NEXT: packuswb %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm13, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm2 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm14, %xmm2 -; SSE-NEXT: pand %xmm10, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,2] -; SSE-NEXT: packuswb %xmm6, %xmm2 -; SSE-NEXT: movdqa %xmm15, %xmm6 -; SSE-NEXT: pandn %xmm2, %xmm6 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[3,1,2,0] -; SSE-NEXT: pand %xmm10, %xmm2 -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm2[2,1,0,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm7, %xmm7 -; SSE-NEXT: pand %xmm15, %xmm7 -; SSE-NEXT: por %xmm6, %xmm7 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: pand %xmm11, %xmm3 -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,1,3] -; SSE-NEXT: pand %xmm10, %xmm2 +; SSE-NEXT: movdqa %xmm14, %xmm0 +; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: pand %xmm13, %xmm0 +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: pand %xmm7, %xmm15 +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm15, %xmm2 +; SSE-NEXT: pand %xmm8, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,2,1,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,4,7] -; SSE-NEXT: packuswb %xmm2, %xmm6 -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: pandn %xmm6, %xmm2 -; SSE-NEXT: pand %xmm1, %xmm7 -; SSE-NEXT: por %xmm7, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; SSE-NEXT: packuswb %xmm2, %xmm2 +; SSE-NEXT: movdqa %xmm4, %xmm6 +; SSE-NEXT: pandn %xmm2, %xmm6 +; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: por %xmm0, %xmm6 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm10, %xmm6 -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm6[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,0,2] -; SSE-NEXT: packuswb %xmm7, %xmm6 -; SSE-NEXT: movdqa %xmm15, %xmm7 -; SSE-NEXT: pandn %xmm6, %xmm7 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[3,1,2,0] -; SSE-NEXT: pand %xmm10, %xmm6 -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,1,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm6[2,1,0,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm8, %xmm8 -; SSE-NEXT: pand %xmm15, %xmm8 -; SSE-NEXT: por %xmm7, %xmm8 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-NEXT: pand %xmm11, %xmm2 -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,2,1,3] -; SSE-NEXT: pand %xmm10, %xmm6 -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,1,2,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,6,5,4,7] -; SSE-NEXT: packuswb %xmm6, %xmm7 -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: pandn %xmm7, %xmm2 -; SSE-NEXT: pand %xmm1, %xmm8 -; SSE-NEXT: por %xmm8, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rsp), %xmm7 # 16-byte Reload -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm7, (%rsp) # 16-byte Spill -; SSE-NEXT: pand %xmm10, %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm7[2,1,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[2,1,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,7,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,0,2] -; SSE-NEXT: packuswb %xmm8, %xmm7 -; SSE-NEXT: movdqa %xmm15, %xmm8 -; SSE-NEXT: pandn %xmm7, %xmm8 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm6[3,1,2,0] -; SSE-NEXT: pand %xmm10, %xmm7 -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[2,1,0,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm7[2,1,0,3,4,5,6,7] -; SSE-NEXT: packuswb %xmm9, %xmm9 -; SSE-NEXT: pand %xmm15, %xmm9 -; SSE-NEXT: por %xmm8, %xmm9 -; SSE-NEXT: movdqa %xmm11, %xmm2 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE-NEXT: pand %xmm11, %xmm7 +; SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,1,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,2] +; SSE-NEXT: packuswb %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm13, %xmm2 +; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm0[2,1,0,3,4,5,6,7] +; SSE-NEXT: packuswb %xmm6, %xmm6 +; SSE-NEXT: pand %xmm13, %xmm6 +; SSE-NEXT: por %xmm2, %xmm6 +; SSE-NEXT: movdqa %xmm7, %xmm2 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-NEXT: pand %xmm7, %xmm0 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: por %xmm7, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[0,2,1,3] -; SSE-NEXT: pand %xmm10, %xmm7 -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,2,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,0,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,6,5,4,7] -; SSE-NEXT: packuswb %xmm7, %xmm8 -; SSE-NEXT: movdqa %xmm1, %xmm7 -; SSE-NEXT: pandn %xmm8, %xmm7 -; SSE-NEXT: pand %xmm1, %xmm9 -; SSE-NEXT: por %xmm9, %xmm7 -; SSE-NEXT: movdqa %xmm0, %xmm8 -; SSE-NEXT: pxor %xmm5, %xmm5 -; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm5[8],xmm8[9],xmm5[9],xmm8[10],xmm5[10],xmm8[11],xmm5[11],xmm8[12],xmm5[12],xmm8[13],xmm5[13],xmm8[14],xmm5[14],xmm8[15],xmm5[15] +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,3,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7] +; SSE-NEXT: packuswb %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm4, %xmm7 +; SSE-NEXT: pandn %xmm2, %xmm7 +; SSE-NEXT: pand %xmm4, %xmm6 +; SSE-NEXT: por %xmm6, %xmm7 +; SSE-NEXT: movdqa %xmm10, %xmm0 +; SSE-NEXT: movdqa %xmm10, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] -; SSE-NEXT: movdqa %xmm0, %xmm9 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm8[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm8[2,3] -; SSE-NEXT: psrlq $48, %xmm8 -; SSE-NEXT: psrldq {{.*#+}} xmm9 = xmm9[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm0[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,5,7] -; SSE-NEXT: packuswb %xmm9, %xmm8 -; SSE-NEXT: movdqa %xmm15, %xmm10 -; SSE-NEXT: pandn %xmm8, %xmm10 -; SSE-NEXT: movdqa %xmm12, %xmm8 -; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm5[8],xmm8[9],xmm5[9],xmm8[10],xmm5[10],xmm8[11],xmm5[11],xmm8[12],xmm5[12],xmm8[13],xmm5[13],xmm8[14],xmm5[14],xmm8[15],xmm5[15] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,1,2,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,5,5,5] +; SSE-NEXT: movdqa %xmm0, %xmm6 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm2[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[2,3] +; SSE-NEXT: psrlq $48, %xmm2 +; SSE-NEXT: psrldq {{.*#+}} xmm6 = xmm6[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,5,7] +; SSE-NEXT: packuswb %xmm6, %xmm2 +; SSE-NEXT: movdqa %xmm13, %xmm8 +; SSE-NEXT: pandn %xmm2, %xmm8 +; SSE-NEXT: movdqa %xmm11, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,2,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,0,65535,65535,65535] -; SSE-NEXT: movdqa %xmm0, %xmm11 -; SSE-NEXT: pandn %xmm8, %xmm11 -; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm5[0],xmm12[1],xmm5[1],xmm12[2],xmm5[2],xmm12[3],xmm5[3],xmm12[4],xmm5[4],xmm12[5],xmm5[5],xmm12[6],xmm5[6],xmm12[7],xmm5[7] -; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm12[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm8[3,1,1,2,4,5,6,7] -; SSE-NEXT: pand %xmm0, %xmm12 -; SSE-NEXT: por %xmm11, %xmm12 -; SSE-NEXT: packuswb %xmm12, %xmm12 -; SSE-NEXT: pand %xmm15, %xmm12 -; SSE-NEXT: por %xmm10, %xmm12 +; SSE-NEXT: movdqa %xmm0, %xmm10 +; SSE-NEXT: pandn %xmm2, %xmm10 +; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm5[0],xmm11[1],xmm5[1],xmm11[2],xmm5[2],xmm11[3],xmm5[3],xmm11[4],xmm5[4],xmm11[5],xmm5[5],xmm11[6],xmm5[6],xmm11[7],xmm5[7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm11[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm2[3,1,1,2,4,5,6,7] +; SSE-NEXT: pand %xmm0, %xmm6 +; SSE-NEXT: por %xmm10, %xmm6 +; SSE-NEXT: packuswb %xmm6, %xmm6 +; SSE-NEXT: pand %xmm13, %xmm6 +; SSE-NEXT: por %xmm8, %xmm6 +; SSE-NEXT: movdqa %xmm9, %xmm2 +; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm5[8],xmm9[9],xmm5[9],xmm9[10],xmm5[10],xmm9[11],xmm5[11],xmm9[12],xmm5[12],xmm9[13],xmm5[13],xmm9[14],xmm5[14],xmm9[15],xmm5[15] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm9[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,5,7,4] +; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,65535,65535,65535,0,65535,0,0] +; SSE-NEXT: movdqa %xmm11, %xmm10 +; SSE-NEXT: pandn %xmm8, %xmm10 +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,1,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7] +; SSE-NEXT: pand %xmm11, %xmm2 +; SSE-NEXT: por %xmm2, %xmm10 +; SSE-NEXT: packuswb %xmm10, %xmm8 +; SSE-NEXT: movdqa %xmm4, %xmm2 +; SSE-NEXT: pandn %xmm8, %xmm2 +; SSE-NEXT: pand %xmm4, %xmm6 +; SSE-NEXT: por %xmm6, %xmm2 +; SSE-NEXT: movdqa %xmm12, %xmm9 +; SSE-NEXT: movdqa %xmm12, %xmm6 +; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3],xmm9[4],xmm5[4],xmm9[5],xmm5[5],xmm9[6],xmm5[6],xmm9[7],xmm5[7] +; SSE-NEXT: movdqa %xmm9, %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,0],xmm6[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,0],xmm6[2,3] +; SSE-NEXT: psrlq $48, %xmm6 +; SSE-NEXT: psrldq {{.*#+}} xmm8 = xmm8[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm9[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,5,7] +; SSE-NEXT: packuswb %xmm8, %xmm6 ; SSE-NEXT: movdqa %xmm13, %xmm8 -; SSE-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm5[8],xmm13[9],xmm5[9],xmm13[10],xmm5[10],xmm13[11],xmm5[11],xmm13[12],xmm5[12],xmm13[13],xmm5[13],xmm13[14],xmm5[14],xmm13[15],xmm5[15] -; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm13[0,1,2,3,7,5,6,7] +; SSE-NEXT: pandn %xmm6, %xmm8 +; SSE-NEXT: movdqa %xmm3, %xmm6 +; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,2,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,5,5,5] +; SSE-NEXT: movdqa %xmm0, %xmm10 +; SSE-NEXT: pandn %xmm6, %xmm10 +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm3[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[3,1,1,2,4,5,6,7] +; SSE-NEXT: pand %xmm0, %xmm6 +; SSE-NEXT: por %xmm10, %xmm6 +; SSE-NEXT: packuswb %xmm6, %xmm6 +; SSE-NEXT: pand %xmm13, %xmm6 +; SSE-NEXT: por %xmm8, %xmm6 +; SSE-NEXT: movdqa %xmm1, %xmm8 +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15] +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm1[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,5,7,4] -; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,65535,65535,65535,0,65535,0,0] -; SSE-NEXT: movdqa %xmm11, %xmm13 -; SSE-NEXT: pandn %xmm10, %xmm13 +; SSE-NEXT: movdqa %xmm11, %xmm12 +; SSE-NEXT: pandn %xmm10, %xmm12 ; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3],xmm8[4],xmm5[4],xmm8[5],xmm5[5],xmm8[6],xmm5[6],xmm8[7],xmm5[7] ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,3,1,1] ; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,1,1,3,4,5,6,7] ; SSE-NEXT: pand %xmm11, %xmm8 -; SSE-NEXT: por %xmm8, %xmm13 -; SSE-NEXT: packuswb %xmm13, %xmm10 -; SSE-NEXT: movdqa %xmm1, %xmm8 -; SSE-NEXT: pandn %xmm10, %xmm8 -; SSE-NEXT: pand %xmm1, %xmm12 -; SSE-NEXT: por %xmm12, %xmm8 -; SSE-NEXT: movdqa %xmm14, %xmm9 -; SSE-NEXT: movdqa %xmm14, %xmm10 -; SSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm5[8],xmm10[9],xmm5[9],xmm10[10],xmm5[10],xmm10[11],xmm5[11],xmm10[12],xmm5[12],xmm10[13],xmm5[13],xmm10[14],xmm5[14],xmm10[15],xmm5[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3],xmm9[4],xmm5[4],xmm9[5],xmm5[5],xmm9[6],xmm5[6],xmm9[7],xmm5[7] -; SSE-NEXT: movdqa %xmm9, %xmm12 -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,0],xmm10[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,0],xmm10[2,3] -; SSE-NEXT: psrlq $48, %xmm10 -; SSE-NEXT: psrldq {{.*#+}} xmm12 = xmm12[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm9[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,4,5,7] -; SSE-NEXT: packuswb %xmm12, %xmm10 -; SSE-NEXT: movdqa %xmm15, %xmm12 -; SSE-NEXT: pandn %xmm10, %xmm12 -; SSE-NEXT: movdqa %xmm4, %xmm10 -; SSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm5[8],xmm10[9],xmm5[9],xmm10[10],xmm5[10],xmm10[11],xmm5[11],xmm10[12],xmm5[12],xmm10[13],xmm5[13],xmm10[14],xmm5[14],xmm10[15],xmm5[15] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[1,1,2,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,5,5,5] -; SSE-NEXT: movdqa %xmm0, %xmm14 -; SSE-NEXT: pandn %xmm10, %xmm14 -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] -; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm4[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm10[3,1,1,2,4,5,6,7] -; SSE-NEXT: pand %xmm0, %xmm13 -; SSE-NEXT: por %xmm14, %xmm13 -; SSE-NEXT: packuswb %xmm13, %xmm13 -; SSE-NEXT: pand %xmm15, %xmm13 -; SSE-NEXT: por %xmm12, %xmm13 -; SSE-NEXT: movdqa %xmm3, %xmm10 -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15] -; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm3[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,5,5,7,4] -; SSE-NEXT: movdqa %xmm11, %xmm14 -; SSE-NEXT: pandn %xmm12, %xmm14 -; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3],xmm10[4],xmm5[4],xmm10[5],xmm5[5],xmm10[6],xmm5[6],xmm10[7],xmm5[7] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,3,1,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[0,1,1,3,4,5,6,7] -; SSE-NEXT: pand %xmm11, %xmm10 -; SSE-NEXT: por %xmm10, %xmm14 -; SSE-NEXT: packuswb %xmm14, %xmm10 -; SSE-NEXT: movdqa %xmm1, %xmm12 -; SSE-NEXT: pandn %xmm10, %xmm12 -; SSE-NEXT: pand %xmm1, %xmm13 -; SSE-NEXT: por %xmm13, %xmm12 +; SSE-NEXT: por %xmm8, %xmm12 +; SSE-NEXT: packuswb %xmm12, %xmm8 +; SSE-NEXT: movdqa %xmm4, %xmm12 +; SSE-NEXT: pandn %xmm8, %xmm12 +; SSE-NEXT: pand %xmm4, %xmm6 +; SSE-NEXT: por %xmm6, %xmm12 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; SSE-NEXT: movdqa %xmm9, %xmm10 -; SSE-NEXT: pxor %xmm3, %xmm3 -; SSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm3[8],xmm10[9],xmm3[9],xmm10[10],xmm3[10],xmm10[11],xmm3[11],xmm10[12],xmm3[12],xmm10[13],xmm3[13],xmm10[14],xmm3[14],xmm10[15],xmm3[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm3[0],xmm9[1],xmm3[1],xmm9[2],xmm3[2],xmm9[3],xmm3[3],xmm9[4],xmm3[4],xmm9[5],xmm3[5],xmm9[6],xmm3[6],xmm9[7],xmm3[7] -; SSE-NEXT: pxor %xmm4, %xmm4 -; SSE-NEXT: movdqa %xmm9, %xmm13 -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,0],xmm10[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,0],xmm10[2,3] -; SSE-NEXT: psrlq $48, %xmm10 -; SSE-NEXT: psrldq {{.*#+}} xmm13 = xmm13[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm10[0],xmm13[1],xmm10[1],xmm13[2],xmm10[2],xmm13[3],xmm10[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm9[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,4,5,7] -; SSE-NEXT: packuswb %xmm13, %xmm10 -; SSE-NEXT: movdqa %xmm15, %xmm13 -; SSE-NEXT: pandn %xmm10, %xmm13 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movdqa %xmm3, %xmm10 -; SSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm4[8],xmm10[9],xmm4[9],xmm10[10],xmm4[10],xmm10[11],xmm4[11],xmm10[12],xmm4[12],xmm10[13],xmm4[13],xmm10[14],xmm4[14],xmm10[15],xmm4[15] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[1,1,2,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,5,5,5] +; SSE-NEXT: movdqa %xmm9, %xmm6 +; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3],xmm9[4],xmm5[4],xmm9[5],xmm5[5],xmm9[6],xmm5[6],xmm9[7],xmm5[7] +; SSE-NEXT: movdqa %xmm9, %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,0],xmm6[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,0],xmm6[2,3] +; SSE-NEXT: psrlq $48, %xmm6 +; SSE-NEXT: psrldq {{.*#+}} xmm8 = xmm8[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm9[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,5,7] +; SSE-NEXT: packuswb %xmm8, %xmm6 +; SSE-NEXT: movdqa %xmm13, %xmm8 +; SSE-NEXT: pandn %xmm6, %xmm8 +; SSE-NEXT: movdqa %xmm14, %xmm6 +; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,2,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,5,5,5] ; SSE-NEXT: movdqa %xmm0, %xmm9 -; SSE-NEXT: pandn %xmm10, %xmm9 -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm3[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm10[3,1,1,2,4,5,6,7] -; SSE-NEXT: pand %xmm0, %xmm14 -; SSE-NEXT: por %xmm9, %xmm14 -; SSE-NEXT: packuswb %xmm14, %xmm14 -; SSE-NEXT: pand %xmm15, %xmm14 -; SSE-NEXT: por %xmm13, %xmm14 -; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: movdqa %xmm3, %xmm9 -; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] -; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm3[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,1,2,0] -; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,5,7,4] -; SSE-NEXT: movdqa %xmm11, %xmm13 -; SSE-NEXT: pandn %xmm10, %xmm13 -; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3],xmm9[4],xmm4[4],xmm9[5],xmm4[5],xmm9[6],xmm4[6],xmm9[7],xmm4[7] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,3,1,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[0,1,1,3,4,5,6,7] -; SSE-NEXT: pand %xmm11, %xmm9 -; SSE-NEXT: por %xmm9, %xmm13 -; SSE-NEXT: packuswb %xmm13, %xmm9 -; SSE-NEXT: movdqa %xmm1, %xmm13 -; SSE-NEXT: pandn %xmm9, %xmm13 -; SSE-NEXT: pand %xmm1, %xmm14 -; SSE-NEXT: por %xmm14, %xmm13 -; SSE-NEXT: movdqa (%rsp), %xmm3 # 16-byte Reload -; SSE-NEXT: movdqa %xmm3, %xmm9 -; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm4[8],xmm9[9],xmm4[9],xmm9[10],xmm4[10],xmm9[11],xmm4[11],xmm9[12],xmm4[12],xmm9[13],xmm4[13],xmm9[14],xmm4[14],xmm9[15],xmm4[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; SSE-NEXT: movdqa %xmm3, %xmm10 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm9[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm9[2,3] -; SSE-NEXT: psrlq $48, %xmm9 -; SSE-NEXT: psrldq {{.*#+}} xmm10 = xmm10[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm3[3,1,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,1,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,4,5,7] -; SSE-NEXT: packuswb %xmm10, %xmm9 -; SSE-NEXT: movdqa %xmm6, %xmm10 -; SSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm5[8],xmm10[9],xmm5[9],xmm10[10],xmm5[10],xmm10[11],xmm5[11],xmm10[12],xmm5[12],xmm10[13],xmm5[13],xmm10[14],xmm5[14],xmm10[15],xmm5[15] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[1,1,2,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,5,5,5] +; SSE-NEXT: pandn %xmm6, %xmm9 +; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm5[0],xmm14[1],xmm5[1],xmm14[2],xmm5[2],xmm14[3],xmm5[3],xmm14[4],xmm5[4],xmm14[5],xmm5[5],xmm14[6],xmm5[6],xmm14[7],xmm5[7] +; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm14[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm6[3,1,1,2,4,5,6,7] +; SSE-NEXT: pand %xmm0, %xmm10 +; SSE-NEXT: por %xmm9, %xmm10 +; SSE-NEXT: packuswb %xmm10, %xmm10 +; SSE-NEXT: pand %xmm13, %xmm10 +; SSE-NEXT: por %xmm8, %xmm10 +; SSE-NEXT: movdqa %xmm15, %xmm6 +; SSE-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm5[8],xmm15[9],xmm5[9],xmm15[10],xmm5[10],xmm15[11],xmm5[11],xmm15[12],xmm5[12],xmm15[13],xmm5[13],xmm15[14],xmm5[14],xmm15[15],xmm5[15] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm15[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,2,0] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,5,7,4] +; SSE-NEXT: movdqa %xmm11, %xmm9 +; SSE-NEXT: pandn %xmm8, %xmm9 ; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm6[0,1,2,3,7,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm14[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm14[3,1,1,2,4,5,6,7] -; SSE-NEXT: pand %xmm0, %xmm14 -; SSE-NEXT: pandn %xmm10, %xmm0 -; SSE-NEXT: por %xmm14, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,3,1,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,1,1,3,4,5,6,7] +; SSE-NEXT: pand %xmm11, %xmm6 +; SSE-NEXT: por %xmm6, %xmm9 +; SSE-NEXT: packuswb %xmm9, %xmm8 +; SSE-NEXT: movdqa %xmm4, %xmm6 +; SSE-NEXT: pandn %xmm8, %xmm6 +; SSE-NEXT: pand %xmm4, %xmm10 +; SSE-NEXT: por %xmm10, %xmm6 +; SSE-NEXT: movdqa (%rsp), %xmm1 # 16-byte Reload +; SSE-NEXT: movdqa %xmm1, %xmm8 +; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm5[8],xmm8[9],xmm5[9],xmm8[10],xmm5[10],xmm8[11],xmm5[11],xmm8[12],xmm5[12],xmm8[13],xmm5[13],xmm8[14],xmm5[14],xmm8[15],xmm5[15] +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] +; SSE-NEXT: movdqa %xmm1, %xmm9 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm8[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm8[2,3] +; SSE-NEXT: psrlq $48, %xmm8 +; SSE-NEXT: psrldq {{.*#+}} xmm9 = xmm9[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm1[3,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,0,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,5,7] +; SSE-NEXT: packuswb %xmm9, %xmm8 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: movdqa %xmm14, %xmm9 +; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm5[8],xmm9[9],xmm5[9],xmm9[10],xmm5[10],xmm9[11],xmm5[11],xmm9[12],xmm5[12],xmm9[13],xmm5[13],xmm9[14],xmm5[14],xmm9[15],xmm5[15] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[1,1,2,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,5,5,5,5] +; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm5[0],xmm14[1],xmm5[1],xmm14[2],xmm5[2],xmm14[3],xmm5[3],xmm14[4],xmm5[4],xmm14[5],xmm5[5],xmm14[6],xmm5[6],xmm14[7],xmm5[7] +; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm14[0,1,2,3,7,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[3,1,1,2,4,5,6,7] +; SSE-NEXT: pand %xmm0, %xmm10 +; SSE-NEXT: pandn %xmm9, %xmm0 +; SSE-NEXT: por %xmm10, %xmm0 ; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: pand %xmm15, %xmm0 -; SSE-NEXT: pandn %xmm9, %xmm15 -; SSE-NEXT: por %xmm15, %xmm0 -; SSE-NEXT: movdqa %xmm2, %xmm4 -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,1,1] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7] -; SSE-NEXT: pand %xmm11, %xmm4 -; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm2[0,1,2,3,7,5,6,7] +; SSE-NEXT: pand %xmm13, %xmm0 +; SSE-NEXT: pandn %xmm8, %xmm13 +; SSE-NEXT: por %xmm13, %xmm0 +; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: movdqa %xmm14, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm14 = xmm14[8],xmm5[8],xmm14[9],xmm5[9],xmm14[10],xmm5[10],xmm14[11],xmm5[11],xmm14[12],xmm5[12],xmm14[13],xmm5[13],xmm14[14],xmm5[14],xmm14[15],xmm5[15] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,1,1] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] +; SSE-NEXT: pand %xmm11, %xmm1 +; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm14[0,1,2,3,7,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,7,4] ; SSE-NEXT: pandn %xmm5, %xmm11 -; SSE-NEXT: por %xmm4, %xmm11 -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: packuswb %xmm11, %xmm4 -; SSE-NEXT: pandn %xmm4, %xmm1 -; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: por %xmm1, %xmm11 +; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: packuswb %xmm11, %xmm1 +; SSE-NEXT: pandn %xmm1, %xmm4 +; SSE-NEXT: por %xmm0, %xmm4 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, 16(%rsi) ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -3577,11 +3606,11 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-NEXT: movaps %xmm0, (%r9) ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa %xmm1, 16(%rax) -; SSE-NEXT: movdqa %xmm13, 32(%rax) +; SSE-NEXT: movdqa %xmm4, 16(%rax) +; SSE-NEXT: movdqa %xmm6, 32(%rax) ; SSE-NEXT: movdqa %xmm12, 48(%rax) -; SSE-NEXT: movdqa %xmm8, (%rax) -; SSE-NEXT: addq $792, %rsp # imm = 0x318 +; SSE-NEXT: movdqa %xmm2, (%rax) +; SSE-NEXT: addq $808, %rsp # imm = 0x328 ; SSE-NEXT: retq ; ; AVX1-ONLY-LABEL: load_i8_stride6_vf64: diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll index 3dc5818d6c9b7..e6de73d602b73 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll @@ -1218,7 +1218,7 @@ define void @store_i16_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm4[4,5,6,7] ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = <5,5,u,6,6,u,7,7,u,8,8,u,9,9,u,10> ; AVX512F-NEXT: vpermd (%rdx), %zmm4, %zmm5 -; AVX512F-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm5 +; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm5 ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128,20,21,128,128] ; AVX512F-NEXT: vpshufb %ymm6, %ymm3, %ymm3 @@ -2386,7 +2386,7 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm18 = <5,5,u,6,6,u,7,7,u,8,8,u,9,9,u,10> ; AVX512F-NEXT: vpermd 64(%rdx), %zmm18, %zmm10 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] -; AVX512F-NEXT: vpternlogd $184, %zmm0, %zmm22, %zmm10 +; AVX512F-NEXT: vpternlogq $184, %zmm0, %zmm22, %zmm10 ; AVX512F-NEXT: vmovdqa 96(%rdi), %ymm0 ; AVX512F-NEXT: vmovdqa %ymm6, %ymm2 ; AVX512F-NEXT: vpshufb %ymm6, %ymm0, %ymm0 @@ -2469,7 +2469,7 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7] ; AVX512F-NEXT: vpermd (%rdx), %zmm18, %zmm1 -; AVX512F-NEXT: vpternlogd $184, %zmm0, %zmm22, %zmm1 +; AVX512F-NEXT: vpternlogq $184, %zmm0, %zmm22, %zmm1 ; AVX512F-NEXT: vmovdqa64 %zmm1, 64(%rcx) ; AVX512F-NEXT: vmovdqa64 %zmm6, 128(%rcx) ; AVX512F-NEXT: vmovdqa64 %zmm5, 192(%rcx) diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll index 59866f2b8577b..9f479b0ee9937 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll @@ -714,7 +714,7 @@ define void @store_i16_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-SLOW-NEXT: vpbroadcastq (%r8), %ymm6 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,1] ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm6, %zmm4 -; AVX512F-SLOW-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm4 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm4 ; AVX512F-SLOW-NEXT: vpsrlq $48, %xmm3, %xmm3 ; AVX512F-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] ; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] @@ -753,7 +753,7 @@ define void @store_i16_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-FAST-NEXT: vpbroadcastq (%r8), %ymm6 ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,1] ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm6, %zmm4 -; AVX512F-FAST-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm4 +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm4 ; AVX512F-FAST-NEXT: vpsrlq $48, %xmm1, %xmm1 ; AVX512F-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; AVX512F-FAST-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] @@ -1396,7 +1396,7 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-SLOW-NEXT: vpbroadcastq (%r8), %ymm6 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm4[0,1,1,1] ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6 -; AVX512F-SLOW-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm6 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm6 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = ymm1[u,u,u,u,u,u,u,u,14,15,u,u,u,u,u,u,u,u,16,17,u,u,u,u,u,u,u,u,18,19,u,u] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm0[1,1,2,2] ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0],ymm5[1],ymm7[2,3],ymm5[4],ymm7[5],ymm5[6],ymm7[7,8],ymm5[9],ymm7[10,11],ymm5[12],ymm7[13],ymm5[14],ymm7[15] @@ -1474,7 +1474,7 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-FAST-NEXT: vpbroadcastq (%r8), %ymm6 ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm4[0,1,1,1] ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6 -; AVX512F-FAST-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm6 +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm6 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm1[u,u,u,u,u,u,u,u,14,15,u,u,u,u,u,u,u,u,16,17,u,u,u,u,u,u,u,u,18,19,u,u] ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm0[1,1,2,2] ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0],ymm5[1],ymm7[2,3],ymm5[4],ymm7[5],ymm5[6],ymm7[7,8],ymm5[9],ymm7[10,11],ymm5[12],ymm7[13],ymm5[14],ymm7[15] @@ -2804,7 +2804,7 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-SLOW-NEXT: vpbroadcastq 24(%r8), %ymm0 ; AVX512F-SLOW-NEXT: vpbroadcastq 32(%r8), %ymm2 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm19 -; AVX512F-SLOW-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm19 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm19 ; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %ymm3 ; AVX512F-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = ; AVX512F-SLOW-NEXT: vpshufb %ymm2, %ymm3, %ymm0 @@ -2862,7 +2862,7 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-SLOW-NEXT: vpbroadcastq (%r8), %ymm2 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm10[0,1,1,1] ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm2, %zmm2 -; AVX512F-SLOW-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm2 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm2 ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm18[0,1,2,1,4,5,6,5] ; AVX512F-SLOW-NEXT: vprolq $16, %ymm3, %ymm8 ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm8[0,1],ymm4[2],ymm8[3],ymm4[4],ymm8[5,6],ymm4[7],ymm8[8,9],ymm4[10],ymm8[11],ymm4[12],ymm8[13,14],ymm4[15] @@ -2889,7 +2889,7 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-SLOW-NEXT: vpbroadcastq 48(%r8), %ymm4 ; AVX512F-SLOW-NEXT: vpbroadcastq 56(%r8), %ymm5 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4 -; AVX512F-SLOW-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm4 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm4 ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm22, %ymm0 ; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm6, %ymm0 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm20[1,1,2,2] @@ -2995,7 +2995,7 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-FAST-NEXT: vpbroadcastq (%r8), %ymm3 ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm0[0,1,1,1] ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm3, %zmm10 -; AVX512F-FAST-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm10 +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm10 ; AVX512F-FAST-NEXT: vmovdqa (%rsi), %ymm15 ; AVX512F-FAST-NEXT: vmovdqa64 %ymm23, %ymm1 ; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm15, %ymm1 @@ -3049,7 +3049,7 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-FAST-NEXT: vpbroadcastq 48(%r8), %ymm1 ; AVX512F-FAST-NEXT: vpbroadcastq 56(%r8), %ymm3 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 -; AVX512F-FAST-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm1 +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm1 ; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm11[0],xmm6[0],xmm11[1],xmm6[1],xmm11[2],xmm6[2],xmm11[3],xmm6[3] ; AVX512F-FAST-NEXT: vmovdqa64 %xmm20, %xmm5 ; AVX512F-FAST-NEXT: vpshufb %xmm5, %xmm3, %xmm3 @@ -3072,7 +3072,7 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-FAST-NEXT: vpbroadcastq 24(%r8), %ymm2 ; AVX512F-FAST-NEXT: vpbroadcastq 32(%r8), %ymm3 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512F-FAST-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm2 +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm2 ; AVX512F-FAST-NEXT: vmovdqa64 %zmm2, 128(%r9) ; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, 256(%r9) ; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, 64(%r9) @@ -5924,12 +5924,12 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-SLOW-NEXT: vpbroadcastq 96(%r8), %ymm2 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 ; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm18 = [0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535] -; AVX512F-SLOW-NEXT: vpternlogd $184, %zmm31, %zmm18, %zmm1 +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm31, %zmm18, %zmm1 ; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm25, %zmm19, %zmm27 ; AVX512F-SLOW-NEXT: vpbroadcastq 24(%r8), %ymm2 ; AVX512F-SLOW-NEXT: vpbroadcastq 32(%r8), %ymm25 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm25, %zmm2, %zmm2 -; AVX512F-SLOW-NEXT: vpternlogd $184, %zmm27, %zmm18, %zmm2 +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm27, %zmm18, %zmm2 ; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm18 = [65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0] ; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload ; AVX512F-SLOW-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm31 # 64-byte Folded Reload @@ -5972,28 +5972,28 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-SLOW-NEXT: vpbroadcastq 64(%r8), %ymm18 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm18, %zmm8 ; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm18 = [65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535] -; AVX512F-SLOW-NEXT: vpternlogd $184, %zmm22, %zmm18, %zmm8 +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm22, %zmm18, %zmm8 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm28, %zmm27, %zmm22 ; AVX512F-SLOW-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Folded Reload ; AVX512F-SLOW-NEXT: # zmm21 = mem[0,1,0,1,4,5,4,5] ; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm22, %zmm24, %zmm21 ; AVX512F-SLOW-NEXT: vpbroadcastq (%r8), %ymm22 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm22, %zmm10 -; AVX512F-SLOW-NEXT: vpternlogd $184, %zmm21, %zmm18, %zmm10 +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm21, %zmm18, %zmm10 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm13, %zmm14, %zmm13 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm12, %zmm11 ; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm13, %zmm24, %zmm11 ; AVX512F-SLOW-NEXT: vpbroadcastq 112(%r8), %ymm12 ; AVX512F-SLOW-NEXT: vpbroadcastq 120(%r8), %ymm13 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm13, %zmm12, %zmm12 -; AVX512F-SLOW-NEXT: vpternlogd $184, %zmm11, %zmm16, %zmm12 +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm11, %zmm16, %zmm12 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm15, %zmm7 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm6, %zmm4 ; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm7, %zmm24, %zmm4 ; AVX512F-SLOW-NEXT: vpbroadcastq 48(%r8), %ymm6 ; AVX512F-SLOW-NEXT: vpbroadcastq 56(%r8), %ymm7 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6 -; AVX512F-SLOW-NEXT: vpternlogd $184, %zmm4, %zmm16, %zmm6 +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm4, %zmm16, %zmm6 ; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm30, %zmm19, %zmm9 ; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm20, %zmm19, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535] @@ -6186,7 +6186,7 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-FAST-NEXT: vpbroadcastq 112(%r8), %ymm0 ; AVX512F-FAST-NEXT: vpbroadcastq 120(%r8), %ymm1 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm16 -; AVX512F-FAST-NEXT: vpternlogd $184, %zmm2, %zmm17, %zmm16 +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm2, %zmm17, %zmm16 ; AVX512F-FAST-NEXT: vmovdqa 64(%rsi), %ymm8 ; AVX512F-FAST-NEXT: vpshufb %ymm12, %ymm8, %ymm0 ; AVX512F-FAST-NEXT: vmovdqa64 64(%rdi), %ymm24 @@ -6231,7 +6231,7 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-FAST-NEXT: vpbroadcastq 48(%r8), %ymm6 ; AVX512F-FAST-NEXT: vpbroadcastq 56(%r8), %ymm10 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm10, %zmm6, %zmm6 -; AVX512F-FAST-NEXT: vpternlogd $184, %zmm7, %zmm17, %zmm6 +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm7, %zmm17, %zmm6 ; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX512F-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload ; AVX512F-FAST-NEXT: # xmm7 = xmm7[0],mem[0],xmm7[1],mem[1],xmm7[2],mem[2],xmm7[3],mem[3] @@ -6267,7 +6267,7 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-FAST-NEXT: vpbroadcastq 96(%r8), %ymm5 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm0 ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535] -; AVX512F-FAST-NEXT: vpternlogd $184, %zmm8, %zmm5, %zmm0 +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm8, %zmm5, %zmm0 ; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %xmm7 ; AVX512F-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload ; AVX512F-FAST-NEXT: # xmm7 = xmm7[0],mem[0],xmm7[1],mem[1],xmm7[2],mem[2],xmm7[3],mem[3] @@ -6282,7 +6282,7 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-FAST-NEXT: vpbroadcastq 24(%r8), %ymm4 ; AVX512F-FAST-NEXT: vpbroadcastq 32(%r8), %ymm7 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm7, %zmm4, %zmm4 -; AVX512F-FAST-NEXT: vpternlogd $184, %zmm8, %zmm5, %zmm4 +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm8, %zmm5, %zmm4 ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = [65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0] ; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512F-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm3 # 64-byte Folded Reload @@ -6294,10 +6294,10 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-FAST-NEXT: vpbroadcastq 64(%r8), %ymm5 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm5, %zmm2 ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = [65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535] -; AVX512F-FAST-NEXT: vpternlogd $184, %zmm20, %zmm5, %zmm2 +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm20, %zmm5, %zmm2 ; AVX512F-FAST-NEXT: vpbroadcastq (%r8), %ymm7 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm7, %zmm7 -; AVX512F-FAST-NEXT: vpternlogd $184, %zmm22, %zmm5, %zmm7 +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm22, %zmm5, %zmm7 ; AVX512F-FAST-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm26 # 64-byte Folded Reload ; AVX512F-FAST-NEXT: vpternlogq $226, %zmm25, %zmm1, %zmm12 ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll index b2c0e00825e63..7133214d9a80f 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll @@ -3486,19 +3486,19 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm19, %zmm18, %zmm9 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] -; AVX512F-ONLY-SLOW-NEXT: vpternlogd $184, %zmm17, %zmm10, %zmm9 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm17, %zmm10, %zmm9 ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm22, %zmm21, %zmm11 -; AVX512F-ONLY-SLOW-NEXT: vpternlogd $184, %zmm20, %zmm10, %zmm11 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm20, %zmm10, %zmm11 ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm24, %zmm16, %zmm10 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm12 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512F-ONLY-SLOW-NEXT: vpternlogd $184, %zmm23, %zmm12, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm23, %zmm12, %zmm10 ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm8, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpternlogd $184, %zmm1, %zmm12, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm12, %zmm3 ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] -; AVX512F-ONLY-SLOW-NEXT: vpternlogd $184, %zmm4, %zmm6, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm4, %zmm6, %zmm1 ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm5, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpternlogd $184, %zmm0, %zmm6, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm6, %zmm2 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm2, (%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, 192(%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 256(%rax) @@ -3695,23 +3695,23 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm3, %zmm5, %zmm2 ; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm4[0,1,2,3],zmm0[0,1,2,3] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vpternlogd $184, %zmm0, %zmm3, %zmm2 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm0, %zmm3, %zmm2 ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 256(%rax) ; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm18[0,1,2,3],zmm10[0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpternlogd $184, %zmm0, %zmm3, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm0, %zmm3, %zmm8 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 64(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vpternlogd $184, %zmm22, %zmm0, %zmm14 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm22, %zmm0, %zmm14 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm14, (%rax) -; AVX512F-ONLY-FAST-NEXT: vpternlogd $184, %zmm19, %zmm0, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm19, %zmm0, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 192(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogd $184, %zmm17, %zmm0, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm17, %zmm0, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 128(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogd $184, %zmm16, %zmm0, %zmm1 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm16, %zmm0, %zmm1 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm1, 320(%rax) ; AVX512F-ONLY-FAST-NEXT: popq %rax ; AVX512F-ONLY-FAST-NEXT: vzeroupper @@ -3935,19 +3935,19 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm19, %zmm26, %zmm6 ; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] -; AVX512DQ-SLOW-NEXT: vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm6 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm6 # 64-byte Folded Reload ; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm18, %zmm22, %zmm8 -; AVX512DQ-SLOW-NEXT: vpternlogd $184, %zmm21, %zmm7, %zmm8 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm21, %zmm7, %zmm8 ; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm25, %zmm24, %zmm7 ; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] -; AVX512DQ-SLOW-NEXT: vpternlogd $184, %zmm20, %zmm9, %zmm7 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm20, %zmm9, %zmm7 ; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm11, %zmm10 -; AVX512DQ-SLOW-NEXT: vpternlogd $184, %zmm23, %zmm9, %zmm10 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm23, %zmm9, %zmm10 ; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1 ; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512DQ-SLOW-NEXT: vpternlogd $184, %zmm2, %zmm3, %zmm1 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm3, %zmm1 ; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm2 -; AVX512DQ-SLOW-NEXT: vpternlogd $184, %zmm0, %zmm3, %zmm2 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm3, %zmm2 ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm2, 256(%rax) ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, 64(%rax) ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm8, (%rax) @@ -4154,22 +4154,22 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,0,2,1,4,5,6,7] ; AVX512DQ-FAST-NEXT: vpermt2q %zmm5, %zmm9, %zmm6 ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] -; AVX512DQ-FAST-NEXT: vpternlogd $184, %zmm4, %zmm5, %zmm6 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm4, %zmm5, %zmm6 ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, (%rax) -; AVX512DQ-FAST-NEXT: vpternlogd $184, %zmm3, %zmm5, %zmm7 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm3, %zmm5, %zmm7 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, 192(%rax) ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] -; AVX512DQ-FAST-NEXT: vpternlogd $184, %zmm1, %zmm3, %zmm0 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm1, %zmm3, %zmm0 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm0, 128(%rax) -; AVX512DQ-FAST-NEXT: vpternlogd $184, %zmm18, %zmm3, %zmm2 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm18, %zmm3, %zmm2 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, 320(%rax) ; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm20[0,1,2,3],zmm17[0,1,2,3] ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512DQ-FAST-NEXT: vpternlogd $184, %zmm0, %zmm1, %zmm13 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm0, %zmm1, %zmm13 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, 256(%rax) ; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm19[0,1,2,3],zmm23[0,1,2,3] -; AVX512DQ-FAST-NEXT: vpternlogd $184, %zmm0, %zmm1, %zmm12 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm0, %zmm1, %zmm12 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm12, 64(%rax) ; AVX512DQ-FAST-NEXT: vzeroupper ; AVX512DQ-FAST-NEXT: retq @@ -7763,10 +7763,10 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm6 # 32-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm29 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] -; AVX512F-ONLY-SLOW-NEXT: vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm6 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm6 # 64-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm23 # 32-byte Folded Reload -; AVX512F-ONLY-SLOW-NEXT: vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm23 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm23 # 64-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm22, %zmm2, %zmm12 ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm0, %zmm14 @@ -7862,27 +7862,27 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-ONLY-SLOW-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,2,2,3,4,5,6,7] ; AVX512F-ONLY-SLOW-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,4,4,4] ; AVX512F-ONLY-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,2,1] -; AVX512F-ONLY-SLOW-NEXT: vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm12 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm12 # 64-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm16, %zmm20, %zmm4 -; AVX512F-ONLY-SLOW-NEXT: vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm4 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm4 # 64-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm5, %zmm5 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512F-ONLY-SLOW-NEXT: vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm5 # 64-byte Folded Reload +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm5 # 64-byte Folded Reload ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm17, %zmm24, %zmm16 -; AVX512F-ONLY-SLOW-NEXT: vpternlogd $184, %zmm27, %zmm10, %zmm16 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm27, %zmm10, %zmm16 ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm13, %zmm8, %zmm8 -; AVX512F-ONLY-SLOW-NEXT: vpternlogd $184, %zmm25, %zmm10, %zmm8 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm25, %zmm10, %zmm8 ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm14, %zmm2 -; AVX512F-ONLY-SLOW-NEXT: vpternlogd $184, %zmm22, %zmm10, %zmm2 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm22, %zmm10, %zmm2 ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm19, %zmm18, %zmm10 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm13 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] -; AVX512F-ONLY-SLOW-NEXT: vpternlogd $184, %zmm28, %zmm13, %zmm10 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm28, %zmm13, %zmm10 ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm7, %zmm7 -; AVX512F-ONLY-SLOW-NEXT: vpternlogd $184, %zmm30, %zmm13, %zmm7 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm30, %zmm13, %zmm7 ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm3, %zmm3 -; AVX512F-ONLY-SLOW-NEXT: vpternlogd $184, %zmm21, %zmm13, %zmm3 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm21, %zmm13, %zmm3 ; AVX512F-ONLY-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm1, %zmm1 -; AVX512F-ONLY-SLOW-NEXT: vpternlogd $184, %zmm0, %zmm13, %zmm1 +; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm13, %zmm1 ; AVX512F-ONLY-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512F-ONLY-SLOW-NEXT: vmovdqa64 %zmm3, 192(%rax) @@ -8295,40 +8295,40 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-ONLY-FAST-NEXT: vpermt2q %zmm7, %zmm27, %zmm6 ; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm18[0,1,2,3] ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vpternlogd $184, %zmm0, %zmm7, %zmm6 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm0, %zmm7, %zmm6 ; AVX512F-ONLY-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm6, 256(%rax) ; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm20[0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpternlogd $184, %zmm0, %zmm7, %zmm9 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm0, %zmm7, %zmm9 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm9, 448(%rax) ; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm5[0,1,2,3],zmm28[0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpternlogd $184, %zmm0, %zmm7, %zmm8 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm0, %zmm7, %zmm8 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm8, 640(%rax) ; AVX512F-ONLY-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,2,3],zmm17[0,1,2,3] -; AVX512F-ONLY-FAST-NEXT: vpternlogd $184, %zmm0, %zmm7, %zmm3 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm0, %zmm7, %zmm3 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm3, 64(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] -; AVX512F-ONLY-FAST-NEXT: vpternlogd $184, %zmm16, %zmm0, %zmm12 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm16, %zmm0, %zmm12 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm12, (%rax) -; AVX512F-ONLY-FAST-NEXT: vpternlogd $184, %zmm26, %zmm0, %zmm4 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm26, %zmm0, %zmm4 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm4, 192(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] -; AVX512F-ONLY-FAST-NEXT: vpternlogd $184, %zmm25, %zmm1, %zmm10 +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %zmm25, %zmm1, %zmm10 ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm10, 128(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm2 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm2 # 64-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 320(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 384(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm2, 576(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 512(%rax) ; AVX512F-ONLY-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512F-ONLY-FAST-NEXT: vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload +; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload ; AVX512F-ONLY-FAST-NEXT: vmovdqa64 %zmm0, 704(%rax) ; AVX512F-ONLY-FAST-NEXT: addq $1256, %rsp # imm = 0x4E8 ; AVX512F-ONLY-FAST-NEXT: vzeroupper @@ -8731,10 +8731,10 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 32-byte Folded Reload ; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm26 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] -; AVX512DQ-SLOW-NEXT: vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm2 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm2 # 64-byte Folded Reload ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 32-byte Folded Reload -; AVX512DQ-SLOW-NEXT: vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm3 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm3 # 64-byte Folded Reload ; AVX512DQ-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 32-byte Folded Reload ; AVX512DQ-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -8807,27 +8807,27 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1] ; AVX512DQ-SLOW-NEXT: vpshuflw {{.*#+}} ymm10 = ymm10[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,2] -; AVX512DQ-SLOW-NEXT: vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm4 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm4 # 64-byte Folded Reload ; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm31, %zmm30, %zmm25 -; AVX512DQ-SLOW-NEXT: vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm25 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm25 # 64-byte Folded Reload ; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm18, %zmm23, %zmm6 ; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm18 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] -; AVX512DQ-SLOW-NEXT: vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm6 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm6 # 64-byte Folded Reload ; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm21, %zmm17, %zmm16 -; AVX512DQ-SLOW-NEXT: vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm16 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm16 # 64-byte Folded Reload ; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm14, %zmm24, %zmm14 -; AVX512DQ-SLOW-NEXT: vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm14 # 64-byte Folded Reload +; AVX512DQ-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm14 # 64-byte Folded Reload ; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm5, %zmm5 -; AVX512DQ-SLOW-NEXT: vpternlogd $184, %zmm29, %zmm18, %zmm5 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm29, %zmm18, %zmm5 ; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm8, %zmm8 ; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512DQ-SLOW-NEXT: vpternlogd $184, %zmm19, %zmm9, %zmm8 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm19, %zmm9, %zmm8 ; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm15, %zmm0 -; AVX512DQ-SLOW-NEXT: vpternlogd $184, %zmm20, %zmm9, %zmm0 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm20, %zmm9, %zmm0 ; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm7, %zmm7 -; AVX512DQ-SLOW-NEXT: vpternlogd $184, %zmm22, %zmm9, %zmm7 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm22, %zmm9, %zmm7 ; AVX512DQ-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm13, %zmm10 -; AVX512DQ-SLOW-NEXT: vpternlogd $184, %zmm1, %zmm9, %zmm10 +; AVX512DQ-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm9, %zmm10 ; AVX512DQ-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm10, 256(%rax) ; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm7, 448(%rax) @@ -9253,43 +9253,43 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FAST-NEXT: vpshuflw {{.*#+}} xmm13 = xmm6[0,0,2,1,4,5,6,7] ; AVX512DQ-FAST-NEXT: vpermt2q %zmm11, %zmm18, %zmm13 ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm11 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] -; AVX512DQ-FAST-NEXT: vpternlogd $184, %zmm9, %zmm11, %zmm13 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm9, %zmm11, %zmm13 ; AVX512DQ-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm13, (%rax) -; AVX512DQ-FAST-NEXT: vpternlogd $184, %zmm5, %zmm11, %zmm10 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm5, %zmm11, %zmm10 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm10, 192(%rax) ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm5 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] -; AVX512DQ-FAST-NEXT: vpternlogd $184, %zmm0, %zmm5, %zmm8 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm0, %zmm5, %zmm8 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm8, 128(%rax) -; AVX512DQ-FAST-NEXT: vpternlogd $184, %zmm29, %zmm5, %zmm3 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm29, %zmm5, %zmm3 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm3, 320(%rax) ; AVX512DQ-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm20[0,1,2,3],zmm25[0,1,2,3] ; AVX512DQ-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpternlogd $184, %zmm0, %zmm3, %zmm6 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm0, %zmm3, %zmm6 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, 256(%rax) ; AVX512DQ-FAST-NEXT: vshufi64x2 $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm0 # 64-byte Folded Reload ; AVX512DQ-FAST-NEXT: # zmm0 = zmm17[0,1,2,3],mem[0,1,2,3] ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpternlogd $184, %zmm0, %zmm3, %zmm6 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm0, %zmm3, %zmm6 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm6, 448(%rax) -; AVX512DQ-FAST-NEXT: vpternlogd $184, %zmm12, %zmm11, %zmm4 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm12, %zmm11, %zmm4 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm4, 384(%rax) -; AVX512DQ-FAST-NEXT: vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm7 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm7 # 64-byte Folded Reload ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm7, 576(%rax) -; AVX512DQ-FAST-NEXT: vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm2 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm2 # 64-byte Folded Reload ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm2, 512(%rax) -; AVX512DQ-FAST-NEXT: vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm1 # 64-byte Folded Reload +; AVX512DQ-FAST-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm1 # 64-byte Folded Reload ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, 704(%rax) ; AVX512DQ-FAST-NEXT: vshufi64x2 $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm0 # 64-byte Folded Reload ; AVX512DQ-FAST-NEXT: # zmm0 = zmm26[0,1,2,3],mem[0,1,2,3] ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpternlogd $184, %zmm0, %zmm3, %zmm1 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm0, %zmm3, %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, 640(%rax) ; AVX512DQ-FAST-NEXT: vshufi64x2 $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm22, %zmm0 # 64-byte Folded Reload ; AVX512DQ-FAST-NEXT: # zmm0 = zmm22[0,1,2,3],mem[0,1,2,3] ; AVX512DQ-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FAST-NEXT: vpternlogd $184, %zmm0, %zmm3, %zmm1 +; AVX512DQ-FAST-NEXT: vpternlogq $184, %zmm0, %zmm3, %zmm1 ; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, 64(%rax) ; AVX512DQ-FAST-NEXT: addq $1224, %rsp # imm = 0x4C8 ; AVX512DQ-FAST-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll index 3de19185ea053..5934b80893ce3 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll @@ -458,7 +458,7 @@ define void @store_i16_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[4,5,12,13,u,u,u,u,u,u],zero,zero,zero,zero,ymm3[6,7],zero,zero,ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 ; AVX512F-SLOW-NEXT: vporq %zmm1, %zmm2, %zmm1 -; AVX512F-SLOW-NEXT: vpternlogd $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 +; AVX512F-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 ; AVX512F-SLOW-NEXT: vextracti32x4 $2, %zmm1, 32(%rax) ; AVX512F-SLOW-NEXT: vextracti32x4 $3, %zmm1, %xmm0 ; AVX512F-SLOW-NEXT: vmovq %xmm0, 48(%rax) @@ -1059,7 +1059,7 @@ define void @store_i16_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-SLOW-NEXT: vpbroadcastd 4(%r10), %ymm12 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm11, %zmm11 ; AVX512F-SLOW-NEXT: vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm10, %zmm11 -; AVX512F-SLOW-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm11 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm11 ; AVX512F-SLOW-NEXT: vpsrlq $48, %xmm4, %xmm4 ; AVX512F-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm4[1] ; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] @@ -1141,7 +1141,7 @@ define void @store_i16_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-FAST-NEXT: vpbroadcastd 4(%r10), %ymm5 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4 ; AVX512F-FAST-NEXT: vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm4 -; AVX512F-FAST-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm4 +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm4 ; AVX512F-FAST-NEXT: vmovdqa %xmm0, 96(%rax) ; AVX512F-FAST-NEXT: vmovdqa64 %zmm4, (%rax) ; AVX512F-FAST-NEXT: vmovdqa %ymm1, 64(%rax) @@ -2427,22 +2427,22 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm16, %zmm7, %zmm7 ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm17, %zmm9, %zmm9 -; AVX512F-SLOW-NEXT: vpternlogd $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm9 +; AVX512F-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm9 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm19, %zmm4, %zmm4 ; AVX512F-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm5 ; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm5 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm10, %zmm4 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm7 = zmm20[2,2,2,3,6,6,6,7] -; AVX512F-SLOW-NEXT: vpternlogd $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm7 +; AVX512F-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm7 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm12, %zmm0 -; AVX512F-SLOW-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm3 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm3 ; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm3 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm0 = zmm1[0,0,1,1,4,4,5,5] -; AVX512F-SLOW-NEXT: vpternlogd $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm18, %zmm0 +; AVX512F-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm18, %zmm0 ; AVX512F-SLOW-NEXT: vpbroadcastd (%rax), %ymm1 ; AVX512F-SLOW-NEXT: vpbroadcastd 4(%rax), %ymm4 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm1 -; AVX512F-SLOW-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm15, %zmm1 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm15, %zmm1 ; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 ; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm8, %ymm6 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm13[2,1,3,2] @@ -2556,20 +2556,20 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm16, %zmm3, %zmm2 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm17, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0 +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm18, %zmm8, %zmm2 ; AVX512F-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm10 ; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm10 ; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm0 = zmm19[2,2,2,3,6,6,6,7] -; AVX512F-FAST-NEXT: vpternlogd $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm0 -; AVX512F-FAST-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm13, %zmm6 +; AVX512F-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm0 +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm13, %zmm6 ; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm6 ; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm0 = zmm11[0,0,1,1,4,4,5,5] -; AVX512F-FAST-NEXT: vpternlogd $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm15, %zmm0 +; AVX512F-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm15, %zmm0 ; AVX512F-FAST-NEXT: vpbroadcastd (%rax), %ymm2 ; AVX512F-FAST-NEXT: vpbroadcastd 4(%rax), %ymm3 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512F-FAST-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm20, %zmm2 +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm20, %zmm2 ; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm2 ; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm4 ; AVX512F-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm7, %ymm1 @@ -5262,12 +5262,12 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm0 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm11, %zmm9 ; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm9 -; AVX512F-SLOW-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm3 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm3 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm15[2,1,3,2] ; AVX512F-SLOW-NEXT: vpbroadcastd 32(%rax), %ymm9 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm7, %zmm7 ; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-SLOW-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm7 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm7 ; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm7 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm8, %zmm17, %zmm1 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm13, %zmm18, %zmm8 @@ -5276,7 +5276,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-SLOW-NEXT: vpbroadcastd 40(%rax), %ymm9 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm1, %zmm1 ; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512F-SLOW-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm1 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm1 ; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm1 ; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm19, %zmm8, %zmm8 @@ -5295,7 +5295,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-SLOW-NEXT: vpbroadcastd 4(%rax), %ymm10 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm9 ; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512F-SLOW-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm10, %zmm9 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm10, %zmm9 ; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm9 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm14, %zmm2 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4 @@ -5303,7 +5303,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,5,4,0,0,6,5,0,0,5,4,0,0,6,5,0] ; AVX512F-SLOW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] ; AVX512F-SLOW-NEXT: vpermd (%rax), %zmm2, %zmm2 -; AVX512F-SLOW-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm27, %zmm2 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm27, %zmm2 ; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm2 ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm2, 128(%rax) @@ -5541,11 +5541,11 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [30,5,0,0,31,6,0,31,30,5,0,0,31,6,0,31] ; AVX512F-FAST-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] ; AVX512F-FAST-NEXT: vpermi2d %zmm3, %zmm8, %zmm7 -; AVX512F-FAST-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm7 +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm7 ; AVX512F-FAST-NEXT: vpbroadcastd 36(%rax), %ymm6 ; AVX512F-FAST-NEXT: vpbroadcastd 40(%rax), %ymm8 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm6 -; AVX512F-FAST-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm10, %zmm6 +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm10, %zmm6 ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] ; AVX512F-FAST-NEXT: vpternlogq $226, %zmm26, %zmm8, %zmm5 ; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm6 @@ -5558,7 +5558,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm19[0,0,1,3] ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm18[2,2,2,3] ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm17[2,1,3,2] -; AVX512F-FAST-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm27, %zmm5 +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm27, %zmm5 ; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm5 ; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm15[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] @@ -5574,7 +5574,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-FAST-NEXT: vpermd %ymm9, %ymm0, %ymm0 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm3, %zmm3 ; AVX512F-FAST-NEXT: vpermd %zmm3, %zmm21, %zmm3 -; AVX512F-FAST-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm28, %zmm3 +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm28, %zmm3 ; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm14, %zmm3 ; AVX512F-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm4 # 32-byte Folded Reload ; AVX512F-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm9 # 32-byte Folded Reload @@ -5591,7 +5591,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm16 ; AVX512F-FAST-NEXT: vpbroadcastd 32(%rax), %ymm2 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm22, %zmm0 +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm22, %zmm0 ; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm20, %zmm0 ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-FAST-NEXT: vmovdqa64 %zmm3, 128(%rax) @@ -10946,7 +10946,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-SLOW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,5,4,0,0,6,5,0,0,5,4,0,0,6,5,0] ; AVX512F-SLOW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] ; AVX512F-SLOW-NEXT: vpermd 64(%rax), %zmm11, %zmm0 -; AVX512F-SLOW-NEXT: vpternlogd $184, %zmm1, %zmm17, %zmm0 +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm17, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm6[3,3,3,3,7,7,7,7] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm3[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] @@ -10973,7 +10973,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-SLOW-NEXT: vpbroadcastd 96(%rax), %ymm5 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4 ; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535] -; AVX512F-SLOW-NEXT: vpternlogd $184, %zmm2, %zmm5, %zmm4 +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm5, %zmm4 ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm5, %zmm15 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-SLOW-NEXT: vmovdqa 96(%rsi), %xmm2 @@ -11007,7 +11007,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-SLOW-NEXT: vpbroadcastd 100(%rax), %ymm2 ; AVX512F-SLOW-NEXT: vpbroadcastd 104(%rax), %ymm3 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512F-SLOW-NEXT: vpternlogd $184, %zmm1, %zmm16, %zmm2 +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm16, %zmm2 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-SLOW-NEXT: vmovdqa 64(%rcx), %xmm1 ; AVX512F-SLOW-NEXT: vmovdqa 64(%rdx), %xmm2 @@ -11049,7 +11049,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-SLOW-NEXT: vpbroadcastd 68(%rax), %ymm5 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4 ; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm9 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512F-SLOW-NEXT: vpternlogd $184, %zmm1, %zmm9, %zmm4 +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm9, %zmm4 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-SLOW-NEXT: vmovdqa (%rcx), %xmm1 ; AVX512F-SLOW-NEXT: vmovdqa (%rdx), %xmm5 @@ -11085,7 +11085,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-SLOW-NEXT: vpbroadcastd (%rax), %ymm2 ; AVX512F-SLOW-NEXT: vpbroadcastd 4(%rax), %ymm3 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512F-SLOW-NEXT: vpternlogd $184, %zmm1, %zmm9, %zmm2 +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm9, %zmm2 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm23, %ymm7 ; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm7[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] @@ -11120,7 +11120,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm6[2],ymm1[3,4],ymm6[5],ymm1[6,7,8,9],ymm6[10],ymm1[11,12],ymm6[13],ymm1[14,15] ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm1, %ymm21 ; AVX512F-SLOW-NEXT: vpermd (%rax), %zmm11, %zmm1 -; AVX512F-SLOW-NEXT: vpternlogd $184, %zmm2, %zmm17, %zmm1 +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm17, %zmm1 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm10[3,3,3,3,7,7,7,7] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm8[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] @@ -11145,7 +11145,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,3,2] ; AVX512F-SLOW-NEXT: vpbroadcastd 32(%rax), %ymm12 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm11, %zmm25 -; AVX512F-SLOW-NEXT: vpternlogd $184, %zmm1, %zmm15, %zmm25 +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm1, %zmm15, %zmm25 ; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %xmm1 ; AVX512F-SLOW-NEXT: vmovdqa 32(%rsi), %xmm12 ; AVX512F-SLOW-NEXT: vprold $16, %xmm12, %xmm15 @@ -11178,7 +11178,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-SLOW-NEXT: vpbroadcastd 36(%rax), %ymm0 ; AVX512F-SLOW-NEXT: vpbroadcastd 40(%rax), %ymm6 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm28 -; AVX512F-SLOW-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm28 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm28 ; AVX512F-SLOW-NEXT: vmovdqa 32(%rcx), %xmm6 ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm29, %xmm0 ; AVX512F-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm2 @@ -11284,9 +11284,9 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7] ; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload ; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0] -; AVX512F-SLOW-NEXT: vpternlogd $184, %zmm0, %zmm1, %zmm23 +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm1, %zmm23 ; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload -; AVX512F-SLOW-NEXT: vpternlogd $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm29 # 64-byte Folded Reload +; AVX512F-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm29 # 64-byte Folded Reload ; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] ; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload ; AVX512F-SLOW-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm17 # 64-byte Folded Reload @@ -11643,7 +11643,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-FAST-NEXT: vmovdqa 32(%rax), %ymm0 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-FAST-NEXT: vpermi2d %zmm0, %zmm19, %zmm2 -; AVX512F-FAST-NEXT: vpternlogd $184, %zmm3, %zmm22, %zmm2 +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm3, %zmm22, %zmm2 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload ; AVX512F-FAST-NEXT: vpshufb %ymm1, %ymm13, %ymm0 @@ -11679,7 +11679,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,13,4,0,0,14,5,0,0,13,4,0,0,14,5,0] ; AVX512F-FAST-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] ; AVX512F-FAST-NEXT: vpermd %zmm0, %zmm2, %zmm0 -; AVX512F-FAST-NEXT: vpternlogd $184, %zmm3, %zmm25, %zmm0 +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm3, %zmm25, %zmm0 ; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm0 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-FAST-NEXT: vpshufb %ymm11, %ymm1, %ymm0 @@ -11776,7 +11776,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-FAST-NEXT: vpbroadcastd 68(%rax), %ymm6 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm23 ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512F-FAST-NEXT: vpternlogd $184, %zmm2, %zmm4, %zmm23 +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm2, %zmm4, %zmm23 ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] ; AVX512F-FAST-NEXT: vpternlogq $184, %zmm1, %zmm2, %zmm23 ; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -11790,7 +11790,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-FAST-NEXT: vpbroadcastd (%rax), %ymm5 ; AVX512F-FAST-NEXT: vpbroadcastd 4(%rax), %ymm6 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm20 -; AVX512F-FAST-NEXT: vpternlogd $184, %zmm1, %zmm4, %zmm20 +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm1, %zmm4, %zmm20 ; AVX512F-FAST-NEXT: vpternlogq $184, %zmm9, %zmm2, %zmm20 ; AVX512F-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] @@ -11835,7 +11835,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,13,4,0,0,14,5,0,0,13,4,0,0,14,5,0] ; AVX512F-FAST-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] ; AVX512F-FAST-NEXT: vpermd %zmm1, %zmm5, %zmm19 -; AVX512F-FAST-NEXT: vpternlogd $184, %zmm2, %zmm25, %zmm19 +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm2, %zmm25, %zmm19 ; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm19 ; AVX512F-FAST-NEXT: vmovdqa 32(%rcx), %xmm12 ; AVX512F-FAST-NEXT: vmovdqa 32(%rdx), %xmm8 @@ -11851,7 +11851,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-FAST-NEXT: vpbroadcastd 96(%rax), %ymm2 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm21 ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm30 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535] -; AVX512F-FAST-NEXT: vpternlogd $184, %zmm13, %zmm30, %zmm21 +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm13, %zmm30, %zmm21 ; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm21 ; AVX512F-FAST-NEXT: vmovdqa64 %xmm24, %xmm2 ; AVX512F-FAST-NEXT: vmovdqa64 %xmm26, %xmm1 @@ -11904,7 +11904,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-FAST-NEXT: vpbroadcastd 104(%rax), %ymm6 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm2, %zmm2 ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm22 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] -; AVX512F-FAST-NEXT: vpternlogd $184, %zmm3, %zmm22, %zmm2 +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm3, %zmm22, %zmm2 ; AVX512F-FAST-NEXT: vmovdqa64 %ymm18, %ymm3 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] ; AVX512F-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm16[3,3,3,3,7,7,7,7] @@ -11918,7 +11918,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-FAST-NEXT: vpermd %ymm14, %ymm31, %ymm9 ; AVX512F-FAST-NEXT: vpbroadcastd 32(%rax), %ymm18 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm18, %zmm9, %zmm9 -; AVX512F-FAST-NEXT: vpternlogd $184, %zmm10, %zmm30, %zmm9 +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm10, %zmm30, %zmm9 ; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm9 ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0] ; AVX512F-FAST-NEXT: vpternlogq $184, %zmm0, %zmm1, %zmm2 @@ -11944,7 +11944,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-FAST-NEXT: vpbroadcastd 36(%rax), %ymm3 ; AVX512F-FAST-NEXT: vpbroadcastd 40(%rax), %ymm5 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm3 -; AVX512F-FAST-NEXT: vpternlogd $184, %zmm4, %zmm22, %zmm3 +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm4, %zmm22, %zmm3 ; AVX512F-FAST-NEXT: vpternlogq $184, %zmm0, %zmm1, %zmm3 ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535] ; AVX512F-FAST-NEXT: vpternlogq $184, %ymm7, %ymm0, %ymm29 @@ -11955,7 +11955,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm26, %zmm0, %zmm1 ; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm29[0,1,2,3],zmm1[4,5,6,7] ; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512F-FAST-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm12 +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm12 ; AVX512F-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX512F-FAST-NEXT: vpunpckhwd (%rsp), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX512F-FAST-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll index 33f4e4b0f6d0e..f4fda97c0817a 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll @@ -927,7 +927,7 @@ define void @store_i8_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-SLOW-NEXT: vporq %zmm7, %zmm5, %zmm5 ; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm6 = ; AVX512F-SLOW-NEXT: vpermd %zmm1, %zmm6, %zmm6 -; AVX512F-SLOW-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm6 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm6 ; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u],zero,zero,xmm3[10,11,u],zero,zero,xmm3[12,13,u],zero,zero,xmm3[14,15,u] ; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] @@ -966,7 +966,7 @@ define void @store_i8_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm6 ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = ; AVX512F-FAST-NEXT: vpermd %zmm6, %zmm7, %zmm6 -; AVX512F-FAST-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm6 +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm6 ; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u],zero,zero,xmm3[10,11,u],zero,zero,xmm3[12,13,u],zero,zero,xmm3[14,15,u] ; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] @@ -1886,7 +1886,7 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-SLOW-NEXT: vmovdqa (%r8), %xmm6 ; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm8 = ; AVX512F-SLOW-NEXT: vpermd %zmm6, %zmm8, %zmm6 -; AVX512F-SLOW-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm6 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm6 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = ymm4[u,u,u],zero,ymm4[13,u,u,u],zero,ymm4[14,u,u,u],zero,ymm4[15,u,u,u],zero,ymm4[16,u,u,u],zero,ymm4[17,u,u,u],zero,ymm4[18,u,u] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = ymm3[u,u,u,13],zero,ymm3[u,u,u,14],zero,ymm3[u,u,u,15],zero,ymm3[u,u,u,16],zero,ymm3[u,u,u,17],zero,ymm3[u,u,u,18],zero,ymm3[u,u] ; AVX512F-SLOW-NEXT: vpor %ymm5, %ymm8, %ymm5 @@ -1968,7 +1968,7 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-FAST-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = mem[0,1,2,3,0,1,2,3] ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm7 = ; AVX512F-FAST-NEXT: vpermd %zmm4, %zmm7, %zmm7 -; AVX512F-FAST-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm7 +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm7 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm3[u,u,u],zero,ymm3[13,u,u,u],zero,ymm3[14,u,u,u],zero,ymm3[15,u,u,u],zero,ymm3[16,u,u,u],zero,ymm3[17,u,u,u],zero,ymm3[18,u,u] ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm2[u,u,u,13],zero,ymm2[u,u,u,14],zero,ymm2[u,u,u,15],zero,ymm2[u,u,u,16],zero,ymm2[u,u,u,17],zero,ymm2[u,u,u,18],zero,ymm2[u,u] ; AVX512F-FAST-NEXT: vpor %ymm5, %ymm8, %ymm5 @@ -4041,15 +4041,15 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm3 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 ; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm6, %zmm9, %zmm3 -; AVX512F-SLOW-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm30 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm30 ; AVX512F-SLOW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm25 -; AVX512F-SLOW-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm1 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm1 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm0 = zmm27[0,0,1,1,4,4,5,5] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm2 = zmm8[0,0,1,1,4,4,5,5] ; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm2 ; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm0 = ; AVX512F-SLOW-NEXT: vpermd %zmm31, %zmm0, %zmm0 -; AVX512F-SLOW-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm25, 64(%r9) ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, (%r9) ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm1, 128(%r9) @@ -4202,7 +4202,7 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255] ; AVX512F-FAST-NEXT: vpternlogq $226, %zmm22, %zmm21, %zmm3 ; AVX512F-FAST-NEXT: vpermt2d %zmm14, %zmm30, %zmm8 -; AVX512F-FAST-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm8 +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm8 ; AVX512F-FAST-NEXT: vmovdqa64 %zmm8, 256(%r9) ; AVX512F-FAST-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload ; AVX512F-FAST-NEXT: # ymm3 = mem[0,0,1,1] @@ -4226,13 +4226,13 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm27 ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm3 = <6,6,6,u,7,7,7,7,u,8,8,8,8,u,9,9> ; AVX512F-FAST-NEXT: vpermd %zmm14, %zmm3, %zmm3 -; AVX512F-FAST-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm3 +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm3 ; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm0 = zmm18[0,0,1,1,4,4,5,5] ; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm1 = zmm11[0,0,1,1,4,4,5,5] ; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm0 = ; AVX512F-FAST-NEXT: vpermd %zmm2, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm0 +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm0 ; AVX512F-FAST-NEXT: vmovdqa64 %zmm27, 64(%r9) ; AVX512F-FAST-NEXT: vmovdqa64 %zmm0, (%r9) ; AVX512F-FAST-NEXT: vmovdqa64 %zmm3, 128(%r9) diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll index d16daa04a4822..3bc7b6e958223 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll @@ -1750,7 +1750,7 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3] ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm12, %zmm11, %zmm13 ; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm14 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] -; AVX512F-SLOW-NEXT: vpternlogd $226, %zmm6, %zmm14, %zmm13 +; AVX512F-SLOW-NEXT: vpternlogq $226, %zmm6, %zmm14, %zmm13 ; AVX512F-SLOW-NEXT: vmovdqa (%r9), %xmm11 ; AVX512F-SLOW-NEXT: vmovdqa (%r8), %xmm12 ; AVX512F-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm12[8],xmm11[8],xmm12[9],xmm11[9],xmm12[10],xmm11[10],xmm12[11],xmm11[11],xmm12[12],xmm11[12],xmm12[13],xmm11[13],xmm12[14],xmm11[14],xmm12[15],xmm11[15] @@ -1761,7 +1761,7 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm15 = ymm15[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm15, %zmm6, %zmm6 -; AVX512F-SLOW-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm13, %zmm6 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm13, %zmm6 ; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm13 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] ; AVX512F-SLOW-NEXT: vpshufb %xmm13, %xmm9, %xmm15 ; AVX512F-SLOW-NEXT: vpshufb %xmm13, %xmm10, %xmm13 @@ -1779,7 +1779,7 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,6,5] ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm7 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm8 = zmm7[0,0,0,1,4,4,4,5] -; AVX512F-SLOW-NEXT: vpternlogd $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm15, %zmm8 +; AVX512F-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm15, %zmm8 ; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm7 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0] ; AVX512F-SLOW-NEXT: vpshufb %xmm7, %xmm11, %xmm9 ; AVX512F-SLOW-NEXT: vpshufb %xmm7, %xmm12, %xmm10 @@ -1789,7 +1789,7 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,4,4,4] ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm10, %zmm9 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm9 = zmm9[0,0,0,1,4,4,4,5] -; AVX512F-SLOW-NEXT: vpternlogd $184, %zmm8, %zmm14, %zmm9 +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm8, %zmm14, %zmm9 ; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm8 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] ; AVX512F-SLOW-NEXT: vpshufb %ymm8, %ymm3, %ymm10 ; AVX512F-SLOW-NEXT: vpshufb %ymm8, %ymm2, %ymm8 @@ -1815,7 +1815,7 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27,24,25,22,23,28,29,26,27,28,29,30,31] ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,2,2,3,6,6,6,7] -; AVX512F-SLOW-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm0 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, 128(%rax) ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm9, (%rax) ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm6, 64(%rax) @@ -1858,7 +1858,7 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27,24,25,22,23,28,29,26,27,28,29,30,31] ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm6 ; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm6 = zmm6[2,2,2,3,6,6,6,7] -; AVX512F-FAST-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm6 +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm6 ; AVX512F-FAST-NEXT: vmovdqa (%rcx), %xmm9 ; AVX512F-FAST-NEXT: vpshufb %xmm7, %xmm9, %xmm8 ; AVX512F-FAST-NEXT: vmovdqa (%rdx), %xmm11 @@ -1878,7 +1878,7 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-FAST-NEXT: vprold $16, %xmm14, %xmm14 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm13, %zmm14, %zmm13 ; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm14 = zmm13[0,0,0,1,4,4,4,5] -; AVX512F-FAST-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm12, %zmm14 +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm12, %zmm14 ; AVX512F-FAST-NEXT: vmovdqa (%r9), %xmm12 ; AVX512F-FAST-NEXT: vpshufb %xmm10, %xmm12, %xmm15 ; AVX512F-FAST-NEXT: vmovdqa (%r8), %xmm13 @@ -1889,7 +1889,7 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm10, %zmm15, %zmm10 ; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm10 = zmm10[0,0,0,1,4,4,4,5] ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm15 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] -; AVX512F-FAST-NEXT: vpternlogd $184, %zmm14, %zmm15, %zmm10 +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm14, %zmm15, %zmm10 ; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[16],ymm5[16],ymm4[17],ymm5[17],ymm4[18],ymm5[18],ymm4[19],ymm5[19],ymm4[20],ymm5[20],ymm4[21],ymm5[21],ymm4[22],ymm5[22],ymm4[23],ymm5[23] ; AVX512F-FAST-NEXT: vprold $16, %ymm4, %ymm4 ; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm11[8],xmm9[8],xmm11[9],xmm9[9],xmm11[10],xmm9[10],xmm11[11],xmm9[11],xmm11[12],xmm9[12],xmm11[13],xmm9[13],xmm11[14],xmm9[14],xmm11[15],xmm9[15] @@ -1901,13 +1901,13 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15] ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15] ; AVX512F-FAST-NEXT: vpermt2q %zmm2, %zmm9, %zmm3 -; AVX512F-FAST-NEXT: vpternlogd $226, %zmm5, %zmm15, %zmm3 +; AVX512F-FAST-NEXT: vpternlogq $226, %zmm5, %zmm15, %zmm3 ; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,18,19,16,17,22,23,24,25,24,25,24,25,24,25] ; AVX512F-FAST-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15] ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,10,11,8,9,6,7,12,13,10,11,12,13,14,15] ; AVX512F-FAST-NEXT: vpermt2q %zmm0, %zmm9, %zmm1 -; AVX512F-FAST-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm1 +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm1 ; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, 64(%rax) ; AVX512F-FAST-NEXT: vmovdqa64 %zmm10, (%rax) ; AVX512F-FAST-NEXT: vmovdqa64 %zmm6, 128(%rax) @@ -4306,7 +4306,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm28[0,1,2,3],zmm1[4,5,6,7] ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm9, %zmm13, %zmm1 ; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] -; AVX512F-SLOW-NEXT: vpternlogd $184, %zmm5, %zmm8, %zmm1 +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm5, %zmm8, %zmm1 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm29[0,0,0,1] ; AVX512F-SLOW-NEXT: vprold $16, %ymm30, %ymm9 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm21[0,0,0,1] @@ -4336,7 +4336,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2 ; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm15[0,1,2,3],zmm2[4,5,6,7] ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm29, %zmm28, %zmm5 -; AVX512F-SLOW-NEXT: vpternlogd $184, %zmm2, %zmm8, %zmm5 +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm8, %zmm5 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm2 = zmm27[0,0,0,1,4,4,4,5] ; AVX512F-SLOW-NEXT: vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Folded Reload ; AVX512F-SLOW-NEXT: # zmm8 = mem[0,0,0,1,4,4,4,5] @@ -4350,7 +4350,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-SLOW-NEXT: vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Folded Reload ; AVX512F-SLOW-NEXT: # zmm8 = mem[0,0,0,1,4,4,4,5] ; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255] -; AVX512F-SLOW-NEXT: vpternlogd $184, %zmm2, %zmm10, %zmm8 +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm10, %zmm8 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm2 = zmm26[0,0,0,1,4,4,4,5] ; AVX512F-SLOW-NEXT: vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Folded Reload ; AVX512F-SLOW-NEXT: # zmm11 = mem[0,0,0,1,4,4,4,5] @@ -4362,7 +4362,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm31[0,1,2,3],zmm2[4,5,6,7] ; AVX512F-SLOW-NEXT: vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Folded Reload ; AVX512F-SLOW-NEXT: # zmm11 = mem[0,0,0,1,4,4,4,5] -; AVX512F-SLOW-NEXT: vpternlogd $184, %zmm2, %zmm10, %zmm11 +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm2, %zmm10, %zmm11 ; AVX512F-SLOW-NEXT: vpermq $234, (%rsp), %zmm2 # 64-byte Folded Reload ; AVX512F-SLOW-NEXT: # zmm2 = mem[2,2,2,3,6,6,6,7] ; AVX512F-SLOW-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Folded Reload @@ -4382,11 +4382,11 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-SLOW-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Folded Reload ; AVX512F-SLOW-NEXT: # zmm2 = mem[2,2,2,3,6,6,6,7] ; AVX512F-SLOW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0] -; AVX512F-SLOW-NEXT: vpternlogd $184, %zmm0, %zmm6, %zmm2 +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm6, %zmm2 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 ; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm4[0,1,2,3],zmm0[4,5,6,7] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm3 = zmm23[2,2,2,3,6,6,6,7] -; AVX512F-SLOW-NEXT: vpternlogd $184, %zmm0, %zmm6, %zmm3 +; AVX512F-SLOW-NEXT: vpternlogq $184, %zmm0, %zmm6, %zmm3 ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm3, 128(%rax) ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm2, 320(%rax) @@ -4605,13 +4605,13 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm7 ; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm13[0,1,2,3],zmm7[4,5,6,7] ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] -; AVX512F-FAST-NEXT: vpternlogd $184, %zmm7, %zmm8, %zmm6 +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm7, %zmm8, %zmm6 ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-FAST-NEXT: vmovdqa64 %zmm6, 256(%rax) ; AVX512F-FAST-NEXT: vpternlogq $184, %ymm11, %ymm16, %ymm12 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm0, %zmm6 ; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm6[4,5,6,7] -; AVX512F-FAST-NEXT: vpternlogd $184, %zmm2, %zmm8, %zmm3 +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm2, %zmm8, %zmm3 ; AVX512F-FAST-NEXT: vmovdqa64 %zmm3, 64(%rax) ; AVX512F-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Folded Reload ; AVX512F-FAST-NEXT: # zmm2 = mem[2,2,2,3,6,6,6,7] @@ -4634,13 +4634,13 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-FAST-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Folded Reload ; AVX512F-FAST-NEXT: # zmm7 = mem[2,2,2,3,6,6,6,7] ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm8 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0] -; AVX512F-FAST-NEXT: vpternlogd $184, %zmm3, %zmm8, %zmm7 +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm3, %zmm8, %zmm7 ; AVX512F-FAST-NEXT: vpternlogq $184, %ymm6, %ymm2, %ymm31 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm30, %zmm0, %zmm3 ; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm31[0,1,2,3],zmm3[4,5,6,7] ; AVX512F-FAST-NEXT: vpermq $234, (%rsp), %zmm6 # 64-byte Folded Reload ; AVX512F-FAST-NEXT: # zmm6 = mem[2,2,2,3,6,6,6,7] -; AVX512F-FAST-NEXT: vpternlogd $184, %zmm3, %zmm8, %zmm6 +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm3, %zmm8, %zmm6 ; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm3 = zmm23[0,0,0,1,4,4,4,5] ; AVX512F-FAST-NEXT: vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Folded Reload ; AVX512F-FAST-NEXT: # zmm8 = mem[0,0,0,1,4,4,4,5] @@ -4658,11 +4658,11 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm5[0,1,2,3],zmm2[4,5,6,7] ; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm3 = zmm26[0,0,0,1,4,4,4,5] ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm4 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255] -; AVX512F-FAST-NEXT: vpternlogd $184, %zmm2, %zmm4, %zmm3 +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm2, %zmm4, %zmm3 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7] ; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm1 = zmm28[0,0,0,1,4,4,4,5] -; AVX512F-FAST-NEXT: vpternlogd $184, %zmm0, %zmm4, %zmm1 +; AVX512F-FAST-NEXT: vpternlogq $184, %zmm0, %zmm4, %zmm1 ; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512F-FAST-NEXT: vmovdqa64 %zmm3, 192(%rax) ; AVX512F-FAST-NEXT: vmovdqa64 %zmm6, 128(%rax) diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll index a4f21ea2a9154..54ed0f184827a 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll @@ -659,7 +659,7 @@ define void @store_i8_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[4,12],zero,ymm1[u,u,u,u,5,13],zero,ymm1[u,u,u,u,6,14,22,u,u,u,u],zero,zero,ymm1[23,u,u,u,u,u,u,u,u] ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1 ; AVX512F-SLOW-NEXT: vporq %zmm0, %zmm1, %zmm0 -; AVX512F-SLOW-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0 ; AVX512F-SLOW-NEXT: vextracti32x4 $2, %zmm0, 32(%rax) ; AVX512F-SLOW-NEXT: vextracti32x4 $3, %zmm0, %xmm1 ; AVX512F-SLOW-NEXT: vmovq %xmm1, 48(%rax) @@ -1560,7 +1560,7 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,0,8],zero,ymm0[u,u,u,u,1,9],zero,ymm0[u,u,u,u,18,26],zero,ymm0[u,u,u,u,19,27],zero,ymm0[u,u,u,u] ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm0 ; AVX512F-SLOW-NEXT: vporq %zmm1, %zmm0, %zmm0 -; AVX512F-SLOW-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqa %xmm5, 96(%rax) ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, (%rax) ; AVX512F-SLOW-NEXT: vmovdqa %ymm4, 64(%rax) @@ -1631,7 +1631,7 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,4],zero,ymm1[u,u,u,u,1,5],zero,ymm1[u,u,u,u,2,6],zero,ymm1[u,u,u,u,19,23],zero,ymm1[u,u,u,u,24,28],zero,ymm1[u] ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm6, %zmm1 ; AVX512F-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 -; AVX512F-FAST-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm1 +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm1 ; AVX512F-FAST-NEXT: vmovdqa %xmm3, 96(%rax) ; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512F-FAST-NEXT: vmovdqa %ymm2, 64(%rax) @@ -3342,7 +3342,7 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm8, %zmm8 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm8 = zmm8[2,3,2,3,6,7,6,7] ; AVX512F-SLOW-NEXT: vporq %zmm7, %zmm8, %zmm7 -; AVX512F-SLOW-NEXT: vpternlogd $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm7 +; AVX512F-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm7 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25,u,23,u,u,u,u,26,u,24,u,u,u,u,27,u] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] ; AVX512F-SLOW-NEXT: vbroadcasti32x4 {{.*#+}} ymm16 = [18374967954648269055,71777218572844800,18374967954648269055,71777218572844800] @@ -3371,7 +3371,7 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm11, %zmm10, %zmm10 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm10 = zmm10[2,3,2,3,6,7,6,7] ; AVX512F-SLOW-NEXT: vporq %zmm10, %zmm8, %zmm8 -; AVX512F-SLOW-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm8 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm8 ; AVX512F-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm8 ; AVX512F-SLOW-NEXT: vmovdqa (%rsi), %xmm11 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm11[u],zero,xmm11[7],zero,xmm11[5,u,u,u],zero,xmm11[8],zero,xmm11[6,u,u,u],zero @@ -3391,7 +3391,7 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] ; AVX512F-SLOW-NEXT: vinserti32x4 $2, %xmm13, %zmm14, %zmm13 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm18 = zmm13[0,1,0,1,4,5,4,5] -; AVX512F-SLOW-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm10, %zmm18 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm10, %zmm18 ; AVX512F-SLOW-NEXT: vmovdqa (%r9), %xmm13 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm10 = zero,xmm13[4,u,u,u],zero,xmm13[7],zero,xmm13[5,u,u,u],zero,xmm13[8],zero,xmm13[6] ; AVX512F-SLOW-NEXT: vmovdqa (%r8), %xmm14 @@ -3407,7 +3407,7 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm0 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm10 = zmm0[0,0,1,0,4,4,5,4] -; AVX512F-SLOW-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm19, %zmm10 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm19, %zmm10 ; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm18, %zmm10 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm1[14,u,u],zero,zero,zero,zero,ymm1[15,u,u],zero,zero,zero,zero,ymm1[16,u,u],zero,zero,zero,zero,ymm1[17,u,u],zero,zero,zero,zero,ymm1[18] ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm1, %ymm19 @@ -3426,7 +3426,7 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm7, %zmm1 -; AVX512F-SLOW-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = zero,ymm5[u,u,u,u,u,14],zero,ymm5[u,u,u,u,u,15],zero,ymm5[u,u,u,u,u,16],zero,ymm5[u,u,u,u,u,17],zero,ymm5[u,u,u] ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[13,u,u,u,u,u],zero,ymm6[14,u,u,u,u,u],zero,ymm6[15,u,u,u,u,u],zero,ymm6[16,u,u,u,u,u],zero,ymm6[17,u,u,u] ; AVX512F-SLOW-NEXT: vpor %ymm0, %ymm7, %ymm0 @@ -3502,7 +3502,7 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] ; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm10, %zmm13, %zmm10 ; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm15 = zmm10[0,1,0,1,4,5,4,5] -; AVX512F-FAST-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm15 +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm15 ; AVX512F-FAST-NEXT: vmovdqa (%r10), %xmm10 ; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} xmm7 = xmm10[1,1,0,0,4,5,6,7] ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = @@ -3519,7 +3519,7 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] ; AVX512F-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm7, %zmm0 ; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm7 = zmm0[0,1,0,1,4,5,4,5] -; AVX512F-FAST-NEXT: vpternlogd $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm16, %zmm7 +; AVX512F-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm16, %zmm7 ; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm15, %zmm7 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm4[14,u,u],zero,zero,zero,zero,ymm4[15,u,u],zero,zero,zero,zero,ymm4[16,u,u],zero,zero,zero,zero,ymm4[17,u,u],zero,zero,zero,zero,ymm4[18] ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm15 = ymm2[0,1,14],zero,ymm2[u,u,0,1,14,15],zero,ymm2[u,u,13,2,3,16],zero,ymm2[u,u,28,29,16,17],zero,ymm2[u,u,19,28,29,18],zero @@ -3535,7 +3535,7 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1] ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm8, %zmm9 -; AVX512F-FAST-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm9 +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm9 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,ymm1[u,u,u,u,u,14],zero,ymm1[u,u,u,u,u,15],zero,ymm1[u,u,u,u,u,16],zero,ymm1[u,u,u,u,u,17],zero,ymm1[u,u,u] ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm3[13,u,u,u,u,u],zero,ymm3[14,u,u,u,u,u],zero,ymm3[15,u,u,u,u,u],zero,ymm3[16,u,u,u,u,u],zero,ymm3[17,u,u,u] ; AVX512F-FAST-NEXT: vpor %ymm0, %ymm8, %ymm0 @@ -3576,7 +3576,7 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm11, %zmm11 ; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm11 = zmm11[2,3,2,3,6,7,6,7] ; AVX512F-FAST-NEXT: vporq %zmm10, %zmm11, %zmm10 -; AVX512F-FAST-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm10 +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm10 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm3[20],zero,ymm3[18],zero,zero,zero,zero,ymm3[21],zero,ymm3[19],zero,zero,zero,zero,ymm3[22] ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm11 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm3[25],zero,ymm3[23],zero,zero,zero,zero,ymm3[26],zero,ymm3[24],zero,zero ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm0 @@ -3593,7 +3593,7 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3] ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm12, %zmm11, %zmm11 -; AVX512F-FAST-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm11 +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm11 ; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm10, %zmm11 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u,u,u,u,30,u,28,u,u,u,u,31,u,29,u] ; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] @@ -7623,19 +7623,19 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm14[0,0,1,0] ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm13, %zmm3 ; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512F-SLOW-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm3 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm3 ; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm3 ; AVX512F-SLOW-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Folded Reload ; AVX512F-SLOW-NEXT: # zmm4 = mem[2,3,2,3,6,7,6,7] -; AVX512F-SLOW-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm4 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm4 ; AVX512F-SLOW-NEXT: vporq %ymm15, %ymm18, %ymm5 ; AVX512F-SLOW-NEXT: vporq %ymm19, %ymm20, %ymm6 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm5 ; AVX512F-SLOW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm6[0,1,2,3],zmm5[4,5,6,7] -; AVX512F-SLOW-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm16 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm16 ; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm16 ; AVX512F-SLOW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512F-SLOW-NEXT: vpternlogd $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm26 +; AVX512F-SLOW-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm26 ; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm17, %zmm26 ; AVX512F-SLOW-NEXT: vporq %ymm21, %ymm22, %ymm1 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 @@ -7648,7 +7648,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm1 = zmm12[0,1,0,1,4,5,4,5] ; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm0 = zmm7[0,0,1,0,4,4,5,4] -; AVX512F-SLOW-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm11, %zmm0 +; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm11, %zmm0 ; AVX512F-SLOW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm0 ; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, (%rax) @@ -8039,7 +8039,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] ; AVX512F-FAST-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Folded Reload ; AVX512F-FAST-NEXT: # zmm22 = mem[2,3,2,3,6,7,6,7] -; AVX512F-FAST-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm22 +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm22 ; AVX512F-FAST-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload ; AVX512F-FAST-NEXT: # ymm0 = mem[0,1,0,1] ; AVX512F-FAST-NEXT: vpermq $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Folded Reload @@ -8061,11 +8061,11 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-FAST-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm24 ; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm23, %zmm24 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm19, %zmm2, %zmm0 -; AVX512F-FAST-NEXT: vpternlogd $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm21 +; AVX512F-FAST-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm21 ; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm21 ; AVX512F-FAST-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm0 # 32-byte Folded Reload ; AVX512F-FAST-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512F-FAST-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0 +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0 ; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm0 ; AVX512F-FAST-NEXT: vpor %ymm12, %ymm15, %ymm2 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2 @@ -8078,14 +8078,14 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm5 = zmm31[0,1,0,1,4,5,4,5] ; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm5 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm17, %zmm2 -; AVX512F-FAST-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm18, %zmm2 +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm18, %zmm2 ; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm2 ; AVX512F-FAST-NEXT: vpor %ymm1, %ymm13, %ymm1 ; AVX512F-FAST-NEXT: vpor %ymm11, %ymm14, %ymm5 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 ; AVX512F-FAST-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm5[0,1,2,3],zmm1[4,5,6,7] ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm20, %zmm4 -; AVX512F-FAST-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm4 +; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm4 ; AVX512F-FAST-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm9, %zmm4 ; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512F-FAST-NEXT: vmovdqa64 %zmm4, 128(%rax) diff --git a/llvm/test/CodeGen/X86/vector-rotate-128.ll b/llvm/test/CodeGen/X86/vector-rotate-128.ll index 43c9be2dc6f97..33f7a4e42b7f7 100644 --- a/llvm/test/CodeGen/X86/vector-rotate-128.ll +++ b/llvm/test/CodeGen/X86/vector-rotate-128.ll @@ -1573,7 +1573,7 @@ define <16 x i8> @splatconstant_rotate_v16i8(<16 x i8> %a) nounwind { ; AVX512NOVLX: # %bb.0: ; AVX512NOVLX-NEXT: vpsllw $4, %xmm0, %xmm1 ; AVX512NOVLX-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX512NOVLX-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0 +; AVX512NOVLX-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm0 ; AVX512NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512NOVLX-NEXT: vzeroupper ; AVX512NOVLX-NEXT: retq @@ -1582,7 +1582,7 @@ define <16 x i8> @splatconstant_rotate_v16i8(<16 x i8> %a) nounwind { ; AVX512VLX: # %bb.0: ; AVX512VLX-NEXT: vpsllw $4, %xmm0, %xmm1 ; AVX512VLX-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX512VLX-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0 +; AVX512VLX-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm1, %xmm0 ; AVX512VLX-NEXT: retq ; ; XOP-LABEL: splatconstant_rotate_v16i8: @@ -1811,7 +1811,7 @@ define <16 x i8> @splatconstant_rotate_mask_v16i8(<16 x i8> %a) nounwind { ; AVX512NOVLX: # %bb.0: ; AVX512NOVLX-NEXT: vpsllw $4, %xmm0, %xmm1 ; AVX512NOVLX-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX512NOVLX-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0 +; AVX512NOVLX-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm0 ; AVX512NOVLX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512NOVLX-NEXT: vzeroupper ; AVX512NOVLX-NEXT: retq @@ -1820,7 +1820,7 @@ define <16 x i8> @splatconstant_rotate_mask_v16i8(<16 x i8> %a) nounwind { ; AVX512VLX: # %bb.0: ; AVX512VLX-NEXT: vpsllw $4, %xmm0, %xmm1 ; AVX512VLX-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX512VLX-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0 +; AVX512VLX-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm1, %xmm0 ; AVX512VLX-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 ; AVX512VLX-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-rotate-256.ll b/llvm/test/CodeGen/X86/vector-rotate-256.ll index c55335f849569..7047d5e3131a7 100644 --- a/llvm/test/CodeGen/X86/vector-rotate-256.ll +++ b/llvm/test/CodeGen/X86/vector-rotate-256.ll @@ -392,12 +392,12 @@ define <32 x i8> @var_rotate_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm2 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm3 -; AVX512F-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm3 +; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm3 ; AVX512F-NEXT: vpsllw $5, %ymm1, %ymm1 ; AVX512F-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 ; AVX512F-NEXT: vpsllw $2, %ymm0, %ymm2 ; AVX512F-NEXT: vpsrlw $6, %ymm0, %ymm3 -; AVX512F-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm3 +; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm3 ; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm1 ; AVX512F-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 ; AVX512F-NEXT: vpsrlw $7, %ymm0, %ymm2 @@ -412,12 +412,12 @@ define <32 x i8> @var_rotate_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm2 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm3 -; AVX512VL-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm3 +; AVX512VL-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm2, %ymm3 ; AVX512VL-NEXT: vpsllw $5, %ymm1, %ymm1 ; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 ; AVX512VL-NEXT: vpsllw $2, %ymm0, %ymm2 ; AVX512VL-NEXT: vpsrlw $6, %ymm0, %ymm3 -; AVX512VL-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm3 +; AVX512VL-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm2, %ymm3 ; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1 ; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 ; AVX512VL-NEXT: vpsrlw $7, %ymm0, %ymm2 @@ -1404,7 +1404,7 @@ define <32 x i8> @splatconstant_rotate_v32i8(<32 x i8> %a) nounwind { ; AVX512NOVLX: # %bb.0: ; AVX512NOVLX-NEXT: vpsllw $4, %ymm0, %ymm1 ; AVX512NOVLX-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512NOVLX-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0 +; AVX512NOVLX-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm0 ; AVX512NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512NOVLX-NEXT: retq ; @@ -1412,7 +1412,7 @@ define <32 x i8> @splatconstant_rotate_v32i8(<32 x i8> %a) nounwind { ; AVX512VLX: # %bb.0: ; AVX512VLX-NEXT: vpsllw $4, %ymm0, %ymm1 ; AVX512VLX-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512VLX-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm0 +; AVX512VLX-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm1, %ymm0 ; AVX512VLX-NEXT: retq ; ; XOPAVX1-LABEL: splatconstant_rotate_v32i8: @@ -1667,7 +1667,7 @@ define <32 x i8> @splatconstant_rotate_mask_v32i8(<32 x i8> %a) nounwind { ; AVX512NOVLX: # %bb.0: ; AVX512NOVLX-NEXT: vpsllw $4, %ymm0, %ymm1 ; AVX512NOVLX-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512NOVLX-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0 +; AVX512NOVLX-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm0 ; AVX512NOVLX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX512NOVLX-NEXT: retq ; @@ -1675,7 +1675,7 @@ define <32 x i8> @splatconstant_rotate_mask_v32i8(<32 x i8> %a) nounwind { ; AVX512VLX: # %bb.0: ; AVX512VLX-NEXT: vpsllw $4, %ymm0, %ymm1 ; AVX512VLX-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512VLX-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm0 +; AVX512VLX-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm1, %ymm0 ; AVX512VLX-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0 ; AVX512VLX-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-rotate-512.ll b/llvm/test/CodeGen/X86/vector-rotate-512.ll index 29afbf4c62ef5..6504c3e6353e9 100644 --- a/llvm/test/CodeGen/X86/vector-rotate-512.ll +++ b/llvm/test/CodeGen/X86/vector-rotate-512.ll @@ -135,15 +135,15 @@ define <64 x i8> @var_rotate_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 ; AVX512F-NEXT: vpsrlw $4, %ymm2, %ymm3 ; AVX512F-NEXT: vpsllw $4, %ymm2, %ymm4 -; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm5 = [4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160] -; AVX512F-NEXT: vpternlogd $226, %zmm3, %zmm5, %zmm4 +; AVX512F-NEXT: vpbroadcastq {{.*#+}} zmm5 = [17361641481138401520,17361641481138401520,17361641481138401520,17361641481138401520,17361641481138401520,17361641481138401520,17361641481138401520,17361641481138401520] +; AVX512F-NEXT: vpternlogq $226, %zmm3, %zmm5, %zmm4 ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3 ; AVX512F-NEXT: vpsllw $5, %ymm3, %ymm3 ; AVX512F-NEXT: vpblendvb %ymm3, %ymm4, %ymm2, %ymm2 ; AVX512F-NEXT: vpsrlw $6, %ymm2, %ymm4 ; AVX512F-NEXT: vpsllw $2, %ymm2, %ymm6 -; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm7 = [4244438268,4244438268,4244438268,4244438268,4244438268,4244438268,4244438268,4244438268,4244438268,4244438268,4244438268,4244438268,4244438268,4244438268,4244438268,4244438268] -; AVX512F-NEXT: vpternlogd $226, %zmm4, %zmm7, %zmm6 +; AVX512F-NEXT: vpbroadcastq {{.*#+}} zmm7 = [18229723555195321596,18229723555195321596,18229723555195321596,18229723555195321596,18229723555195321596,18229723555195321596,18229723555195321596,18229723555195321596] +; AVX512F-NEXT: vpternlogq $226, %zmm4, %zmm7, %zmm6 ; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm3 ; AVX512F-NEXT: vpblendvb %ymm3, %ymm6, %ymm2, %ymm2 ; AVX512F-NEXT: vpsrlw $7, %ymm2, %ymm4 @@ -155,12 +155,12 @@ define <64 x i8> @var_rotate_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; AVX512F-NEXT: vpblendvb %ymm3, %ymm4, %ymm2, %ymm2 ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm3 ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm4 -; AVX512F-NEXT: vpternlogd $226, %zmm3, %zmm5, %zmm4 +; AVX512F-NEXT: vpternlogq $226, %zmm3, %zmm5, %zmm4 ; AVX512F-NEXT: vpsllw $5, %ymm1, %ymm1 ; AVX512F-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0 ; AVX512F-NEXT: vpsrlw $6, %ymm0, %ymm3 ; AVX512F-NEXT: vpsllw $2, %ymm0, %ymm4 -; AVX512F-NEXT: vpternlogd $226, %zmm3, %zmm7, %zmm4 +; AVX512F-NEXT: vpternlogq $226, %zmm3, %zmm7, %zmm4 ; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm1 ; AVX512F-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0 ; AVX512F-NEXT: vpsrlw $7, %ymm0, %ymm3 @@ -177,15 +177,15 @@ define <64 x i8> @var_rotate_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 ; AVX512VL-NEXT: vpsrlw $4, %ymm2, %ymm3 ; AVX512VL-NEXT: vpsllw $4, %ymm2, %ymm4 -; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm5 = [4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160] -; AVX512VL-NEXT: vpternlogd $226, %ymm3, %ymm5, %ymm4 +; AVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm5 = [17361641481138401520,17361641481138401520,17361641481138401520,17361641481138401520] +; AVX512VL-NEXT: vpternlogq $226, %ymm3, %ymm5, %ymm4 ; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm3 ; AVX512VL-NEXT: vpsllw $5, %ymm3, %ymm3 ; AVX512VL-NEXT: vpblendvb %ymm3, %ymm4, %ymm2, %ymm2 ; AVX512VL-NEXT: vpsrlw $6, %ymm2, %ymm4 ; AVX512VL-NEXT: vpsllw $2, %ymm2, %ymm6 -; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm7 = [4244438268,4244438268,4244438268,4244438268,4244438268,4244438268,4244438268,4244438268] -; AVX512VL-NEXT: vpternlogd $226, %ymm4, %ymm7, %ymm6 +; AVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm7 = [18229723555195321596,18229723555195321596,18229723555195321596,18229723555195321596] +; AVX512VL-NEXT: vpternlogq $226, %ymm4, %ymm7, %ymm6 ; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm3 ; AVX512VL-NEXT: vpblendvb %ymm3, %ymm6, %ymm2, %ymm2 ; AVX512VL-NEXT: vpsrlw $7, %ymm2, %ymm4 @@ -196,12 +196,12 @@ define <64 x i8> @var_rotate_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; AVX512VL-NEXT: vpblendvb %ymm3, %ymm6, %ymm2, %ymm2 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm3 ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm4 -; AVX512VL-NEXT: vpternlogd $226, %ymm3, %ymm5, %ymm4 +; AVX512VL-NEXT: vpternlogq $226, %ymm3, %ymm5, %ymm4 ; AVX512VL-NEXT: vpsllw $5, %ymm1, %ymm1 ; AVX512VL-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0 ; AVX512VL-NEXT: vpsrlw $6, %ymm0, %ymm3 ; AVX512VL-NEXT: vpsllw $2, %ymm0, %ymm4 -; AVX512VL-NEXT: vpternlogd $226, %ymm3, %ymm7, %ymm4 +; AVX512VL-NEXT: vpternlogq $226, %ymm3, %ymm7, %ymm4 ; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1 ; AVX512VL-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0 ; AVX512VL-NEXT: vpsrlw $7, %ymm0, %ymm3 @@ -754,7 +754,7 @@ define <64 x i8> @splatconstant_rotate_v64i8(<64 x i8> %a) nounwind { ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX512F-NEXT: vpsrlw $4, %ymm2, %ymm2 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 -; AVX512F-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0 +; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: splatconstant_rotate_v64i8: @@ -766,35 +766,35 @@ define <64 x i8> @splatconstant_rotate_v64i8(<64 x i8> %a) nounwind { ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX512VL-NEXT: vpsrlw $4, %ymm2, %ymm2 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 -; AVX512VL-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0 +; AVX512VL-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: splatconstant_rotate_v64i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm1 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 -; AVX512BW-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0 +; AVX512BW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512VLBW-LABEL: splatconstant_rotate_v64i8: ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpsllw $4, %zmm0, %zmm1 ; AVX512VLBW-NEXT: vpsrlw $4, %zmm0, %zmm0 -; AVX512VLBW-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0 +; AVX512VLBW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm0 ; AVX512VLBW-NEXT: retq ; ; AVX512VBMI2-LABEL: splatconstant_rotate_v64i8: ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: vpsllw $4, %zmm0, %zmm1 ; AVX512VBMI2-NEXT: vpsrlw $4, %zmm0, %zmm0 -; AVX512VBMI2-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0 +; AVX512VBMI2-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm0 ; AVX512VBMI2-NEXT: retq ; ; AVX512VLVBMI2-LABEL: splatconstant_rotate_v64i8: ; AVX512VLVBMI2: # %bb.0: ; AVX512VLVBMI2-NEXT: vpsllw $4, %zmm0, %zmm1 ; AVX512VLVBMI2-NEXT: vpsrlw $4, %zmm0, %zmm0 -; AVX512VLVBMI2-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0 +; AVX512VLVBMI2-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm0 ; AVX512VLVBMI2-NEXT: retq %shl = shl <64 x i8> %a, %lshr = lshr <64 x i8> %a, @@ -902,7 +902,7 @@ define <64 x i8> @splatconstant_rotate_mask_v64i8(<64 x i8> %a) nounwind { ; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX512F-NEXT: vpsrlw $4, %ymm2, %ymm2 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 -; AVX512F-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0 +; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm0 ; AVX512F-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; @@ -915,7 +915,7 @@ define <64 x i8> @splatconstant_rotate_mask_v64i8(<64 x i8> %a) nounwind { ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm0 ; AVX512VL-NEXT: vpsrlw $4, %ymm2, %ymm2 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 -; AVX512VL-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0 +; AVX512VL-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm0 ; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 ; AVX512VL-NEXT: retq ; @@ -923,7 +923,7 @@ define <64 x i8> @splatconstant_rotate_mask_v64i8(<64 x i8> %a) nounwind { ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm1 ; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 -; AVX512BW-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0 +; AVX512BW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm0 ; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 ; AVX512BW-NEXT: retq ; @@ -931,7 +931,7 @@ define <64 x i8> @splatconstant_rotate_mask_v64i8(<64 x i8> %a) nounwind { ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpsllw $4, %zmm0, %zmm1 ; AVX512VLBW-NEXT: vpsrlw $4, %zmm0, %zmm0 -; AVX512VLBW-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0 +; AVX512VLBW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm0 ; AVX512VLBW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 ; AVX512VLBW-NEXT: retq ; @@ -939,7 +939,7 @@ define <64 x i8> @splatconstant_rotate_mask_v64i8(<64 x i8> %a) nounwind { ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: vpsllw $4, %zmm0, %zmm1 ; AVX512VBMI2-NEXT: vpsrlw $4, %zmm0, %zmm0 -; AVX512VBMI2-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0 +; AVX512VBMI2-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm0 ; AVX512VBMI2-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 ; AVX512VBMI2-NEXT: retq ; @@ -947,7 +947,7 @@ define <64 x i8> @splatconstant_rotate_mask_v64i8(<64 x i8> %a) nounwind { ; AVX512VLVBMI2: # %bb.0: ; AVX512VLVBMI2-NEXT: vpsllw $4, %zmm0, %zmm1 ; AVX512VLVBMI2-NEXT: vpsrlw $4, %zmm0, %zmm0 -; AVX512VLVBMI2-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0 +; AVX512VLVBMI2-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm0 ; AVX512VLVBMI2-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 ; AVX512VLVBMI2-NEXT: retq %shl = shl <64 x i8> %a, diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll index e6b5998d965f0..938fba0490b55 100644 --- a/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll +++ b/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll @@ -1160,9 +1160,9 @@ define <16 x i8> @splatvar_modulo_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwi ; ; XOPAVX1-LABEL: splatvar_modulo_shift_v16i8: ; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; XOPAVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; XOPAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; XOPAVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm1 ; XOPAVX1-NEXT: vpshab %xmm1, %xmm0, %xmm0 ; XOPAVX1-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-256.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-256.ll index 6d19a81d9fd78..5a70e5d4a2b56 100644 --- a/llvm/test/CodeGen/X86/vector-shift-ashr-256.ll +++ b/llvm/test/CodeGen/X86/vector-shift-ashr-256.ll @@ -1265,9 +1265,9 @@ define <32 x i8> @splatvar_modulo_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwi ; ; XOPAVX1-LABEL: splatvar_modulo_shift_v32i8: ; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; XOPAVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; XOPAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; XOPAVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm1 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; XOPAVX1-NEXT: vpshab %xmm1, %xmm2, %xmm2 diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll index 7a2dcd1c8ca8c..e248aafab5255 100644 --- a/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll +++ b/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll @@ -941,9 +941,9 @@ define <16 x i8> @splatvar_modulo_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwi ; ; XOPAVX1-LABEL: splatvar_modulo_shift_v16i8: ; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; XOPAVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; XOPAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; XOPAVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm1 ; XOPAVX1-NEXT: vpshlb %xmm1, %xmm0, %xmm0 ; XOPAVX1-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll index 5fe661c7e7778..77f5f2660af7e 100644 --- a/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll +++ b/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll @@ -1037,9 +1037,9 @@ define <32 x i8> @splatvar_modulo_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwi ; ; XOPAVX1-LABEL: splatvar_modulo_shift_v32i8: ; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; XOPAVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; XOPAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; XOPAVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm1 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; XOPAVX1-NEXT: vpshlb %xmm1, %xmm2, %xmm2 diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-128.ll b/llvm/test/CodeGen/X86/vector-shift-shl-128.ll index 12f971fb83b56..deb1514e42c4a 100644 --- a/llvm/test/CodeGen/X86/vector-shift-shl-128.ll +++ b/llvm/test/CodeGen/X86/vector-shift-shl-128.ll @@ -848,9 +848,9 @@ define <16 x i8> @splatvar_modulo_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwi ; ; XOPAVX1-LABEL: splatvar_modulo_shift_v16i8: ; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; XOPAVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; XOPAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; XOPAVX1-NEXT: vpshlb %xmm1, %xmm0, %xmm0 ; XOPAVX1-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-256.ll b/llvm/test/CodeGen/X86/vector-shift-shl-256.ll index 76944994c87d1..c355eeaa42b66 100644 --- a/llvm/test/CodeGen/X86/vector-shift-shl-256.ll +++ b/llvm/test/CodeGen/X86/vector-shift-shl-256.ll @@ -962,9 +962,9 @@ define <32 x i8> @splatvar_modulo_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwi ; ; XOPAVX1-LABEL: splatvar_modulo_shift_v32i8: ; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; XOPAVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; XOPAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; XOPAVX1-NEXT: vpshlb %xmm1, %xmm2, %xmm2 ; XOPAVX1-NEXT: vpshlb %xmm1, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-v192.ll b/llvm/test/CodeGen/X86/vector-shuffle-v192.ll index dd1022151214e..7159edc2bbdf4 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-v192.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-v192.ll @@ -230,7 +230,7 @@ define <64 x i8> @f2(ptr %p0) { ; AVX512F-NEXT: vpor %xmm1, %xmm2, %xmm1 ; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7] -; AVX512F-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm0 +; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: f2: @@ -517,7 +517,7 @@ define <64 x i8> @f4(ptr %p0) { ; AVX512F-NEXT: vpor %xmm1, %xmm2, %xmm1 ; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7] -; AVX512F-NEXT: vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm0 +; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: f4: diff --git a/llvm/test/Transforms/InstCombine/ctlz-cttz-shifts.ll b/llvm/test/Transforms/InstCombine/ctlz-cttz-shifts.ll deleted file mode 100644 index 86fef51872b19..0000000000000 --- a/llvm/test/Transforms/InstCombine/ctlz-cttz-shifts.ll +++ /dev/null @@ -1,234 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 -; RUN: opt -passes=instcombine -S < %s | FileCheck %s - -declare i32 @llvm.ctlz.i32(i32, i1) -declare i32 @llvm.cttz.i32(i32, i1) -declare <2 x i32> @llvm.ctlz.v2i32(<2 x i32>, i1) -declare <2 x i32> @llvm.cttz.v2i32(<2 x i32>, i1) - -define i32 @lshr_ctlz_true(i32) { -; CHECK-LABEL: define i32 @lshr_ctlz_true( -; CHECK-SAME: i32 [[TMP0:%.*]]) { -; CHECK-NEXT: [[CTLZ:%.*]] = add i32 [[TMP0]], 9 -; CHECK-NEXT: ret i32 [[CTLZ]] -; - %lshr = lshr i32 8387584, %0 - %ctlz = call i32 @llvm.ctlz.i32(i32 %lshr, i1 true) - ret i32 %ctlz -} - -define i32 @shl_nuw_ctlz_true(i32) { -; CHECK-LABEL: define i32 @shl_nuw_ctlz_true( -; CHECK-SAME: i32 [[TMP0:%.*]]) { -; CHECK-NEXT: [[CTLZ:%.*]] = sub i32 9, [[TMP0]] -; CHECK-NEXT: ret i32 [[CTLZ]] -; - %shl = shl nuw i32 8387584, %0 - %ctlz = call i32 @llvm.ctlz.i32(i32 %shl, i1 true) - ret i32 %ctlz -} - -define i32 @shl_nuw_nsw_ctlz_true(i32) { -; CHECK-LABEL: define i32 @shl_nuw_nsw_ctlz_true( -; CHECK-SAME: i32 [[TMP0:%.*]]) { -; CHECK-NEXT: [[CTLZ:%.*]] = sub i32 9, [[TMP0]] -; CHECK-NEXT: ret i32 [[CTLZ]] -; - %shl = shl nuw nsw i32 8387584, %0 - %ctlz = call i32 @llvm.ctlz.i32(i32 %shl, i1 true) - ret i32 %ctlz -} - -define i32 @lshr_exact_cttz_true(i32) { -; CHECK-LABEL: define i32 @lshr_exact_cttz_true( -; CHECK-SAME: i32 [[TMP0:%.*]]) { -; CHECK-NEXT: [[CTTZ:%.*]] = sub i32 10, [[TMP0]] -; CHECK-NEXT: ret i32 [[CTTZ]] -; - %lshr = lshr exact i32 8387584, %0 - %cttz = call i32 @llvm.cttz.i32(i32 %lshr, i1 true) - ret i32 %cttz -} - -define i32 @shl_cttz_true(i32) { -; CHECK-LABEL: define i32 @shl_cttz_true( -; CHECK-SAME: i32 [[TMP0:%.*]]) { -; CHECK-NEXT: [[CTTZ:%.*]] = add i32 [[TMP0]], 10 -; CHECK-NEXT: ret i32 [[CTTZ]] -; - %shl = shl i32 8387584, %0 - %cttz = call i32 @llvm.cttz.i32(i32 %shl, i1 true) - ret i32 %cttz -} - -define <2 x i32> @vec2_lshr_ctlz_true(<2 x i32>) { -; CHECK-LABEL: define <2 x i32> @vec2_lshr_ctlz_true( -; CHECK-SAME: <2 x i32> [[TMP0:%.*]]) { -; CHECK-NEXT: [[CTLZ:%.*]] = add <2 x i32> [[TMP0]], -; CHECK-NEXT: ret <2 x i32> [[CTLZ]] -; - %div = lshr <2 x i32> , %0 - %ctlz = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %div, i1 true) - ret <2 x i32> %ctlz -} - -define <2 x i32> @vec2_shl_nuw_ctlz_true(<2 x i32>) { -; CHECK-LABEL: define <2 x i32> @vec2_shl_nuw_ctlz_true( -; CHECK-SAME: <2 x i32> [[TMP0:%.*]]) { -; CHECK-NEXT: [[CTLZ:%.*]] = sub <2 x i32> , [[TMP0]] -; CHECK-NEXT: ret <2 x i32> [[CTLZ]] -; - %shl = shl nuw <2 x i32> , %0 - %ctlz = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %shl, i1 true) - ret <2 x i32> %ctlz -} - -define <2 x i32> @vec2_shl_nuw_nsw_ctlz_true(<2 x i32>) { -; CHECK-LABEL: define <2 x i32> @vec2_shl_nuw_nsw_ctlz_true( -; CHECK-SAME: <2 x i32> [[TMP0:%.*]]) { -; CHECK-NEXT: [[CTLZ:%.*]] = sub <2 x i32> , [[TMP0]] -; CHECK-NEXT: ret <2 x i32> [[CTLZ]] -; - %shl = shl nuw nsw <2 x i32> , %0 - %ctlz = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %shl, i1 true) - ret <2 x i32> %ctlz -} - -define <2 x i32> @vec2_lshr_exact_cttz_true(<2 x i32>) { -; CHECK-LABEL: define <2 x i32> @vec2_lshr_exact_cttz_true( -; CHECK-SAME: <2 x i32> [[TMP0:%.*]]) { -; CHECK-NEXT: [[CTTZ:%.*]] = sub <2 x i32> , [[TMP0]] -; CHECK-NEXT: ret <2 x i32> [[CTTZ]] -; - %lshr = lshr exact <2 x i32> , %0 - %cttz = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %lshr, i1 true) - ret <2 x i32> %cttz -} - -define <2 x i32> @vec2_shl_cttz_true(<2 x i32>) { -; CHECK-LABEL: define <2 x i32> @vec2_shl_cttz_true( -; CHECK-SAME: <2 x i32> [[TMP0:%.*]]) { -; CHECK-NEXT: [[CTTZ:%.*]] = add <2 x i32> [[TMP0]], -; CHECK-NEXT: ret <2 x i32> [[CTTZ]] -; - %shl = shl <2 x i32> , %0 - %cttz = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %shl, i1 true) - ret <2 x i32> %cttz -} - -; negative tests: - -define <2 x i32> @vec2_shl_nsw_ctlz_true_neg(<2 x i32>) { -; CHECK-LABEL: define <2 x i32> @vec2_shl_nsw_ctlz_true_neg( -; CHECK-SAME: <2 x i32> [[TMP0:%.*]]) { -; CHECK-NEXT: [[SHL:%.*]] = shl nsw <2 x i32> , [[TMP0]] -; CHECK-NEXT: [[CTLZ:%.*]] = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> [[SHL]], i1 true), !range [[RNG0:![0-9]+]] -; CHECK-NEXT: ret <2 x i32> [[CTLZ]] -; - %shl = shl nsw <2 x i32> , %0 - %ctlz = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %shl, i1 true) - ret <2 x i32> %ctlz -} - -define <2 x i32> @vec2_lshr_ctlz_false_neg(<2 x i32>) { -; CHECK-LABEL: define <2 x i32> @vec2_lshr_ctlz_false_neg( -; CHECK-SAME: <2 x i32> [[TMP0:%.*]]) { -; CHECK-NEXT: [[DIV:%.*]] = lshr <2 x i32> , [[TMP0]] -; CHECK-NEXT: [[CTLZ:%.*]] = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> [[DIV]], i1 false), !range [[RNG1:![0-9]+]] -; CHECK-NEXT: ret <2 x i32> [[CTLZ]] -; - %div = lshr <2 x i32> , %0 - %ctlz = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %div, i1 false) - ret <2 x i32> %ctlz -} - -define <2 x i32> @vec2_shl_ctlz_false_neg(<2 x i32>) { -; CHECK-LABEL: define <2 x i32> @vec2_shl_ctlz_false_neg( -; CHECK-SAME: <2 x i32> [[TMP0:%.*]]) { -; CHECK-NEXT: [[SHL:%.*]] = shl <2 x i32> , [[TMP0]] -; CHECK-NEXT: [[CTLZ:%.*]] = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> [[SHL]], i1 false), !range [[RNG2:![0-9]+]] -; CHECK-NEXT: ret <2 x i32> [[CTLZ]] -; - %shl = shl <2 x i32> , %0 - %ctlz = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %shl, i1 false) - ret <2 x i32> %ctlz -} - -define <2 x i32> @vec2_lshr_cttz_false_neg(<2 x i32>) { -; CHECK-LABEL: define <2 x i32> @vec2_lshr_cttz_false_neg( -; CHECK-SAME: <2 x i32> [[TMP0:%.*]]) { -; CHECK-NEXT: [[LSHR:%.*]] = lshr <2 x i32> , [[TMP0]] -; CHECK-NEXT: [[CTTZ:%.*]] = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> [[LSHR]], i1 false), !range [[RNG2]] -; CHECK-NEXT: ret <2 x i32> [[CTTZ]] -; - %lshr = lshr <2 x i32> , %0 - %cttz = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %lshr, i1 false) - ret <2 x i32> %cttz -} - -define <2 x i32> @vec2_shl_cttz_false_neg(<2 x i32>) { -; CHECK-LABEL: define <2 x i32> @vec2_shl_cttz_false_neg( -; CHECK-SAME: <2 x i32> [[TMP0:%.*]]) { -; CHECK-NEXT: [[SHL:%.*]] = shl <2 x i32> , [[TMP0]] -; CHECK-NEXT: [[CTTZ:%.*]] = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> [[SHL]], i1 false), !range [[RNG3:![0-9]+]] -; CHECK-NEXT: ret <2 x i32> [[CTTZ]] -; - %shl = shl <2 x i32> , %0 - %cttz = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %shl, i1 false) - ret <2 x i32> %cttz -} - -define i32 @lshr_ctlz_faslse_neg(i32) { -; CHECK-LABEL: define i32 @lshr_ctlz_faslse_neg( -; CHECK-SAME: i32 [[TMP0:%.*]]) { -; CHECK-NEXT: [[LSHR:%.*]] = lshr i32 8387584, [[TMP0]] -; CHECK-NEXT: [[CTLZ:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LSHR]], i1 false), !range [[RNG1]] -; CHECK-NEXT: ret i32 [[CTLZ]] -; - %lshr = lshr i32 8387584, %0 - %ctlz = call i32 @llvm.ctlz.i32(i32 %lshr, i1 false) - ret i32 %ctlz -} - -define i32 @shl_ctlz_false_neg(i32) { -; CHECK-LABEL: define i32 @shl_ctlz_false_neg( -; CHECK-SAME: i32 [[TMP0:%.*]]) { -; CHECK-NEXT: [[SHL:%.*]] = shl i32 8387584, [[TMP0]] -; CHECK-NEXT: [[CTLZ:%.*]] = call i32 @llvm.ctlz.i32(i32 [[SHL]], i1 false), !range [[RNG2]] -; CHECK-NEXT: ret i32 [[CTLZ]] -; - %shl = shl i32 8387584, %0 - %ctlz = call i32 @llvm.ctlz.i32(i32 %shl, i1 false) - ret i32 %ctlz -} - -define i32 @lshr_cttz_false_neg(i32) { -; CHECK-LABEL: define i32 @lshr_cttz_false_neg( -; CHECK-SAME: i32 [[TMP0:%.*]]) { -; CHECK-NEXT: [[LSHR:%.*]] = lshr i32 8387584, [[TMP0]] -; CHECK-NEXT: [[CTTZ:%.*]] = call i32 @llvm.cttz.i32(i32 [[LSHR]], i1 false), !range [[RNG2]] -; CHECK-NEXT: ret i32 [[CTTZ]] -; - %lshr = lshr i32 8387584, %0 - %cttz = call i32 @llvm.cttz.i32(i32 %lshr, i1 false) - ret i32 %cttz -} - -define i32 @shl_cttz_false_neg(i32) { -; CHECK-LABEL: define i32 @shl_cttz_false_neg( -; CHECK-SAME: i32 [[TMP0:%.*]]) { -; CHECK-NEXT: [[SHL:%.*]] = shl i32 8387584, [[TMP0]] -; CHECK-NEXT: [[CTTZ:%.*]] = call i32 @llvm.cttz.i32(i32 [[SHL]], i1 false), !range [[RNG4:![0-9]+]] -; CHECK-NEXT: ret i32 [[CTTZ]] -; - %shl = shl i32 8387584, %0 - %cttz = call i32 @llvm.cttz.i32(i32 %shl, i1 false) - ret i32 %cttz -} -;. -; CHECK: [[RNG0]] = !{i32 1, i32 33} -; CHECK: [[RNG1]] = !{i32 9, i32 33} -; CHECK: [[RNG2]] = !{i32 0, i32 33} -; CHECK: [[RNG3]] = !{i32 3, i32 33} -; CHECK: [[RNG4]] = !{i32 10, i32 33} -;. diff --git a/llvm/test/Transforms/InstCombine/icmp-gep.ll b/llvm/test/Transforms/InstCombine/icmp-gep.ll index 99c784d15eb30..7d266b7b246ae 100644 --- a/llvm/test/Transforms/InstCombine/icmp-gep.ll +++ b/llvm/test/Transforms/InstCombine/icmp-gep.ll @@ -397,52 +397,6 @@ define i1 @test61_as1(ptr addrspace(1) %foo, i16 %i, i16 %j) { ; Don't transform non-inbounds GEPs. } -define i1 @test60_extra_use(ptr %foo, i64 %i, i64 %j) { -; CHECK-LABEL: @test60_extra_use( -; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds i32, ptr [[FOO:%.*]], i64 [[I:%.*]] -; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds i16, ptr [[FOO]], i64 [[J:%.*]] -; CHECK-NEXT: call void @use(ptr [[GEP1]]) -; CHECK-NEXT: call void @use(ptr [[GEP2]]) -; CHECK-NEXT: [[CMP:%.*]] = icmp ult ptr [[GEP1]], [[GEP2]] -; CHECK-NEXT: ret i1 [[CMP]] -; - %gep1 = getelementptr inbounds i32, ptr %foo, i64 %i - %gep2 = getelementptr inbounds i16, ptr %foo, i64 %j - call void @use(ptr %gep1) - call void @use(ptr %gep2) - %cmp = icmp ult ptr %gep1, %gep2 - ret i1 %cmp -} - -define i1 @test60_extra_use_const_operands_inbounds(ptr %foo, i64 %i, i64 %j) { -; CHECK-LABEL: @test60_extra_use_const_operands_inbounds( -; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds i32, ptr [[FOO:%.*]], i64 1 -; CHECK-NEXT: call void @use(ptr nonnull [[GEP1]]) -; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[J:%.*]], 2 -; CHECK-NEXT: ret i1 [[CMP]] -; - %gep1 = getelementptr inbounds i32, ptr %foo, i64 1 - %gep2 = getelementptr inbounds i16, ptr %foo, i64 %j - call void @use(ptr %gep1) - %cmp = icmp eq ptr %gep1, %gep2 - ret i1 %cmp -} - -define i1 @test60_extra_use_const_operands_no_inbounds(ptr %foo, i64 %i, i64 %j) { -; CHECK-LABEL: @test60_extra_use_const_operands_no_inbounds( -; CHECK-NEXT: [[GEP1:%.*]] = getelementptr i32, ptr [[FOO:%.*]], i64 1 -; CHECK-NEXT: call void @use(ptr [[GEP1]]) -; CHECK-NEXT: [[GEP2_IDX_MASK:%.*]] = and i64 [[J:%.*]], 9223372036854775807 -; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[GEP2_IDX_MASK]], 2 -; CHECK-NEXT: ret i1 [[CMP]] -; - %gep1 = getelementptr i32, ptr %foo, i64 1 - %gep2 = getelementptr i16, ptr %foo, i64 %j - call void @use(ptr %gep1) - %cmp = icmp eq ptr %gep1, %gep2 - ret i1 %cmp -} - define i1 @test_scalable_same(ptr %x) { ; CHECK-LABEL: @test_scalable_same( ; CHECK-NEXT: ret i1 false diff --git a/llvm/test/Transforms/InstCombine/icmp-uge-of-not-of-shl-allones-by-bits-and-val-to-icmp-eq-of-lshr-val-by-bits-and-0.ll b/llvm/test/Transforms/InstCombine/icmp-uge-of-not-of-shl-allones-by-bits-and-val-to-icmp-eq-of-lshr-val-by-bits-and-0.ll index 7f4603881f23c..574f3e23a4ecf 100644 --- a/llvm/test/Transforms/InstCombine/icmp-uge-of-not-of-shl-allones-by-bits-and-val-to-icmp-eq-of-lshr-val-by-bits-and-0.ll +++ b/llvm/test/Transforms/InstCombine/icmp-uge-of-not-of-shl-allones-by-bits-and-val-to-icmp-eq-of-lshr-val-by-bits-and-0.ll @@ -101,7 +101,7 @@ define i1 @both(i8 %bits0, i8 %bits1) { ; CHECK-LABEL: @both( ; CHECK-NEXT: [[T0:%.*]] = shl nsw i8 -1, [[BITS0:%.*]] ; CHECK-NEXT: [[T2:%.*]] = shl nsw i8 -1, [[BITS1:%.*]] -; CHECK-NEXT: [[R:%.*]] = icmp ule i8 [[T0]], [[T2]] +; CHECK-NEXT: [[R:%.*]] = icmp uge i8 [[T2]], [[T0]] ; CHECK-NEXT: ret i1 [[R]] ; %t0 = shl i8 -1, %bits0 diff --git a/llvm/test/Transforms/InstCombine/icmp-ult-of-not-of-shl-allones-by-bits-and-val-to-icmp-ne-of-lshr-val-by-bits-and-0.ll b/llvm/test/Transforms/InstCombine/icmp-ult-of-not-of-shl-allones-by-bits-and-val-to-icmp-ne-of-lshr-val-by-bits-and-0.ll index c7a45c5cdc11a..5f0af994f0eb8 100644 --- a/llvm/test/Transforms/InstCombine/icmp-ult-of-not-of-shl-allones-by-bits-and-val-to-icmp-ne-of-lshr-val-by-bits-and-0.ll +++ b/llvm/test/Transforms/InstCombine/icmp-ult-of-not-of-shl-allones-by-bits-and-val-to-icmp-ne-of-lshr-val-by-bits-and-0.ll @@ -101,7 +101,7 @@ define i1 @both(i8 %bits0, i8 %bits1) { ; CHECK-LABEL: @both( ; CHECK-NEXT: [[T0:%.*]] = shl nsw i8 -1, [[BITS0:%.*]] ; CHECK-NEXT: [[T2:%.*]] = shl nsw i8 -1, [[BITS1:%.*]] -; CHECK-NEXT: [[R:%.*]] = icmp ugt i8 [[T0]], [[T2]] +; CHECK-NEXT: [[R:%.*]] = icmp ult i8 [[T2]], [[T0]] ; CHECK-NEXT: ret i1 [[R]] ; %t0 = shl i8 -1, %bits0 diff --git a/llvm/test/Transforms/InstCombine/not.ll b/llvm/test/Transforms/InstCombine/not.ll index 3b0e5b4412fbe..847625af8b9c8 100644 --- a/llvm/test/Transforms/InstCombine/not.ll +++ b/llvm/test/Transforms/InstCombine/not.ll @@ -39,7 +39,7 @@ define i1 @invert_fcmp(float %X, float %Y) { define i1 @not_not_cmp(i32 %a, i32 %b) { ; CHECK-LABEL: @not_not_cmp( -; CHECK-NEXT: [[CMP:%.*]] = icmp sgt i32 [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[B:%.*]], [[A:%.*]] ; CHECK-NEXT: ret i1 [[CMP]] ; %nota = xor i32 %a, -1 @@ -50,7 +50,7 @@ define i1 @not_not_cmp(i32 %a, i32 %b) { define <2 x i1> @not_not_cmp_vector(<2 x i32> %a, <2 x i32> %b) { ; CHECK-LABEL: @not_not_cmp_vector( -; CHECK-NEXT: [[CMP:%.*]] = icmp ult <2 x i32> [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: [[CMP:%.*]] = icmp ugt <2 x i32> [[B:%.*]], [[A:%.*]] ; CHECK-NEXT: ret <2 x i1> [[CMP]] ; %nota = xor <2 x i32> %a, @@ -727,8 +727,9 @@ define i8 @bitcast_to_scalar_sext_bool_use2(<4 x i1> %b) { define i1 @invert_both_cmp_operands_add(i32 %a, i32 %b) { ; CHECK-LABEL: @invert_both_cmp_operands_add( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = sub i32 [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP0]], -1 +; CHECK-NEXT: [[NOT_A:%.*]] = xor i32 [[A:%.*]], -1 +; CHECK-NEXT: [[ADD:%.*]] = add i32 [[NOT_A]], [[B:%.*]] +; CHECK-NEXT: [[CMP:%.*]] = icmp sgt i32 [[ADD]], 0 ; CHECK-NEXT: ret i1 [[CMP]] ; entry: @@ -741,8 +742,9 @@ entry: define i1 @invert_both_cmp_operands_sub(i32 %a, i32 %b) { ; CHECK-LABEL: @invert_both_cmp_operands_sub( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: [[CMP:%.*]] = icmp ugt i32 [[TMP0]], -43 +; CHECK-NEXT: [[NOT_A:%.*]] = xor i32 [[A:%.*]], -1 +; CHECK-NEXT: [[ADD:%.*]] = sub i32 [[NOT_A]], [[B:%.*]] +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[ADD]], 42 ; CHECK-NEXT: ret i1 [[CMP]] ; entry: @@ -755,9 +757,12 @@ entry: define i1 @invert_both_cmp_operands_complex(i1 %x, i32 %a, i32 %b, i32 %c) { ; CHECK-LABEL: @invert_both_cmp_operands_complex( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = sub i32 [[A:%.*]], [[C:%.*]] -; CHECK-NEXT: [[TMP1:%.*]] = select i1 [[X:%.*]], i32 [[TMP0]], i32 [[B:%.*]] -; CHECK-NEXT: [[CMP:%.*]] = icmp sge i32 [[TMP1]], [[C]] +; CHECK-NEXT: [[NOT_A:%.*]] = xor i32 [[A:%.*]], -1 +; CHECK-NEXT: [[NOT_B:%.*]] = xor i32 [[B:%.*]], -1 +; CHECK-NEXT: [[NOT_C:%.*]] = xor i32 [[C:%.*]], -1 +; CHECK-NEXT: [[ADD:%.*]] = add i32 [[NOT_A]], [[C]] +; CHECK-NEXT: [[SELECT:%.*]] = select i1 [[X:%.*]], i32 [[ADD]], i32 [[NOT_B]] +; CHECK-NEXT: [[CMP:%.*]] = icmp sle i32 [[SELECT]], [[NOT_C]] ; CHECK-NEXT: ret i1 [[CMP]] ; entry: diff --git a/llvm/test/Transforms/InstSimplify/returned.ll b/llvm/test/Transforms/InstSimplify/returned.ll index 2da1052e7d4d8..94a98ac6cb05d 100644 --- a/llvm/test/Transforms/InstSimplify/returned.ll +++ b/llvm/test/Transforms/InstSimplify/returned.ll @@ -25,31 +25,6 @@ define i1 @gep3() { ret i1 %equal } -define <8 x i1> @returned_vec_arg_casted(<2 x i32> %a) { -; CHECK-LABEL: @returned_vec_arg_casted( -; CHECK-NEXT: [[X:%.*]] = call <8 x i8> @passthru_8i8v_from_2i32v(<2 x i32> [[A:%.*]]) -; CHECK-NEXT: [[C:%.*]] = icmp slt <8 x i8> [[X]], zeroinitializer -; CHECK-NEXT: ret <8 x i1> [[C]] -; - %x = call <8 x i8> @passthru_8i8v_from_2i32v(<2 x i32> %a) - %C = icmp slt <8 x i8> %x, zeroinitializer - ret <8 x i1> %C -} - -define <8 x i1> @returned_vec_arg_casted2(<2 x i32> %a) { -; CHECK-LABEL: @returned_vec_arg_casted2( -; CHECK-NEXT: [[OR:%.*]] = or <2 x i32> [[A:%.*]], -; CHECK-NEXT: [[X:%.*]] = call <8 x i8> @passthru_8i8v_from_2i32v(<2 x i32> [[OR]]) -; CHECK-NEXT: [[C:%.*]] = icmp ne <8 x i8> [[X]], zeroinitializer -; CHECK-NEXT: ret <8 x i1> [[C]] -; - %or = or <2 x i32> %a, - %x = call <8 x i8> @passthru_8i8v_from_2i32v(<2 x i32> %or) - %C = icmp ne <8 x i8> %x, zeroinitializer - ret <8 x i1> %C -} - -declare <8 x i8> @passthru_8i8v_from_2i32v(<2 x i32> returned) declare ptr @func1(ptr returned) nounwind readnone willreturn declare ptr @func2(ptr returned) nounwind readnone willreturn diff --git a/llvm/test/Transforms/LoopUnroll/loop-branch-folding.ll b/llvm/test/Transforms/LoopUnroll/loop-branch-folding.ll deleted file mode 100644 index f0a6c0a954f6a..0000000000000 --- a/llvm/test/Transforms/LoopUnroll/loop-branch-folding.ll +++ /dev/null @@ -1,936 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 -; RUN: opt < %s -S -passes=simplifycfg | FileCheck %s --check-prefixes=CHECK-CFG -; RUN: opt < %s -S -passes=simplifycfg,loop-unroll --unroll-max-upperbound=17 | FileCheck %s --check-prefixes=CHECK-UNROLL - -; This test designed to check: -; We can still unroll loop with 'pragma unroll' if loop count(trip count) was destroyed by previous optimization. -; For exmaple, in following test, loop condition "Dim < 16" was 'merged' with "Dim == Dims" in folding branches -; at simplifycfg. But if custumer mark the loop with "#pragma unroll", we can still successfully unroll it under -; unroll-max-upperbound. -; -; __device__ void func(int Idx, int *Arr[], int Dims, int *Out) { -; #pragma unroll -; for (int Dim = 0; Dim < 16; ++Dim) { -; if (Dim == Dims) { -; break; -; } -; int divmod = Arr[Dim][Idx]; -; Idx = divmod + 1; -; -; for (int arg = 0; arg < 4; arg++) { -; Out[arg] += Arr[Dim][arg]; -; bar(); -; } -; } -; } - -define void @func(i32 noundef %Idx, ptr noundef %Arr, i32 noundef %Dims, ptr noundef %Out) { -; CHECK-CFG-LABEL: define void @func( -; CHECK-CFG-SAME: i32 noundef [[IDX:%.*]], ptr noundef [[ARR:%.*]], i32 noundef [[DIMS:%.*]], ptr noundef [[OUT:%.*]]) { -; CHECK-CFG-NEXT: entry: -; CHECK-CFG-NEXT: br label [[FOR_COND:%.*]] -; CHECK-CFG: for.cond: -; CHECK-CFG-NEXT: [[DIM_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC16:%.*]], [[FOR_COND_CLEANUP6:%.*]] ] -; CHECK-CFG-NEXT: [[IDX_ADDR_0:%.*]] = phi i32 [ [[IDX]], [[ENTRY]] ], [ [[ADD:%.*]], [[FOR_COND_CLEANUP6]] ] -; CHECK-CFG-NEXT: [[CMP:%.*]] = icmp sge i32 [[DIM_0]], 16 -; CHECK-CFG-NEXT: [[CMP1:%.*]] = icmp eq i32 [[DIM_0]], [[DIMS]] -; CHECK-CFG-NEXT: [[OR_COND:%.*]] = or i1 [[CMP]], [[CMP1]] -; CHECK-CFG-NEXT: br i1 [[OR_COND]], label [[CLEANUP:%.*]], label [[IF_END:%.*]] -; CHECK-CFG: if.end: -; CHECK-CFG-NEXT: [[IDXPROM:%.*]] = sext i32 [[DIM_0]] to i64 -; CHECK-CFG-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[ARR]], i64 [[IDXPROM]] -; CHECK-CFG-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8 -; CHECK-CFG-NEXT: [[IDXPROM2:%.*]] = sext i32 [[IDX_ADDR_0]] to i64 -; CHECK-CFG-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 [[IDXPROM2]] -; CHECK-CFG-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX3]], align 4 -; CHECK-CFG-NEXT: [[ADD]] = add nsw i32 [[TMP1]], 1 -; CHECK-CFG-NEXT: br label [[FOR_COND4:%.*]] -; CHECK-CFG: for.cond4: -; CHECK-CFG-NEXT: [[ARG_0:%.*]] = phi i32 [ 0, [[IF_END]] ], [ [[INC:%.*]], [[FOR_BODY7:%.*]] ] -; CHECK-CFG-NEXT: [[CMP5:%.*]] = icmp slt i32 [[ARG_0]], 4 -; CHECK-CFG-NEXT: br i1 [[CMP5]], label [[FOR_BODY7]], label [[FOR_COND_CLEANUP6]] -; CHECK-CFG: for.cond.cleanup6: -; CHECK-CFG-NEXT: [[INC16]] = add nsw i32 [[DIM_0]], 1 -; CHECK-CFG-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP0:![0-9]+]] -; CHECK-CFG: for.body7: -; CHECK-CFG-NEXT: [[TMP2:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8 -; CHECK-CFG-NEXT: [[IDXPROM10:%.*]] = sext i32 [[ARG_0]] to i64 -; CHECK-CFG-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i64 [[IDXPROM10]] -; CHECK-CFG-NEXT: [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX11]], align 4 -; CHECK-CFG-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds i32, ptr [[OUT]], i64 [[IDXPROM10]] -; CHECK-CFG-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX13]], align 4 -; CHECK-CFG-NEXT: [[ADD14:%.*]] = add nsw i32 [[TMP4]], [[TMP3]] -; CHECK-CFG-NEXT: store i32 [[ADD14]], ptr [[ARRAYIDX13]], align 4 -; CHECK-CFG-NEXT: call void @_Z3barv() -; CHECK-CFG-NEXT: [[INC]] = add nsw i32 [[ARG_0]], 1 -; CHECK-CFG-NEXT: br label [[FOR_COND4]], !llvm.loop [[LOOP3:![0-9]+]] -; CHECK-CFG: cleanup: -; CHECK-CFG-NEXT: ret void -; -; CHECK-UNROLL-LABEL: define void @func( -; CHECK-UNROLL-SAME: i32 noundef [[IDX:%.*]], ptr noundef [[ARR:%.*]], i32 noundef [[DIMS:%.*]], ptr noundef [[OUT:%.*]]) { -; CHECK-UNROLL-NEXT: entry: -; CHECK-UNROLL-NEXT: br label [[FOR_COND:%.*]] -; CHECK-UNROLL: for.cond: -; CHECK-UNROLL-NEXT: [[CMP1:%.*]] = icmp eq i32 0, [[DIMS]] -; CHECK-UNROLL-NEXT: br i1 [[CMP1]], label [[CLEANUP:%.*]], label [[IF_END:%.*]] -; CHECK-UNROLL: if.end: -; CHECK-UNROLL-NEXT: br label [[FOR_COND4:%.*]] -; CHECK-UNROLL: for.cond4: -; CHECK-UNROLL-NEXT: br label [[FOR_BODY7:%.*]] -; CHECK-UNROLL: for.cond.cleanup6: -; CHECK-UNROLL-NEXT: [[CMP1_1:%.*]] = icmp eq i32 1, [[DIMS]] -; CHECK-UNROLL-NEXT: br i1 [[CMP1_1]], label [[CLEANUP]], label [[IF_END_1:%.*]] -; CHECK-UNROLL: if.end.1: -; CHECK-UNROLL-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds ptr, ptr [[ARR]], i64 1 -; CHECK-UNROLL-NEXT: br label [[FOR_COND4_1:%.*]] -; CHECK-UNROLL: for.cond4.1: -; CHECK-UNROLL-NEXT: br label [[FOR_BODY7_12:%.*]] -; CHECK-UNROLL: for.body7.12: -; CHECK-UNROLL-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ARRAYIDX_1]], align 8 -; CHECK-UNROLL-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4 -; CHECK-UNROLL-NEXT: [[TMP2:%.*]] = load i32, ptr [[OUT]], align 4 -; CHECK-UNROLL-NEXT: [[ADD14_11:%.*]] = add nsw i32 [[TMP2]], [[TMP1]] -; CHECK-UNROLL-NEXT: store i32 [[ADD14_11]], ptr [[OUT]], align 4 -; CHECK-UNROLL-NEXT: call void @_Z3barv() -; CHECK-UNROLL-NEXT: br label [[FOR_BODY7_1_1:%.*]] -; CHECK-UNROLL: for.body7.1.1: -; CHECK-UNROLL-NEXT: [[TMP3:%.*]] = load ptr, ptr [[ARRAYIDX_1]], align 8 -; CHECK-UNROLL-NEXT: [[ARRAYIDX11_1_1:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i64 1 -; CHECK-UNROLL-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX11_1_1]], align 4 -; CHECK-UNROLL-NEXT: [[ARRAYIDX13_1_1:%.*]] = getelementptr inbounds i32, ptr [[OUT]], i64 1 -; CHECK-UNROLL-NEXT: [[TMP5:%.*]] = load i32, ptr [[ARRAYIDX13_1_1]], align 4 -; CHECK-UNROLL-NEXT: [[ADD14_1_1:%.*]] = add nsw i32 [[TMP5]], [[TMP4]] -; CHECK-UNROLL-NEXT: store i32 [[ADD14_1_1]], ptr [[ARRAYIDX13_1_1]], align 4 -; CHECK-UNROLL-NEXT: call void @_Z3barv() -; CHECK-UNROLL-NEXT: br label [[FOR_BODY7_2_1:%.*]] -; CHECK-UNROLL: for.body7.2.1: -; CHECK-UNROLL-NEXT: [[TMP6:%.*]] = load ptr, ptr [[ARRAYIDX_1]], align 8 -; CHECK-UNROLL-NEXT: [[ARRAYIDX11_2_1:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i64 2 -; CHECK-UNROLL-NEXT: [[TMP7:%.*]] = load i32, ptr [[ARRAYIDX11_2_1]], align 4 -; CHECK-UNROLL-NEXT: [[ARRAYIDX13_2_1:%.*]] = getelementptr inbounds i32, ptr [[OUT]], i64 2 -; CHECK-UNROLL-NEXT: [[TMP8:%.*]] = load i32, ptr [[ARRAYIDX13_2_1]], align 4 -; CHECK-UNROLL-NEXT: [[ADD14_2_1:%.*]] = add nsw i32 [[TMP8]], [[TMP7]] -; CHECK-UNROLL-NEXT: store i32 [[ADD14_2_1]], ptr [[ARRAYIDX13_2_1]], align 4 -; CHECK-UNROLL-NEXT: call void @_Z3barv() -; CHECK-UNROLL-NEXT: br label [[FOR_BODY7_3_1:%.*]] -; CHECK-UNROLL: for.body7.3.1: -; CHECK-UNROLL-NEXT: [[TMP9:%.*]] = load ptr, ptr [[ARRAYIDX_1]], align 8 -; CHECK-UNROLL-NEXT: [[ARRAYIDX11_3_1:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i64 3 -; CHECK-UNROLL-NEXT: [[TMP10:%.*]] = load i32, ptr [[ARRAYIDX11_3_1]], align 4 -; CHECK-UNROLL-NEXT: [[ARRAYIDX13_3_1:%.*]] = getelementptr inbounds i32, ptr [[OUT]], i64 3 -; CHECK-UNROLL-NEXT: [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX13_3_1]], align 4 -; CHECK-UNROLL-NEXT: [[ADD14_3_1:%.*]] = add nsw i32 [[TMP11]], [[TMP10]] -; CHECK-UNROLL-NEXT: store i32 [[ADD14_3_1]], ptr [[ARRAYIDX13_3_1]], align 4 -; CHECK-UNROLL-NEXT: call void @_Z3barv() -; CHECK-UNROLL-NEXT: br i1 false, label [[FOR_BODY7_4:%.*]], label [[FOR_COND_CLEANUP6_1:%.*]] -; CHECK-UNROLL: for.cond.cleanup6.1: -; CHECK-UNROLL-NEXT: [[CMP1_2:%.*]] = icmp eq i32 2, [[DIMS]] -; CHECK-UNROLL-NEXT: br i1 [[CMP1_2]], label [[CLEANUP]], label [[IF_END_2:%.*]] -; CHECK-UNROLL: if.end.2: -; CHECK-UNROLL-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds ptr, ptr [[ARR]], i64 2 -; CHECK-UNROLL-NEXT: br label [[FOR_COND4_2:%.*]] -; CHECK-UNROLL: for.cond4.2: -; CHECK-UNROLL-NEXT: br label [[FOR_BODY7_24:%.*]] -; CHECK-UNROLL: for.body7.24: -; CHECK-UNROLL-NEXT: [[TMP12:%.*]] = load ptr, ptr [[ARRAYIDX_2]], align 8 -; CHECK-UNROLL-NEXT: [[TMP13:%.*]] = load i32, ptr [[TMP12]], align 4 -; CHECK-UNROLL-NEXT: [[TMP14:%.*]] = load i32, ptr [[OUT]], align 4 -; CHECK-UNROLL-NEXT: [[ADD14_23:%.*]] = add nsw i32 [[TMP14]], [[TMP13]] -; CHECK-UNROLL-NEXT: store i32 [[ADD14_23]], ptr [[OUT]], align 4 -; CHECK-UNROLL-NEXT: call void @_Z3barv() -; CHECK-UNROLL-NEXT: br label [[FOR_BODY7_1_2:%.*]] -; CHECK-UNROLL: for.body7.1.2: -; CHECK-UNROLL-NEXT: [[TMP15:%.*]] = load ptr, ptr [[ARRAYIDX_2]], align 8 -; CHECK-UNROLL-NEXT: [[ARRAYIDX11_1_2:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i64 1 -; CHECK-UNROLL-NEXT: [[TMP16:%.*]] = load i32, ptr [[ARRAYIDX11_1_2]], align 4 -; CHECK-UNROLL-NEXT: [[ARRAYIDX13_1_2:%.*]] = getelementptr inbounds i32, ptr [[OUT]], i64 1 -; CHECK-UNROLL-NEXT: [[TMP17:%.*]] = load i32, ptr [[ARRAYIDX13_1_2]], align 4 -; CHECK-UNROLL-NEXT: [[ADD14_1_2:%.*]] = add nsw i32 [[TMP17]], [[TMP16]] -; CHECK-UNROLL-NEXT: store i32 [[ADD14_1_2]], ptr [[ARRAYIDX13_1_2]], align 4 -; CHECK-UNROLL-NEXT: call void @_Z3barv() -; CHECK-UNROLL-NEXT: br label [[FOR_BODY7_2_2:%.*]] -; CHECK-UNROLL: for.body7.2.2: -; CHECK-UNROLL-NEXT: [[TMP18:%.*]] = load ptr, ptr [[ARRAYIDX_2]], align 8 -; CHECK-UNROLL-NEXT: [[ARRAYIDX11_2_2:%.*]] = getelementptr inbounds i32, ptr [[TMP18]], i64 2 -; CHECK-UNROLL-NEXT: [[TMP19:%.*]] = load i32, ptr [[ARRAYIDX11_2_2]], align 4 -; CHECK-UNROLL-NEXT: [[ARRAYIDX13_2_2:%.*]] = getelementptr inbounds i32, ptr [[OUT]], i64 2 -; CHECK-UNROLL-NEXT: [[TMP20:%.*]] = load i32, ptr [[ARRAYIDX13_2_2]], align 4 -; CHECK-UNROLL-NEXT: [[ADD14_2_2:%.*]] = add nsw i32 [[TMP20]], [[TMP19]] -; CHECK-UNROLL-NEXT: store i32 [[ADD14_2_2]], ptr [[ARRAYIDX13_2_2]], align 4 -; CHECK-UNROLL-NEXT: call void @_Z3barv() -; CHECK-UNROLL-NEXT: br label [[FOR_BODY7_3_2:%.*]] -; CHECK-UNROLL: for.body7.3.2: -; CHECK-UNROLL-NEXT: [[TMP21:%.*]] = load ptr, ptr [[ARRAYIDX_2]], align 8 -; CHECK-UNROLL-NEXT: [[ARRAYIDX11_3_2:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i64 3 -; CHECK-UNROLL-NEXT: [[TMP22:%.*]] = load i32, ptr [[ARRAYIDX11_3_2]], align 4 -; CHECK-UNROLL-NEXT: [[ARRAYIDX13_3_2:%.*]] = getelementptr inbounds i32, ptr [[OUT]], i64 3 -; CHECK-UNROLL-NEXT: [[TMP23:%.*]] = load i32, ptr [[ARRAYIDX13_3_2]], align 4 -; CHECK-UNROLL-NEXT: [[ADD14_3_2:%.*]] = add nsw i32 [[TMP23]], [[TMP22]] -; CHECK-UNROLL-NEXT: store i32 [[ADD14_3_2]], ptr [[ARRAYIDX13_3_2]], align 4 -; CHECK-UNROLL-NEXT: call void @_Z3barv() -; CHECK-UNROLL-NEXT: br i1 false, label [[FOR_BODY7_4]], label [[FOR_COND_CLEANUP6_2:%.*]] -; CHECK-UNROLL: for.cond.cleanup6.2: -; CHECK-UNROLL-NEXT: [[CMP1_3:%.*]] = icmp eq i32 3, [[DIMS]] -; CHECK-UNROLL-NEXT: br i1 [[CMP1_3]], label [[CLEANUP]], label [[IF_END_3:%.*]] -; CHECK-UNROLL: if.end.3: -; CHECK-UNROLL-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds ptr, ptr [[ARR]], i64 3 -; CHECK-UNROLL-NEXT: br label [[FOR_COND4_3:%.*]] -; CHECK-UNROLL: for.cond4.3: -; CHECK-UNROLL-NEXT: br label [[FOR_BODY7_36:%.*]] -; CHECK-UNROLL: for.body7.36: -; CHECK-UNROLL-NEXT: [[TMP24:%.*]] = load ptr, ptr [[ARRAYIDX_3]], align 8 -; CHECK-UNROLL-NEXT: [[TMP25:%.*]] = load i32, ptr [[TMP24]], align 4 -; CHECK-UNROLL-NEXT: [[TMP26:%.*]] = load i32, ptr [[OUT]], align 4 -; CHECK-UNROLL-NEXT: [[ADD14_35:%.*]] = add nsw i32 [[TMP26]], [[TMP25]] -; CHECK-UNROLL-NEXT: store i32 [[ADD14_35]], ptr [[OUT]], align 4 -; CHECK-UNROLL-NEXT: call void @_Z3barv() -; CHECK-UNROLL-NEXT: br label [[FOR_BODY7_1_3:%.*]] -; CHECK-UNROLL: for.body7.1.3: -; CHECK-UNROLL-NEXT: [[TMP27:%.*]] = load ptr, ptr [[ARRAYIDX_3]], align 8 -; CHECK-UNROLL-NEXT: [[ARRAYIDX11_1_3:%.*]] = getelementptr inbounds i32, ptr [[TMP27]], i64 1 -; CHECK-UNROLL-NEXT: [[TMP28:%.*]] = load i32, ptr [[ARRAYIDX11_1_3]], align 4 -; CHECK-UNROLL-NEXT: [[ARRAYIDX13_1_3:%.*]] = getelementptr inbounds i32, ptr [[OUT]], i64 1 -; CHECK-UNROLL-NEXT: [[TMP29:%.*]] = load i32, ptr [[ARRAYIDX13_1_3]], align 4 -; CHECK-UNROLL-NEXT: [[ADD14_1_3:%.*]] = add nsw i32 [[TMP29]], [[TMP28]] -; CHECK-UNROLL-NEXT: store i32 [[ADD14_1_3]], ptr [[ARRAYIDX13_1_3]], align 4 -; CHECK-UNROLL-NEXT: call void @_Z3barv() -; CHECK-UNROLL-NEXT: br label [[FOR_BODY7_2_3:%.*]] -; CHECK-UNROLL: for.body7.2.3: -; CHECK-UNROLL-NEXT: [[TMP30:%.*]] = load ptr, ptr [[ARRAYIDX_3]], align 8 -; CHECK-UNROLL-NEXT: [[ARRAYIDX11_2_3:%.*]] = getelementptr inbounds i32, ptr [[TMP30]], i64 2 -; CHECK-UNROLL-NEXT: [[TMP31:%.*]] = load i32, ptr [[ARRAYIDX11_2_3]], align 4 -; CHECK-UNROLL-NEXT: [[ARRAYIDX13_2_3:%.*]] = getelementptr inbounds i32, ptr [[OUT]], i64 2 -; CHECK-UNROLL-NEXT: [[TMP32:%.*]] = load i32, ptr [[ARRAYIDX13_2_3]], align 4 -; CHECK-UNROLL-NEXT: [[ADD14_2_3:%.*]] = add nsw i32 [[TMP32]], [[TMP31]] -; CHECK-UNROLL-NEXT: store i32 [[ADD14_2_3]], ptr [[ARRAYIDX13_2_3]], align 4 -; CHECK-UNROLL-NEXT: call void @_Z3barv() -; CHECK-UNROLL-NEXT: br label [[FOR_BODY7_3_3:%.*]] -; CHECK-UNROLL: for.body7.3.3: -; CHECK-UNROLL-NEXT: [[TMP33:%.*]] = load ptr, ptr [[ARRAYIDX_3]], align 8 -; CHECK-UNROLL-NEXT: [[ARRAYIDX11_3_3:%.*]] = getelementptr inbounds i32, ptr [[TMP33]], i64 3 -; CHECK-UNROLL-NEXT: [[TMP34:%.*]] = load i32, ptr [[ARRAYIDX11_3_3]], align 4 -; CHECK-UNROLL-NEXT: [[ARRAYIDX13_3_3:%.*]] = getelementptr inbounds i32, ptr [[OUT]], i64 3 -; CHECK-UNROLL-NEXT: [[TMP35:%.*]] = load i32, ptr [[ARRAYIDX13_3_3]], align 4 -; CHECK-UNROLL-NEXT: [[ADD14_3_3:%.*]] = add nsw i32 [[TMP35]], [[TMP34]] -; CHECK-UNROLL-NEXT: store i32 [[ADD14_3_3]], ptr [[ARRAYIDX13_3_3]], align 4 -; CHECK-UNROLL-NEXT: call void @_Z3barv() -; CHECK-UNROLL-NEXT: br i1 false, label [[FOR_BODY7_4]], label [[FOR_COND_CLEANUP6_3:%.*]] -; CHECK-UNROLL: for.cond.cleanup6.3: -; CHECK-UNROLL-NEXT: [[CMP1_4:%.*]] = icmp eq i32 4, [[DIMS]] -; CHECK-UNROLL-NEXT: br i1 [[CMP1_4]], label [[CLEANUP]], label [[IF_END_4:%.*]] -; CHECK-UNROLL: if.end.4: -; CHECK-UNROLL-NEXT: [[ARRAYIDX_4:%.*]] = getelementptr inbounds ptr, ptr [[ARR]], i64 4 -; CHECK-UNROLL-NEXT: br label [[FOR_COND4_4:%.*]] -; CHECK-UNROLL: for.cond4.4: -; CHECK-UNROLL-NEXT: br label [[FOR_BODY7_48:%.*]] -; CHECK-UNROLL: for.body7.48: -; CHECK-UNROLL-NEXT: [[TMP36:%.*]] = load ptr, ptr [[ARRAYIDX_4]], align 8 -; CHECK-UNROLL-NEXT: [[TMP37:%.*]] = load i32, ptr [[TMP36]], align 4 -; CHECK-UNROLL-NEXT: [[TMP38:%.*]] = load i32, ptr [[OUT]], align 4 -; CHECK-UNROLL-NEXT: [[ADD14_47:%.*]] = add nsw i32 [[TMP38]], [[TMP37]] -; CHECK-UNROLL-NEXT: store i32 [[ADD14_47]], ptr [[OUT]], align 4 -; CHECK-UNROLL-NEXT: call void @_Z3barv() -; CHECK-UNROLL-NEXT: br label [[FOR_BODY7_1_4:%.*]] -; CHECK-UNROLL: for.body7.1.4: -; CHECK-UNROLL-NEXT: [[TMP39:%.*]] = load ptr, ptr [[ARRAYIDX_4]], align 8 -; CHECK-UNROLL-NEXT: [[ARRAYIDX11_1_4:%.*]] = getelementptr inbounds i32, ptr [[TMP39]], i64 1 -; CHECK-UNROLL-NEXT: [[TMP40:%.*]] = load i32, ptr [[ARRAYIDX11_1_4]], align 4 -; CHECK-UNROLL-NEXT: [[ARRAYIDX13_1_4:%.*]] = getelementptr inbounds i32, ptr [[OUT]], i64 1 -; CHECK-UNROLL-NEXT: [[TMP41:%.*]] = load i32, ptr [[ARRAYIDX13_1_4]], align 4 -; CHECK-UNROLL-NEXT: [[ADD14_1_4:%.*]] = add nsw i32 [[TMP41]], [[TMP40]] -; CHECK-UNROLL-NEXT: store i32 [[ADD14_1_4]], ptr [[ARRAYIDX13_1_4]], align 4 -; CHECK-UNROLL-NEXT: call void @_Z3barv() -; CHECK-UNROLL-NEXT: br label [[FOR_BODY7_2_4:%.*]] -; CHECK-UNROLL: for.body7.2.4: -; CHECK-UNROLL-NEXT: [[TMP42:%.*]] = load ptr, ptr [[ARRAYIDX_4]], align 8 -; CHECK-UNROLL-NEXT: [[ARRAYIDX11_2_4:%.*]] = getelementptr inbounds i32, ptr [[TMP42]], i64 2 -; CHECK-UNROLL-NEXT: [[TMP43:%.*]] = load i32, ptr [[ARRAYIDX11_2_4]], align 4 -; CHECK-UNROLL-NEXT: [[ARRAYIDX13_2_4:%.*]] = getelementptr inbounds i32, ptr [[OUT]], i64 2 -; CHECK-UNROLL-NEXT: [[TMP44:%.*]] = load i32, ptr [[ARRAYIDX13_2_4]], align 4 -; CHECK-UNROLL-NEXT: [[ADD14_2_4:%.*]] = add nsw i32 [[TMP44]], [[TMP43]] -; CHECK-UNROLL-NEXT: store i32 [[ADD14_2_4]], ptr [[ARRAYIDX13_2_4]], align 4 -; CHECK-UNROLL-NEXT: call void @_Z3barv() -; CHECK-UNROLL-NEXT: br label [[FOR_BODY7_3_4:%.*]] -; CHECK-UNROLL: for.body7.3.4: -; CHECK-UNROLL-NEXT: [[TMP45:%.*]] = load ptr, ptr [[ARRAYIDX_4]], align 8 -; CHECK-UNROLL-NEXT: [[ARRAYIDX11_3_4:%.*]] = getelementptr inbounds i32, ptr [[TMP45]], i64 3 -; CHECK-UNROLL-NEXT: [[TMP46:%.*]] = load i32, ptr [[ARRAYIDX11_3_4]], align 4 -; CHECK-UNROLL-NEXT: [[ARRAYIDX13_3_4:%.*]] = getelementptr inbounds i32, ptr [[OUT]], i64 3 -; CHECK-UNROLL-NEXT: [[TMP47:%.*]] = load i32, ptr [[ARRAYIDX13_3_4]], align 4 -; CHECK-UNROLL-NEXT: [[ADD14_3_4:%.*]] = add nsw i32 [[TMP47]], [[TMP46]] -; CHECK-UNROLL-NEXT: store i32 [[ADD14_3_4]], ptr [[ARRAYIDX13_3_4]], align 4 -; CHECK-UNROLL-NEXT: call void @_Z3barv() -; CHECK-UNROLL-NEXT: br i1 false, label [[FOR_BODY7_4]], label [[FOR_COND_CLEANUP6_4:%.*]] -; CHECK-UNROLL: for.cond.cleanup6.4: -; CHECK-UNROLL-NEXT: [[CMP1_5:%.*]] = icmp eq i32 5, [[DIMS]] -; CHECK-UNROLL-NEXT: br i1 [[CMP1_5]], label [[CLEANUP]], label [[IF_END_5:%.*]] -; CHECK-UNROLL: if.end.5: -; CHECK-UNROLL-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds ptr, ptr [[ARR]], i64 5 -; CHECK-UNROLL-NEXT: br label [[FOR_COND4_5:%.*]] -; CHECK-UNROLL: for.cond4.5: -; CHECK-UNROLL-NEXT: br label [[FOR_BODY7_5:%.*]] -; CHECK-UNROLL: for.body7.5: -; CHECK-UNROLL-NEXT: [[TMP48:%.*]] = load ptr, ptr [[ARRAYIDX_5]], align 8 -; CHECK-UNROLL-NEXT: [[TMP49:%.*]] = load i32, ptr [[TMP48]], align 4 -; CHECK-UNROLL-NEXT: [[TMP50:%.*]] = load i32, ptr [[OUT]], align 4 -; CHECK-UNROLL-NEXT: [[ADD14_5:%.*]] = add nsw i32 [[TMP50]], [[TMP49]] -; CHECK-UNROLL-NEXT: store i32 [[ADD14_5]], ptr [[OUT]], align 4 -; CHECK-UNROLL-NEXT: call void @_Z3barv() -; CHECK-UNROLL-NEXT: br label [[FOR_BODY7_1_5:%.*]] -; CHECK-UNROLL: for.body7.1.5: -; CHECK-UNROLL-NEXT: [[TMP51:%.*]] = load ptr, ptr [[ARRAYIDX_5]], align 8 -; CHECK-UNROLL-NEXT: [[ARRAYIDX11_1_5:%.*]] = getelementptr inbounds i32, ptr [[TMP51]], i64 1 -; CHECK-UNROLL-NEXT: [[TMP52:%.*]] = load i32, ptr [[ARRAYIDX11_1_5]], align 4 -; CHECK-UNROLL-NEXT: [[ARRAYIDX13_1_5:%.*]] = getelementptr inbounds i32, ptr [[OUT]], i64 1 -; CHECK-UNROLL-NEXT: [[TMP53:%.*]] = load i32, ptr [[ARRAYIDX13_1_5]], align 4 -; CHECK-UNROLL-NEXT: [[ADD14_1_5:%.*]] = add nsw i32 [[TMP53]], [[TMP52]] -; CHECK-UNROLL-NEXT: store i32 [[ADD14_1_5]], ptr [[ARRAYIDX13_1_5]], align 4 -; CHECK-UNROLL-NEXT: call void @_Z3barv() -; CHECK-UNROLL-NEXT: br label [[FOR_BODY7_2_5:%.*]] -; CHECK-UNROLL: for.body7.2.5: -; CHECK-UNROLL-NEXT: [[TMP54:%.*]] = load ptr, ptr [[ARRAYIDX_5]], align 8 -; CHECK-UNROLL-NEXT: [[ARRAYIDX11_2_5:%.*]] = getelementptr inbounds i32, ptr [[TMP54]], i64 2 -; CHECK-UNROLL-NEXT: [[TMP55:%.*]] = load i32, ptr [[ARRAYIDX11_2_5]], align 4 -; CHECK-UNROLL-NEXT: [[ARRAYIDX13_2_5:%.*]] = getelementptr inbounds i32, ptr [[OUT]], i64 2 -; CHECK-UNROLL-NEXT: [[TMP56:%.*]] = load i32, ptr [[ARRAYIDX13_2_5]], align 4 -; CHECK-UNROLL-NEXT: [[ADD14_2_5:%.*]] = add nsw i32 [[TMP56]], [[TMP55]] -; CHECK-UNROLL-NEXT: store i32 [[ADD14_2_5]], ptr [[ARRAYIDX13_2_5]], align 4 -; CHECK-UNROLL-NEXT: call void @_Z3barv() -; CHECK-UNROLL-NEXT: br label [[FOR_BODY7_3_5:%.*]] -; CHECK-UNROLL: for.body7.3.5: -; CHECK-UNROLL-NEXT: [[TMP57:%.*]] = load ptr, ptr [[ARRAYIDX_5]], align 8 -; CHECK-UNROLL-NEXT: [[ARRAYIDX11_3_5:%.*]] = getelementptr inbounds i32, ptr [[TMP57]], i64 3 -; CHECK-UNROLL-NEXT: [[TMP58:%.*]] = load i32, ptr [[ARRAYIDX11_3_5]], align 4 -; CHECK-UNROLL-NEXT: [[ARRAYIDX13_3_5:%.*]] = getelementptr inbounds i32, ptr [[OUT]], i64 3 -; CHECK-UNROLL-NEXT: [[TMP59:%.*]] = load i32, ptr [[ARRAYIDX13_3_5]], align 4 -; CHECK-UNROLL-NEXT: [[ADD14_3_5:%.*]] = add nsw i32 [[TMP59]], [[TMP58]] -; CHECK-UNROLL-NEXT: store i32 [[ADD14_3_5]], ptr [[ARRAYIDX13_3_5]], align 4 -; CHECK-UNROLL-NEXT: call void @_Z3barv() -; CHECK-UNROLL-NEXT: br i1 false, label [[FOR_BODY7_4]], label [[FOR_COND_CLEANUP6_5:%.*]] -; CHECK-UNROLL: for.cond.cleanup6.5: -; CHECK-UNROLL-NEXT: [[CMP1_6:%.*]] = icmp eq i32 6, [[DIMS]] -; CHECK-UNROLL-NEXT: br i1 [[CMP1_6]], label [[CLEANUP]], label [[IF_END_6:%.*]] -; CHECK-UNROLL: if.end.6: -; CHECK-UNROLL-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds ptr, ptr [[ARR]], i64 6 -; CHECK-UNROLL-NEXT: br label [[FOR_COND4_6:%.*]] -; CHECK-UNROLL: for.cond4.6: -; CHECK-UNROLL-NEXT: br label [[FOR_BODY7_6:%.*]] -; CHECK-UNROLL: for.body7.6: -; CHECK-UNROLL-NEXT: [[TMP60:%.*]] = load ptr, ptr [[ARRAYIDX_6]], align 8 -; CHECK-UNROLL-NEXT: [[TMP61:%.*]] = load i32, ptr [[TMP60]], align 4 -; CHECK-UNROLL-NEXT: [[TMP62:%.*]] = load i32, ptr [[OUT]], align 4 -; CHECK-UNROLL-NEXT: [[ADD14_6:%.*]] = add nsw i32 [[TMP62]], [[TMP61]] -; CHECK-UNROLL-NEXT: store i32 [[ADD14_6]], ptr [[OUT]], align 4 -; CHECK-UNROLL-NEXT: call void @_Z3barv() -; CHECK-UNROLL-NEXT: br label [[FOR_BODY7_1_6:%.*]] -; CHECK-UNROLL: for.body7.1.6: -; CHECK-UNROLL-NEXT: [[TMP63:%.*]] = load ptr, ptr [[ARRAYIDX_6]], align 8 -; CHECK-UNROLL-NEXT: [[ARRAYIDX11_1_6:%.*]] = getelementptr inbounds i32, ptr [[TMP63]], i64 1 -; CHECK-UNROLL-NEXT: [[TMP64:%.*]] = load i32, ptr [[ARRAYIDX11_1_6]], align 4 -; CHECK-UNROLL-NEXT: [[ARRAYIDX13_1_6:%.*]] = getelementptr inbounds i32, ptr [[OUT]], i64 1 -; CHECK-UNROLL-NEXT: [[TMP65:%.*]] = load i32, ptr [[ARRAYIDX13_1_6]], align 4 -; CHECK-UNROLL-NEXT: [[ADD14_1_6:%.*]] = add nsw i32 [[TMP65]], [[TMP64]] -; CHECK-UNROLL-NEXT: store i32 [[ADD14_1_6]], ptr [[ARRAYIDX13_1_6]], align 4 -; CHECK-UNROLL-NEXT: call void @_Z3barv() -; CHECK-UNROLL-NEXT: br label [[FOR_BODY7_2_6:%.*]] -; CHECK-UNROLL: for.body7.2.6: -; CHECK-UNROLL-NEXT: [[TMP66:%.*]] = load ptr, ptr [[ARRAYIDX_6]], align 8 -; CHECK-UNROLL-NEXT: [[ARRAYIDX11_2_6:%.*]] = getelementptr inbounds i32, ptr [[TMP66]], i64 2 -; CHECK-UNROLL-NEXT: [[TMP67:%.*]] = load i32, ptr [[ARRAYIDX11_2_6]], align 4 -; CHECK-UNROLL-NEXT: [[ARRAYIDX13_2_6:%.*]] = getelementptr inbounds i32, ptr [[OUT]], i64 2 -; CHECK-UNROLL-NEXT: [[TMP68:%.*]] = load i32, ptr [[ARRAYIDX13_2_6]], align 4 -; CHECK-UNROLL-NEXT: [[ADD14_2_6:%.*]] = add nsw i32 [[TMP68]], [[TMP67]] -; CHECK-UNROLL-NEXT: store i32 [[ADD14_2_6]], ptr [[ARRAYIDX13_2_6]], align 4 -; CHECK-UNROLL-NEXT: call void @_Z3barv() -; CHECK-UNROLL-NEXT: br label [[FOR_BODY7_3_6:%.*]] -; CHECK-UNROLL: for.body7.3.6: -; CHECK-UNROLL-NEXT: [[TMP69:%.*]] = load ptr, ptr [[ARRAYIDX_6]], align 8 -; CHECK-UNROLL-NEXT: [[ARRAYIDX11_3_6:%.*]] = getelementptr inbounds i32, ptr [[TMP69]], i64 3 -; CHECK-UNROLL-NEXT: [[TMP70:%.*]] = load i32, ptr [[ARRAYIDX11_3_6]], align 4 -; CHECK-UNROLL-NEXT: [[ARRAYIDX13_3_6:%.*]] = getelementptr inbounds i32, ptr [[OUT]], i64 3 -; CHECK-UNROLL-NEXT: [[TMP71:%.*]] = load i32, ptr [[ARRAYIDX13_3_6]], align 4 -; CHECK-UNROLL-NEXT: [[ADD14_3_6:%.*]] = add nsw i32 [[TMP71]], [[TMP70]] -; CHECK-UNROLL-NEXT: store i32 [[ADD14_3_6]], ptr [[ARRAYIDX13_3_6]], align 4 -; CHECK-UNROLL-NEXT: call void @_Z3barv() -; CHECK-UNROLL-NEXT: br i1 false, label [[FOR_BODY7_4]], label [[FOR_COND_CLEANUP6_6:%.*]] -; CHECK-UNROLL: for.cond.cleanup6.6: -; CHECK-UNROLL-NEXT: [[CMP1_7:%.*]] = icmp eq i32 7, [[DIMS]] -; CHECK-UNROLL-NEXT: br i1 [[CMP1_7]], label [[CLEANUP]], label [[IF_END_7:%.*]] -; CHECK-UNROLL: if.end.7: -; CHECK-UNROLL-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds ptr, ptr [[ARR]], i64 7 -; CHECK-UNROLL-NEXT: br label [[FOR_COND4_7:%.*]] -; CHECK-UNROLL: for.cond4.7: -; CHECK-UNROLL-NEXT: br label [[FOR_BODY7_7:%.*]] -; CHECK-UNROLL: for.body7.7: -; CHECK-UNROLL-NEXT: [[TMP72:%.*]] = load ptr, ptr [[ARRAYIDX_7]], align 8 -; CHECK-UNROLL-NEXT: [[TMP73:%.*]] = load i32, ptr [[TMP72]], align 4 -; CHECK-UNROLL-NEXT: [[TMP74:%.*]] = load i32, ptr [[OUT]], align 4 -; CHECK-UNROLL-NEXT: [[ADD14_7:%.*]] = add nsw i32 [[TMP74]], [[TMP73]] -; CHECK-UNROLL-NEXT: store i32 [[ADD14_7]], ptr [[OUT]], align 4 -; CHECK-UNROLL-NEXT: call void @_Z3barv() -; CHECK-UNROLL-NEXT: br label [[FOR_BODY7_1_7:%.*]] -; CHECK-UNROLL: for.body7.1.7: -; CHECK-UNROLL-NEXT: [[TMP75:%.*]] = load ptr, ptr [[ARRAYIDX_7]], align 8 -; CHECK-UNROLL-NEXT: [[ARRAYIDX11_1_7:%.*]] = getelementptr inbounds i32, ptr [[TMP75]], i64 1 -; CHECK-UNROLL-NEXT: [[TMP76:%.*]] = load i32, ptr [[ARRAYIDX11_1_7]], align 4 -; CHECK-UNROLL-NEXT: [[ARRAYIDX13_1_7:%.*]] = getelementptr inbounds i32, ptr [[OUT]], i64 1 -; CHECK-UNROLL-NEXT: [[TMP77:%.*]] = load i32, ptr [[ARRAYIDX13_1_7]], align 4 -; CHECK-UNROLL-NEXT: [[ADD14_1_7:%.*]] = add nsw i32 [[TMP77]], [[TMP76]] -; CHECK-UNROLL-NEXT: store i32 [[ADD14_1_7]], ptr [[ARRAYIDX13_1_7]], align 4 -; CHECK-UNROLL-NEXT: call void @_Z3barv() -; CHECK-UNROLL-NEXT: br label [[FOR_BODY7_2_7:%.*]] -; CHECK-UNROLL: for.body7.2.7: -; CHECK-UNROLL-NEXT: [[TMP78:%.*]] = load ptr, ptr [[ARRAYIDX_7]], align 8 -; CHECK-UNROLL-NEXT: [[ARRAYIDX11_2_7:%.*]] = getelementptr inbounds i32, ptr [[TMP78]], i64 2 -; CHECK-UNROLL-NEXT: [[TMP79:%.*]] = load i32, ptr [[ARRAYIDX11_2_7]], align 4 -; CHECK-UNROLL-NEXT: [[ARRAYIDX13_2_7:%.*]] = getelementptr inbounds i32, ptr [[OUT]], i64 2 -; CHECK-UNROLL-NEXT: [[TMP80:%.*]] = load i32, ptr [[ARRAYIDX13_2_7]], align 4 -; CHECK-UNROLL-NEXT: [[ADD14_2_7:%.*]] = add nsw i32 [[TMP80]], [[TMP79]] -; CHECK-UNROLL-NEXT: store i32 [[ADD14_2_7]], ptr [[ARRAYIDX13_2_7]], align 4 -; CHECK-UNROLL-NEXT: call void @_Z3barv() -; CHECK-UNROLL-NEXT: br label [[FOR_BODY7_3_7:%.*]] -; CHECK-UNROLL: for.body7.3.7: -; CHECK-UNROLL-NEXT: [[TMP81:%.*]] = load ptr, ptr [[ARRAYIDX_7]], align 8 -; CHECK-UNROLL-NEXT: [[ARRAYIDX11_3_7:%.*]] = getelementptr inbounds i32, ptr [[TMP81]], i64 3 -; CHECK-UNROLL-NEXT: [[TMP82:%.*]] = load i32, ptr [[ARRAYIDX11_3_7]], align 4 -; CHECK-UNROLL-NEXT: [[ARRAYIDX13_3_7:%.*]] = getelementptr inbounds i32, ptr [[OUT]], i64 3 -; CHECK-UNROLL-NEXT: [[TMP83:%.*]] = load i32, ptr [[ARRAYIDX13_3_7]], align 4 -; CHECK-UNROLL-NEXT: [[ADD14_3_7:%.*]] = add nsw i32 [[TMP83]], [[TMP82]] -; CHECK-UNROLL-NEXT: store i32 [[ADD14_3_7]], ptr [[ARRAYIDX13_3_7]], align 4 -; CHECK-UNROLL-NEXT: call void @_Z3barv() -; CHECK-UNROLL-NEXT: br i1 false, label [[FOR_BODY7_4]], label [[FOR_COND_CLEANUP6_7:%.*]] -; CHECK-UNROLL: for.cond.cleanup6.7: -; CHECK-UNROLL-NEXT: [[CMP1_8:%.*]] = icmp eq i32 8, [[DIMS]] -; CHECK-UNROLL-NEXT: br i1 [[CMP1_8]], label [[CLEANUP]], label [[IF_END_8:%.*]] -; CHECK-UNROLL: if.end.8: -; CHECK-UNROLL-NEXT: [[ARRAYIDX_8:%.*]] = getelementptr inbounds ptr, ptr [[ARR]], i64 8 -; CHECK-UNROLL-NEXT: br label [[FOR_COND4_8:%.*]] -; CHECK-UNROLL: for.cond4.8: -; CHECK-UNROLL-NEXT: br label [[FOR_BODY7_8:%.*]] -; CHECK-UNROLL: for.body7.8: -; CHECK-UNROLL-NEXT: [[TMP84:%.*]] = load ptr, ptr [[ARRAYIDX_8]], align 8 -; CHECK-UNROLL-NEXT: [[TMP85:%.*]] = load i32, ptr [[TMP84]], align 4 -; CHECK-UNROLL-NEXT: [[TMP86:%.*]] = load i32, ptr [[OUT]], align 4 -; CHECK-UNROLL-NEXT: [[ADD14_8:%.*]] = add nsw i32 [[TMP86]], [[TMP85]] -; CHECK-UNROLL-NEXT: store i32 [[ADD14_8]], ptr [[OUT]], align 4 -; CHECK-UNROLL-NEXT: call void @_Z3barv() -; CHECK-UNROLL-NEXT: br label [[FOR_BODY7_1_8:%.*]] -; CHECK-UNROLL: for.body7.1.8: -; CHECK-UNROLL-NEXT: [[TMP87:%.*]] = load ptr, ptr [[ARRAYIDX_8]], align 8 -; CHECK-UNROLL-NEXT: [[ARRAYIDX11_1_8:%.*]] = getelementptr inbounds i32, ptr [[TMP87]], i64 1 -; CHECK-UNROLL-NEXT: [[TMP88:%.*]] = load i32, ptr [[ARRAYIDX11_1_8]], align 4 -; CHECK-UNROLL-NEXT: [[ARRAYIDX13_1_8:%.*]] = getelementptr inbounds i32, ptr [[OUT]], i64 1 -; CHECK-UNROLL-NEXT: [[TMP89:%.*]] = load i32, ptr [[ARRAYIDX13_1_8]], align 4 -; CHECK-UNROLL-NEXT: [[ADD14_1_8:%.*]] = add nsw i32 [[TMP89]], [[TMP88]] -; CHECK-UNROLL-NEXT: store i32 [[ADD14_1_8]], ptr [[ARRAYIDX13_1_8]], align 4 -; CHECK-UNROLL-NEXT: call void @_Z3barv() -; CHECK-UNROLL-NEXT: br label [[FOR_BODY7_2_8:%.*]] -; CHECK-UNROLL: for.body7.2.8: -; CHECK-UNROLL-NEXT: [[TMP90:%.*]] = load ptr, ptr [[ARRAYIDX_8]], align 8 -; CHECK-UNROLL-NEXT: [[ARRAYIDX11_2_8:%.*]] = getelementptr inbounds i32, ptr [[TMP90]], i64 2 -; CHECK-UNROLL-NEXT: [[TMP91:%.*]] = load i32, ptr [[ARRAYIDX11_2_8]], align 4 -; CHECK-UNROLL-NEXT: [[ARRAYIDX13_2_8:%.*]] = getelementptr inbounds i32, ptr [[OUT]], i64 2 -; CHECK-UNROLL-NEXT: [[TMP92:%.*]] = load i32, ptr [[ARRAYIDX13_2_8]], align 4 -; CHECK-UNROLL-NEXT: [[ADD14_2_8:%.*]] = add nsw i32 [[TMP92]], [[TMP91]] -; CHECK-UNROLL-NEXT: store i32 [[ADD14_2_8]], ptr [[ARRAYIDX13_2_8]], align 4 -; CHECK-UNROLL-NEXT: call void @_Z3barv() -; CHECK-UNROLL-NEXT: br label [[FOR_BODY7_3_8:%.*]] -; CHECK-UNROLL: for.body7.3.8: -; CHECK-UNROLL-NEXT: [[TMP93:%.*]] = load ptr, ptr [[ARRAYIDX_8]], align 8 -; CHECK-UNROLL-NEXT: [[ARRAYIDX11_3_8:%.*]] = getelementptr inbounds i32, ptr [[TMP93]], i64 3 -; CHECK-UNROLL-NEXT: [[TMP94:%.*]] = load i32, ptr [[ARRAYIDX11_3_8]], align 4 -; CHECK-UNROLL-NEXT: [[ARRAYIDX13_3_8:%.*]] = getelementptr inbounds i32, ptr [[OUT]], i64 3 -; CHECK-UNROLL-NEXT: [[TMP95:%.*]] = load i32, ptr [[ARRAYIDX13_3_8]], align 4 -; CHECK-UNROLL-NEXT: [[ADD14_3_8:%.*]] = add nsw i32 [[TMP95]], [[TMP94]] -; CHECK-UNROLL-NEXT: store i32 [[ADD14_3_8]], ptr [[ARRAYIDX13_3_8]], align 4 -; CHECK-UNROLL-NEXT: call void @_Z3barv() -; CHECK-UNROLL-NEXT: br i1 false, label [[FOR_BODY7_4]], label [[FOR_COND_CLEANUP6_8:%.*]] -; CHECK-UNROLL: for.cond.cleanup6.8: -; CHECK-UNROLL-NEXT: [[CMP1_9:%.*]] = icmp eq i32 9, [[DIMS]] -; CHECK-UNROLL-NEXT: br i1 [[CMP1_9]], label [[CLEANUP]], label [[IF_END_9:%.*]] -; CHECK-UNROLL: if.end.9: -; CHECK-UNROLL-NEXT: [[ARRAYIDX_9:%.*]] = getelementptr inbounds ptr, ptr [[ARR]], i64 9 -; CHECK-UNROLL-NEXT: br label [[FOR_COND4_9:%.*]] -; CHECK-UNROLL: for.cond4.9: -; CHECK-UNROLL-NEXT: br label [[FOR_BODY7_9:%.*]] -; CHECK-UNROLL: for.body7.9: -; CHECK-UNROLL-NEXT: [[TMP96:%.*]] = load ptr, ptr [[ARRAYIDX_9]], align 8 -; CHECK-UNROLL-NEXT: [[TMP97:%.*]] = load i32, ptr [[TMP96]], align 4 -; CHECK-UNROLL-NEXT: [[TMP98:%.*]] = load i32, ptr [[OUT]], align 4 -; CHECK-UNROLL-NEXT: [[ADD14_9:%.*]] = add nsw i32 [[TMP98]], [[TMP97]] -; CHECK-UNROLL-NEXT: store i32 [[ADD14_9]], ptr [[OUT]], align 4 -; CHECK-UNROLL-NEXT: call void @_Z3barv() -; CHECK-UNROLL-NEXT: br label [[FOR_BODY7_1_9:%.*]] -; CHECK-UNROLL: for.body7.1.9: -; CHECK-UNROLL-NEXT: [[TMP99:%.*]] = load ptr, ptr [[ARRAYIDX_9]], align 8 -; CHECK-UNROLL-NEXT: [[ARRAYIDX11_1_9:%.*]] = getelementptr inbounds i32, ptr [[TMP99]], i64 1 -; CHECK-UNROLL-NEXT: [[TMP100:%.*]] = load i32, ptr [[ARRAYIDX11_1_9]], align 4 -; CHECK-UNROLL-NEXT: [[ARRAYIDX13_1_9:%.*]] = getelementptr inbounds i32, ptr [[OUT]], i64 1 -; CHECK-UNROLL-NEXT: [[TMP101:%.*]] = load i32, ptr [[ARRAYIDX13_1_9]], align 4 -; CHECK-UNROLL-NEXT: [[ADD14_1_9:%.*]] = add nsw i32 [[TMP101]], [[TMP100]] -; CHECK-UNROLL-NEXT: store i32 [[ADD14_1_9]], ptr [[ARRAYIDX13_1_9]], align 4 -; CHECK-UNROLL-NEXT: call void @_Z3barv() -; CHECK-UNROLL-NEXT: br label [[FOR_BODY7_2_9:%.*]] -; CHECK-UNROLL: for.body7.2.9: -; CHECK-UNROLL-NEXT: [[TMP102:%.*]] = load ptr, ptr [[ARRAYIDX_9]], align 8 -; CHECK-UNROLL-NEXT: [[ARRAYIDX11_2_9:%.*]] = getelementptr inbounds i32, ptr [[TMP102]], i64 2 -; CHECK-UNROLL-NEXT: [[TMP103:%.*]] = load i32, ptr [[ARRAYIDX11_2_9]], align 4 -; CHECK-UNROLL-NEXT: [[ARRAYIDX13_2_9:%.*]] = getelementptr inbounds i32, ptr [[OUT]], i64 2 -; CHECK-UNROLL-NEXT: [[TMP104:%.*]] = load i32, ptr [[ARRAYIDX13_2_9]], align 4 -; CHECK-UNROLL-NEXT: [[ADD14_2_9:%.*]] = add nsw i32 [[TMP104]], [[TMP103]] -; CHECK-UNROLL-NEXT: store i32 [[ADD14_2_9]], ptr [[ARRAYIDX13_2_9]], align 4 -; CHECK-UNROLL-NEXT: call void @_Z3barv() -; CHECK-UNROLL-NEXT: br label [[FOR_BODY7_3_9:%.*]] -; CHECK-UNROLL: for.body7.3.9: -; CHECK-UNROLL-NEXT: [[TMP105:%.*]] = load ptr, ptr [[ARRAYIDX_9]], align 8 -; CHECK-UNROLL-NEXT: [[ARRAYIDX11_3_9:%.*]] = getelementptr inbounds i32, ptr [[TMP105]], i64 3 -; CHECK-UNROLL-NEXT: [[TMP106:%.*]] = load i32, ptr [[ARRAYIDX11_3_9]], align 4 -; CHECK-UNROLL-NEXT: [[ARRAYIDX13_3_9:%.*]] = getelementptr inbounds i32, ptr [[OUT]], i64 3 -; CHECK-UNROLL-NEXT: [[TMP107:%.*]] = load i32, ptr [[ARRAYIDX13_3_9]], align 4 -; CHECK-UNROLL-NEXT: [[ADD14_3_9:%.*]] = add nsw i32 [[TMP107]], [[TMP106]] -; CHECK-UNROLL-NEXT: store i32 [[ADD14_3_9]], ptr [[ARRAYIDX13_3_9]], align 4 -; CHECK-UNROLL-NEXT: call void @_Z3barv() -; CHECK-UNROLL-NEXT: br i1 false, label [[FOR_BODY7_4]], label [[FOR_COND_CLEANUP6_9:%.*]] -; CHECK-UNROLL: for.cond.cleanup6.9: -; CHECK-UNROLL-NEXT: [[CMP1_10:%.*]] = icmp eq i32 10, [[DIMS]] -; CHECK-UNROLL-NEXT: br i1 [[CMP1_10]], label [[CLEANUP]], label [[IF_END_10:%.*]] -; CHECK-UNROLL: if.end.10: -; CHECK-UNROLL-NEXT: [[ARRAYIDX_10:%.*]] = getelementptr inbounds ptr, ptr [[ARR]], i64 10 -; CHECK-UNROLL-NEXT: br label [[FOR_COND4_10:%.*]] -; CHECK-UNROLL: for.cond4.10: -; CHECK-UNROLL-NEXT: br label [[FOR_BODY7_10:%.*]] -; CHECK-UNROLL: for.body7.10: -; CHECK-UNROLL-NEXT: [[TMP108:%.*]] = load ptr, ptr [[ARRAYIDX_10]], align 8 -; CHECK-UNROLL-NEXT: [[TMP109:%.*]] = load i32, ptr [[TMP108]], align 4 -; CHECK-UNROLL-NEXT: [[TMP110:%.*]] = load i32, ptr [[OUT]], align 4 -; CHECK-UNROLL-NEXT: [[ADD14_10:%.*]] = add nsw i32 [[TMP110]], [[TMP109]] -; CHECK-UNROLL-NEXT: store i32 [[ADD14_10]], ptr [[OUT]], align 4 -; CHECK-UNROLL-NEXT: call void @_Z3barv() -; CHECK-UNROLL-NEXT: br label [[FOR_BODY7_1_10:%.*]] -; CHECK-UNROLL: for.body7.1.10: -; CHECK-UNROLL-NEXT: [[TMP111:%.*]] = load ptr, ptr [[ARRAYIDX_10]], align 8 -; CHECK-UNROLL-NEXT: [[ARRAYIDX11_1_10:%.*]] = getelementptr inbounds i32, ptr [[TMP111]], i64 1 -; CHECK-UNROLL-NEXT: [[TMP112:%.*]] = load i32, ptr [[ARRAYIDX11_1_10]], align 4 -; CHECK-UNROLL-NEXT: [[ARRAYIDX13_1_10:%.*]] = getelementptr inbounds i32, ptr [[OUT]], i64 1 -; CHECK-UNROLL-NEXT: [[TMP113:%.*]] = load i32, ptr [[ARRAYIDX13_1_10]], align 4 -; CHECK-UNROLL-NEXT: [[ADD14_1_10:%.*]] = add nsw i32 [[TMP113]], [[TMP112]] -; CHECK-UNROLL-NEXT: store i32 [[ADD14_1_10]], ptr [[ARRAYIDX13_1_10]], align 4 -; CHECK-UNROLL-NEXT: call void @_Z3barv() -; CHECK-UNROLL-NEXT: br label [[FOR_BODY7_2_10:%.*]] -; CHECK-UNROLL: for.body7.2.10: -; CHECK-UNROLL-NEXT: [[TMP114:%.*]] = load ptr, ptr [[ARRAYIDX_10]], align 8 -; CHECK-UNROLL-NEXT: [[ARRAYIDX11_2_10:%.*]] = getelementptr inbounds i32, ptr [[TMP114]], i64 2 -; CHECK-UNROLL-NEXT: [[TMP115:%.*]] = load i32, ptr [[ARRAYIDX11_2_10]], align 4 -; CHECK-UNROLL-NEXT: [[ARRAYIDX13_2_10:%.*]] = getelementptr inbounds i32, ptr [[OUT]], i64 2 -; CHECK-UNROLL-NEXT: [[TMP116:%.*]] = load i32, ptr [[ARRAYIDX13_2_10]], align 4 -; CHECK-UNROLL-NEXT: [[ADD14_2_10:%.*]] = add nsw i32 [[TMP116]], [[TMP115]] -; CHECK-UNROLL-NEXT: store i32 [[ADD14_2_10]], ptr [[ARRAYIDX13_2_10]], align 4 -; CHECK-UNROLL-NEXT: call void @_Z3barv() -; CHECK-UNROLL-NEXT: br label [[FOR_BODY7_3_10:%.*]] -; CHECK-UNROLL: for.body7.3.10: -; CHECK-UNROLL-NEXT: [[TMP117:%.*]] = load ptr, ptr [[ARRAYIDX_10]], align 8 -; CHECK-UNROLL-NEXT: [[ARRAYIDX11_3_10:%.*]] = getelementptr inbounds i32, ptr [[TMP117]], i64 3 -; CHECK-UNROLL-NEXT: [[TMP118:%.*]] = load i32, ptr [[ARRAYIDX11_3_10]], align 4 -; CHECK-UNROLL-NEXT: [[ARRAYIDX13_3_10:%.*]] = getelementptr inbounds i32, ptr [[OUT]], i64 3 -; CHECK-UNROLL-NEXT: [[TMP119:%.*]] = load i32, ptr [[ARRAYIDX13_3_10]], align 4 -; CHECK-UNROLL-NEXT: [[ADD14_3_10:%.*]] = add nsw i32 [[TMP119]], [[TMP118]] -; CHECK-UNROLL-NEXT: store i32 [[ADD14_3_10]], ptr [[ARRAYIDX13_3_10]], align 4 -; CHECK-UNROLL-NEXT: call void @_Z3barv() -; CHECK-UNROLL-NEXT: br i1 false, label [[FOR_BODY7_4]], label [[FOR_COND_CLEANUP6_10:%.*]] -; CHECK-UNROLL: for.cond.cleanup6.10: -; CHECK-UNROLL-NEXT: [[CMP1_11:%.*]] = icmp eq i32 11, [[DIMS]] -; CHECK-UNROLL-NEXT: br i1 [[CMP1_11]], label [[CLEANUP]], label [[IF_END_11:%.*]] -; CHECK-UNROLL: if.end.11: -; CHECK-UNROLL-NEXT: [[ARRAYIDX_11:%.*]] = getelementptr inbounds ptr, ptr [[ARR]], i64 11 -; CHECK-UNROLL-NEXT: br label [[FOR_COND4_11:%.*]] -; CHECK-UNROLL: for.cond4.11: -; CHECK-UNROLL-NEXT: br label [[FOR_BODY7_11:%.*]] -; CHECK-UNROLL: for.body7.11: -; CHECK-UNROLL-NEXT: [[TMP120:%.*]] = load ptr, ptr [[ARRAYIDX_11]], align 8 -; CHECK-UNROLL-NEXT: [[TMP121:%.*]] = load i32, ptr [[TMP120]], align 4 -; CHECK-UNROLL-NEXT: [[TMP122:%.*]] = load i32, ptr [[OUT]], align 4 -; CHECK-UNROLL-NEXT: [[ADD14_119:%.*]] = add nsw i32 [[TMP122]], [[TMP121]] -; CHECK-UNROLL-NEXT: store i32 [[ADD14_119]], ptr [[OUT]], align 4 -; CHECK-UNROLL-NEXT: call void @_Z3barv() -; CHECK-UNROLL-NEXT: br label [[FOR_BODY7_1_11:%.*]] -; CHECK-UNROLL: for.body7.1.11: -; CHECK-UNROLL-NEXT: [[TMP123:%.*]] = load ptr, ptr [[ARRAYIDX_11]], align 8 -; CHECK-UNROLL-NEXT: [[ARRAYIDX11_1_11:%.*]] = getelementptr inbounds i32, ptr [[TMP123]], i64 1 -; CHECK-UNROLL-NEXT: [[TMP124:%.*]] = load i32, ptr [[ARRAYIDX11_1_11]], align 4 -; CHECK-UNROLL-NEXT: [[ARRAYIDX13_1_11:%.*]] = getelementptr inbounds i32, ptr [[OUT]], i64 1 -; CHECK-UNROLL-NEXT: [[TMP125:%.*]] = load i32, ptr [[ARRAYIDX13_1_11]], align 4 -; CHECK-UNROLL-NEXT: [[ADD14_1_11:%.*]] = add nsw i32 [[TMP125]], [[TMP124]] -; CHECK-UNROLL-NEXT: store i32 [[ADD14_1_11]], ptr [[ARRAYIDX13_1_11]], align 4 -; CHECK-UNROLL-NEXT: call void @_Z3barv() -; CHECK-UNROLL-NEXT: br label [[FOR_BODY7_2_11:%.*]] -; CHECK-UNROLL: for.body7.2.11: -; CHECK-UNROLL-NEXT: [[TMP126:%.*]] = load ptr, ptr [[ARRAYIDX_11]], align 8 -; CHECK-UNROLL-NEXT: [[ARRAYIDX11_2_11:%.*]] = getelementptr inbounds i32, ptr [[TMP126]], i64 2 -; CHECK-UNROLL-NEXT: [[TMP127:%.*]] = load i32, ptr [[ARRAYIDX11_2_11]], align 4 -; CHECK-UNROLL-NEXT: [[ARRAYIDX13_2_11:%.*]] = getelementptr inbounds i32, ptr [[OUT]], i64 2 -; CHECK-UNROLL-NEXT: [[TMP128:%.*]] = load i32, ptr [[ARRAYIDX13_2_11]], align 4 -; CHECK-UNROLL-NEXT: [[ADD14_2_11:%.*]] = add nsw i32 [[TMP128]], [[TMP127]] -; CHECK-UNROLL-NEXT: store i32 [[ADD14_2_11]], ptr [[ARRAYIDX13_2_11]], align 4 -; CHECK-UNROLL-NEXT: call void @_Z3barv() -; CHECK-UNROLL-NEXT: br label [[FOR_BODY7_3_11:%.*]] -; CHECK-UNROLL: for.body7.3.11: -; CHECK-UNROLL-NEXT: [[TMP129:%.*]] = load ptr, ptr [[ARRAYIDX_11]], align 8 -; CHECK-UNROLL-NEXT: [[ARRAYIDX11_3_11:%.*]] = getelementptr inbounds i32, ptr [[TMP129]], i64 3 -; CHECK-UNROLL-NEXT: [[TMP130:%.*]] = load i32, ptr [[ARRAYIDX11_3_11]], align 4 -; CHECK-UNROLL-NEXT: [[ARRAYIDX13_3_11:%.*]] = getelementptr inbounds i32, ptr [[OUT]], i64 3 -; CHECK-UNROLL-NEXT: [[TMP131:%.*]] = load i32, ptr [[ARRAYIDX13_3_11]], align 4 -; CHECK-UNROLL-NEXT: [[ADD14_3_11:%.*]] = add nsw i32 [[TMP131]], [[TMP130]] -; CHECK-UNROLL-NEXT: store i32 [[ADD14_3_11]], ptr [[ARRAYIDX13_3_11]], align 4 -; CHECK-UNROLL-NEXT: call void @_Z3barv() -; CHECK-UNROLL-NEXT: br i1 false, label [[FOR_BODY7_4]], label [[FOR_COND_CLEANUP6_11:%.*]] -; CHECK-UNROLL: for.cond.cleanup6.11: -; CHECK-UNROLL-NEXT: [[CMP1_12:%.*]] = icmp eq i32 12, [[DIMS]] -; CHECK-UNROLL-NEXT: br i1 [[CMP1_12]], label [[CLEANUP]], label [[IF_END_12:%.*]] -; CHECK-UNROLL: if.end.12: -; CHECK-UNROLL-NEXT: [[ARRAYIDX_12:%.*]] = getelementptr inbounds ptr, ptr [[ARR]], i64 12 -; CHECK-UNROLL-NEXT: br label [[FOR_COND4_12:%.*]] -; CHECK-UNROLL: for.cond4.12: -; CHECK-UNROLL-NEXT: br label [[FOR_BODY7_1210:%.*]] -; CHECK-UNROLL: for.body7.1210: -; CHECK-UNROLL-NEXT: [[TMP132:%.*]] = load ptr, ptr [[ARRAYIDX_12]], align 8 -; CHECK-UNROLL-NEXT: [[TMP133:%.*]] = load i32, ptr [[TMP132]], align 4 -; CHECK-UNROLL-NEXT: [[TMP134:%.*]] = load i32, ptr [[OUT]], align 4 -; CHECK-UNROLL-NEXT: [[ADD14_12:%.*]] = add nsw i32 [[TMP134]], [[TMP133]] -; CHECK-UNROLL-NEXT: store i32 [[ADD14_12]], ptr [[OUT]], align 4 -; CHECK-UNROLL-NEXT: call void @_Z3barv() -; CHECK-UNROLL-NEXT: br label [[FOR_BODY7_1_12:%.*]] -; CHECK-UNROLL: for.body7.1.12: -; CHECK-UNROLL-NEXT: [[TMP135:%.*]] = load ptr, ptr [[ARRAYIDX_12]], align 8 -; CHECK-UNROLL-NEXT: [[ARRAYIDX11_1_12:%.*]] = getelementptr inbounds i32, ptr [[TMP135]], i64 1 -; CHECK-UNROLL-NEXT: [[TMP136:%.*]] = load i32, ptr [[ARRAYIDX11_1_12]], align 4 -; CHECK-UNROLL-NEXT: [[ARRAYIDX13_1_12:%.*]] = getelementptr inbounds i32, ptr [[OUT]], i64 1 -; CHECK-UNROLL-NEXT: [[TMP137:%.*]] = load i32, ptr [[ARRAYIDX13_1_12]], align 4 -; CHECK-UNROLL-NEXT: [[ADD14_1_12:%.*]] = add nsw i32 [[TMP137]], [[TMP136]] -; CHECK-UNROLL-NEXT: store i32 [[ADD14_1_12]], ptr [[ARRAYIDX13_1_12]], align 4 -; CHECK-UNROLL-NEXT: call void @_Z3barv() -; CHECK-UNROLL-NEXT: br label [[FOR_BODY7_2_12:%.*]] -; CHECK-UNROLL: for.body7.2.12: -; CHECK-UNROLL-NEXT: [[TMP138:%.*]] = load ptr, ptr [[ARRAYIDX_12]], align 8 -; CHECK-UNROLL-NEXT: [[ARRAYIDX11_2_12:%.*]] = getelementptr inbounds i32, ptr [[TMP138]], i64 2 -; CHECK-UNROLL-NEXT: [[TMP139:%.*]] = load i32, ptr [[ARRAYIDX11_2_12]], align 4 -; CHECK-UNROLL-NEXT: [[ARRAYIDX13_2_12:%.*]] = getelementptr inbounds i32, ptr [[OUT]], i64 2 -; CHECK-UNROLL-NEXT: [[TMP140:%.*]] = load i32, ptr [[ARRAYIDX13_2_12]], align 4 -; CHECK-UNROLL-NEXT: [[ADD14_2_12:%.*]] = add nsw i32 [[TMP140]], [[TMP139]] -; CHECK-UNROLL-NEXT: store i32 [[ADD14_2_12]], ptr [[ARRAYIDX13_2_12]], align 4 -; CHECK-UNROLL-NEXT: call void @_Z3barv() -; CHECK-UNROLL-NEXT: br label [[FOR_BODY7_3_12:%.*]] -; CHECK-UNROLL: for.body7.3.12: -; CHECK-UNROLL-NEXT: [[TMP141:%.*]] = load ptr, ptr [[ARRAYIDX_12]], align 8 -; CHECK-UNROLL-NEXT: [[ARRAYIDX11_3_12:%.*]] = getelementptr inbounds i32, ptr [[TMP141]], i64 3 -; CHECK-UNROLL-NEXT: [[TMP142:%.*]] = load i32, ptr [[ARRAYIDX11_3_12]], align 4 -; CHECK-UNROLL-NEXT: [[ARRAYIDX13_3_12:%.*]] = getelementptr inbounds i32, ptr [[OUT]], i64 3 -; CHECK-UNROLL-NEXT: [[TMP143:%.*]] = load i32, ptr [[ARRAYIDX13_3_12]], align 4 -; CHECK-UNROLL-NEXT: [[ADD14_3_12:%.*]] = add nsw i32 [[TMP143]], [[TMP142]] -; CHECK-UNROLL-NEXT: store i32 [[ADD14_3_12]], ptr [[ARRAYIDX13_3_12]], align 4 -; CHECK-UNROLL-NEXT: call void @_Z3barv() -; CHECK-UNROLL-NEXT: br i1 false, label [[FOR_BODY7_4]], label [[FOR_COND_CLEANUP6_12:%.*]] -; CHECK-UNROLL: for.cond.cleanup6.12: -; CHECK-UNROLL-NEXT: [[CMP1_13:%.*]] = icmp eq i32 13, [[DIMS]] -; CHECK-UNROLL-NEXT: br i1 [[CMP1_13]], label [[CLEANUP]], label [[IF_END_13:%.*]] -; CHECK-UNROLL: if.end.13: -; CHECK-UNROLL-NEXT: [[ARRAYIDX_13:%.*]] = getelementptr inbounds ptr, ptr [[ARR]], i64 13 -; CHECK-UNROLL-NEXT: br label [[FOR_COND4_13:%.*]] -; CHECK-UNROLL: for.cond4.13: -; CHECK-UNROLL-NEXT: br label [[FOR_BODY7_13:%.*]] -; CHECK-UNROLL: for.body7.13: -; CHECK-UNROLL-NEXT: [[TMP144:%.*]] = load ptr, ptr [[ARRAYIDX_13]], align 8 -; CHECK-UNROLL-NEXT: [[TMP145:%.*]] = load i32, ptr [[TMP144]], align 4 -; CHECK-UNROLL-NEXT: [[TMP146:%.*]] = load i32, ptr [[OUT]], align 4 -; CHECK-UNROLL-NEXT: [[ADD14_13:%.*]] = add nsw i32 [[TMP146]], [[TMP145]] -; CHECK-UNROLL-NEXT: store i32 [[ADD14_13]], ptr [[OUT]], align 4 -; CHECK-UNROLL-NEXT: call void @_Z3barv() -; CHECK-UNROLL-NEXT: br label [[FOR_BODY7_1_13:%.*]] -; CHECK-UNROLL: for.body7.1.13: -; CHECK-UNROLL-NEXT: [[TMP147:%.*]] = load ptr, ptr [[ARRAYIDX_13]], align 8 -; CHECK-UNROLL-NEXT: [[ARRAYIDX11_1_13:%.*]] = getelementptr inbounds i32, ptr [[TMP147]], i64 1 -; CHECK-UNROLL-NEXT: [[TMP148:%.*]] = load i32, ptr [[ARRAYIDX11_1_13]], align 4 -; CHECK-UNROLL-NEXT: [[ARRAYIDX13_1_13:%.*]] = getelementptr inbounds i32, ptr [[OUT]], i64 1 -; CHECK-UNROLL-NEXT: [[TMP149:%.*]] = load i32, ptr [[ARRAYIDX13_1_13]], align 4 -; CHECK-UNROLL-NEXT: [[ADD14_1_13:%.*]] = add nsw i32 [[TMP149]], [[TMP148]] -; CHECK-UNROLL-NEXT: store i32 [[ADD14_1_13]], ptr [[ARRAYIDX13_1_13]], align 4 -; CHECK-UNROLL-NEXT: call void @_Z3barv() -; CHECK-UNROLL-NEXT: br label [[FOR_BODY7_2_13:%.*]] -; CHECK-UNROLL: for.body7.2.13: -; CHECK-UNROLL-NEXT: [[TMP150:%.*]] = load ptr, ptr [[ARRAYIDX_13]], align 8 -; CHECK-UNROLL-NEXT: [[ARRAYIDX11_2_13:%.*]] = getelementptr inbounds i32, ptr [[TMP150]], i64 2 -; CHECK-UNROLL-NEXT: [[TMP151:%.*]] = load i32, ptr [[ARRAYIDX11_2_13]], align 4 -; CHECK-UNROLL-NEXT: [[ARRAYIDX13_2_13:%.*]] = getelementptr inbounds i32, ptr [[OUT]], i64 2 -; CHECK-UNROLL-NEXT: [[TMP152:%.*]] = load i32, ptr [[ARRAYIDX13_2_13]], align 4 -; CHECK-UNROLL-NEXT: [[ADD14_2_13:%.*]] = add nsw i32 [[TMP152]], [[TMP151]] -; CHECK-UNROLL-NEXT: store i32 [[ADD14_2_13]], ptr [[ARRAYIDX13_2_13]], align 4 -; CHECK-UNROLL-NEXT: call void @_Z3barv() -; CHECK-UNROLL-NEXT: br label [[FOR_BODY7_3_13:%.*]] -; CHECK-UNROLL: for.body7.3.13: -; CHECK-UNROLL-NEXT: [[TMP153:%.*]] = load ptr, ptr [[ARRAYIDX_13]], align 8 -; CHECK-UNROLL-NEXT: [[ARRAYIDX11_3_13:%.*]] = getelementptr inbounds i32, ptr [[TMP153]], i64 3 -; CHECK-UNROLL-NEXT: [[TMP154:%.*]] = load i32, ptr [[ARRAYIDX11_3_13]], align 4 -; CHECK-UNROLL-NEXT: [[ARRAYIDX13_3_13:%.*]] = getelementptr inbounds i32, ptr [[OUT]], i64 3 -; CHECK-UNROLL-NEXT: [[TMP155:%.*]] = load i32, ptr [[ARRAYIDX13_3_13]], align 4 -; CHECK-UNROLL-NEXT: [[ADD14_3_13:%.*]] = add nsw i32 [[TMP155]], [[TMP154]] -; CHECK-UNROLL-NEXT: store i32 [[ADD14_3_13]], ptr [[ARRAYIDX13_3_13]], align 4 -; CHECK-UNROLL-NEXT: call void @_Z3barv() -; CHECK-UNROLL-NEXT: br i1 false, label [[FOR_BODY7_4]], label [[FOR_COND_CLEANUP6_13:%.*]] -; CHECK-UNROLL: for.cond.cleanup6.13: -; CHECK-UNROLL-NEXT: [[CMP1_14:%.*]] = icmp eq i32 14, [[DIMS]] -; CHECK-UNROLL-NEXT: br i1 [[CMP1_14]], label [[CLEANUP]], label [[IF_END_14:%.*]] -; CHECK-UNROLL: if.end.14: -; CHECK-UNROLL-NEXT: [[ARRAYIDX_14:%.*]] = getelementptr inbounds ptr, ptr [[ARR]], i64 14 -; CHECK-UNROLL-NEXT: br label [[FOR_COND4_14:%.*]] -; CHECK-UNROLL: for.cond4.14: -; CHECK-UNROLL-NEXT: br label [[FOR_BODY7_14:%.*]] -; CHECK-UNROLL: for.body7.14: -; CHECK-UNROLL-NEXT: [[TMP156:%.*]] = load ptr, ptr [[ARRAYIDX_14]], align 8 -; CHECK-UNROLL-NEXT: [[TMP157:%.*]] = load i32, ptr [[TMP156]], align 4 -; CHECK-UNROLL-NEXT: [[TMP158:%.*]] = load i32, ptr [[OUT]], align 4 -; CHECK-UNROLL-NEXT: [[ADD14_14:%.*]] = add nsw i32 [[TMP158]], [[TMP157]] -; CHECK-UNROLL-NEXT: store i32 [[ADD14_14]], ptr [[OUT]], align 4 -; CHECK-UNROLL-NEXT: call void @_Z3barv() -; CHECK-UNROLL-NEXT: br label [[FOR_BODY7_1_14:%.*]] -; CHECK-UNROLL: for.body7.1.14: -; CHECK-UNROLL-NEXT: [[TMP159:%.*]] = load ptr, ptr [[ARRAYIDX_14]], align 8 -; CHECK-UNROLL-NEXT: [[ARRAYIDX11_1_14:%.*]] = getelementptr inbounds i32, ptr [[TMP159]], i64 1 -; CHECK-UNROLL-NEXT: [[TMP160:%.*]] = load i32, ptr [[ARRAYIDX11_1_14]], align 4 -; CHECK-UNROLL-NEXT: [[ARRAYIDX13_1_14:%.*]] = getelementptr inbounds i32, ptr [[OUT]], i64 1 -; CHECK-UNROLL-NEXT: [[TMP161:%.*]] = load i32, ptr [[ARRAYIDX13_1_14]], align 4 -; CHECK-UNROLL-NEXT: [[ADD14_1_14:%.*]] = add nsw i32 [[TMP161]], [[TMP160]] -; CHECK-UNROLL-NEXT: store i32 [[ADD14_1_14]], ptr [[ARRAYIDX13_1_14]], align 4 -; CHECK-UNROLL-NEXT: call void @_Z3barv() -; CHECK-UNROLL-NEXT: br label [[FOR_BODY7_2_14:%.*]] -; CHECK-UNROLL: for.body7.2.14: -; CHECK-UNROLL-NEXT: [[TMP162:%.*]] = load ptr, ptr [[ARRAYIDX_14]], align 8 -; CHECK-UNROLL-NEXT: [[ARRAYIDX11_2_14:%.*]] = getelementptr inbounds i32, ptr [[TMP162]], i64 2 -; CHECK-UNROLL-NEXT: [[TMP163:%.*]] = load i32, ptr [[ARRAYIDX11_2_14]], align 4 -; CHECK-UNROLL-NEXT: [[ARRAYIDX13_2_14:%.*]] = getelementptr inbounds i32, ptr [[OUT]], i64 2 -; CHECK-UNROLL-NEXT: [[TMP164:%.*]] = load i32, ptr [[ARRAYIDX13_2_14]], align 4 -; CHECK-UNROLL-NEXT: [[ADD14_2_14:%.*]] = add nsw i32 [[TMP164]], [[TMP163]] -; CHECK-UNROLL-NEXT: store i32 [[ADD14_2_14]], ptr [[ARRAYIDX13_2_14]], align 4 -; CHECK-UNROLL-NEXT: call void @_Z3barv() -; CHECK-UNROLL-NEXT: br label [[FOR_BODY7_3_14:%.*]] -; CHECK-UNROLL: for.body7.3.14: -; CHECK-UNROLL-NEXT: [[TMP165:%.*]] = load ptr, ptr [[ARRAYIDX_14]], align 8 -; CHECK-UNROLL-NEXT: [[ARRAYIDX11_3_14:%.*]] = getelementptr inbounds i32, ptr [[TMP165]], i64 3 -; CHECK-UNROLL-NEXT: [[TMP166:%.*]] = load i32, ptr [[ARRAYIDX11_3_14]], align 4 -; CHECK-UNROLL-NEXT: [[ARRAYIDX13_3_14:%.*]] = getelementptr inbounds i32, ptr [[OUT]], i64 3 -; CHECK-UNROLL-NEXT: [[TMP167:%.*]] = load i32, ptr [[ARRAYIDX13_3_14]], align 4 -; CHECK-UNROLL-NEXT: [[ADD14_3_14:%.*]] = add nsw i32 [[TMP167]], [[TMP166]] -; CHECK-UNROLL-NEXT: store i32 [[ADD14_3_14]], ptr [[ARRAYIDX13_3_14]], align 4 -; CHECK-UNROLL-NEXT: call void @_Z3barv() -; CHECK-UNROLL-NEXT: br i1 false, label [[FOR_BODY7_4]], label [[FOR_COND_CLEANUP6_14:%.*]] -; CHECK-UNROLL: for.cond.cleanup6.14: -; CHECK-UNROLL-NEXT: [[CMP1_15:%.*]] = icmp eq i32 15, [[DIMS]] -; CHECK-UNROLL-NEXT: br i1 [[CMP1_15]], label [[CLEANUP]], label [[IF_END_15:%.*]] -; CHECK-UNROLL: if.end.15: -; CHECK-UNROLL-NEXT: [[ARRAYIDX_15:%.*]] = getelementptr inbounds ptr, ptr [[ARR]], i64 15 -; CHECK-UNROLL-NEXT: br label [[FOR_COND4_15:%.*]] -; CHECK-UNROLL: for.cond4.15: -; CHECK-UNROLL-NEXT: br label [[FOR_BODY7_15:%.*]] -; CHECK-UNROLL: for.body7.15: -; CHECK-UNROLL-NEXT: [[TMP168:%.*]] = load ptr, ptr [[ARRAYIDX_15]], align 8 -; CHECK-UNROLL-NEXT: [[TMP169:%.*]] = load i32, ptr [[TMP168]], align 4 -; CHECK-UNROLL-NEXT: [[TMP170:%.*]] = load i32, ptr [[OUT]], align 4 -; CHECK-UNROLL-NEXT: [[ADD14_15:%.*]] = add nsw i32 [[TMP170]], [[TMP169]] -; CHECK-UNROLL-NEXT: store i32 [[ADD14_15]], ptr [[OUT]], align 4 -; CHECK-UNROLL-NEXT: call void @_Z3barv() -; CHECK-UNROLL-NEXT: br label [[FOR_BODY7_1_15:%.*]] -; CHECK-UNROLL: for.body7.1.15: -; CHECK-UNROLL-NEXT: [[TMP171:%.*]] = load ptr, ptr [[ARRAYIDX_15]], align 8 -; CHECK-UNROLL-NEXT: [[ARRAYIDX11_1_15:%.*]] = getelementptr inbounds i32, ptr [[TMP171]], i64 1 -; CHECK-UNROLL-NEXT: [[TMP172:%.*]] = load i32, ptr [[ARRAYIDX11_1_15]], align 4 -; CHECK-UNROLL-NEXT: [[ARRAYIDX13_1_15:%.*]] = getelementptr inbounds i32, ptr [[OUT]], i64 1 -; CHECK-UNROLL-NEXT: [[TMP173:%.*]] = load i32, ptr [[ARRAYIDX13_1_15]], align 4 -; CHECK-UNROLL-NEXT: [[ADD14_1_15:%.*]] = add nsw i32 [[TMP173]], [[TMP172]] -; CHECK-UNROLL-NEXT: store i32 [[ADD14_1_15]], ptr [[ARRAYIDX13_1_15]], align 4 -; CHECK-UNROLL-NEXT: call void @_Z3barv() -; CHECK-UNROLL-NEXT: br label [[FOR_BODY7_2_15:%.*]] -; CHECK-UNROLL: for.body7.2.15: -; CHECK-UNROLL-NEXT: [[TMP174:%.*]] = load ptr, ptr [[ARRAYIDX_15]], align 8 -; CHECK-UNROLL-NEXT: [[ARRAYIDX11_2_15:%.*]] = getelementptr inbounds i32, ptr [[TMP174]], i64 2 -; CHECK-UNROLL-NEXT: [[TMP175:%.*]] = load i32, ptr [[ARRAYIDX11_2_15]], align 4 -; CHECK-UNROLL-NEXT: [[ARRAYIDX13_2_15:%.*]] = getelementptr inbounds i32, ptr [[OUT]], i64 2 -; CHECK-UNROLL-NEXT: [[TMP176:%.*]] = load i32, ptr [[ARRAYIDX13_2_15]], align 4 -; CHECK-UNROLL-NEXT: [[ADD14_2_15:%.*]] = add nsw i32 [[TMP176]], [[TMP175]] -; CHECK-UNROLL-NEXT: store i32 [[ADD14_2_15]], ptr [[ARRAYIDX13_2_15]], align 4 -; CHECK-UNROLL-NEXT: call void @_Z3barv() -; CHECK-UNROLL-NEXT: br label [[FOR_BODY7_3_15:%.*]] -; CHECK-UNROLL: for.body7.3.15: -; CHECK-UNROLL-NEXT: [[TMP177:%.*]] = load ptr, ptr [[ARRAYIDX_15]], align 8 -; CHECK-UNROLL-NEXT: [[ARRAYIDX11_3_15:%.*]] = getelementptr inbounds i32, ptr [[TMP177]], i64 3 -; CHECK-UNROLL-NEXT: [[TMP178:%.*]] = load i32, ptr [[ARRAYIDX11_3_15]], align 4 -; CHECK-UNROLL-NEXT: [[ARRAYIDX13_3_15:%.*]] = getelementptr inbounds i32, ptr [[OUT]], i64 3 -; CHECK-UNROLL-NEXT: [[TMP179:%.*]] = load i32, ptr [[ARRAYIDX13_3_15]], align 4 -; CHECK-UNROLL-NEXT: [[ADD14_3_15:%.*]] = add nsw i32 [[TMP179]], [[TMP178]] -; CHECK-UNROLL-NEXT: store i32 [[ADD14_3_15]], ptr [[ARRAYIDX13_3_15]], align 4 -; CHECK-UNROLL-NEXT: call void @_Z3barv() -; CHECK-UNROLL-NEXT: br i1 false, label [[FOR_BODY7_4]], label [[FOR_COND_CLEANUP6_15:%.*]] -; CHECK-UNROLL: for.cond.cleanup6.15: -; CHECK-UNROLL-NEXT: br i1 true, label [[CLEANUP]], label [[IF_END_16:%.*]] -; CHECK-UNROLL: if.end.16: -; CHECK-UNROLL-NEXT: [[ARRAYIDX_16:%.*]] = getelementptr inbounds ptr, ptr [[ARR]], i64 16 -; CHECK-UNROLL-NEXT: br label [[FOR_COND4_16:%.*]] -; CHECK-UNROLL: for.cond4.16: -; CHECK-UNROLL-NEXT: br label [[FOR_BODY7_16:%.*]] -; CHECK-UNROLL: for.body7.16: -; CHECK-UNROLL-NEXT: [[TMP180:%.*]] = load ptr, ptr [[ARRAYIDX_16]], align 8 -; CHECK-UNROLL-NEXT: [[TMP181:%.*]] = load i32, ptr [[TMP180]], align 4 -; CHECK-UNROLL-NEXT: [[TMP182:%.*]] = load i32, ptr [[OUT]], align 4 -; CHECK-UNROLL-NEXT: [[ADD14_16:%.*]] = add nsw i32 [[TMP182]], [[TMP181]] -; CHECK-UNROLL-NEXT: store i32 [[ADD14_16]], ptr [[OUT]], align 4 -; CHECK-UNROLL-NEXT: call void @_Z3barv() -; CHECK-UNROLL-NEXT: br label [[FOR_BODY7_1_16:%.*]] -; CHECK-UNROLL: for.body7.1.16: -; CHECK-UNROLL-NEXT: [[TMP183:%.*]] = load ptr, ptr [[ARRAYIDX_16]], align 8 -; CHECK-UNROLL-NEXT: [[ARRAYIDX11_1_16:%.*]] = getelementptr inbounds i32, ptr [[TMP183]], i64 1 -; CHECK-UNROLL-NEXT: [[TMP184:%.*]] = load i32, ptr [[ARRAYIDX11_1_16]], align 4 -; CHECK-UNROLL-NEXT: [[ARRAYIDX13_1_16:%.*]] = getelementptr inbounds i32, ptr [[OUT]], i64 1 -; CHECK-UNROLL-NEXT: [[TMP185:%.*]] = load i32, ptr [[ARRAYIDX13_1_16]], align 4 -; CHECK-UNROLL-NEXT: [[ADD14_1_16:%.*]] = add nsw i32 [[TMP185]], [[TMP184]] -; CHECK-UNROLL-NEXT: store i32 [[ADD14_1_16]], ptr [[ARRAYIDX13_1_16]], align 4 -; CHECK-UNROLL-NEXT: call void @_Z3barv() -; CHECK-UNROLL-NEXT: br label [[FOR_BODY7_2_16:%.*]] -; CHECK-UNROLL: for.body7.2.16: -; CHECK-UNROLL-NEXT: [[TMP186:%.*]] = load ptr, ptr [[ARRAYIDX_16]], align 8 -; CHECK-UNROLL-NEXT: [[ARRAYIDX11_2_16:%.*]] = getelementptr inbounds i32, ptr [[TMP186]], i64 2 -; CHECK-UNROLL-NEXT: [[TMP187:%.*]] = load i32, ptr [[ARRAYIDX11_2_16]], align 4 -; CHECK-UNROLL-NEXT: [[ARRAYIDX13_2_16:%.*]] = getelementptr inbounds i32, ptr [[OUT]], i64 2 -; CHECK-UNROLL-NEXT: [[TMP188:%.*]] = load i32, ptr [[ARRAYIDX13_2_16]], align 4 -; CHECK-UNROLL-NEXT: [[ADD14_2_16:%.*]] = add nsw i32 [[TMP188]], [[TMP187]] -; CHECK-UNROLL-NEXT: store i32 [[ADD14_2_16]], ptr [[ARRAYIDX13_2_16]], align 4 -; CHECK-UNROLL-NEXT: call void @_Z3barv() -; CHECK-UNROLL-NEXT: br label [[FOR_BODY7_3_16:%.*]] -; CHECK-UNROLL: for.body7.3.16: -; CHECK-UNROLL-NEXT: [[TMP189:%.*]] = load ptr, ptr [[ARRAYIDX_16]], align 8 -; CHECK-UNROLL-NEXT: [[ARRAYIDX11_3_16:%.*]] = getelementptr inbounds i32, ptr [[TMP189]], i64 3 -; CHECK-UNROLL-NEXT: [[TMP190:%.*]] = load i32, ptr [[ARRAYIDX11_3_16]], align 4 -; CHECK-UNROLL-NEXT: [[ARRAYIDX13_3_16:%.*]] = getelementptr inbounds i32, ptr [[OUT]], i64 3 -; CHECK-UNROLL-NEXT: [[TMP191:%.*]] = load i32, ptr [[ARRAYIDX13_3_16]], align 4 -; CHECK-UNROLL-NEXT: [[ADD14_3_16:%.*]] = add nsw i32 [[TMP191]], [[TMP190]] -; CHECK-UNROLL-NEXT: store i32 [[ADD14_3_16]], ptr [[ARRAYIDX13_3_16]], align 4 -; CHECK-UNROLL-NEXT: call void @_Z3barv() -; CHECK-UNROLL-NEXT: br i1 false, label [[FOR_BODY7_4]], label [[FOR_COND_CLEANUP6_16:%.*]] -; CHECK-UNROLL: for.cond.cleanup6.16: -; CHECK-UNROLL-NEXT: unreachable -; CHECK-UNROLL: for.body7: -; CHECK-UNROLL-NEXT: [[TMP192:%.*]] = load ptr, ptr [[ARR]], align 8 -; CHECK-UNROLL-NEXT: [[TMP193:%.*]] = load i32, ptr [[TMP192]], align 4 -; CHECK-UNROLL-NEXT: [[TMP194:%.*]] = load i32, ptr [[OUT]], align 4 -; CHECK-UNROLL-NEXT: [[ADD14:%.*]] = add nsw i32 [[TMP194]], [[TMP193]] -; CHECK-UNROLL-NEXT: store i32 [[ADD14]], ptr [[OUT]], align 4 -; CHECK-UNROLL-NEXT: call void @_Z3barv() -; CHECK-UNROLL-NEXT: br label [[FOR_BODY7_1:%.*]] -; CHECK-UNROLL: for.body7.1: -; CHECK-UNROLL-NEXT: [[TMP195:%.*]] = load ptr, ptr [[ARR]], align 8 -; CHECK-UNROLL-NEXT: [[ARRAYIDX11_1:%.*]] = getelementptr inbounds i32, ptr [[TMP195]], i64 1 -; CHECK-UNROLL-NEXT: [[TMP196:%.*]] = load i32, ptr [[ARRAYIDX11_1]], align 4 -; CHECK-UNROLL-NEXT: [[ARRAYIDX13_1:%.*]] = getelementptr inbounds i32, ptr [[OUT]], i64 1 -; CHECK-UNROLL-NEXT: [[TMP197:%.*]] = load i32, ptr [[ARRAYIDX13_1]], align 4 -; CHECK-UNROLL-NEXT: [[ADD14_1:%.*]] = add nsw i32 [[TMP197]], [[TMP196]] -; CHECK-UNROLL-NEXT: store i32 [[ADD14_1]], ptr [[ARRAYIDX13_1]], align 4 -; CHECK-UNROLL-NEXT: call void @_Z3barv() -; CHECK-UNROLL-NEXT: br label [[FOR_BODY7_2:%.*]] -; CHECK-UNROLL: for.body7.2: -; CHECK-UNROLL-NEXT: [[TMP198:%.*]] = load ptr, ptr [[ARR]], align 8 -; CHECK-UNROLL-NEXT: [[ARRAYIDX11_2:%.*]] = getelementptr inbounds i32, ptr [[TMP198]], i64 2 -; CHECK-UNROLL-NEXT: [[TMP199:%.*]] = load i32, ptr [[ARRAYIDX11_2]], align 4 -; CHECK-UNROLL-NEXT: [[ARRAYIDX13_2:%.*]] = getelementptr inbounds i32, ptr [[OUT]], i64 2 -; CHECK-UNROLL-NEXT: [[TMP200:%.*]] = load i32, ptr [[ARRAYIDX13_2]], align 4 -; CHECK-UNROLL-NEXT: [[ADD14_2:%.*]] = add nsw i32 [[TMP200]], [[TMP199]] -; CHECK-UNROLL-NEXT: store i32 [[ADD14_2]], ptr [[ARRAYIDX13_2]], align 4 -; CHECK-UNROLL-NEXT: call void @_Z3barv() -; CHECK-UNROLL-NEXT: br label [[FOR_BODY7_3:%.*]] -; CHECK-UNROLL: for.body7.3: -; CHECK-UNROLL-NEXT: [[TMP201:%.*]] = load ptr, ptr [[ARR]], align 8 -; CHECK-UNROLL-NEXT: [[ARRAYIDX11_3:%.*]] = getelementptr inbounds i32, ptr [[TMP201]], i64 3 -; CHECK-UNROLL-NEXT: [[TMP202:%.*]] = load i32, ptr [[ARRAYIDX11_3]], align 4 -; CHECK-UNROLL-NEXT: [[ARRAYIDX13_3:%.*]] = getelementptr inbounds i32, ptr [[OUT]], i64 3 -; CHECK-UNROLL-NEXT: [[TMP203:%.*]] = load i32, ptr [[ARRAYIDX13_3]], align 4 -; CHECK-UNROLL-NEXT: [[ADD14_3:%.*]] = add nsw i32 [[TMP203]], [[TMP202]] -; CHECK-UNROLL-NEXT: store i32 [[ADD14_3]], ptr [[ARRAYIDX13_3]], align 4 -; CHECK-UNROLL-NEXT: call void @_Z3barv() -; CHECK-UNROLL-NEXT: br i1 false, label [[FOR_BODY7_4]], label [[FOR_COND_CLEANUP6:%.*]] -; CHECK-UNROLL: for.body7.4: -; CHECK-UNROLL-NEXT: [[ARRAYIDX_LCSSA:%.*]] = phi ptr [ [[ARR]], [[FOR_BODY7_3]] ], [ [[ARRAYIDX_1]], [[FOR_BODY7_3_1]] ], [ [[ARRAYIDX_2]], [[FOR_BODY7_3_2]] ], [ [[ARRAYIDX_3]], [[FOR_BODY7_3_3]] ], [ [[ARRAYIDX_4]], [[FOR_BODY7_3_4]] ], [ [[ARRAYIDX_5]], [[FOR_BODY7_3_5]] ], [ [[ARRAYIDX_6]], [[FOR_BODY7_3_6]] ], [ [[ARRAYIDX_7]], [[FOR_BODY7_3_7]] ], [ [[ARRAYIDX_8]], [[FOR_BODY7_3_8]] ], [ [[ARRAYIDX_9]], [[FOR_BODY7_3_9]] ], [ [[ARRAYIDX_10]], [[FOR_BODY7_3_10]] ], [ [[ARRAYIDX_11]], [[FOR_BODY7_3_11]] ], [ [[ARRAYIDX_12]], [[FOR_BODY7_3_12]] ], [ [[ARRAYIDX_13]], [[FOR_BODY7_3_13]] ], [ [[ARRAYIDX_14]], [[FOR_BODY7_3_14]] ], [ [[ARRAYIDX_15]], [[FOR_BODY7_3_15]] ], [ [[ARRAYIDX_16]], [[FOR_BODY7_3_16]] ] -; CHECK-UNROLL-NEXT: [[TMP204:%.*]] = load ptr, ptr [[ARRAYIDX_LCSSA]], align 8 -; CHECK-UNROLL-NEXT: [[ARRAYIDX11_4:%.*]] = getelementptr inbounds i32, ptr [[TMP204]], i64 4 -; CHECK-UNROLL-NEXT: [[TMP205:%.*]] = load i32, ptr [[ARRAYIDX11_4]], align 4 -; CHECK-UNROLL-NEXT: [[ARRAYIDX13_4:%.*]] = getelementptr inbounds i32, ptr [[OUT]], i64 4 -; CHECK-UNROLL-NEXT: [[TMP206:%.*]] = load i32, ptr [[ARRAYIDX13_4]], align 4 -; CHECK-UNROLL-NEXT: [[ADD14_4:%.*]] = add nsw i32 [[TMP206]], [[TMP205]] -; CHECK-UNROLL-NEXT: store i32 [[ADD14_4]], ptr [[ARRAYIDX13_4]], align 4 -; CHECK-UNROLL-NEXT: call void @_Z3barv() -; CHECK-UNROLL-NEXT: unreachable -; CHECK-UNROLL: cleanup: -; CHECK-UNROLL-NEXT: ret void -; -entry: - br label %for.cond - -for.cond: ; preds = %for.cond.cleanup6, %entry - %Dim.0 = phi i32 [ 0, %entry ], [ %inc16, %for.cond.cleanup6 ] - %Idx.addr.0 = phi i32 [ %Idx, %entry ], [ %add, %for.cond.cleanup6 ] - %cmp = icmp slt i32 %Dim.0, 16 - br i1 %cmp, label %for.body, label %for.cond.cleanup - -for.cond.cleanup: ; preds = %for.cond - br label %cleanup - -for.body: ; preds = %for.cond - %cmp1 = icmp eq i32 %Dim.0, %Dims - br i1 %cmp1, label %if.then, label %if.end - -if.then: ; preds = %for.body - br label %cleanup - -if.end: ; preds = %for.body - %idxprom = sext i32 %Dim.0 to i64 - %arrayidx = getelementptr inbounds ptr, ptr %Arr, i64 %idxprom - %0 = load ptr, ptr %arrayidx, align 8 - %idxprom2 = sext i32 %Idx.addr.0 to i64 - %arrayidx3 = getelementptr inbounds i32, ptr %0, i64 %idxprom2 - %1 = load i32, ptr %arrayidx3, align 4 - %add = add nsw i32 %1, 1 - br label %for.cond4 - -for.cond4: ; preds = %for.body7, %if.end - %arg.0 = phi i32 [ 0, %if.end ], [ %inc, %for.body7 ] - %cmp5 = icmp slt i32 %arg.0, 4 - br i1 %cmp5, label %for.body7, label %for.cond.cleanup6 - -for.cond.cleanup6: ; preds = %for.cond4 - %inc16 = add nsw i32 %Dim.0, 1 - br label %for.cond, !llvm.loop !0 - -for.body7: ; preds = %for.cond4 - %2 = load ptr, ptr %arrayidx, align 8 - %idxprom10 = sext i32 %arg.0 to i64 - %arrayidx11 = getelementptr inbounds i32, ptr %2, i64 %idxprom10 - %3 = load i32, ptr %arrayidx11, align 4 - %arrayidx13 = getelementptr inbounds i32, ptr %Out, i64 %idxprom10 - %4 = load i32, ptr %arrayidx13, align 4 - %add14 = add nsw i32 %4, %3 - store i32 %add14, ptr %arrayidx13, align 4 - call void @_Z3barv() - %inc = add nsw i32 %arg.0, 1 - br label %for.cond4, !llvm.loop !3 - -cleanup: ; preds = %if.then, %for.cond.cleanup - ret void -} - - declare void @_Z3barv() - -!0 = distinct !{!0, !1, !2} -!1 = !{!"llvm.loop.mustprogress"} -!2 = !{!"llvm.loop.unroll.enable"} -!3 = distinct !{!3, !1} -;. -; CHECK-CFG: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} -; CHECK-CFG: [[META1]] = !{!"llvm.loop.mustprogress"} -; CHECK-CFG: [[META2]] = !{!"llvm.loop.unroll.enable"} -; CHECK-CFG: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]]} -;. diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/eliminate-tail-predication.ll b/llvm/test/Transforms/LoopVectorize/AArch64/eliminate-tail-predication.ll index 8c50d86489c9d..6b5d69d100dde 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/eliminate-tail-predication.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/eliminate-tail-predication.ll @@ -18,16 +18,16 @@ define void @f1(ptr %A) #0 { ; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] -; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0 -; CHECK-NEXT: store shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer), ptr [[TMP8]], align 4 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0 +; CHECK-NEXT: store shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer), ptr [[TMP6]], align 4 +; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]] ; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/gather-do-not-vectorize-addressing.ll b/llvm/test/Transforms/LoopVectorize/AArch64/gather-do-not-vectorize-addressing.ll index 763b3e0bc8293..98d55ae15c077 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/gather-do-not-vectorize-addressing.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/gather-do-not-vectorize-addressing.ll @@ -54,8 +54,6 @@ define dso_local double @test(ptr nocapture noundef readonly %data, ptr nocaptur ; SVE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 ; SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]] ; SVE-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] -; SVE-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() -; SVE-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 2 ; SVE-NEXT: br label [[VECTOR_BODY:%.*]] ; SVE: vector.body: ; SVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -68,6 +66,8 @@ define dso_local double @test(ptr nocapture noundef readonly %data, ptr nocaptur ; SVE-NEXT: [[TMP8:%.*]] = getelementptr inbounds double, ptr [[DATA:%.*]], [[TMP7]] ; SVE-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv2f64.nxv2p0( [[TMP8]], i32 8, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison) ; SVE-NEXT: [[TMP9]] = fadd [[VEC_PHI]], [[WIDE_MASKED_GATHER]] +; SVE-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() +; SVE-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 2 ; SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP11]] ; SVE-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; SVE-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll index 8e2efd86c1f26..28962dfba8924 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll @@ -19,8 +19,6 @@ define void @test_widen(ptr noalias %a, ptr readnone %b) #4 { ; TFNONE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 ; TFNONE-NEXT: [[N_MOD_VF:%.*]] = urem i64 1025, [[TMP3]] ; TFNONE-NEXT: [[N_VEC:%.*]] = sub i64 1025, [[N_MOD_VF]] -; TFNONE-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() -; TFNONE-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 2 ; TFNONE-NEXT: br label [[VECTOR_BODY:%.*]] ; TFNONE: vector.body: ; TFNONE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -29,6 +27,8 @@ define void @test_widen(ptr noalias %a, ptr readnone %b) #4 { ; TFNONE-NEXT: [[TMP5:%.*]] = call @foo_vector( [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) ; TFNONE-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] ; TFNONE-NEXT: store [[TMP5]], ptr [[TMP6]], align 8 +; TFNONE-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; TFNONE-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 2 ; TFNONE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]] ; TFNONE-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; TFNONE-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] @@ -61,8 +61,6 @@ define void @test_widen(ptr noalias %a, ptr readnone %b) #4 { ; TFCOMMON-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] ; TFCOMMON-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; TFCOMMON-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 1025) -; TFCOMMON-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() -; TFCOMMON-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 2 ; TFCOMMON-NEXT: br label [[VECTOR_BODY:%.*]] ; TFCOMMON: vector.body: ; TFCOMMON-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -72,6 +70,8 @@ define void @test_widen(ptr noalias %a, ptr readnone %b) #4 { ; TFCOMMON-NEXT: [[TMP6:%.*]] = call @foo_vector( [[WIDE_MASKED_LOAD]], [[ACTIVE_LANE_MASK]]) ; TFCOMMON-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] ; TFCOMMON-NEXT: call void @llvm.masked.store.nxv2i64.p0( [[TMP6]], ptr [[TMP7]], i32 8, [[ACTIVE_LANE_MASK]]) +; TFCOMMON-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; TFCOMMON-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 2 ; TFCOMMON-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]] ; TFCOMMON-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT]], i64 1025) ; TFCOMMON-NEXT: [[TMP10:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) @@ -111,8 +111,6 @@ define void @test_if_then(ptr noalias %a, ptr readnone %b) #4 { ; TFNONE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 ; TFNONE-NEXT: [[N_MOD_VF:%.*]] = urem i64 1025, [[TMP3]] ; TFNONE-NEXT: [[N_VEC:%.*]] = sub i64 1025, [[N_MOD_VF]] -; TFNONE-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() -; TFNONE-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 2 ; TFNONE-NEXT: br label [[VECTOR_BODY:%.*]] ; TFNONE: vector.body: ; TFNONE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -124,6 +122,8 @@ define void @test_if_then(ptr noalias %a, ptr readnone %b) #4 { ; TFNONE-NEXT: [[PREDPHI:%.*]] = select [[TMP7]], zeroinitializer, [[TMP6]] ; TFNONE-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[INDEX]] ; TFNONE-NEXT: store [[PREDPHI]], ptr [[TMP8]], align 8 +; TFNONE-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() +; TFNONE-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 2 ; TFNONE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]] ; TFNONE-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; TFNONE-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] @@ -162,8 +162,6 @@ define void @test_if_then(ptr noalias %a, ptr readnone %b) #4 { ; TFCOMMON-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] ; TFCOMMON-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; TFCOMMON-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 1025) -; TFCOMMON-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() -; TFCOMMON-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 2 ; TFCOMMON-NEXT: br label [[VECTOR_BODY:%.*]] ; TFCOMMON: vector.body: ; TFCOMMON-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -179,6 +177,8 @@ define void @test_if_then(ptr noalias %a, ptr readnone %b) #4 { ; TFCOMMON-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[INDEX]] ; TFCOMMON-NEXT: [[TMP12:%.*]] = or [[TMP7]], [[TMP10]] ; TFCOMMON-NEXT: call void @llvm.masked.store.nxv2i64.p0( [[PREDPHI]], ptr [[TMP11]], i32 8, [[TMP12]]) +; TFCOMMON-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() +; TFCOMMON-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 2 ; TFCOMMON-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP14]] ; TFCOMMON-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT]], i64 1025) ; TFCOMMON-NEXT: [[TMP15:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) @@ -229,8 +229,6 @@ define void @test_widen_if_then_else(ptr noalias %a, ptr readnone %b) #4 { ; TFNONE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 ; TFNONE-NEXT: [[N_MOD_VF:%.*]] = urem i64 1025, [[TMP3]] ; TFNONE-NEXT: [[N_VEC:%.*]] = sub i64 1025, [[N_MOD_VF]] -; TFNONE-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() -; TFNONE-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 2 ; TFNONE-NEXT: br label [[VECTOR_BODY:%.*]] ; TFNONE: vector.body: ; TFNONE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -243,6 +241,8 @@ define void @test_widen_if_then_else(ptr noalias %a, ptr readnone %b) #4 { ; TFNONE-NEXT: [[PREDPHI:%.*]] = select [[TMP6]], [[TMP7]], [[TMP8]] ; TFNONE-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[INDEX]] ; TFNONE-NEXT: store [[PREDPHI]], ptr [[TMP9]], align 8 +; TFNONE-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() +; TFNONE-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 2 ; TFNONE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP11]] ; TFNONE-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; TFNONE-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] @@ -284,8 +284,6 @@ define void @test_widen_if_then_else(ptr noalias %a, ptr readnone %b) #4 { ; TFCOMMON-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] ; TFCOMMON-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; TFCOMMON-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 1025) -; TFCOMMON-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64() -; TFCOMMON-NEXT: [[TMP15:%.*]] = mul i64 [[TMP14]], 2 ; TFCOMMON-NEXT: br label [[VECTOR_BODY:%.*]] ; TFCOMMON: vector.body: ; TFCOMMON-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -302,6 +300,8 @@ define void @test_widen_if_then_else(ptr noalias %a, ptr readnone %b) #4 { ; TFCOMMON-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[INDEX]] ; TFCOMMON-NEXT: [[TMP13:%.*]] = or [[TMP8]], [[TMP10]] ; TFCOMMON-NEXT: call void @llvm.masked.store.nxv2i64.p0( [[PREDPHI]], ptr [[TMP12]], i32 8, [[TMP13]]) +; TFCOMMON-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64() +; TFCOMMON-NEXT: [[TMP15:%.*]] = mul i64 [[TMP14]], 2 ; TFCOMMON-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP15]] ; TFCOMMON-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT]], i64 1025) ; TFCOMMON-NEXT: [[TMP16:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) @@ -355,8 +355,6 @@ define void @test_widen_nomask(ptr noalias %a, ptr readnone %b) #4 { ; TFNONE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 ; TFNONE-NEXT: [[N_MOD_VF:%.*]] = urem i64 1025, [[TMP3]] ; TFNONE-NEXT: [[N_VEC:%.*]] = sub i64 1025, [[N_MOD_VF]] -; TFNONE-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() -; TFNONE-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 2 ; TFNONE-NEXT: br label [[VECTOR_BODY:%.*]] ; TFNONE: vector.body: ; TFNONE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -365,6 +363,8 @@ define void @test_widen_nomask(ptr noalias %a, ptr readnone %b) #4 { ; TFNONE-NEXT: [[TMP5:%.*]] = call @foo_vector_nomask( [[WIDE_LOAD]]) ; TFNONE-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] ; TFNONE-NEXT: store [[TMP5]], ptr [[TMP6]], align 8 +; TFNONE-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; TFNONE-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 2 ; TFNONE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]] ; TFNONE-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; TFNONE-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] @@ -413,8 +413,6 @@ define void @test_widen_nomask(ptr noalias %a, ptr readnone %b) #4 { ; TFFALLBACK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 ; TFFALLBACK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1025, [[TMP3]] ; TFFALLBACK-NEXT: [[N_VEC:%.*]] = sub i64 1025, [[N_MOD_VF]] -; TFFALLBACK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() -; TFFALLBACK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 2 ; TFFALLBACK-NEXT: br label [[VECTOR_BODY:%.*]] ; TFFALLBACK: vector.body: ; TFFALLBACK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -423,6 +421,8 @@ define void @test_widen_nomask(ptr noalias %a, ptr readnone %b) #4 { ; TFFALLBACK-NEXT: [[TMP5:%.*]] = call @foo_vector_nomask( [[WIDE_LOAD]]) ; TFFALLBACK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] ; TFFALLBACK-NEXT: store [[TMP5]], ptr [[TMP6]], align 8 +; TFFALLBACK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; TFFALLBACK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 2 ; TFFALLBACK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]] ; TFFALLBACK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; TFFALLBACK-NEXT: br i1 [[TMP9]], label [[SCALAR_PH]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] @@ -475,8 +475,6 @@ define void @test_widen_optmask(ptr noalias %a, ptr readnone %b) #4 { ; TFNONE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 ; TFNONE-NEXT: [[N_MOD_VF:%.*]] = urem i64 1025, [[TMP3]] ; TFNONE-NEXT: [[N_VEC:%.*]] = sub i64 1025, [[N_MOD_VF]] -; TFNONE-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() -; TFNONE-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 2 ; TFNONE-NEXT: br label [[VECTOR_BODY:%.*]] ; TFNONE: vector.body: ; TFNONE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -485,6 +483,8 @@ define void @test_widen_optmask(ptr noalias %a, ptr readnone %b) #4 { ; TFNONE-NEXT: [[TMP5:%.*]] = call @foo_vector_nomask( [[WIDE_LOAD]]) ; TFNONE-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] ; TFNONE-NEXT: store [[TMP5]], ptr [[TMP6]], align 8 +; TFNONE-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; TFNONE-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 2 ; TFNONE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]] ; TFNONE-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; TFNONE-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] @@ -517,8 +517,6 @@ define void @test_widen_optmask(ptr noalias %a, ptr readnone %b) #4 { ; TFALWAYS-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] ; TFALWAYS-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; TFALWAYS-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 1025) -; TFALWAYS-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() -; TFALWAYS-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 2 ; TFALWAYS-NEXT: br label [[VECTOR_BODY:%.*]] ; TFALWAYS: vector.body: ; TFALWAYS-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -528,6 +526,8 @@ define void @test_widen_optmask(ptr noalias %a, ptr readnone %b) #4 { ; TFALWAYS-NEXT: [[TMP6:%.*]] = call @foo_vector( [[WIDE_MASKED_LOAD]], [[ACTIVE_LANE_MASK]]) ; TFALWAYS-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] ; TFALWAYS-NEXT: call void @llvm.masked.store.nxv2i64.p0( [[TMP6]], ptr [[TMP7]], i32 8, [[ACTIVE_LANE_MASK]]) +; TFALWAYS-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; TFALWAYS-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 2 ; TFALWAYS-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]] ; TFALWAYS-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT]], i64 1025) ; TFALWAYS-NEXT: [[TMP10:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) @@ -547,8 +547,6 @@ define void @test_widen_optmask(ptr noalias %a, ptr readnone %b) #4 { ; TFFALLBACK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] ; TFFALLBACK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; TFFALLBACK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 1025) -; TFFALLBACK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() -; TFFALLBACK-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 2 ; TFFALLBACK-NEXT: br label [[VECTOR_BODY:%.*]] ; TFFALLBACK: vector.body: ; TFFALLBACK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -558,6 +556,8 @@ define void @test_widen_optmask(ptr noalias %a, ptr readnone %b) #4 { ; TFFALLBACK-NEXT: [[TMP6:%.*]] = call @foo_vector( [[WIDE_MASKED_LOAD]], [[ACTIVE_LANE_MASK]]) ; TFFALLBACK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] ; TFFALLBACK-NEXT: call void @llvm.masked.store.nxv2i64.p0( [[TMP6]], ptr [[TMP7]], i32 8, [[ACTIVE_LANE_MASK]]) +; TFFALLBACK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; TFFALLBACK-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 2 ; TFFALLBACK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]] ; TFFALLBACK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT]], i64 1025) ; TFFALLBACK-NEXT: [[TMP10:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) @@ -601,8 +601,6 @@ define double @test_widen_fmuladd_and_call(ptr noalias %a, ptr readnone %b, doub ; TFNONE-NEXT: [[N_VEC:%.*]] = sub i64 1025, [[N_MOD_VF]] ; TFNONE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, double [[M:%.*]], i64 0 ; TFNONE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; TFNONE-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() -; TFNONE-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 2 ; TFNONE-NEXT: br label [[VECTOR_BODY:%.*]] ; TFNONE: vector.body: ; TFNONE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -615,6 +613,8 @@ define double @test_widen_fmuladd_and_call(ptr noalias %a, ptr readnone %b, doub ; TFNONE-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] ; TFNONE-NEXT: store [[TMP7]], ptr [[TMP8]], align 8 ; TFNONE-NEXT: [[TMP9]] = call double @llvm.vector.reduce.fadd.nxv2f64(double [[VEC_PHI]], [[TMP5]]) +; TFNONE-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() +; TFNONE-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 2 ; TFNONE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP11]] ; TFNONE-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; TFNONE-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] @@ -654,8 +654,6 @@ define double @test_widen_fmuladd_and_call(ptr noalias %a, ptr readnone %b, doub ; TFALWAYS-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 1025) ; TFALWAYS-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, double [[M:%.*]], i64 0 ; TFALWAYS-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; TFALWAYS-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() -; TFALWAYS-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 2 ; TFALWAYS-NEXT: br label [[VECTOR_BODY:%.*]] ; TFALWAYS: vector.body: ; TFALWAYS-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -670,6 +668,8 @@ define double @test_widen_fmuladd_and_call(ptr noalias %a, ptr readnone %b, doub ; TFALWAYS-NEXT: call void @llvm.masked.store.nxv2i64.p0( [[TMP8]], ptr [[TMP9]], i32 8, [[ACTIVE_LANE_MASK]]) ; TFALWAYS-NEXT: [[TMP10:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP6]], shufflevector ( insertelement ( poison, double -0.000000e+00, i64 0), poison, zeroinitializer) ; TFALWAYS-NEXT: [[TMP11]] = call double @llvm.vector.reduce.fadd.nxv2f64(double [[VEC_PHI]], [[TMP10]]) +; TFALWAYS-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() +; TFALWAYS-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 2 ; TFALWAYS-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP13]] ; TFALWAYS-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT]], i64 1025) ; TFALWAYS-NEXT: [[TMP14:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) @@ -691,8 +691,6 @@ define double @test_widen_fmuladd_and_call(ptr noalias %a, ptr readnone %b, doub ; TFFALLBACK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 1025) ; TFFALLBACK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, double [[M:%.*]], i64 0 ; TFFALLBACK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; TFFALLBACK-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() -; TFFALLBACK-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 2 ; TFFALLBACK-NEXT: br label [[VECTOR_BODY:%.*]] ; TFFALLBACK: vector.body: ; TFFALLBACK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -707,6 +705,8 @@ define double @test_widen_fmuladd_and_call(ptr noalias %a, ptr readnone %b, doub ; TFFALLBACK-NEXT: call void @llvm.masked.store.nxv2i64.p0( [[TMP8]], ptr [[TMP9]], i32 8, [[ACTIVE_LANE_MASK]]) ; TFFALLBACK-NEXT: [[TMP10:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP6]], shufflevector ( insertelement ( poison, double -0.000000e+00, i64 0), poison, zeroinitializer) ; TFFALLBACK-NEXT: [[TMP11]] = call double @llvm.vector.reduce.fadd.nxv2f64(double [[VEC_PHI]], [[TMP10]]) +; TFFALLBACK-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() +; TFFALLBACK-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 2 ; TFFALLBACK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP13]] ; TFFALLBACK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT]], i64 1025) ; TFFALLBACK-NEXT: [[TMP14:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/outer_loop_prefer_scalable.ll b/llvm/test/Transforms/LoopVectorize/AArch64/outer_loop_prefer_scalable.ll index 2fbbb7bb35c8a..ca3c935669e97 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/outer_loop_prefer_scalable.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/outer_loop_prefer_scalable.ll @@ -27,8 +27,6 @@ define void @foo() { ; CHECK-NEXT: [[TMP9:%.*]] = mul i64 1, [[TMP8]] ; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP9]], i64 0 ; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer -; CHECK-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 4 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[OUTER_LOOP_LATCH4:%.*]] ] @@ -51,6 +49,8 @@ define void @foo() { ; CHECK-NEXT: call void @llvm.masked.scatter.nxv4f32.nxv4p0( [[VEC_PHI5]], [[TMP10]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) ; CHECK-NEXT: [[TMP16:%.*]] = add nuw nsw [[VEC_IND]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) ; CHECK-NEXT: [[TMP17:%.*]] = icmp eq [[TMP16]], shufflevector ( insertelement ( poison, i64 1024, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP19]] ; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] ; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/pr60831-sve-inv-store-crash.ll b/llvm/test/Transforms/LoopVectorize/AArch64/pr60831-sve-inv-store-crash.ll index 9bb9417398526..3f6d1b70f891c 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/pr60831-sve-inv-store-crash.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/pr60831-sve-inv-store-crash.ll @@ -17,28 +17,28 @@ define void @test_invar_gep(ptr %dst) #0 { ; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 100, [[TMP3]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 100, [[N_MOD_VF]] -; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP6:%.*]] = call @llvm.experimental.stepvector.nxv4i64() +; CHECK-NEXT: [[TMP4:%.*]] = call @llvm.experimental.stepvector.nxv4i64() ; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[INDEX]], i64 0 ; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer -; CHECK-NEXT: [[TMP7:%.*]] = add zeroinitializer, [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = mul [[TMP7]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP9:%.*]] = add [[DOTSPLAT]], [[TMP8]] -; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 1 -; CHECK-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 2 -; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 3 -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[DST:%.*]], i64 0 -; CHECK-NEXT: [[TMP15:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-NEXT: [[TMP16:%.*]] = mul i32 [[TMP15]], 4 -; CHECK-NEXT: [[TMP17:%.*]] = sub i32 [[TMP16]], 1 -; CHECK-NEXT: [[TMP18:%.*]] = extractelement [[TMP9]], i32 [[TMP17]] -; CHECK-NEXT: store i64 [[TMP18]], ptr [[TMP14]], align 1 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-NEXT: [[TMP5:%.*]] = add zeroinitializer, [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = mul [[TMP5]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP7:%.*]] = add [[DOTSPLAT]], [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 3 +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[DST:%.*]], i64 0 +; CHECK-NEXT: [[TMP13:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[TMP14:%.*]] = mul i32 [[TMP13]], 4 +; CHECK-NEXT: [[TMP15:%.*]] = sub i32 [[TMP14]], 1 +; CHECK-NEXT: [[TMP16:%.*]] = extractelement [[TMP7]], i32 [[TMP15]] +; CHECK-NEXT: store i64 [[TMP16]], ptr [[TMP12]], align 1 +; CHECK-NEXT: [[TMP17:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP18:%.*]] = mul i64 [[TMP17]], 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP18]] ; CHECK-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-avoid-scalarization.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-avoid-scalarization.ll index e070d87f83327..db97dc6796738 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-avoid-scalarization.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-avoid-scalarization.ll @@ -26,31 +26,31 @@ define void @test_no_scalarization(ptr %a, ptr noalias %b, i32 %idx, i32 %n) #0 ; CHECK-NEXT: [[IND_END:%.*]] = add i32 [[IDX]], [[N_VEC]] ; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i32 [[IDX]], i64 0 ; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer -; CHECK-NEXT: [[TMP8:%.*]] = call @llvm.experimental.stepvector.nxv2i32() -; CHECK-NEXT: [[TMP9:%.*]] = add [[TMP8]], zeroinitializer -; CHECK-NEXT: [[TMP10:%.*]] = mul [[TMP9]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[INDUCTION:%.*]] = add [[DOTSPLAT]], [[TMP10]] -; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-NEXT: [[TMP12:%.*]] = mul i32 [[TMP11]], 2 -; CHECK-NEXT: [[TMP13:%.*]] = mul i32 1, [[TMP12]] -; CHECK-NEXT: [[DOTSPLATINSERT1:%.*]] = insertelement poison, i32 [[TMP13]], i64 0 +; CHECK-NEXT: [[TMP6:%.*]] = call @llvm.experimental.stepvector.nxv2i32() +; CHECK-NEXT: [[TMP7:%.*]] = add [[TMP6]], zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = mul [[TMP7]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[INDUCTION:%.*]] = add [[DOTSPLAT]], [[TMP8]] +; CHECK-NEXT: [[TMP9:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[TMP10:%.*]] = mul i32 [[TMP9]], 2 +; CHECK-NEXT: [[TMP11:%.*]] = mul i32 1, [[TMP10]] +; CHECK-NEXT: [[DOTSPLATINSERT1:%.*]] = insertelement poison, i32 [[TMP11]], i64 0 ; CHECK-NEXT: [[DOTSPLAT2:%.*]] = shufflevector [[DOTSPLATINSERT1]], poison, zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-NEXT: [[TMP7:%.*]] = mul i32 [[TMP6]], 2 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i32 [[IDX]], [[INDEX]] -; CHECK-NEXT: [[TMP14:%.*]] = add i32 [[OFFSET_IDX]], 0 -; CHECK-NEXT: [[TMP15:%.*]] = getelementptr i64, ptr [[A:%.*]], [[VEC_IND]] -; CHECK-NEXT: [[TMP16:%.*]] = extractelement [[TMP15]], i32 0 +; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i64, ptr [[A:%.*]], [[VEC_IND]] +; CHECK-NEXT: [[TMP14:%.*]] = extractelement [[TMP13]], i32 0 +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr double, ptr [[TMP14]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP15]], align 8 +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr i64, ptr [[B:%.*]], i32 [[TMP12]] ; CHECK-NEXT: [[TMP17:%.*]] = getelementptr double, ptr [[TMP16]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP17]], align 8 -; CHECK-NEXT: [[TMP18:%.*]] = getelementptr i64, ptr [[B:%.*]], i32 [[TMP14]] -; CHECK-NEXT: [[TMP19:%.*]] = getelementptr double, ptr [[TMP18]], i32 0 -; CHECK-NEXT: store [[WIDE_LOAD]], ptr [[TMP19]], align 8 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP7]] +; CHECK-NEXT: store [[WIDE_LOAD]], ptr [[TMP17]], align 8 +; CHECK-NEXT: [[TMP18:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[TMP19:%.*]] = mul i32 [[TMP18]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP19]] ; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT2]] ; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] @@ -58,7 +58,7 @@ define void @test_no_scalarization(ptr %a, ptr noalias %b, i32 %idx, i32 %n) #0 ; CHECK-NEXT: [[TMP21:%.*]] = call i32 @llvm.vscale.i32() ; CHECK-NEXT: [[TMP22:%.*]] = mul i32 [[TMP21]], 2 ; CHECK-NEXT: [[TMP23:%.*]] = sub i32 [[TMP22]], 1 -; CHECK-NEXT: [[TMP24:%.*]] = extractelement [[TMP15]], i32 [[TMP23]] +; CHECK-NEXT: [[TMP24:%.*]] = extractelement [[TMP13]], i32 [[TMP23]] ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP1]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[L_EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reduction-inloop-cond.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reduction-inloop-cond.ll index 43220d582c4c8..52feff81914a3 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reduction-inloop-cond.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reduction-inloop-cond.ll @@ -14,24 +14,24 @@ define float @cond_fadd(ptr noalias nocapture readonly %a, ptr noalias nocapture ; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] -; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi float [ 1.000000e+00, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[COND:%.*]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP8]], align 4 -; CHECK-NEXT: [[TMP9:%.*]] = fcmp une [[WIDE_LOAD]], shufflevector ( insertelement ( poison, float 2.000000e+00, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr float, ptr [[A:%.*]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr float, ptr [[TMP10]], i32 0 -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4f32.p0(ptr [[TMP11]], i32 4, [[TMP9]], poison) -; CHECK-NEXT: [[TMP12:%.*]] = select fast [[TMP9]], [[WIDE_MASKED_LOAD]], zeroinitializer -; CHECK-NEXT: [[TMP13:%.*]] = call fast float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, [[TMP12]]) -; CHECK-NEXT: [[TMP14]] = fadd fast float [[TMP13]], [[VEC_PHI]] -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi float [ 1.000000e+00, [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[COND:%.*]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP5]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 4 +; CHECK-NEXT: [[TMP7:%.*]] = fcmp une [[WIDE_LOAD]], shufflevector ( insertelement ( poison, float 2.000000e+00, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr float, ptr [[A:%.*]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr float, ptr [[TMP8]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4f32.p0(ptr [[TMP9]], i32 4, [[TMP7]], poison) +; CHECK-NEXT: [[TMP10:%.*]] = select fast [[TMP7]], [[WIDE_MASKED_LOAD]], zeroinitializer +; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, [[TMP10]]) +; CHECK-NEXT: [[TMP12]] = fadd fast float [[TMP11]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP14]] ; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: @@ -39,7 +39,7 @@ define float @cond_fadd(ptr noalias nocapture readonly %a, ptr noalias nocapture ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 1.000000e+00, [[ENTRY]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 1.000000e+00, [[ENTRY]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_NEXT:%.*]], [[FOR_INC:%.*]] ] @@ -57,9 +57,9 @@ define float @cond_fadd(ptr noalias nocapture readonly %a, ptr noalias nocapture ; CHECK-NEXT: [[RES]] = phi float [ [[FADD]], [[IF_THEN]] ], [ [[RDX]], [[FOR_BODY]] ] ; CHECK-NEXT: [[INDVARS_NEXT]] = add nuw nsw i64 [[INDVARS]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] ; CHECK: for.end: -; CHECK-NEXT: [[RES_LCSSA:%.*]] = phi float [ [[RES]], [[FOR_INC]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[RES_LCSSA:%.*]] = phi float [ [[RES]], [[FOR_INC]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret float [[RES_LCSSA]] ; entry: @@ -101,25 +101,25 @@ define float @cond_cmp_sel(ptr noalias %a, ptr noalias %cond, i64 %N) { ; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] -; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi float [ 1.000000e+00, [[VECTOR_PH]] ], [ [[RDX_MINMAX_SELECT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[COND:%.*]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP8]], align 4 -; CHECK-NEXT: [[TMP9:%.*]] = fcmp une [[WIDE_LOAD]], shufflevector ( insertelement ( poison, float 3.000000e+00, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr float, ptr [[A:%.*]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr float, ptr [[TMP10]], i32 0 -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4f32.p0(ptr [[TMP11]], i32 4, [[TMP9]], poison) -; CHECK-NEXT: [[TMP12:%.*]] = select fast [[TMP9]], [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, float 0x7FF0000000000000, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP13:%.*]] = call fast float @llvm.vector.reduce.fmin.nxv4f32( [[TMP12]]) -; CHECK-NEXT: [[RDX_MINMAX_CMP:%.*]] = fcmp fast olt float [[TMP13]], [[VEC_PHI]] -; CHECK-NEXT: [[RDX_MINMAX_SELECT]] = select fast i1 [[RDX_MINMAX_CMP]], float [[TMP13]], float [[VEC_PHI]] -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[COND:%.*]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP5]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 4 +; CHECK-NEXT: [[TMP7:%.*]] = fcmp une [[WIDE_LOAD]], shufflevector ( insertelement ( poison, float 3.000000e+00, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr float, ptr [[A:%.*]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr float, ptr [[TMP8]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4f32.p0(ptr [[TMP9]], i32 4, [[TMP7]], poison) +; CHECK-NEXT: [[TMP10:%.*]] = select fast [[TMP7]], [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, float 0x7FF0000000000000, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.vector.reduce.fmin.nxv4f32( [[TMP10]]) +; CHECK-NEXT: [[RDX_MINMAX_CMP:%.*]] = fcmp fast olt float [[TMP11]], [[VEC_PHI]] +; CHECK-NEXT: [[RDX_MINMAX_SELECT]] = select fast i1 [[RDX_MINMAX_CMP]], float [[TMP11]], float [[VEC_PHI]] +; CHECK-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP13]] ; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll index c474a420ebcfe..2e994838ff241 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll @@ -42,8 +42,6 @@ define float @fadd_strict(ptr noalias nocapture readonly %a, i64 %n) #0 { ; CHECK-UNORDERED-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8 ; CHECK-UNORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] ; CHECK-UNORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] -; CHECK-UNORDERED-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-UNORDERED-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 8 ; CHECK-UNORDERED-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-UNORDERED: vector.body: ; CHECK-UNORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -53,6 +51,8 @@ define float @fadd_strict(ptr noalias nocapture readonly %a, i64 %n) #0 { ; CHECK-UNORDERED-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP5]], i32 0 ; CHECK-UNORDERED-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 4 ; CHECK-UNORDERED-NEXT: [[TMP7]] = fadd [[WIDE_LOAD]], [[VEC_PHI]] +; CHECK-UNORDERED-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UNORDERED-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 8 ; CHECK-UNORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]] ; CHECK-UNORDERED-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-UNORDERED-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] @@ -89,8 +89,6 @@ define float @fadd_strict(ptr noalias nocapture readonly %a, i64 %n) #0 { ; CHECK-ORDERED-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8 ; CHECK-ORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] ; CHECK-ORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] -; CHECK-ORDERED-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 8 ; CHECK-ORDERED-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-ORDERED: vector.body: ; CHECK-ORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -100,6 +98,8 @@ define float @fadd_strict(ptr noalias nocapture readonly %a, i64 %n) #0 { ; CHECK-ORDERED-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP5]], i32 0 ; CHECK-ORDERED-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 4 ; CHECK-ORDERED-NEXT: [[TMP7]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[VEC_PHI]], [[WIDE_LOAD]]) +; CHECK-ORDERED-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 8 ; CHECK-ORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]] ; CHECK-ORDERED-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-ORDERED-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] @@ -142,8 +142,6 @@ define float @fadd_strict(ptr noalias nocapture readonly %a, i64 %n) #0 { ; CHECK-ORDERED-TF-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[N]], [[TMP6]] ; CHECK-ORDERED-TF-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0 ; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[N]]) -; CHECK-ORDERED-TF-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 8 ; CHECK-ORDERED-TF-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-ORDERED-TF: vector.body: ; CHECK-ORDERED-TF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -155,6 +153,8 @@ define float @fadd_strict(ptr noalias nocapture readonly %a, i64 %n) #0 { ; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP12]], i32 4, [[ACTIVE_LANE_MASK]], poison) ; CHECK-ORDERED-TF-NEXT: [[TMP13:%.*]] = select [[ACTIVE_LANE_MASK]], [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) ; CHECK-ORDERED-TF-NEXT: [[TMP14]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[VEC_PHI]], [[TMP13]]) +; CHECK-ORDERED-TF-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 8 ; CHECK-ORDERED-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP16]] ; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX]], i64 [[TMP9]]) ; CHECK-ORDERED-TF-NEXT: [[TMP17:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) @@ -230,8 +230,6 @@ define float @fadd_strict_unroll(ptr noalias nocapture readonly %a, i64 %n) #0 { ; CHECK-UNORDERED-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 32 ; CHECK-UNORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] ; CHECK-UNORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] -; CHECK-UNORDERED-NEXT: [[TMP38:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-UNORDERED-NEXT: [[TMP39:%.*]] = mul i64 [[TMP38]], 32 ; CHECK-UNORDERED-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-UNORDERED: vector.body: ; CHECK-UNORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -277,6 +275,8 @@ define float @fadd_strict_unroll(ptr noalias nocapture readonly %a, i64 %n) #0 { ; CHECK-UNORDERED-NEXT: [[TMP35]] = fadd [[WIDE_LOAD4]], [[VEC_PHI1]] ; CHECK-UNORDERED-NEXT: [[TMP36]] = fadd [[WIDE_LOAD5]], [[VEC_PHI2]] ; CHECK-UNORDERED-NEXT: [[TMP37]] = fadd [[WIDE_LOAD6]], [[VEC_PHI3]] +; CHECK-UNORDERED-NEXT: [[TMP38:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UNORDERED-NEXT: [[TMP39:%.*]] = mul i64 [[TMP38]], 32 ; CHECK-UNORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP39]] ; CHECK-UNORDERED-NEXT: [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-UNORDERED-NEXT: br i1 [[TMP40]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] @@ -316,8 +316,6 @@ define float @fadd_strict_unroll(ptr noalias nocapture readonly %a, i64 %n) #0 { ; CHECK-ORDERED-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 32 ; CHECK-ORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] ; CHECK-ORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] -; CHECK-ORDERED-NEXT: [[TMP38:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-NEXT: [[TMP39:%.*]] = mul i64 [[TMP38]], 32 ; CHECK-ORDERED-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-ORDERED: vector.body: ; CHECK-ORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -360,6 +358,8 @@ define float @fadd_strict_unroll(ptr noalias nocapture readonly %a, i64 %n) #0 { ; CHECK-ORDERED-NEXT: [[TMP35:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP34]], [[WIDE_LOAD1]]) ; CHECK-ORDERED-NEXT: [[TMP36:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP35]], [[WIDE_LOAD2]]) ; CHECK-ORDERED-NEXT: [[TMP37]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP36]], [[WIDE_LOAD3]]) +; CHECK-ORDERED-NEXT: [[TMP38:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-NEXT: [[TMP39:%.*]] = mul i64 [[TMP38]], 32 ; CHECK-ORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP39]] ; CHECK-ORDERED-NEXT: [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-ORDERED-NEXT: br i1 [[TMP40]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] @@ -429,8 +429,6 @@ define float @fadd_strict_unroll(ptr noalias nocapture readonly %a, i64 %n) #0 { ; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY3:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_PART_NEXT]], i64 [[N]]) ; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY4:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_PART_NEXT1]], i64 [[N]]) ; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY5:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_PART_NEXT2]], i64 [[N]]) -; CHECK-ORDERED-TF-NEXT: [[TMP69:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP70:%.*]] = mul i64 [[TMP69]], 32 ; CHECK-ORDERED-TF-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-ORDERED-TF: vector.body: ; CHECK-ORDERED-TF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -481,6 +479,8 @@ define float @fadd_strict_unroll(ptr noalias nocapture readonly %a, i64 %n) #0 { ; CHECK-ORDERED-TF-NEXT: [[TMP66:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP64]], [[TMP65]]) ; CHECK-ORDERED-TF-NEXT: [[TMP67:%.*]] = select [[ACTIVE_LANE_MASK8]], [[WIDE_MASKED_LOAD11]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) ; CHECK-ORDERED-TF-NEXT: [[TMP68]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP66]], [[TMP67]]) +; CHECK-ORDERED-TF-NEXT: [[TMP69:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP70:%.*]] = mul i64 [[TMP69]], 32 ; CHECK-ORDERED-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP70]] ; CHECK-ORDERED-TF-NEXT: [[TMP71:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-ORDERED-TF-NEXT: [[TMP72:%.*]] = mul i64 [[TMP71]], 8 @@ -591,8 +591,6 @@ define void @fadd_strict_interleave(ptr noalias nocapture readonly %a, ptr noali ; CHECK-UNORDERED-NEXT: [[IND_END:%.*]] = mul i64 [[N_VEC]], 2 ; CHECK-UNORDERED-NEXT: [[TMP7:%.*]] = insertelement shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer), float [[A2]], i32 0 ; CHECK-UNORDERED-NEXT: [[TMP8:%.*]] = insertelement shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer), float [[A1]], i32 0 -; CHECK-UNORDERED-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-UNORDERED-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 4 ; CHECK-UNORDERED-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-UNORDERED: vector.body: ; CHECK-UNORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -608,6 +606,8 @@ define void @fadd_strict_interleave(ptr noalias nocapture readonly %a, ptr noali ; CHECK-UNORDERED-NEXT: [[TMP13:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 ; CHECK-UNORDERED-NEXT: [[TMP14]] = fadd [[TMP12]], [[VEC_PHI1]] ; CHECK-UNORDERED-NEXT: [[TMP15]] = fadd [[TMP13]], [[VEC_PHI]] +; CHECK-UNORDERED-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UNORDERED-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 4 ; CHECK-UNORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP17]] ; CHECK-UNORDERED-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-UNORDERED-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] @@ -661,8 +661,6 @@ define void @fadd_strict_interleave(ptr noalias nocapture readonly %a, ptr noali ; CHECK-ORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], [[TMP6]] ; CHECK-ORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]] ; CHECK-ORDERED-NEXT: [[IND_END:%.*]] = mul i64 [[N_VEC]], 2 -; CHECK-ORDERED-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-NEXT: [[TMP15:%.*]] = mul i64 [[TMP14]], 4 ; CHECK-ORDERED-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-ORDERED: vector.body: ; CHECK-ORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -678,6 +676,8 @@ define void @fadd_strict_interleave(ptr noalias nocapture readonly %a, ptr noali ; CHECK-ORDERED-NEXT: [[TMP11:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 ; CHECK-ORDERED-NEXT: [[TMP12]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], [[TMP11]]) ; CHECK-ORDERED-NEXT: [[TMP13]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI1]], [[TMP10]]) +; CHECK-ORDERED-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-NEXT: [[TMP15:%.*]] = mul i64 [[TMP14]], 4 ; CHECK-ORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP15]] ; CHECK-ORDERED-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-ORDERED-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] @@ -736,8 +736,6 @@ define void @fadd_strict_interleave(ptr noalias nocapture readonly %a, ptr noali ; CHECK-ORDERED-TF-NEXT: [[TMP11:%.*]] = icmp ugt i64 [[TMP2]], [[TMP9]] ; CHECK-ORDERED-TF-NEXT: [[TMP12:%.*]] = select i1 [[TMP11]], i64 [[TMP10]], i64 0 ; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[TMP2]]) -; CHECK-ORDERED-TF-NEXT: [[TMP22:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP23:%.*]] = mul i64 [[TMP22]], 4 ; CHECK-ORDERED-TF-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-ORDERED-TF: vector.body: ; CHECK-ORDERED-TF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -757,6 +755,8 @@ define void @fadd_strict_interleave(ptr noalias nocapture readonly %a, ptr noali ; CHECK-ORDERED-TF-NEXT: [[TMP19]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], [[TMP18]]) ; CHECK-ORDERED-TF-NEXT: [[TMP20:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP16]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) ; CHECK-ORDERED-TF-NEXT: [[TMP21]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI1]], [[TMP20]]) +; CHECK-ORDERED-TF-NEXT: [[TMP22:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP23:%.*]] = mul i64 [[TMP22]], 4 ; CHECK-ORDERED-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP23]] ; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP12]]) ; CHECK-ORDERED-TF-NEXT: [[TMP24:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) @@ -867,8 +867,6 @@ define float @fadd_of_sum(ptr noalias nocapture readonly %a, ptr noalias nocaptu ; CHECK-UNORDERED-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 4 ; CHECK-UNORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP4]] ; CHECK-UNORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] -; CHECK-UNORDERED-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-UNORDERED-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 4 ; CHECK-UNORDERED-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-UNORDERED: vector.body: ; CHECK-UNORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -882,6 +880,8 @@ define float @fadd_of_sum(ptr noalias nocapture readonly %a, ptr noalias nocaptu ; CHECK-UNORDERED-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP9]], align 4 ; CHECK-UNORDERED-NEXT: [[TMP10:%.*]] = fadd [[WIDE_LOAD]], [[WIDE_LOAD1]] ; CHECK-UNORDERED-NEXT: [[TMP11]] = fadd [[VEC_PHI]], [[TMP10]] +; CHECK-UNORDERED-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UNORDERED-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 4 ; CHECK-UNORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP13]] ; CHECK-UNORDERED-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-UNORDERED-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] @@ -929,8 +929,6 @@ define float @fadd_of_sum(ptr noalias nocapture readonly %a, ptr noalias nocaptu ; CHECK-ORDERED-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 4 ; CHECK-ORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP4]] ; CHECK-ORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] -; CHECK-ORDERED-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 4 ; CHECK-ORDERED-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-ORDERED: vector.body: ; CHECK-ORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -944,6 +942,8 @@ define float @fadd_of_sum(ptr noalias nocapture readonly %a, ptr noalias nocaptu ; CHECK-ORDERED-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP9]], align 4 ; CHECK-ORDERED-NEXT: [[TMP10:%.*]] = fadd [[WIDE_LOAD]], [[WIDE_LOAD1]] ; CHECK-ORDERED-NEXT: [[TMP11]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], [[TMP10]]) +; CHECK-ORDERED-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 4 ; CHECK-ORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP13]] ; CHECK-ORDERED-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-ORDERED-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] @@ -997,8 +997,6 @@ define float @fadd_of_sum(ptr noalias nocapture readonly %a, ptr noalias nocaptu ; CHECK-ORDERED-TF-NEXT: [[TMP9:%.*]] = icmp ugt i64 [[N]], [[TMP7]] ; CHECK-ORDERED-TF-NEXT: [[TMP10:%.*]] = select i1 [[TMP9]], i64 [[TMP8]], i64 0 ; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]]) -; CHECK-ORDERED-TF-NEXT: [[TMP19:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP20:%.*]] = mul i64 [[TMP19]], 4 ; CHECK-ORDERED-TF-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-ORDERED-TF: vector.body: ; CHECK-ORDERED-TF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -1014,6 +1012,8 @@ define float @fadd_of_sum(ptr noalias nocapture readonly %a, ptr noalias nocaptu ; CHECK-ORDERED-TF-NEXT: [[TMP16:%.*]] = fadd [[WIDE_MASKED_LOAD]], [[WIDE_MASKED_LOAD1]] ; CHECK-ORDERED-TF-NEXT: [[TMP17:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP16]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) ; CHECK-ORDERED-TF-NEXT: [[TMP18]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], [[TMP17]]) +; CHECK-ORDERED-TF-NEXT: [[TMP19:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP20:%.*]] = mul i64 [[TMP19]], 4 ; CHECK-ORDERED-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP20]] ; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP10]]) ; CHECK-ORDERED-TF-NEXT: [[TMP21:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) @@ -1110,8 +1110,6 @@ define float @fadd_conditional(ptr noalias nocapture readonly %a, ptr noalias no ; CHECK-UNORDERED-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 ; CHECK-UNORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] ; CHECK-UNORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] -; CHECK-UNORDERED-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-UNORDERED-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 4 ; CHECK-UNORDERED-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-UNORDERED: vector.body: ; CHECK-UNORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -1127,6 +1125,8 @@ define float @fadd_conditional(ptr noalias nocapture readonly %a, ptr noalias no ; CHECK-UNORDERED-NEXT: [[TMP10:%.*]] = xor [[TMP7]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) ; CHECK-UNORDERED-NEXT: [[PREDPHI:%.*]] = select [[TMP10]], shufflevector ( insertelement ( poison, float 3.000000e+00, i64 0), poison, zeroinitializer), [[WIDE_MASKED_LOAD]] ; CHECK-UNORDERED-NEXT: [[TMP11]] = fadd [[VEC_PHI]], [[PREDPHI]] +; CHECK-UNORDERED-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UNORDERED-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 4 ; CHECK-UNORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP13]] ; CHECK-UNORDERED-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-UNORDERED-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] @@ -1171,8 +1171,6 @@ define float @fadd_conditional(ptr noalias nocapture readonly %a, ptr noalias no ; CHECK-ORDERED-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 ; CHECK-ORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] ; CHECK-ORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] -; CHECK-ORDERED-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 4 ; CHECK-ORDERED-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-ORDERED: vector.body: ; CHECK-ORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -1188,6 +1186,8 @@ define float @fadd_conditional(ptr noalias nocapture readonly %a, ptr noalias no ; CHECK-ORDERED-NEXT: [[TMP10:%.*]] = xor [[TMP7]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) ; CHECK-ORDERED-NEXT: [[PREDPHI:%.*]] = select [[TMP10]], shufflevector ( insertelement ( poison, float 3.000000e+00, i64 0), poison, zeroinitializer), [[WIDE_MASKED_LOAD]] ; CHECK-ORDERED-NEXT: [[TMP11]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], [[PREDPHI]]) +; CHECK-ORDERED-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 4 ; CHECK-ORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP13]] ; CHECK-ORDERED-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-ORDERED-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] @@ -1238,8 +1238,6 @@ define float @fadd_conditional(ptr noalias nocapture readonly %a, ptr noalias no ; CHECK-ORDERED-TF-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[N]], [[TMP6]] ; CHECK-ORDERED-TF-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0 ; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]]) -; CHECK-ORDERED-TF-NEXT: [[TMP22:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP23:%.*]] = mul i64 [[TMP22]], 4 ; CHECK-ORDERED-TF-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-ORDERED-TF: vector.body: ; CHECK-ORDERED-TF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -1260,6 +1258,8 @@ define float @fadd_conditional(ptr noalias nocapture readonly %a, ptr noalias no ; CHECK-ORDERED-TF-NEXT: [[TMP19:%.*]] = or [[TMP15]], [[TMP18]] ; CHECK-ORDERED-TF-NEXT: [[TMP20:%.*]] = select [[TMP19]], [[PREDPHI]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) ; CHECK-ORDERED-TF-NEXT: [[TMP21]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], [[TMP20]]) +; CHECK-ORDERED-TF-NEXT: [[TMP22:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP23:%.*]] = mul i64 [[TMP22]], 4 ; CHECK-ORDERED-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP23]] ; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]]) ; CHECK-ORDERED-TF-NEXT: [[TMP24:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) @@ -1358,8 +1358,6 @@ define float @fadd_multiple(ptr noalias nocapture %a, ptr noalias nocapture %b, ; CHECK-UNORDERED-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8 ; CHECK-UNORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] ; CHECK-UNORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] -; CHECK-UNORDERED-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-UNORDERED-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 8 ; CHECK-UNORDERED-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-UNORDERED: vector.body: ; CHECK-UNORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -1373,6 +1371,8 @@ define float @fadd_multiple(ptr noalias nocapture %a, ptr noalias nocapture %b, ; CHECK-UNORDERED-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[TMP8]], i32 0 ; CHECK-UNORDERED-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP9]], align 4 ; CHECK-UNORDERED-NEXT: [[TMP10]] = fadd [[TMP7]], [[WIDE_LOAD1]] +; CHECK-UNORDERED-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UNORDERED-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 8 ; CHECK-UNORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP12]] ; CHECK-UNORDERED-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-UNORDERED-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] @@ -1498,8 +1498,6 @@ define float @fmuladd_strict(ptr %a, ptr %b, i64 %n) #0 { ; CHECK-UNORDERED-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 32 ; CHECK-UNORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] ; CHECK-UNORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] -; CHECK-UNORDERED-NEXT: [[TMP52:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-UNORDERED-NEXT: [[TMP53:%.*]] = mul i64 [[TMP52]], 32 ; CHECK-UNORDERED-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-UNORDERED: vector.body: ; CHECK-UNORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -1563,6 +1561,8 @@ define float @fmuladd_strict(ptr %a, ptr %b, i64 %n) #0 { ; CHECK-UNORDERED-NEXT: [[TMP49]] = call @llvm.fmuladd.nxv8f32( [[WIDE_LOAD4]], [[WIDE_LOAD8]], [[VEC_PHI1]]) ; CHECK-UNORDERED-NEXT: [[TMP50]] = call @llvm.fmuladd.nxv8f32( [[WIDE_LOAD5]], [[WIDE_LOAD9]], [[VEC_PHI2]]) ; CHECK-UNORDERED-NEXT: [[TMP51]] = call @llvm.fmuladd.nxv8f32( [[WIDE_LOAD6]], [[WIDE_LOAD10]], [[VEC_PHI3]]) +; CHECK-UNORDERED-NEXT: [[TMP52:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UNORDERED-NEXT: [[TMP53:%.*]] = mul i64 [[TMP52]], 32 ; CHECK-UNORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP53]] ; CHECK-UNORDERED-NEXT: [[TMP54:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-UNORDERED-NEXT: br i1 [[TMP54]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] @@ -1604,8 +1604,6 @@ define float @fmuladd_strict(ptr %a, ptr %b, i64 %n) #0 { ; CHECK-ORDERED-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 32 ; CHECK-ORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] ; CHECK-ORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] -; CHECK-ORDERED-NEXT: [[TMP56:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-NEXT: [[TMP57:%.*]] = mul i64 [[TMP56]], 32 ; CHECK-ORDERED-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-ORDERED: vector.body: ; CHECK-ORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -1670,6 +1668,8 @@ define float @fmuladd_strict(ptr %a, ptr %b, i64 %n) #0 { ; CHECK-ORDERED-NEXT: [[TMP53:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP52]], [[TMP49]]) ; CHECK-ORDERED-NEXT: [[TMP54:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP53]], [[TMP50]]) ; CHECK-ORDERED-NEXT: [[TMP55]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP54]], [[TMP51]]) +; CHECK-ORDERED-NEXT: [[TMP56:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-NEXT: [[TMP57:%.*]] = mul i64 [[TMP56]], 32 ; CHECK-ORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP57]] ; CHECK-ORDERED-NEXT: [[TMP58:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-ORDERED-NEXT: br i1 [[TMP58]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] @@ -1741,8 +1741,6 @@ define float @fmuladd_strict(ptr %a, ptr %b, i64 %n) #0 { ; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY3:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_PART_NEXT]], i64 [[N]]) ; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY4:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_PART_NEXT1]], i64 [[N]]) ; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY5:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_PART_NEXT2]], i64 [[N]]) -; CHECK-ORDERED-TF-NEXT: [[TMP87:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP88:%.*]] = mul i64 [[TMP87]], 32 ; CHECK-ORDERED-TF-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-ORDERED-TF: vector.body: ; CHECK-ORDERED-TF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -1815,6 +1813,8 @@ define float @fmuladd_strict(ptr %a, ptr %b, i64 %n) #0 { ; CHECK-ORDERED-TF-NEXT: [[TMP84:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP82]], [[TMP83]]) ; CHECK-ORDERED-TF-NEXT: [[TMP85:%.*]] = select [[ACTIVE_LANE_MASK8]], [[TMP78]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) ; CHECK-ORDERED-TF-NEXT: [[TMP86]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP84]], [[TMP85]]) +; CHECK-ORDERED-TF-NEXT: [[TMP87:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP88:%.*]] = mul i64 [[TMP87]], 32 ; CHECK-ORDERED-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP88]] ; CHECK-ORDERED-TF-NEXT: [[TMP89:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-ORDERED-TF-NEXT: [[TMP90:%.*]] = mul i64 [[TMP89]], 8 @@ -1912,8 +1912,6 @@ define float @fmuladd_strict_fmf(ptr %a, ptr %b, i64 %n) #0 { ; CHECK-UNORDERED-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 32 ; CHECK-UNORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] ; CHECK-UNORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] -; CHECK-UNORDERED-NEXT: [[TMP52:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-UNORDERED-NEXT: [[TMP53:%.*]] = mul i64 [[TMP52]], 32 ; CHECK-UNORDERED-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-UNORDERED: vector.body: ; CHECK-UNORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -1977,6 +1975,8 @@ define float @fmuladd_strict_fmf(ptr %a, ptr %b, i64 %n) #0 { ; CHECK-UNORDERED-NEXT: [[TMP49]] = call nnan @llvm.fmuladd.nxv8f32( [[WIDE_LOAD4]], [[WIDE_LOAD8]], [[VEC_PHI1]]) ; CHECK-UNORDERED-NEXT: [[TMP50]] = call nnan @llvm.fmuladd.nxv8f32( [[WIDE_LOAD5]], [[WIDE_LOAD9]], [[VEC_PHI2]]) ; CHECK-UNORDERED-NEXT: [[TMP51]] = call nnan @llvm.fmuladd.nxv8f32( [[WIDE_LOAD6]], [[WIDE_LOAD10]], [[VEC_PHI3]]) +; CHECK-UNORDERED-NEXT: [[TMP52:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UNORDERED-NEXT: [[TMP53:%.*]] = mul i64 [[TMP52]], 32 ; CHECK-UNORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP53]] ; CHECK-UNORDERED-NEXT: [[TMP54:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-UNORDERED-NEXT: br i1 [[TMP54]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] @@ -2018,8 +2018,6 @@ define float @fmuladd_strict_fmf(ptr %a, ptr %b, i64 %n) #0 { ; CHECK-ORDERED-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 32 ; CHECK-ORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] ; CHECK-ORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] -; CHECK-ORDERED-NEXT: [[TMP56:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-NEXT: [[TMP57:%.*]] = mul i64 [[TMP56]], 32 ; CHECK-ORDERED-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-ORDERED: vector.body: ; CHECK-ORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -2084,6 +2082,8 @@ define float @fmuladd_strict_fmf(ptr %a, ptr %b, i64 %n) #0 { ; CHECK-ORDERED-NEXT: [[TMP53:%.*]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP52]], [[TMP49]]) ; CHECK-ORDERED-NEXT: [[TMP54:%.*]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP53]], [[TMP50]]) ; CHECK-ORDERED-NEXT: [[TMP55]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP54]], [[TMP51]]) +; CHECK-ORDERED-NEXT: [[TMP56:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-NEXT: [[TMP57:%.*]] = mul i64 [[TMP56]], 32 ; CHECK-ORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP57]] ; CHECK-ORDERED-NEXT: [[TMP58:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-ORDERED-NEXT: br i1 [[TMP58]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] @@ -2155,8 +2155,6 @@ define float @fmuladd_strict_fmf(ptr %a, ptr %b, i64 %n) #0 { ; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY3:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_PART_NEXT]], i64 [[N]]) ; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY4:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_PART_NEXT1]], i64 [[N]]) ; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY5:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_PART_NEXT2]], i64 [[N]]) -; CHECK-ORDERED-TF-NEXT: [[TMP87:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-ORDERED-TF-NEXT: [[TMP88:%.*]] = mul i64 [[TMP87]], 32 ; CHECK-ORDERED-TF-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-ORDERED-TF: vector.body: ; CHECK-ORDERED-TF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -2229,6 +2227,8 @@ define float @fmuladd_strict_fmf(ptr %a, ptr %b, i64 %n) #0 { ; CHECK-ORDERED-TF-NEXT: [[TMP84:%.*]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP82]], [[TMP83]]) ; CHECK-ORDERED-TF-NEXT: [[TMP85:%.*]] = select nnan [[ACTIVE_LANE_MASK8]], [[TMP78]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) ; CHECK-ORDERED-TF-NEXT: [[TMP86]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP84]], [[TMP85]]) +; CHECK-ORDERED-TF-NEXT: [[TMP87:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-ORDERED-TF-NEXT: [[TMP88:%.*]] = mul i64 [[TMP87]], 32 ; CHECK-ORDERED-TF-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP88]] ; CHECK-ORDERED-TF-NEXT: [[TMP89:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-ORDERED-TF-NEXT: [[TMP90:%.*]] = mul i64 [[TMP89]], 8 diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-cond-inv-loads.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-cond-inv-loads.ll index 95e9992f55cd1..b3de02799f513 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-cond-inv-loads.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-cond-inv-loads.ll @@ -15,19 +15,19 @@ define void @cond_inv_load_i32i32i16(ptr noalias nocapture %a, ptr noalias nocap ; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[DOTNEG]], [[N]] ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, ptr [[INV:%.*]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 2 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[COND:%.*]], i64 [[INDEX]] -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP5]], align 4 -; CHECK-NEXT: [[TMP6:%.*]] = icmp ne [[WIDE_LOAD]], zeroinitializer -; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i16.nxv4p0( [[BROADCAST_SPLAT]], i32 2, [[TMP6]], poison) -; CHECK-NEXT: [[TMP7:%.*]] = sext [[WIDE_MASKED_GATHER]] to -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i32, ptr [[A:%.*]], i64 [[INDEX]] -; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[TMP7]], ptr [[TMP8]], i32 4, [[TMP6]]) -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]] +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[COND:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP3]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = icmp ne [[WIDE_LOAD]], zeroinitializer +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i16.nxv4p0( [[BROADCAST_SPLAT]], i32 2, [[TMP4]], poison) +; CHECK-NEXT: [[TMP5:%.*]] = sext [[WIDE_MASKED_GATHER]] to +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i32, ptr [[A:%.*]], i64 [[INDEX]] +; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[TMP5]], ptr [[TMP6]], i32 4, [[TMP4]]) +; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP8:%.*]] = shl nuw nsw i64 [[TMP7]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]] ; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: @@ -94,18 +94,18 @@ define void @cond_inv_load_f64f64f64(ptr noalias nocapture %a, ptr noalias nocap ; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[DOTNEG]], [[N]] ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, ptr [[INV:%.*]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 2 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds double, ptr [[COND:%.*]], i64 [[INDEX]] -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP5]], align 8 -; CHECK-NEXT: [[TMP6:%.*]] = fcmp ogt [[WIDE_LOAD]], shufflevector ( insertelement ( poison, double 4.000000e-01, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4f64.nxv4p0( [[BROADCAST_SPLAT]], i32 8, [[TMP6]], poison) -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr double, ptr [[A:%.*]], i64 [[INDEX]] -; CHECK-NEXT: call void @llvm.masked.store.nxv4f64.p0( [[WIDE_MASKED_GATHER]], ptr [[TMP7]], i32 8, [[TMP6]]) -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]] +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds double, ptr [[COND:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP3]], align 8 +; CHECK-NEXT: [[TMP4:%.*]] = fcmp ogt [[WIDE_LOAD]], shufflevector ( insertelement ( poison, double 4.000000e-01, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4f64.nxv4p0( [[BROADCAST_SPLAT]], i32 8, [[TMP4]], poison) +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr double, ptr [[A:%.*]], i64 [[INDEX]] +; CHECK-NEXT: call void @llvm.masked.store.nxv4f64.p0( [[WIDE_MASKED_GATHER]], ptr [[TMP5]], i32 8, [[TMP4]]) +; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP7:%.*]] = shl nuw nsw i64 [[TMP6]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP7]] ; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: middle.block: @@ -168,24 +168,24 @@ define void @invariant_load_cond(ptr noalias nocapture %a, ptr nocapture readonl ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[DOTNEG:%.*]] = mul nsw i64 [[TMP2]], -4 ; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[DOTNEG]], [[N]] -; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 2 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 42 -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, ptr [[TMP5]], i64 0 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 42 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, ptr [[TMP3]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[COND:%.*]], i64 [[INDEX]] -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 4 -; CHECK-NEXT: [[TMP7:%.*]] = icmp ne [[WIDE_LOAD]], zeroinitializer -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i32, ptr [[B]], i64 [[INDEX]] -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP8]], i32 4, [[TMP7]], poison) -; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0( [[BROADCAST_SPLAT]], i32 4, [[TMP7]], poison) -; CHECK-NEXT: [[TMP9:%.*]] = add nsw [[WIDE_MASKED_GATHER]], [[WIDE_MASKED_LOAD]] -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i32, ptr [[A:%.*]], i64 [[INDEX]] -; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[TMP9]], ptr [[TMP10]], i32 4, [[TMP7]]) -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[COND:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP4]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = icmp ne [[WIDE_LOAD]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i32, ptr [[B]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP6]], i32 4, [[TMP5]], poison) +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0( [[BROADCAST_SPLAT]], i32 4, [[TMP5]], poison) +; CHECK-NEXT: [[TMP7:%.*]] = add nsw [[WIDE_MASKED_GATHER]], [[WIDE_MASKED_LOAD]] +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i32, ptr [[A:%.*]], i64 [[INDEX]] +; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[TMP7]], ptr [[TMP8]], i32 4, [[TMP5]]) +; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP10:%.*]] = shl nuw nsw i64 [[TMP9]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]] ; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK: middle.block: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-inloop-reductions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-inloop-reductions.ll index 982915fe5cc53..04af618014a79 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-inloop-reductions.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-inloop-reductions.ll @@ -19,8 +19,6 @@ define i64 @int_reduction_and(ptr noalias nocapture %a, i64 %N) { ; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] -; CHECK-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP21:%.*]] = mul i64 [[TMP20]], 4 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -44,6 +42,8 @@ define i64 @int_reduction_and(ptr noalias nocapture %a, i64 %N) { ; CHECK-NEXT: [[TMP17]] = and i64 [[TMP16]], [[VEC_PHI]] ; CHECK-NEXT: [[TMP18:%.*]] = call i64 @llvm.vector.reduce.and.nxv2i64( [[WIDE_LOAD3]]) ; CHECK-NEXT: [[TMP19]] = and i64 [[TMP18]], [[VEC_PHI2]] +; CHECK-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP21:%.*]] = mul i64 [[TMP20]], 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP21]] ; CHECK-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-reductions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-reductions.ll index f48933a76709d..f84e7c0ea182b 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-reductions.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-reductions.ll @@ -19,8 +19,6 @@ define i64 @int_reduction_add(ptr %a, i64 %N) { ; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] -; CHECK-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 4 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -42,6 +40,8 @@ define i64 @int_reduction_add(ptr %a, i64 %N) { ; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load , ptr [[TMP15]], align 8 ; CHECK-NEXT: [[TMP16]] = add [[WIDE_LOAD]], [[VEC_PHI]] ; CHECK-NEXT: [[TMP17]] = add [[WIDE_LOAD3]], [[VEC_PHI2]] +; CHECK-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP19]] ; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-strict-reductions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-strict-reductions.ll index fd3d8d09fb39d..23fd5fe5e908e 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-strict-reductions.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-strict-reductions.ll @@ -19,29 +19,29 @@ define float @fadd_strict(ptr noalias nocapture readonly %a, i64 %n) { ; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8 ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] -; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi float [ 0xFFFFFFFFE0000000, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4 -; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[TMP8]], 0 -; CHECK-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 1 -; CHECK-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], [[TMP10]] -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP11]] -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP14]], align 4 -; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 4 -; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i64 [[TMP16]] -; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP17]], align 4 -; CHECK-NEXT: [[TMP18:%.*]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], [[WIDE_LOAD]]) -; CHECK-NEXT: [[TMP19]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[TMP18]], [[WIDE_LOAD2]]) -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi float [ 0xFFFFFFFFE0000000, [[VECTOR_PH]] ], [ [[TMP17:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4 +; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[TMP6]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 1 +; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP9]] +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[TMP10]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP12]], align 4 +; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 4 +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, ptr [[TMP10]], i64 [[TMP14]] +; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP15]], align 4 +; CHECK-NEXT: [[TMP16:%.*]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], [[WIDE_LOAD]]) +; CHECK-NEXT: [[TMP17]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[TMP16]], [[WIDE_LOAD2]]) +; CHECK-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 8 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP19]] ; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: @@ -52,7 +52,7 @@ define float @fadd_strict(ptr noalias nocapture readonly %a, i64 %n) { ; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 2 ; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] ; CHECK: vec.epilog.ph: -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0xFFFFFFFFE0000000, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ], [ [[TMP19]], [[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0xFFFFFFFFE0000000, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ], [ [[TMP17]], [[VEC_EPILOG_ITER_CHECK]] ] ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] ; CHECK-NEXT: [[N_MOD_VF3:%.*]] = urem i64 [[N]], 2 ; CHECK-NEXT: [[N_VEC4:%.*]] = sub i64 [[N]], [[N_MOD_VF3]] @@ -73,7 +73,7 @@ define float @fadd_strict(ptr noalias nocapture readonly %a, i64 %n) { ; CHECK-NEXT: br i1 [[CMP_N5]], label [[FOR_END]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC4]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX10:%.*]] = phi float [ 0xFFFFFFFFE0000000, [[ITER_CHECK]] ], [ [[TMP19]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[TMP24]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX10:%.*]] = phi float [ 0xFFFFFFFFE0000000, [[ITER_CHECK]] ], [ [[TMP17]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[TMP24]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] @@ -85,7 +85,7 @@ define float @fadd_strict(ptr noalias nocapture readonly %a, i64 %n) { ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] ; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: for.end: -; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP19]], [[MIDDLE_BLOCK]] ], [ [[TMP24]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ], [ [[TMP24]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret float [[ADD_LCSSA]] ; entry: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect.ll index d5f41aa440e5b..6982b3f0943ce 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect.ll @@ -35,26 +35,26 @@ define void @main_vf_vscale_x_16(ptr %A) #0 { ; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 32 ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP5]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] -; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 32 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 16 -; CHECK-NEXT: [[TMP11:%.*]] = add i64 [[TMP10]], 0 -; CHECK-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 1 -; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], [[TMP12]] -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[TMP8]] -; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP13]] -; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i32 0 -; CHECK-NEXT: store shufflevector ( insertelement ( poison, i8 1, i64 0), poison, zeroinitializer), ptr [[TMP16]], align 1 -; CHECK-NEXT: [[TMP17:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP18:%.*]] = mul i64 [[TMP17]], 16 -; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i64 [[TMP18]] -; CHECK-NEXT: store shufflevector ( insertelement ( poison, i8 1, i64 0), poison, zeroinitializer), ptr [[TMP19]], align 1 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP7]] +; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 16 +; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[TMP8]], 0 +; CHECK-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 1 +; CHECK-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], [[TMP10]] +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP11]] +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP12]], i32 0 +; CHECK-NEXT: store shufflevector ( insertelement ( poison, i8 1, i64 0), poison, zeroinitializer), ptr [[TMP14]], align 1 +; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 16 +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP12]], i64 [[TMP16]] +; CHECK-NEXT: store shufflevector ( insertelement ( poison, i8 1, i64 0), poison, zeroinitializer), ptr [[TMP17]], align 1 +; CHECK-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 32 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP19]] ; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: @@ -72,16 +72,16 @@ define void @main_vf_vscale_x_16(ptr %A) #0 { ; CHECK-NEXT: [[TMP24:%.*]] = mul i64 [[TMP23]], 8 ; CHECK-NEXT: [[N_MOD_VF2:%.*]] = urem i64 1024, [[TMP24]] ; CHECK-NEXT: [[N_VEC3:%.*]] = sub i64 1024, [[N_MOD_VF2]] -; CHECK-NEXT: [[TMP25:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP26:%.*]] = mul i64 [[TMP25]], 8 ; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] ; CHECK: vec.epilog.vector.body: ; CHECK-NEXT: [[INDEX5:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT6:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP27:%.*]] = add i64 [[INDEX5]], 0 -; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP27]] -; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds i8, ptr [[TMP28]], i32 0 -; CHECK-NEXT: store shufflevector ( insertelement ( poison, i8 1, i64 0), poison, zeroinitializer), ptr [[TMP29]], align 1 -; CHECK-NEXT: [[INDEX_NEXT6]] = add nuw i64 [[INDEX5]], [[TMP26]] +; CHECK-NEXT: [[TMP25:%.*]] = add i64 [[INDEX5]], 0 +; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP25]] +; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[TMP26]], i32 0 +; CHECK-NEXT: store shufflevector ( insertelement ( poison, i8 1, i64 0), poison, zeroinitializer), ptr [[TMP27]], align 1 +; CHECK-NEXT: [[TMP28:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP29:%.*]] = mul i64 [[TMP28]], 8 +; CHECK-NEXT: [[INDEX_NEXT6]] = add nuw i64 [[INDEX5]], [[TMP29]] ; CHECK-NEXT: [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT6]], [[N_VEC3]] ; CHECK-NEXT: br i1 [[TMP30]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: vec.epilog.middle.block: @@ -113,26 +113,26 @@ define void @main_vf_vscale_x_16(ptr %A) #0 { ; CHECK-VF8-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 32 ; CHECK-VF8-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] ; CHECK-VF8-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] -; CHECK-VF8-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-VF8-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 32 ; CHECK-VF8-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-VF8: vector.body: ; CHECK-VF8-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-VF8-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 -; CHECK-VF8-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-VF8-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 16 -; CHECK-VF8-NEXT: [[TMP9:%.*]] = add i64 [[TMP8]], 0 -; CHECK-VF8-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 1 -; CHECK-VF8-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], [[TMP10]] -; CHECK-VF8-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[TMP6]] -; CHECK-VF8-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP11]] -; CHECK-VF8-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP12]], i32 0 -; CHECK-VF8-NEXT: store shufflevector ( insertelement ( poison, i8 1, i64 0), poison, zeroinitializer), ptr [[TMP14]], align 1 -; CHECK-VF8-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-VF8-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 16 -; CHECK-VF8-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP12]], i64 [[TMP16]] -; CHECK-VF8-NEXT: store shufflevector ( insertelement ( poison, i8 1, i64 0), poison, zeroinitializer), ptr [[TMP17]], align 1 -; CHECK-VF8-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-VF8-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 +; CHECK-VF8-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-VF8-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 16 +; CHECK-VF8-NEXT: [[TMP7:%.*]] = add i64 [[TMP6]], 0 +; CHECK-VF8-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 1 +; CHECK-VF8-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]] +; CHECK-VF8-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[TMP4]] +; CHECK-VF8-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]] +; CHECK-VF8-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 0 +; CHECK-VF8-NEXT: store shufflevector ( insertelement ( poison, i8 1, i64 0), poison, zeroinitializer), ptr [[TMP12]], align 1 +; CHECK-VF8-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-VF8-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 16 +; CHECK-VF8-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i64 [[TMP14]] +; CHECK-VF8-NEXT: store shufflevector ( insertelement ( poison, i8 1, i64 0), poison, zeroinitializer), ptr [[TMP15]], align 1 +; CHECK-VF8-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-VF8-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 32 +; CHECK-VF8-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP17]] ; CHECK-VF8-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-VF8-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK-VF8: middle.block: @@ -213,26 +213,26 @@ define void @main_vf_vscale_x_2(ptr %A) #0 vscale_range(8, 8) { ; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] -; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 2 -; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[TMP8]], 0 -; CHECK-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 1 -; CHECK-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], [[TMP10]] -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP11]] -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[TMP12]], i32 0 -; CHECK-NEXT: store shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer), ptr [[TMP14]], align 1 -; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 2 -; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[TMP12]], i64 [[TMP16]] -; CHECK-NEXT: store shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer), ptr [[TMP17]], align 1 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 2 +; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[TMP6]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 1 +; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]] +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[TMP10]], i32 0 +; CHECK-NEXT: store shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer), ptr [[TMP12]], align 1 +; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 2 +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[TMP10]], i64 [[TMP14]] +; CHECK-NEXT: store shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer), ptr [[TMP15]], align 1 +; CHECK-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP17]] ; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: middle.block: @@ -282,26 +282,26 @@ define void @main_vf_vscale_x_2(ptr %A) #0 vscale_range(8, 8) { ; CHECK-VF8-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 ; CHECK-VF8-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] ; CHECK-VF8-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] -; CHECK-VF8-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-VF8-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 ; CHECK-VF8-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-VF8: vector.body: ; CHECK-VF8-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-VF8-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 -; CHECK-VF8-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-VF8-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 2 -; CHECK-VF8-NEXT: [[TMP9:%.*]] = add i64 [[TMP8]], 0 -; CHECK-VF8-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 1 -; CHECK-VF8-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], [[TMP10]] -; CHECK-VF8-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP6]] -; CHECK-VF8-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP11]] -; CHECK-VF8-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[TMP12]], i32 0 -; CHECK-VF8-NEXT: store shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer), ptr [[TMP14]], align 1 -; CHECK-VF8-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-VF8-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 2 -; CHECK-VF8-NEXT: [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[TMP12]], i64 [[TMP16]] -; CHECK-VF8-NEXT: store shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer), ptr [[TMP17]], align 1 -; CHECK-VF8-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-VF8-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 +; CHECK-VF8-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-VF8-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 2 +; CHECK-VF8-NEXT: [[TMP7:%.*]] = add i64 [[TMP6]], 0 +; CHECK-VF8-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 1 +; CHECK-VF8-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]] +; CHECK-VF8-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP4]] +; CHECK-VF8-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]] +; CHECK-VF8-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[TMP10]], i32 0 +; CHECK-VF8-NEXT: store shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer), ptr [[TMP12]], align 1 +; CHECK-VF8-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-VF8-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 2 +; CHECK-VF8-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[TMP10]], i64 [[TMP14]] +; CHECK-VF8-NEXT: store shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer), ptr [[TMP15]], align 1 +; CHECK-VF8-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-VF8-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 4 +; CHECK-VF8-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP17]] ; CHECK-VF8-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-VF8-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK-VF8: middle.block: @@ -372,26 +372,26 @@ define void @test_pr57912_pointer_induction(ptr %start) #0 { ; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 32 ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 10000, [[TMP5]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 10000, [[N_MOD_VF]] -; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 32 ; CHECK-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[START:%.*]], i64 [[N_VEC]] ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP8]] -; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 16 -; CHECK-NEXT: [[TMP11:%.*]] = add i64 [[TMP10]], 0 -; CHECK-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], [[TMP11]] -; CHECK-NEXT: [[NEXT_GEP2:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP12]] -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0 -; CHECK-NEXT: store zeroinitializer, ptr [[TMP13]], align 1 -; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP15:%.*]] = mul i64 [[TMP14]], 16 -; CHECK-NEXT: [[TMP16:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 [[TMP15]] -; CHECK-NEXT: store zeroinitializer, ptr [[TMP16]], align 1 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP7]] +; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 16 +; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[TMP8]], 0 +; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], [[TMP9]] +; CHECK-NEXT: [[NEXT_GEP2:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP10]] +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0 +; CHECK-NEXT: store zeroinitializer, ptr [[TMP11]], align 1 +; CHECK-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 16 +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 [[TMP13]] +; CHECK-NEXT: store zeroinitializer, ptr [[TMP14]], align 1 +; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 32 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP16]] ; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: middle.block: @@ -412,16 +412,16 @@ define void @test_pr57912_pointer_induction(ptr %start) #0 { ; CHECK-NEXT: [[N_MOD_VF3:%.*]] = urem i64 10000, [[TMP21]] ; CHECK-NEXT: [[N_VEC4:%.*]] = sub i64 10000, [[N_MOD_VF3]] ; CHECK-NEXT: [[IND_END6:%.*]] = getelementptr i8, ptr [[START]], i64 [[N_VEC4]] -; CHECK-NEXT: [[TMP22:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP23:%.*]] = mul i64 [[TMP22]], 8 ; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] ; CHECK: vec.epilog.vector.body: ; CHECK-NEXT: [[INDEX10:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT12:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP24:%.*]] = add i64 [[INDEX10]], 0 -; CHECK-NEXT: [[NEXT_GEP11:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP24]] -; CHECK-NEXT: [[TMP25:%.*]] = getelementptr i8, ptr [[NEXT_GEP11]], i32 0 -; CHECK-NEXT: store zeroinitializer, ptr [[TMP25]], align 1 -; CHECK-NEXT: [[INDEX_NEXT12]] = add nuw i64 [[INDEX10]], [[TMP23]] +; CHECK-NEXT: [[TMP22:%.*]] = add i64 [[INDEX10]], 0 +; CHECK-NEXT: [[NEXT_GEP11:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP22]] +; CHECK-NEXT: [[TMP23:%.*]] = getelementptr i8, ptr [[NEXT_GEP11]], i32 0 +; CHECK-NEXT: store zeroinitializer, ptr [[TMP23]], align 1 +; CHECK-NEXT: [[TMP24:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP25:%.*]] = mul i64 [[TMP24]], 8 +; CHECK-NEXT: [[INDEX_NEXT12]] = add nuw i64 [[INDEX10]], [[TMP25]] ; CHECK-NEXT: [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT12]], [[N_VEC4]] ; CHECK-NEXT: br i1 [[TMP26]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; CHECK: vec.epilog.middle.block: @@ -455,26 +455,26 @@ define void @test_pr57912_pointer_induction(ptr %start) #0 { ; CHECK-VF8-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 32 ; CHECK-VF8-NEXT: [[N_MOD_VF:%.*]] = urem i64 10000, [[TMP3]] ; CHECK-VF8-NEXT: [[N_VEC:%.*]] = sub i64 10000, [[N_MOD_VF]] -; CHECK-VF8-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-VF8-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 32 ; CHECK-VF8-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[START:%.*]], i64 [[N_VEC]] ; CHECK-VF8-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-VF8: vector.body: ; CHECK-VF8-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-VF8-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 -; CHECK-VF8-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP6]] -; CHECK-VF8-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-VF8-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 16 -; CHECK-VF8-NEXT: [[TMP9:%.*]] = add i64 [[TMP8]], 0 -; CHECK-VF8-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], [[TMP9]] -; CHECK-VF8-NEXT: [[NEXT_GEP1:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP10]] -; CHECK-VF8-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0 -; CHECK-VF8-NEXT: store zeroinitializer, ptr [[TMP11]], align 1 -; CHECK-VF8-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-VF8-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 16 -; CHECK-VF8-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 [[TMP13]] -; CHECK-VF8-NEXT: store zeroinitializer, ptr [[TMP14]], align 1 -; CHECK-VF8-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-VF8-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 +; CHECK-VF8-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP4]] +; CHECK-VF8-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-VF8-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 16 +; CHECK-VF8-NEXT: [[TMP7:%.*]] = add i64 [[TMP6]], 0 +; CHECK-VF8-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], [[TMP7]] +; CHECK-VF8-NEXT: [[NEXT_GEP1:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP8]] +; CHECK-VF8-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0 +; CHECK-VF8-NEXT: store zeroinitializer, ptr [[TMP9]], align 1 +; CHECK-VF8-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-VF8-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 16 +; CHECK-VF8-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 [[TMP11]] +; CHECK-VF8-NEXT: store zeroinitializer, ptr [[TMP12]], align 1 +; CHECK-VF8-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-VF8-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 32 +; CHECK-VF8-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP14]] ; CHECK-VF8-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-VF8-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK-VF8: middle.block: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-fneg.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-fneg.ll index 061b59f5442cd..995950e46f533 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-fneg.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-fneg.ll @@ -31,29 +31,29 @@ define void @fneg(ptr nocapture noundef writeonly %d, ptr nocapture noundef read ; CHECK-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 16 ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP7]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] -; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 16 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds half, ptr [[S]], i64 [[TMP10]] -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds half, ptr [[TMP11]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP12]], align 2 -; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 8 -; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds half, ptr [[TMP11]], i64 [[TMP14]] -; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load , ptr [[TMP15]], align 2 -; CHECK-NEXT: [[TMP16:%.*]] = fneg [[WIDE_LOAD]] -; CHECK-NEXT: [[TMP17:%.*]] = fneg [[WIDE_LOAD3]] -; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds half, ptr [[D]], i64 [[TMP10]] -; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds half, ptr [[TMP18]], i32 0 -; CHECK-NEXT: store [[TMP16]], ptr [[TMP19]], align 2 -; CHECK-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP21:%.*]] = mul i64 [[TMP20]], 8 -; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds half, ptr [[TMP18]], i64 [[TMP21]] -; CHECK-NEXT: store [[TMP17]], ptr [[TMP22]], align 2 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]] +; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds half, ptr [[S]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds half, ptr [[TMP9]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP10]], align 2 +; CHECK-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 8 +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds half, ptr [[TMP9]], i64 [[TMP12]] +; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load , ptr [[TMP13]], align 2 +; CHECK-NEXT: [[TMP14:%.*]] = fneg [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP15:%.*]] = fneg [[WIDE_LOAD3]] +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds half, ptr [[D]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds half, ptr [[TMP16]], i32 0 +; CHECK-NEXT: store [[TMP14]], ptr [[TMP17]], align 2 +; CHECK-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 8 +; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds half, ptr [[TMP16]], i64 [[TMP19]] +; CHECK-NEXT: store [[TMP15]], ptr [[TMP20]], align 2 +; CHECK-NEXT: [[TMP21:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP22:%.*]] = mul i64 [[TMP21]], 16 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP22]] ; CHECK-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-gather-scatter.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-gather-scatter.ll index 6747cc6fde17e..3a867cf401b33 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-gather-scatter.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-gather-scatter.ll @@ -13,18 +13,18 @@ define void @gather_nxv4i32_ind64(ptr noalias nocapture readonly %a, ptr noalias ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[DOTNEG:%.*]] = mul nsw i64 [[TMP2]], -4 ; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[DOTNEG]], [[N]] -; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 2 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[INDEX]] -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP5]], align 8 -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], [[WIDE_LOAD]] -; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4f32.nxv4p0( [[TMP6]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison) -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[C:%.*]], i64 [[INDEX]] -; CHECK-NEXT: store [[WIDE_MASKED_GATHER]], ptr [[TMP7]], align 4 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]] +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP3]], align 8 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], [[WIDE_LOAD]] +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4f32.nxv4p0( [[TMP4]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison) +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[C:%.*]], i64 [[INDEX]] +; CHECK-NEXT: store [[WIDE_MASKED_GATHER]], ptr [[TMP5]], align 4 +; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP7:%.*]] = shl nuw nsw i64 [[TMP6]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP7]] ; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: @@ -80,19 +80,19 @@ define void @scatter_nxv4i32_ind32(ptr noalias nocapture %a, ptr noalias nocaptu ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[DOTNEG:%.*]] = mul nsw i64 [[TMP2]], -4 ; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[DOTNEG]], [[N]] -; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 2 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[C:%.*]], i64 [[INDEX]] -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP5]], align 4 -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[INDEX]] -; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP6]], align 4 -; CHECK-NEXT: [[TMP7:%.*]] = sext [[WIDE_LOAD1]] to -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], [[TMP7]] -; CHECK-NEXT: call void @llvm.masked.scatter.nxv4f32.nxv4p0( [[WIDE_LOAD]], [[TMP8]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]] +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, ptr [[C:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP3]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP4]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = sext [[WIDE_LOAD1]] to +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], [[TMP5]] +; CHECK-NEXT: call void @llvm.masked.scatter.nxv4f32.nxv4p0( [[WIDE_LOAD]], [[TMP6]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP8:%.*]] = shl nuw nsw i64 [[TMP7]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]] ; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: middle.block: @@ -149,16 +149,16 @@ define void @scatter_inv_nxv4i32(ptr noalias nocapture %inv, ptr noalias nocaptu ; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[DOTNEG]], [[N]] ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, ptr [[INV:%.*]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 2 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[INDEX]] -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP5]], align 4 -; CHECK-NEXT: [[TMP6:%.*]] = icmp ne [[WIDE_LOAD]], zeroinitializer -; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0( shufflevector ( insertelement ( poison, i32 3, i64 0), poison, zeroinitializer), [[BROADCAST_SPLAT]], i32 4, [[TMP6]]) -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]] +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP3]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = icmp ne [[WIDE_LOAD]], zeroinitializer +; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0( shufflevector ( insertelement ( poison, i32 3, i64 0), poison, zeroinitializer), [[BROADCAST_SPLAT]], i32 4, [[TMP4]]) +; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP6:%.*]] = shl nuw nsw i64 [[TMP5]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]] ; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK: middle.block: @@ -219,17 +219,17 @@ define void @gather_inv_nxv4i32(ptr noalias nocapture %a, ptr noalias nocapture ; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[DOTNEG]], [[N]] ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, ptr [[INV:%.*]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 2 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i32, ptr [[A:%.*]], i64 [[INDEX]] -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP5]], align 4 -; CHECK-NEXT: [[TMP6:%.*]] = icmp sgt [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i32 3, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0( [[BROADCAST_SPLAT]], i32 4, [[TMP6]], poison) -; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[WIDE_MASKED_GATHER]], ptr [[TMP5]], i32 4, [[TMP6]]) -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]] +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i32, ptr [[A:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP3]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = icmp sgt [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i32 3, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0( [[BROADCAST_SPLAT]], i32 4, [[TMP4]], poison) +; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[WIDE_MASKED_GATHER]], ptr [[TMP3]], i32 4, [[TMP4]]) +; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP6:%.*]] = shl nuw nsw i64 [[TMP5]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]] ; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; CHECK: middle.block: @@ -292,31 +292,31 @@ define void @gather_nxv4i32_ind64_stride2(ptr noalias nocapture %a, ptr noalias ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[DOTNEG:%.*]] = mul nsw i64 [[TMP2]], -8 ; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[DOTNEG]], [[N]] -; CHECK-NEXT: [[TMP5:%.*]] = call @llvm.experimental.stepvector.nxv4i64() -; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP7:%.*]] = shl nuw nsw i64 [[TMP6]], 2 -; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP7]], i64 0 +; CHECK-NEXT: [[TMP3:%.*]] = call @llvm.experimental.stepvector.nxv4i64() +; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 2 +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP5]], i64 0 ; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer -; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 3 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP5]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP3]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[STEP_ADD:%.*]] = add [[VEC_IND]], [[DOTSPLAT]] -; CHECK-NEXT: [[TMP8:%.*]] = shl [[VEC_IND]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP9:%.*]] = shl [[STEP_ADD]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], [[TMP8]] -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, ptr [[B]], [[TMP9]] -; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4f32.nxv4p0( [[TMP10]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison) -; CHECK-NEXT: [[WIDE_MASKED_GATHER2:%.*]] = call @llvm.masked.gather.nxv4f32.nxv4p0( [[TMP11]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison) -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]] -; CHECK-NEXT: store [[WIDE_MASKED_GATHER]], ptr [[TMP12]], align 4 -; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP14:%.*]] = shl nuw nsw i64 [[TMP13]], 2 -; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i64 [[TMP14]] -; CHECK-NEXT: store [[WIDE_MASKED_GATHER2]], ptr [[TMP15]], align 4 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = shl [[VEC_IND]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP7:%.*]] = shl [[STEP_ADD]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], [[TMP6]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[B]], [[TMP7]] +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4f32.nxv4p0( [[TMP8]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison) +; CHECK-NEXT: [[WIDE_MASKED_GATHER2:%.*]] = call @llvm.masked.gather.nxv4f32.nxv4p0( [[TMP9]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison) +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]] +; CHECK-NEXT: store [[WIDE_MASKED_GATHER]], ptr [[TMP10]], align 4 +; CHECK-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP12:%.*]] = shl nuw nsw i64 [[TMP11]], 2 +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP10]], i64 [[TMP12]] +; CHECK-NEXT: store [[WIDE_MASKED_GATHER2]], ptr [[TMP13]], align 4 +; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP15:%.*]] = shl nuw nsw i64 [[TMP14]], 3 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP15]] ; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[STEP_ADD]], [[DOTSPLAT]] ; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions-unusual-types.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions-unusual-types.ll index bfb27b5f7a83b..01657493e9d4a 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions-unusual-types.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions-unusual-types.ll @@ -11,15 +11,11 @@ target triple = "aarch64-unknown-linux-gnu" define void @induction_i7(ptr %dst) #0 { ; CHECK-LABEL: @induction_i7( ; CHECK: vector.ph: -; CHECK: %ind.end = trunc i64 %n.vec to i7 -; CHECK-NEXT: [[TMP4:%.*]] = call @llvm.experimental.stepvector.nxv2i8() +; CHECK: [[TMP4:%.*]] = call @llvm.experimental.stepvector.nxv2i8() ; CHECK: [[TMP5:%.*]] = trunc [[TMP4]] to ; CHECK-NEXT: [[TMP6:%.*]] = add [[TMP5]], zeroinitializer ; CHECK-NEXT: [[TMP7:%.*]] = mul [[TMP6]], shufflevector ( insertelement ( poison, i7 1, i64 0), poison, zeroinitializer) ; CHECK-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP7]] -; CHECK: = shufflevecto -; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 2 ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] ; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %vector.body ] @@ -29,6 +25,8 @@ define void @induction_i7(ptr %dst) #0 { ; CHECK-NEXT: [[EXT:%.+]] = zext [[TMP11]] to ; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[TMP12]], i32 0 ; CHECK-NEXT: store [[EXT]], ptr [[TMP13]], align 8 +; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 2 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP16]] ; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], ; @@ -59,15 +57,11 @@ for.end: ; preds = %for.body define void @induction_i3_zext(ptr %dst) #0 { ; CHECK-LABEL: @induction_i3_zext( ; CHECK: vector.ph: -; CHECK: %ind.end = trunc i64 %n.vec to i3 ; CHECK: [[TMP4:%.*]] = call @llvm.experimental.stepvector.nxv2i8() ; CHECK: [[TMP5:%.*]] = trunc [[TMP4]] to ; CHECK-NEXT: [[TMP6:%.*]] = add [[TMP5]], zeroinitializer ; CHECK-NEXT: [[TMP7:%.*]] = mul [[TMP6]], shufflevector ( insertelement ( poison, i3 1, i64 0), poison, zeroinitializer) ; CHECK-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP7]] -; CHECK: = shufflevecto -; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 2 ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] ; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %vector.body ] @@ -76,6 +70,8 @@ define void @induction_i3_zext(ptr %dst) #0 { ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[DST:%.*]], i64 [[TMP9]] ; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[TMP12]], i32 0 ; CHECK-NEXT: store [[TMP10]], ptr [[TMP13]], align 8 +; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 2 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP16]] ; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], ; diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions.ll index c8f17a5d66598..739010b5b631f 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions.ll @@ -23,26 +23,26 @@ define void @cond_ind64(ptr noalias nocapture %a, ptr noalias nocapture readonly ; CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 2 ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] -; CHECK-NEXT: [[TMP6:%.*]] = call @llvm.experimental.stepvector.nxv4i64() -; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP8:%.*]] = shl i64 [[TMP7]], 2 -; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP8]], i64 0 +; CHECK-NEXT: [[TMP4:%.*]] = call @llvm.experimental.stepvector.nxv4i64() +; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP6:%.*]] = shl i64 [[TMP5]], 2 +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP6]], i64 0 ; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP5:%.*]] = shl i64 [[TMP4]], 2 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP6]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP9:%.*]] = trunc [[VEC_IND]] to -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i32, ptr [[B:%.*]], i64 [[INDEX]] -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP10]], i32 4, [[TMP9]], poison) -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i32, ptr [[A:%.*]], i64 [[INDEX]] -; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[WIDE_MASKED_LOAD]], ptr [[TMP11]], i32 4, [[TMP9]]) -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP4]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP7:%.*]] = trunc [[VEC_IND]] to +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i32, ptr [[B:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP8]], i32 4, [[TMP7]], poison) +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i32, ptr [[A:%.*]], i64 [[INDEX]] +; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[WIDE_MASKED_LOAD]], ptr [[TMP10]], i32 4, [[TMP7]]) +; CHECK-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP13:%.*]] = shl i64 [[TMP12]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP13]] ; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] -; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] @@ -56,14 +56,14 @@ define void @cond_ind64(ptr noalias nocapture %a, ptr noalias nocapture readonly ; CHECK-NEXT: br i1 [[TOBOOL_NOT]], label [[FOR_INC]], label [[IF_THEN:%.*]] ; CHECK: if.then: ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[I_08]] -; CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[TMP15:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[I_08]] -; CHECK-NEXT: store i32 [[TMP13]], ptr [[ARRAYIDX1]], align 4 +; CHECK-NEXT: store i32 [[TMP15]], ptr [[ARRAYIDX1]], align 4 ; CHECK-NEXT: br label [[FOR_INC]] ; CHECK: for.inc: ; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_08]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll index 7796b80eb4e63..0b5058cff8d5c 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll @@ -30,8 +30,6 @@ define void @test_array_load2_store2(i32 %C, i32 %D) #1 { ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i32 [[D:%.*]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer -; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP9:%.*]] = shl nuw nsw i64 [[TMP8]], 2 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -48,6 +46,8 @@ define void @test_array_load2_store2(i32 %C, i32 %D) #1 { ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i64 -1 ; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.experimental.vector.interleave2.nxv8i32( [[TMP4]], [[TMP5]]) ; CHECK-NEXT: store [[INTERLEAVED_VEC]], ptr [[TMP7]], align 4 +; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP9:%.*]] = shl nuw nsw i64 [[TMP8]], 2 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]] ; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 512 ; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] @@ -116,8 +116,6 @@ define void @test_array_load2_i16_store2(i32 %C, i32 %D) #1 { ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement poison, i32 [[D:%.*]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector [[BROADCAST_SPLATINSERT2]], poison, zeroinitializer -; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP15:%.*]] = shl nuw nsw i64 [[TMP14]], 2 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -136,6 +134,8 @@ define void @test_array_load2_i16_store2(i32 %C, i32 %D) #1 { ; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i64 -1 ; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.experimental.vector.interleave2.nxv8i32( [[TMP8]], [[TMP10]]) ; CHECK-NEXT: store [[INTERLEAVED_VEC]], ptr [[TMP13]], align 4 +; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP15:%.*]] = shl nuw nsw i64 [[TMP14]], 2 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP15]] ; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] ; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 512 @@ -207,8 +207,6 @@ define void @test_array_load2_store2_i16(i32 noundef %C, i32 noundef %D) #1 { ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i32 [[D:%.*]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer -; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP15:%.*]] = shl nuw nsw i64 [[TMP14]], 2 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -228,6 +226,8 @@ define void @test_array_load2_store2_i16(i32 noundef %C, i32 noundef %D) #1 { ; CHECK-NEXT: [[TMP12:%.*]] = trunc [[TMP11]] to ; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds [1024 x i16], ptr @CD_i16, i64 0, [[TMP7]] ; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i16.nxv4p0( [[TMP12]], [[TMP13]], i32 2, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP15:%.*]] = shl nuw nsw i64 [[TMP14]], 2 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP15]] ; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] ; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 512 @@ -282,8 +282,6 @@ define i32 @test_struct_load6(%struct.ST6* %S) #1 { ; CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2 ; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP2]], i64 0 ; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer -; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP16:%.*]] = shl nuw nsw i64 [[TMP15]], 2 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -307,6 +305,8 @@ define i32 @test_struct_load6(%struct.ST6* %S) #1 { ; CHECK-NEXT: [[TMP12:%.*]] = add [[TMP11]], [[WIDE_MASKED_GATHER4]] ; CHECK-NEXT: [[TMP13:%.*]] = add [[TMP12]], [[WIDE_MASKED_GATHER5]] ; CHECK-NEXT: [[TMP14]] = sub [[TMP10]], [[TMP13]] +; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP16:%.*]] = shl nuw nsw i64 [[TMP15]], 2 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP16]] ; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] ; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 @@ -387,8 +387,6 @@ define void @test_reversed_load2_store2(%struct.ST2* noalias nocapture readonly ; CHECK-NEXT: [[DOTNEG:%.*]] = mul nsw i32 [[TMP1]], -4 ; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i32 [[DOTNEG]], i64 0 ; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer -; CHECK-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP19:%.*]] = shl nuw nsw i64 [[TMP18]], 2 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -418,6 +416,8 @@ define void @test_reversed_load2_store2(%struct.ST2* noalias nocapture readonly ; CHECK-NEXT: [[REVERSE3:%.*]] = call @llvm.experimental.vector.reverse.nxv4i32( [[TMP11]]) ; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.experimental.vector.interleave2.nxv8i32( [[REVERSE2]], [[REVERSE3]]) ; CHECK-NEXT: store [[INTERLEAVED_VEC]], ptr [[TMP17]], align 4 +; CHECK-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP19:%.*]] = shl nuw nsw i64 [[TMP18]], 2 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP19]] ; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] ; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 @@ -475,8 +475,6 @@ define void @even_load_static_tc(i32* noalias nocapture readonly %A, i32* noalia ; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2 ; CHECK-NEXT: [[N_VEC:%.*]] = sub nuw nsw i64 512, [[TMP1]] ; CHECK-NEXT: [[IND_END:%.*]] = shl nuw nsw i64 [[N_VEC]], 1 -; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP8:%.*]] = shl nuw nsw i64 [[TMP7]], 2 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -489,6 +487,8 @@ define void @even_load_static_tc(i32* noalias nocapture readonly %A, i32* noalia ; CHECK-NEXT: [[TMP5:%.*]] = and i64 [[INDEX]], 9223372036854775804 ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP5]] ; CHECK-NEXT: store [[TMP4]], ptr [[TMP6]], align 4 +; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP8:%.*]] = shl nuw nsw i64 [[TMP7]], 2 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]] ; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] @@ -561,8 +561,6 @@ define void @even_load_dynamic_tc(i32* noalias nocapture readonly %A, i32* noali ; CHECK-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP6]], i64 [[N_MOD_VF]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP4]], [[TMP9]] ; CHECK-NEXT: [[IND_END:%.*]] = shl i64 [[N_VEC]], 1 -; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP16:%.*]] = shl nuw nsw i64 [[TMP15]], 2 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -575,6 +573,8 @@ define void @even_load_dynamic_tc(i32* noalias nocapture readonly %A, i32* noali ; CHECK-NEXT: [[TMP13:%.*]] = and i64 [[INDEX]], 9223372036854775804 ; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP13]] ; CHECK-NEXT: store [[TMP12]], ptr [[TMP14]], align 4 +; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP16:%.*]] = shl nuw nsw i64 [[TMP15]], 2 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP16]] ; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] @@ -648,8 +648,6 @@ define void @load_gap_reverse(%pair* noalias nocapture readonly %P1, %pair* noal ; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[X:%.*]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP7:%.*]] = shl nuw nsw i64 [[TMP6]], 2 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -661,6 +659,8 @@ define void @load_gap_reverse(%pair* noalias nocapture readonly %P1, %pair* noal ; CHECK-NEXT: [[TMP5:%.*]] = sub nsw [[WIDE_MASKED_GATHER]], [[VEC_IND]] ; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i64.nxv4p0( [[TMP2]], [[TMP3]], i32 8, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) ; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i64.nxv4p0( [[TMP5]], [[TMP4]], i32 8, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP7:%.*]] = shl nuw nsw i64 [[TMP6]], 2 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP7]] ; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] ; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 @@ -709,8 +709,6 @@ define void @mixed_load2_store2(i32* noalias nocapture readonly %A, i32* noalias ; CHECK-NEXT: entry: ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP11:%.*]] = shl nuw nsw i64 [[TMP10]], 2 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -730,6 +728,8 @@ define void @mixed_load2_store2(i32* noalias nocapture readonly %A, i32* noalias ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i64 -1 ; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.experimental.vector.interleave2.nxv8i32( [[TMP4]], [[TMP7]]) ; CHECK-NEXT: store [[INTERLEAVED_VEC]], ptr [[TMP9]], align 4 +; CHECK-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP11:%.*]] = shl nuw nsw i64 [[TMP10]], 2 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP11]] ; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 512 ; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] @@ -802,8 +802,6 @@ define void @int_float_struct(%struct.IntFloat* nocapture readonly %p) #0 { ; CHECK-NEXT: entry: ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP7:%.*]] = shl nuw nsw i64 [[TMP6]], 2 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -817,6 +815,8 @@ define void @int_float_struct(%struct.IntFloat* nocapture readonly %p) #0 { ; CHECK-NEXT: [[TMP3:%.*]] = bitcast [[TMP2]] to ; CHECK-NEXT: [[TMP4]] = add [[TMP1]], [[VEC_PHI1]] ; CHECK-NEXT: [[TMP5]] = fadd fast [[VEC_PHI]], [[TMP3]] +; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP7:%.*]] = shl nuw nsw i64 [[TMP6]], 2 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP7]] ; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] @@ -899,8 +899,6 @@ define void @PR27626_0(%pair.i32 *%p, i32 %z, i64 %n) #1 { ; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[Z:%.*]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP15:%.*]] = shl nuw nsw i64 [[TMP14]], 2 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -913,6 +911,8 @@ define void @PR27626_0(%pair.i32 *%p, i32 %z, i64 %n) #1 { ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.experimental.vector.deinterleave2.nxv8i32( [[WIDE_VEC]]) ; CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 ; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0( [[TMP13]], [[TMP11]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP15:%.*]] = shl nuw nsw i64 [[TMP14]], 2 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP15]] ; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] ; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] @@ -984,8 +984,6 @@ define i32 @PR27626_1(%pair.i32 *%p, i64 %n) #1 { ; CHECK-NEXT: [[TMP9:%.*]] = shl nuw nsw i64 [[TMP8]], 2 ; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP9]], i64 0 ; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer -; CHECK-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP17:%.*]] = shl nuw nsw i64 [[TMP16]], 2 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -1002,6 +1000,8 @@ define i32 @PR27626_1(%pair.i32 *%p, i64 %n) #1 { ; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = call { , } @llvm.experimental.vector.deinterleave2.nxv8i32( [[WIDE_VEC1]]) ; CHECK-NEXT: [[TMP14:%.*]] = extractvalue { , } [[STRIDED_VEC2]], 0 ; CHECK-NEXT: [[TMP15]] = add [[TMP14]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP17:%.*]] = shl nuw nsw i64 [[TMP16]], 2 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP17]] ; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] ; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] @@ -1081,8 +1081,6 @@ define void @PR27626_2(%pair.i32 *%p, i64 %n, i32 %z) #1 { ; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[Z:%.*]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP15:%.*]] = shl nuw nsw i64 [[TMP14]], 2 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -1095,6 +1093,8 @@ define void @PR27626_2(%pair.i32 *%p, i64 %n, i32 %z) #1 { ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.experimental.vector.deinterleave2.nxv8i32( [[WIDE_VEC]]) ; CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 ; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0( [[TMP13]], [[TMP12]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP15:%.*]] = shl nuw nsw i64 [[TMP14]], 2 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP15]] ; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] ; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] @@ -1169,8 +1169,6 @@ define i32 @PR27626_3(%pair.i32 *%p, i64 %n, i32 %z) #1 { ; CHECK-NEXT: [[TMP9:%.*]] = shl nuw nsw i64 [[TMP8]], 2 ; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP9]], i64 0 ; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer -; CHECK-NEXT: [[TMP17:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP18:%.*]] = shl nuw nsw i64 [[TMP17]], 2 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -1188,6 +1186,8 @@ define i32 @PR27626_3(%pair.i32 *%p, i64 %n, i32 %z) #1 { ; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = call { , } @llvm.experimental.vector.deinterleave2.nxv8i32( [[WIDE_VEC1]]) ; CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , } [[STRIDED_VEC2]], 0 ; CHECK-NEXT: [[TMP16]] = add [[TMP15]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP17:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP18:%.*]] = shl nuw nsw i64 [[TMP17]], 2 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP18]] ; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] ; CHECK-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] @@ -1279,8 +1279,6 @@ define void @PR27626_4(i32 *%a, i32 %x, i32 %y, i32 %z, i64 %n) #1 { ; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer ; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement poison, i32 [[Z:%.*]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector [[BROADCAST_SPLATINSERT3]], poison, zeroinitializer -; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP15:%.*]] = shl nuw nsw i64 [[TMP14]], 2 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -1293,6 +1291,8 @@ define void @PR27626_4(i32 *%a, i32 %x, i32 %y, i32 %z, i64 %n) #1 { ; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i64 -1 ; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.experimental.vector.interleave2.nxv8i32( [[BROADCAST_SPLAT2]], [[BROADCAST_SPLAT4]]) ; CHECK-NEXT: store [[INTERLEAVED_VEC]], ptr [[TMP13]], align 4 +; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP15:%.*]] = shl nuw nsw i64 [[TMP14]], 2 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP15]] ; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] ; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] @@ -1376,8 +1376,6 @@ define void @PR27626_5(i32 *%a, i32 %x, i32 %y, i32 %z, i64 %n) #1 { ; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer ; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement poison, i32 [[Z:%.*]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector [[BROADCAST_SPLATINSERT3]], poison, zeroinitializer -; CHECK-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP17:%.*]] = shl nuw nsw i64 [[TMP16]], 2 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -1390,6 +1388,8 @@ define void @PR27626_5(i32 *%a, i32 %x, i32 %y, i32 %z, i64 %n) #1 { ; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0( [[BROADCAST_SPLAT]], [[TMP14]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) ; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0( [[BROADCAST_SPLAT2]], [[TMP15]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) ; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0( [[BROADCAST_SPLAT4]], [[TMP13]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; CHECK-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP17:%.*]] = shl nuw nsw i64 [[TMP16]], 2 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP17]] ; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] ; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] @@ -1483,8 +1483,6 @@ define void @PR34743(i16* %a, i32* %b, i64 %n) #1 { ; CHECK-NEXT: [[TMP15:%.*]] = shl nuw nsw i64 [[TMP14]], 3 ; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP15]], i64 0 ; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer -; CHECK-NEXT: [[TMP27:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP28:%.*]] = shl nuw nsw i64 [[TMP27]], 2 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -1503,7 +1501,9 @@ define void @PR34743(i16* %a, i32* %b, i64 %n) #1 { ; CHECK-NEXT: [[TMP24:%.*]] = mul nsw [[TMP22]], [[TMP19]] ; CHECK-NEXT: [[TMP25:%.*]] = mul nsw [[TMP24]], [[TMP23]] ; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]] -; CHECK-NEXT: store [[TMP25]], ptr [[TMP26]], align 4, !alias.scope [[META37:![0-9]+]], !noalias [[META34]] +; CHECK-NEXT: store [[TMP25]], ptr [[TMP26]], align 4, !alias.scope [[META37:![0-9]+]], !noalias [[META34]] +; CHECK-NEXT: [[TMP27:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP28:%.*]] = shl nuw nsw i64 [[TMP27]], 2 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP28]] ; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] ; CHECK-NEXT: [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll index dd912e1571918..56e2fe6e15b70 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll @@ -42,8 +42,6 @@ define dso_local void @masked_strided1(ptr noalias nocapture readonly %p, ptr no ; SCALAR_TAIL_FOLDING-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer ; SCALAR_TAIL_FOLDING-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[CONV]], i64 0 ; SCALAR_TAIL_FOLDING-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; SCALAR_TAIL_FOLDING-NEXT: [[TMP19:%.*]] = call i32 @llvm.vscale.i32() -; SCALAR_TAIL_FOLDING-NEXT: [[TMP20:%.*]] = shl i32 [[TMP19]], 4 ; SCALAR_TAIL_FOLDING-NEXT: br label [[VECTOR_BODY:%.*]] ; SCALAR_TAIL_FOLDING: vector.body: ; SCALAR_TAIL_FOLDING-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -66,6 +64,8 @@ define dso_local void @masked_strided1(ptr noalias nocapture readonly %p, ptr no ; SCALAR_TAIL_FOLDING-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.experimental.vector.interleave2.nxv32i8( [[TMP14]], [[TMP15]]) ; SCALAR_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK1:%.*]] = call @llvm.experimental.vector.interleave2.nxv32i1( [[TMP7]], [[TMP7]]) ; SCALAR_TAIL_FOLDING-NEXT: call void @llvm.masked.store.nxv32i8.p0( [[INTERLEAVED_VEC]], ptr [[TMP18]], i32 1, [[INTERLEAVED_MASK1]]) +; SCALAR_TAIL_FOLDING-NEXT: [[TMP19:%.*]] = call i32 @llvm.vscale.i32() +; SCALAR_TAIL_FOLDING-NEXT: [[TMP20:%.*]] = shl i32 [[TMP19]], 4 ; SCALAR_TAIL_FOLDING-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP20]] ; SCALAR_TAIL_FOLDING-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] ; SCALAR_TAIL_FOLDING-NEXT: [[TMP21:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] @@ -122,8 +122,6 @@ define dso_local void @masked_strided1(ptr noalias nocapture readonly %p, ptr no ; PREDICATED_TAIL_FOLDING-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer ; PREDICATED_TAIL_FOLDING-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[CONV]], i64 0 ; PREDICATED_TAIL_FOLDING-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP19:%.*]] = call i32 @llvm.vscale.i32() -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP20:%.*]] = shl i32 [[TMP19]], 4 ; PREDICATED_TAIL_FOLDING-NEXT: br label [[VECTOR_BODY:%.*]] ; PREDICATED_TAIL_FOLDING: vector.body: ; PREDICATED_TAIL_FOLDING-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -148,6 +146,8 @@ define dso_local void @masked_strided1(ptr noalias nocapture readonly %p, ptr no ; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.experimental.vector.interleave2.nxv32i8( [[TMP14]], [[TMP15]]) ; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK1:%.*]] = call @llvm.experimental.vector.interleave2.nxv32i1( [[TMP10]], [[TMP10]]) ; PREDICATED_TAIL_FOLDING-NEXT: call void @llvm.masked.store.nxv32i8.p0( [[INTERLEAVED_VEC]], ptr [[TMP18]], i32 1, [[INTERLEAVED_MASK1]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP19:%.*]] = call i32 @llvm.vscale.i32() +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP20:%.*]] = shl i32 [[TMP19]], 4 ; PREDICATED_TAIL_FOLDING-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP20]] ; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 [[TMP2]]) ; PREDICATED_TAIL_FOLDING-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] @@ -234,8 +234,6 @@ define dso_local void @masked_strided2(ptr noalias nocapture readnone %p, ptr no ; SCALAR_TAIL_FOLDING-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer ; SCALAR_TAIL_FOLDING-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[CONV]], i64 0 ; SCALAR_TAIL_FOLDING-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; SCALAR_TAIL_FOLDING-NEXT: [[TMP14:%.*]] = call i32 @llvm.vscale.i32() -; SCALAR_TAIL_FOLDING-NEXT: [[TMP15:%.*]] = shl i32 [[TMP14]], 4 ; SCALAR_TAIL_FOLDING-NEXT: br label [[VECTOR_BODY:%.*]] ; SCALAR_TAIL_FOLDING: vector.body: ; SCALAR_TAIL_FOLDING-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -249,6 +247,8 @@ define dso_local void @masked_strided2(ptr noalias nocapture readnone %p, ptr no ; SCALAR_TAIL_FOLDING-NEXT: [[TMP12:%.*]] = zext nneg [[TMP11]] to ; SCALAR_TAIL_FOLDING-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP12]] ; SCALAR_TAIL_FOLDING-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0( shufflevector ( insertelement ( poison, i8 2, i64 0), poison, zeroinitializer), [[TMP13]], i32 1, [[TMP10]]) +; SCALAR_TAIL_FOLDING-NEXT: [[TMP14:%.*]] = call i32 @llvm.vscale.i32() +; SCALAR_TAIL_FOLDING-NEXT: [[TMP15:%.*]] = shl i32 [[TMP14]], 4 ; SCALAR_TAIL_FOLDING-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP15]] ; SCALAR_TAIL_FOLDING-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] ; SCALAR_TAIL_FOLDING-NEXT: [[TMP16:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] @@ -297,8 +297,6 @@ define dso_local void @masked_strided2(ptr noalias nocapture readnone %p, ptr no ; PREDICATED_TAIL_FOLDING-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer ; PREDICATED_TAIL_FOLDING-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[CONV]], i64 0 ; PREDICATED_TAIL_FOLDING-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP14:%.*]] = call i32 @llvm.vscale.i32() -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP15:%.*]] = shl i32 [[TMP14]], 4 ; PREDICATED_TAIL_FOLDING-NEXT: br label [[VECTOR_BODY:%.*]] ; PREDICATED_TAIL_FOLDING: vector.body: ; PREDICATED_TAIL_FOLDING-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -314,6 +312,8 @@ define dso_local void @masked_strided2(ptr noalias nocapture readnone %p, ptr no ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP11]] ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP13:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP9]], zeroinitializer ; PREDICATED_TAIL_FOLDING-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0( shufflevector ( insertelement ( poison, i8 2, i64 0), poison, zeroinitializer), [[TMP12]], i32 1, [[TMP13]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP14:%.*]] = call i32 @llvm.vscale.i32() +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP15:%.*]] = shl i32 [[TMP14]], 4 ; PREDICATED_TAIL_FOLDING-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP15]] ; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 [[TMP2]]) ; PREDICATED_TAIL_FOLDING-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] @@ -398,8 +398,6 @@ define dso_local void @masked_strided3(ptr noalias nocapture readnone %p, ptr no ; SCALAR_TAIL_FOLDING-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; SCALAR_TAIL_FOLDING-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i32 [[CONV3]], i64 0 ; SCALAR_TAIL_FOLDING-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer -; SCALAR_TAIL_FOLDING-NEXT: [[TMP15:%.*]] = call i32 @llvm.vscale.i32() -; SCALAR_TAIL_FOLDING-NEXT: [[TMP16:%.*]] = shl i32 [[TMP15]], 4 ; SCALAR_TAIL_FOLDING-NEXT: br label [[VECTOR_BODY:%.*]] ; SCALAR_TAIL_FOLDING: vector.body: ; SCALAR_TAIL_FOLDING-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -414,6 +412,8 @@ define dso_local void @masked_strided3(ptr noalias nocapture readnone %p, ptr no ; SCALAR_TAIL_FOLDING-NEXT: [[TMP13:%.*]] = zext nneg [[TMP12]] to ; SCALAR_TAIL_FOLDING-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP13]] ; SCALAR_TAIL_FOLDING-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0( shufflevector ( insertelement ( poison, i8 2, i64 0), poison, zeroinitializer), [[TMP14]], i32 1, [[TMP11]]) +; SCALAR_TAIL_FOLDING-NEXT: [[TMP15:%.*]] = call i32 @llvm.vscale.i32() +; SCALAR_TAIL_FOLDING-NEXT: [[TMP16:%.*]] = shl i32 [[TMP15]], 4 ; SCALAR_TAIL_FOLDING-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP16]] ; SCALAR_TAIL_FOLDING-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] ; SCALAR_TAIL_FOLDING-NEXT: [[TMP17:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] @@ -470,8 +470,6 @@ define dso_local void @masked_strided3(ptr noalias nocapture readnone %p, ptr no ; PREDICATED_TAIL_FOLDING-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; PREDICATED_TAIL_FOLDING-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i32 [[CONV3]], i64 0 ; PREDICATED_TAIL_FOLDING-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP16:%.*]] = call i32 @llvm.vscale.i32() -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP17:%.*]] = shl i32 [[TMP16]], 4 ; PREDICATED_TAIL_FOLDING-NEXT: br label [[VECTOR_BODY:%.*]] ; PREDICATED_TAIL_FOLDING: vector.body: ; PREDICATED_TAIL_FOLDING-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -489,6 +487,8 @@ define dso_local void @masked_strided3(ptr noalias nocapture readnone %p, ptr no ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP13]] ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP15:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP11]], zeroinitializer ; PREDICATED_TAIL_FOLDING-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0( shufflevector ( insertelement ( poison, i8 2, i64 0), poison, zeroinitializer), [[TMP14]], i32 1, [[TMP15]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP16:%.*]] = call i32 @llvm.vscale.i32() +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP17:%.*]] = shl i32 [[TMP16]], 4 ; PREDICATED_TAIL_FOLDING-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP17]] ; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 [[TMP2]]) ; PREDICATED_TAIL_FOLDING-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-inv-store.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-inv-store.ll index a885b013bb5e3..6a562e8b92bb7 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-inv-store.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-inv-store.ll @@ -15,21 +15,21 @@ define void @inv_store_i16(ptr noalias %dst, ptr noalias readonly %src, i64 %N) ; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] -; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i16, ptr [[SRC:%.*]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i16, ptr [[TMP7]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP8]], align 2 -; CHECK-NEXT: [[TMP9:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-NEXT: [[TMP10:%.*]] = mul i32 [[TMP9]], 4 -; CHECK-NEXT: [[TMP11:%.*]] = sub i32 [[TMP10]], 1 -; CHECK-NEXT: [[TMP12:%.*]] = extractelement [[WIDE_LOAD]], i32 [[TMP11]] -; CHECK-NEXT: store i16 [[TMP12]], ptr [[DST:%.*]], align 2 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i16, ptr [[SRC:%.*]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[TMP5]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 2 +; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[TMP8:%.*]] = mul i32 [[TMP7]], 4 +; CHECK-NEXT: [[TMP9:%.*]] = sub i32 [[TMP8]], 1 +; CHECK-NEXT: [[TMP10:%.*]] = extractelement [[WIDE_LOAD]], i32 [[TMP9]] +; CHECK-NEXT: store i16 [[TMP10]], ptr [[DST:%.*]], align 2 +; CHECK-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP12]] ; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: @@ -45,7 +45,7 @@ define void @inv_store_i16(ptr noalias %dst, ptr noalias readonly %src, i64 %N) ; CHECK-NEXT: store i16 [[LD]], ptr [[DST]], align 2 ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_INC24]], label [[FOR_BODY14]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_INC24]], label [[FOR_BODY14]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: for.inc24: ; CHECK-NEXT: ret void ; @@ -80,18 +80,18 @@ define void @cond_inv_store_i32(ptr noalias %dst, ptr noalias readonly %src, i64 ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, ptr [[DST:%.*]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP8]], align 4 -; CHECK-NEXT: [[TMP9:%.*]] = icmp sgt [[WIDE_LOAD]], zeroinitializer -; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0( [[WIDE_LOAD]], [[BROADCAST_SPLAT]], i32 4, [[TMP9]]) -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 4 +; CHECK-NEXT: [[TMP7:%.*]] = icmp sgt [[WIDE_LOAD]], zeroinitializer +; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0( [[WIDE_LOAD]], [[BROADCAST_SPLAT]], i32 4, [[TMP7]]) +; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]] ; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: middle.block: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-live-out-pointer-induction.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-live-out-pointer-induction.ll index d12a1036ed536..b378603bbc52f 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-live-out-pointer-induction.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-live-out-pointer-induction.ll @@ -23,8 +23,6 @@ define ptr @test(ptr %start.1, ptr %start.2, ptr %end) { ; CHECK-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[START_1:%.*]], i64 [[TMP8]] ; CHECK-NEXT: [[TMP9:%.*]] = mul i64 [[N_VEC]], 8 ; CHECK-NEXT: [[IND_END3:%.*]] = getelementptr i8, ptr [[START_2]], i64 [[TMP9]] -; CHECK-NEXT: [[TMP33:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP34:%.*]] = mul i64 [[TMP33]], 4 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[POINTER_PHI:%.*]] = phi ptr [ [[START_1]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ] @@ -62,6 +60,8 @@ define ptr @test(ptr %start.1, ptr %start.2, ptr %end) { ; CHECK-NEXT: [[TMP31:%.*]] = mul i64 [[TMP30]], 2 ; CHECK-NEXT: [[TMP32:%.*]] = getelementptr i64, ptr [[NEXT_GEP]], i64 [[TMP31]] ; CHECK-NEXT: store zeroinitializer, ptr [[TMP32]], align 8 +; CHECK-NEXT: [[TMP33:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP34:%.*]] = mul i64 [[TMP33]], 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP34]] ; CHECK-NEXT: [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 [[TMP13]] ; CHECK-NEXT: [[TMP35:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-low-trip-count.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-low-trip-count.ll index 602ccb678c968..997aca89b825d 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-low-trip-count.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-low-trip-count.ll @@ -4,18 +4,14 @@ target triple = "aarch64-unknown-linux-gnu" define void @trip7_i64(ptr noalias nocapture noundef %dst, ptr noalias nocapture noundef readonly %src) #0 { ; CHECK-LABEL: @trip7_i64( -; CHECK: = call i64 @llvm.vscale.i64() -; CHECK-NEXT: = mul i64 -; CHECK: = call i64 @llvm.vscale.i64() -; CHECK-NEXT: = mul i64 -; CHECK: [[VSCALE:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[VF:%.*]] = mul i64 [[VSCALE]], 2 ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] ; CHECK: [[ACTIVE_LANE_MASK:%.*]] = phi [ {{%.*}}, %vector.ph ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %vector.body ] ; CHECK: {{%.*}} = call @llvm.masked.load.nxv2i64.p0(ptr {{%.*}}, i32 8, [[ACTIVE_LANE_MASK]], poison) ; CHECK: {{%.*}} = call @llvm.masked.load.nxv2i64.p0(ptr {{%.*}}, i32 8, [[ACTIVE_LANE_MASK]], poison) ; CHECK: call void @llvm.masked.store.nxv2i64.p0( {{%.*}}, ptr {{%.*}}, i32 8, [[ACTIVE_LANE_MASK]]) +; CHECK: [[VSCALE:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[VF:%.*]] = mul i64 [[VSCALE]], 2 ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[VF]] ; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT]], i64 7) ; CHECK-NEXT: [[ACTIVE_LANE_MASK_NOT:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-multiexit.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-multiexit.ll index 006171f3b0682..79bda0a4a5d80 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-multiexit.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-multiexit.ll @@ -33,34 +33,34 @@ define void @multiple_exits_unique_exit_block(ptr %A, ptr %B, i32 %N) #0 { ; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i32 [[N_MOD_VF]], 0 ; CHECK-NEXT: [[TMP10:%.*]] = select i1 [[TMP9]], i32 [[TMP8]], i32 [[N_MOD_VF]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[TMP0]], [[TMP10]] -; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-NEXT: [[TMP12:%.*]] = mul i32 [[TMP11]], 8 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP13:%.*]] = add i32 [[INDEX]], 0 -; CHECK-NEXT: [[TMP14:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-NEXT: [[TMP15:%.*]] = mul i32 [[TMP14]], 4 -; CHECK-NEXT: [[TMP16:%.*]] = add i32 [[TMP15]], 0 -; CHECK-NEXT: [[TMP17:%.*]] = mul i32 [[TMP16]], 1 -; CHECK-NEXT: [[TMP18:%.*]] = add i32 [[INDEX]], [[TMP17]] -; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[TMP13]] -; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[TMP18]] -; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP21]], align 4 -; CHECK-NEXT: [[TMP22:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP23:%.*]] = mul i64 [[TMP22]], 4 -; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i64 [[TMP23]] -; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load , ptr [[TMP24]], align 4 -; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[TMP13]] -; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[TMP18]] -; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[TMP25]], i32 0 -; CHECK-NEXT: store [[WIDE_LOAD]], ptr [[TMP27]], align 4 -; CHECK-NEXT: [[TMP28:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP29:%.*]] = mul i64 [[TMP28]], 4 -; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[TMP25]], i64 [[TMP29]] -; CHECK-NEXT: store [[WIDE_LOAD3]], ptr [[TMP30]], align 4 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP12]] +; CHECK-NEXT: [[TMP11:%.*]] = add i32 [[INDEX]], 0 +; CHECK-NEXT: [[TMP12:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[TMP13:%.*]] = mul i32 [[TMP12]], 4 +; CHECK-NEXT: [[TMP14:%.*]] = add i32 [[TMP13]], 0 +; CHECK-NEXT: [[TMP15:%.*]] = mul i32 [[TMP14]], 1 +; CHECK-NEXT: [[TMP16:%.*]] = add i32 [[INDEX]], [[TMP15]] +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[TMP11]] +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[TMP16]] +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP19]], align 4 +; CHECK-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP21:%.*]] = mul i64 [[TMP20]], 4 +; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i64 [[TMP21]] +; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load , ptr [[TMP22]], align 4 +; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[TMP11]] +; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[TMP16]] +; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[TMP23]], i32 0 +; CHECK-NEXT: store [[WIDE_LOAD]], ptr [[TMP25]], align 4 +; CHECK-NEXT: [[TMP26:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP27:%.*]] = mul i64 [[TMP26]], 4 +; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[TMP23]], i64 [[TMP27]] +; CHECK-NEXT: store [[WIDE_LOAD3]], ptr [[TMP28]], align 4 +; CHECK-NEXT: [[TMP29:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[TMP30:%.*]] = mul i32 [[TMP29]], 8 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP30]] ; CHECK-NEXT: [[TMP31:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP31]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: @@ -131,34 +131,34 @@ define i32 @multiple_exits_multiple_exit_blocks(ptr %A, ptr %B, i32 %N) #0 { ; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i32 [[N_MOD_VF]], 0 ; CHECK-NEXT: [[TMP10:%.*]] = select i1 [[TMP9]], i32 [[TMP8]], i32 [[N_MOD_VF]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[TMP0]], [[TMP10]] -; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-NEXT: [[TMP12:%.*]] = mul i32 [[TMP11]], 8 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP13:%.*]] = add i32 [[INDEX]], 0 -; CHECK-NEXT: [[TMP14:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-NEXT: [[TMP15:%.*]] = mul i32 [[TMP14]], 4 -; CHECK-NEXT: [[TMP16:%.*]] = add i32 [[TMP15]], 0 -; CHECK-NEXT: [[TMP17:%.*]] = mul i32 [[TMP16]], 1 -; CHECK-NEXT: [[TMP18:%.*]] = add i32 [[INDEX]], [[TMP17]] -; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[TMP13]] -; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[TMP18]] -; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP21]], align 4 -; CHECK-NEXT: [[TMP22:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP23:%.*]] = mul i64 [[TMP22]], 4 -; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i64 [[TMP23]] -; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load , ptr [[TMP24]], align 4 -; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[TMP13]] -; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[TMP18]] -; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[TMP25]], i32 0 -; CHECK-NEXT: store [[WIDE_LOAD]], ptr [[TMP27]], align 4 -; CHECK-NEXT: [[TMP28:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP29:%.*]] = mul i64 [[TMP28]], 4 -; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[TMP25]], i64 [[TMP29]] -; CHECK-NEXT: store [[WIDE_LOAD3]], ptr [[TMP30]], align 4 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP12]] +; CHECK-NEXT: [[TMP11:%.*]] = add i32 [[INDEX]], 0 +; CHECK-NEXT: [[TMP12:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[TMP13:%.*]] = mul i32 [[TMP12]], 4 +; CHECK-NEXT: [[TMP14:%.*]] = add i32 [[TMP13]], 0 +; CHECK-NEXT: [[TMP15:%.*]] = mul i32 [[TMP14]], 1 +; CHECK-NEXT: [[TMP16:%.*]] = add i32 [[INDEX]], [[TMP15]] +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[TMP11]] +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[TMP16]] +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP19]], align 4 +; CHECK-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP21:%.*]] = mul i64 [[TMP20]], 4 +; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i64 [[TMP21]] +; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load , ptr [[TMP22]], align 4 +; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[TMP11]] +; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[TMP16]] +; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[TMP23]], i32 0 +; CHECK-NEXT: store [[WIDE_LOAD]], ptr [[TMP25]], align 4 +; CHECK-NEXT: [[TMP26:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP27:%.*]] = mul i64 [[TMP26]], 4 +; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[TMP23]], i64 [[TMP27]] +; CHECK-NEXT: store [[WIDE_LOAD3]], ptr [[TMP28]], align 4 +; CHECK-NEXT: [[TMP29:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[TMP30:%.*]] = mul i32 [[TMP29]], 8 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP30]] ; CHECK-NEXT: [[TMP31:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP31]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-runtime-check-size-based-threshold.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-runtime-check-size-based-threshold.ll index 1d5c60bd7374e..166d771482441 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-runtime-check-size-based-threshold.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-runtime-check-size-based-threshold.ll @@ -47,8 +47,6 @@ define void @min_trip_count_due_to_runtime_checks_1(ptr %dst.1, ptr %dst.2, ptr ; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 4 ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[UMAX]], [[TMP16]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[UMAX]], [[N_MOD_VF]] -; CHECK-NEXT: [[TMP49:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP50:%.*]] = mul i64 [[TMP49]], 4 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -92,6 +90,8 @@ define void @min_trip_count_due_to_runtime_checks_1(ptr %dst.1, ptr %dst.2, ptr ; CHECK-NEXT: [[TMP47:%.*]] = mul i64 [[TMP46]], 2 ; CHECK-NEXT: [[TMP48:%.*]] = getelementptr i64, ptr [[TMP39]], i64 [[TMP47]] ; CHECK-NEXT: store [[TMP36]], ptr [[TMP48]], align 8 +; CHECK-NEXT: [[TMP49:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP50:%.*]] = mul i64 [[TMP49]], 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP50]] ; CHECK-NEXT: [[TMP51:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP51]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll index c7e2f63a1c79c..2ca5fbebed877 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll @@ -59,8 +59,6 @@ define void @simple_memset(i32 %val, ptr %ptr, i64 %n) #0 { ; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]]) ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[VAL:%.*]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 4 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ] @@ -69,6 +67,8 @@ define void @simple_memset(i32 %val, ptr %ptr, i64 %n) #0 { ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[TMP10]] ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[TMP11]], i32 0 ; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[BROADCAST_SPLAT]], ptr [[TMP12]], i32 4, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 4 ; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP14]] ; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]]) ; CHECK-NEXT: [[TMP15:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-optsize.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-optsize.ll index 920046a32caca..275a26bae1f2e 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-optsize.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-optsize.ll @@ -17,23 +17,23 @@ define void @trip1025_i64(ptr noalias nocapture noundef %dst, ptr noalias nocapt ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 1025) -; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 2 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[SRC:%.*]], i64 [[TMP7]] -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 0 -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv2i64.p0(ptr [[TMP9]], i32 8, [[ACTIVE_LANE_MASK]], poison) -; CHECK-NEXT: [[TMP10:%.*]] = shl nsw [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[DST:%.*]], i64 [[TMP7]] -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[TMP11]], i32 0 -; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call @llvm.masked.load.nxv2i64.p0(ptr [[TMP12]], i32 8, [[ACTIVE_LANE_MASK]], poison) -; CHECK-NEXT: [[TMP13:%.*]] = add nsw [[WIDE_MASKED_LOAD1]], [[TMP10]] -; CHECK-NEXT: call void @llvm.masked.store.nxv2i64.p0( [[TMP13]], ptr [[TMP12]], i32 8, [[ACTIVE_LANE_MASK]]) -; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]] +; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[SRC:%.*]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv2i64.p0(ptr [[TMP7]], i32 8, [[ACTIVE_LANE_MASK]], poison) +; CHECK-NEXT: [[TMP8:%.*]] = shl nsw [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[DST:%.*]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[TMP9]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call @llvm.masked.load.nxv2i64.p0(ptr [[TMP10]], i32 8, [[ACTIVE_LANE_MASK]], poison) +; CHECK-NEXT: [[TMP11:%.*]] = add nsw [[WIDE_MASKED_LOAD1]], [[TMP8]] +; CHECK-NEXT: call void @llvm.masked.store.nxv2i64.p0( [[TMP11]], ptr [[TMP10]], i32 8, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP13]] ; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT]], i64 1025) ; CHECK-NEXT: [[TMP14:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) ; CHECK-NEXT: [[TMP15:%.*]] = extractelement [[TMP14]], i32 0 diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-overflow-checks.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-overflow-checks.ll index 4b4bc682de166..c0e736399f293 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-overflow-checks.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-overflow-checks.ll @@ -16,8 +16,6 @@ define void @cannot_overflow_i32_induction_var(ptr noalias %dst, ptr readonly %s ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]]) -; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 2 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -27,6 +25,8 @@ define void @cannot_overflow_i32_induction_var(ptr noalias %dst, ptr readonly %s ; CHECK-NEXT: [[TMP1:%.*]] = add nsw [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, i32 42, i64 0), poison, zeroinitializer) ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 [[INDEX]] ; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[TMP1]], ptr [[TMP2]], i32 4, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 2 ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]] ; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]]) ; CHECK-NEXT: [[TMP5:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i64 0 @@ -80,8 +80,6 @@ define void @can_overflow_i64_induction_var(ptr noalias %dst, ptr readonly %src, ; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2 ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[N]], i64 [[TMP1]]) ; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]]) -; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP7:%.*]] = shl nuw nsw i64 [[TMP6]], 2 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -91,6 +89,8 @@ define void @can_overflow_i64_induction_var(ptr noalias %dst, ptr readonly %src, ; CHECK-NEXT: [[TMP4:%.*]] = add nsw [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, i32 42, i64 0), poison, zeroinitializer) ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 [[INDEX]] ; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[TMP4]], ptr [[TMP5]], i32 4, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP7:%.*]] = shl nuw nsw i64 [[TMP6]], 2 ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP7]] ; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP2]]) ; CHECK-NEXT: [[TMP8:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i64 0 diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-reductions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-reductions.ll index 2d8b8d86e237d..7e0f18b9efae1 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-reductions.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-reductions.ll @@ -26,8 +26,6 @@ define i32 @add_reduction_i32(ptr %ptr, i64 %n) #0 { ; CHECK-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[UMAX]], [[TMP6]] ; CHECK-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0 ; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]]) -; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 4 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ] @@ -39,6 +37,8 @@ define i32 @add_reduction_i32(ptr %ptr, i64 %n) #0 { ; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP12]], i32 4, [[ACTIVE_LANE_MASK]], poison) ; CHECK-NEXT: [[TMP13:%.*]] = add [[VEC_PHI]], [[WIDE_MASKED_LOAD]] ; CHECK-NEXT: [[TMP14]] = select [[ACTIVE_LANE_MASK]], [[TMP13]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 4 ; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP16]] ; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]]) ; CHECK-NEXT: [[TMP17:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) @@ -83,8 +83,6 @@ define i32 @add_reduction_i32(ptr %ptr, i64 %n) #0 { ; CHECK-IN-LOOP-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[UMAX]], [[TMP6]] ; CHECK-IN-LOOP-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0 ; CHECK-IN-LOOP-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]]) -; CHECK-IN-LOOP-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-IN-LOOP-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 4 ; CHECK-IN-LOOP-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-IN-LOOP: vector.body: ; CHECK-IN-LOOP-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ] @@ -97,6 +95,8 @@ define i32 @add_reduction_i32(ptr %ptr, i64 %n) #0 { ; CHECK-IN-LOOP-NEXT: [[TMP13:%.*]] = select [[ACTIVE_LANE_MASK]], [[WIDE_MASKED_LOAD]], zeroinitializer ; CHECK-IN-LOOP-NEXT: [[TMP14:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP13]]) ; CHECK-IN-LOOP-NEXT: [[TMP15]] = add i32 [[TMP14]], [[VEC_PHI]] +; CHECK-IN-LOOP-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-IN-LOOP-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 4 ; CHECK-IN-LOOP-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP17]] ; CHECK-IN-LOOP-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]]) ; CHECK-IN-LOOP-NEXT: [[TMP18:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) @@ -158,8 +158,6 @@ define float @add_reduction_f32(ptr %ptr, i64 %n) #0 { ; CHECK-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[UMAX]], [[TMP6]] ; CHECK-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0 ; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]]) -; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 4 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ] @@ -171,6 +169,8 @@ define float @add_reduction_f32(ptr %ptr, i64 %n) #0 { ; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4f32.p0(ptr [[TMP12]], i32 4, [[ACTIVE_LANE_MASK]], poison) ; CHECK-NEXT: [[TMP13:%.*]] = select [[ACTIVE_LANE_MASK]], [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) ; CHECK-NEXT: [[TMP14]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], [[TMP13]]) +; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 4 ; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP16]] ; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]]) ; CHECK-NEXT: [[TMP17:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) @@ -214,8 +214,6 @@ define float @add_reduction_f32(ptr %ptr, i64 %n) #0 { ; CHECK-IN-LOOP-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[UMAX]], [[TMP6]] ; CHECK-IN-LOOP-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0 ; CHECK-IN-LOOP-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]]) -; CHECK-IN-LOOP-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-IN-LOOP-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 4 ; CHECK-IN-LOOP-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-IN-LOOP: vector.body: ; CHECK-IN-LOOP-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ] @@ -227,6 +225,8 @@ define float @add_reduction_f32(ptr %ptr, i64 %n) #0 { ; CHECK-IN-LOOP-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4f32.p0(ptr [[TMP12]], i32 4, [[ACTIVE_LANE_MASK]], poison) ; CHECK-IN-LOOP-NEXT: [[TMP13:%.*]] = select [[ACTIVE_LANE_MASK]], [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) ; CHECK-IN-LOOP-NEXT: [[TMP14]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], [[TMP13]]) +; CHECK-IN-LOOP-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-IN-LOOP-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 4 ; CHECK-IN-LOOP-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP16]] ; CHECK-IN-LOOP-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]]) ; CHECK-IN-LOOP-NEXT: [[TMP17:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) @@ -287,8 +287,6 @@ define i32 @cond_xor_reduction(ptr noalias %a, ptr noalias %cond, i64 %N) #0 { ; CHECK-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[N]], [[TMP6]] ; CHECK-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0 ; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]]) -; CHECK-NEXT: [[TMP21:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP22:%.*]] = mul i64 [[TMP21]], 4 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -308,6 +306,8 @@ define i32 @cond_xor_reduction(ptr noalias %a, ptr noalias %cond, i64 %N) #0 { ; CHECK-NEXT: [[TMP19:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP18]], zeroinitializer ; CHECK-NEXT: [[PREDPHI:%.*]] = select [[TMP15]], [[TMP17]], [[VEC_PHI]] ; CHECK-NEXT: [[TMP20]] = select [[ACTIVE_LANE_MASK]], [[PREDPHI]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP21:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP22:%.*]] = mul i64 [[TMP21]], 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP22]] ; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]]) ; CHECK-NEXT: [[TMP23:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) @@ -359,8 +359,6 @@ define i32 @cond_xor_reduction(ptr noalias %a, ptr noalias %cond, i64 %N) #0 { ; CHECK-IN-LOOP-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[N]], [[TMP6]] ; CHECK-IN-LOOP-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0 ; CHECK-IN-LOOP-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]]) -; CHECK-IN-LOOP-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-IN-LOOP-NEXT: [[TMP21:%.*]] = mul i64 [[TMP20]], 4 ; CHECK-IN-LOOP-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-IN-LOOP: vector.body: ; CHECK-IN-LOOP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -378,6 +376,8 @@ define i32 @cond_xor_reduction(ptr noalias %a, ptr noalias %cond, i64 %N) #0 { ; CHECK-IN-LOOP-NEXT: [[TMP17:%.*]] = select [[TMP15]], [[WIDE_MASKED_LOAD1]], zeroinitializer ; CHECK-IN-LOOP-NEXT: [[TMP18:%.*]] = call i32 @llvm.vector.reduce.xor.nxv4i32( [[TMP17]]) ; CHECK-IN-LOOP-NEXT: [[TMP19]] = xor i32 [[TMP18]], [[VEC_PHI]] +; CHECK-IN-LOOP-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-IN-LOOP-NEXT: [[TMP21:%.*]] = mul i64 [[TMP20]], 4 ; CHECK-IN-LOOP-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP21]] ; CHECK-IN-LOOP-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]]) ; CHECK-IN-LOOP-NEXT: [[TMP22:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-unroll.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-unroll.ll index 8f2e20757f26e..caeb37bce2f18 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-unroll.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-unroll.ll @@ -53,8 +53,6 @@ define void @simple_memset(i32 %val, ptr %ptr, i64 %n) #0 { ; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY5:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT2]], i64 [[UMAX]]) ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[VAL:%.*]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; CHECK-NEXT: [[TMP61:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP62:%.*]] = mul i64 [[TMP61]], 16 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX6:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT10:%.*]], [[VECTOR_BODY]] ] @@ -96,6 +94,8 @@ define void @simple_memset(i32 %val, ptr %ptr, i64 %n) #0 { ; CHECK-NEXT: [[TMP59:%.*]] = mul i64 [[TMP58]], 12 ; CHECK-NEXT: [[TMP60:%.*]] = getelementptr i32, ptr [[TMP47]], i64 [[TMP59]] ; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[BROADCAST_SPLAT]], ptr [[TMP60]], i32 4, [[ACTIVE_LANE_MASK9]]) +; CHECK-NEXT: [[TMP61:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP62:%.*]] = mul i64 [[TMP61]], 16 ; CHECK-NEXT: [[INDEX_NEXT10]] = add i64 [[INDEX6]], [[TMP62]] ; CHECK-NEXT: [[TMP63:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP64:%.*]] = mul i64 [[TMP63]], 4 @@ -195,8 +195,6 @@ define void @cond_memset(i32 %val, ptr noalias readonly %cond_ptr, ptr noalias % ; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY5:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT2]], i64 [[UMAX]]) ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[VAL:%.*]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; CHECK-NEXT: [[TMP83:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP84:%.*]] = mul i64 [[TMP83]], 16 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX6:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT13:%.*]], [[VECTOR_BODY]] ] @@ -264,6 +262,8 @@ define void @cond_memset(i32 %val, ptr noalias readonly %cond_ptr, ptr noalias % ; CHECK-NEXT: [[TMP81:%.*]] = mul i64 [[TMP80]], 12 ; CHECK-NEXT: [[TMP82:%.*]] = getelementptr i32, ptr [[TMP65]], i64 [[TMP81]] ; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[BROADCAST_SPLAT]], ptr [[TMP82]], i32 4, [[TMP72]]) +; CHECK-NEXT: [[TMP83:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP84:%.*]] = mul i64 [[TMP83]], 16 ; CHECK-NEXT: [[INDEX_NEXT13]] = add i64 [[INDEX6]], [[TMP84]] ; CHECK-NEXT: [[TMP85:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP86:%.*]] = mul i64 [[TMP85]], 4 diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll index 9e88d65c65cfd..292bdd5c8be85 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll @@ -26,8 +26,6 @@ define void @simple_memset(i32 %val, ptr %ptr, i64 %n) #0 { ; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]]) ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[VAL:%.*]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 4 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ] @@ -36,6 +34,8 @@ define void @simple_memset(i32 %val, ptr %ptr, i64 %n) #0 { ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[TMP10]] ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[TMP11]], i32 0 ; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[BROADCAST_SPLAT]], ptr [[TMP12]], i32 4, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 4 ; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP14]] ; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]]) ; CHECK-NEXT: [[TMP15:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) @@ -151,8 +151,6 @@ define void @simple_memcpy(ptr noalias %dst, ptr noalias %src, i64 %n) #0 { ; CHECK-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[UMAX]], [[TMP6]] ; CHECK-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0 ; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]]) -; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 4 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ] @@ -164,6 +162,8 @@ define void @simple_memcpy(ptr noalias %dst, ptr noalias %src, i64 %n) #0 { ; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i32, ptr [[DST:%.*]], i64 [[TMP10]] ; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i32, ptr [[TMP13]], i32 0 ; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[WIDE_MASKED_LOAD]], ptr [[TMP14]], i32 4, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 4 ; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP16]] ; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]]) ; CHECK-NEXT: [[TMP17:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) @@ -237,8 +237,6 @@ define void @copy_stride4(ptr noalias %dst, ptr noalias %src, i64 %n) #0 { ; CHECK-NEXT: [[TMP18:%.*]] = mul i64 4, [[TMP17]] ; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP18]], i64 0 ; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer -; CHECK-NEXT: [[TMP21:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP22:%.*]] = mul i64 [[TMP21]], 4 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ] @@ -248,6 +246,8 @@ define void @copy_stride4(ptr noalias %dst, ptr noalias %src, i64 %n) #0 { ; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0( [[TMP19]], i32 4, [[ACTIVE_LANE_MASK]], poison) ; CHECK-NEXT: [[TMP20:%.*]] = getelementptr i32, ptr [[DST:%.*]], [[VEC_IND]] ; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0( [[WIDE_MASKED_GATHER]], [[TMP20]], i32 4, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: [[TMP21:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP22:%.*]] = mul i64 [[TMP21]], 4 ; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP22]] ; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP12]]) ; CHECK-NEXT: [[TMP23:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) @@ -309,8 +309,6 @@ define void @simple_gather_scatter(ptr noalias %dst, ptr noalias %src, ptr noali ; CHECK-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[UMAX]], [[TMP6]] ; CHECK-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0 ; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]]) -; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 4 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ] @@ -323,6 +321,8 @@ define void @simple_gather_scatter(ptr noalias %dst, ptr noalias %src, ptr noali ; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0( [[TMP13]], i32 4, [[ACTIVE_LANE_MASK]], poison) ; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i32, ptr [[DST:%.*]], [[WIDE_MASKED_LOAD]] ; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0( [[WIDE_MASKED_GATHER]], [[TMP14]], i32 4, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 4 ; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP16]] ; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]]) ; CHECK-NEXT: [[TMP17:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) @@ -388,8 +388,6 @@ define void @uniform_load(ptr noalias %dst, ptr noalias readonly %src, i64 %n) # ; CHECK-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[N]], [[TMP6]] ; CHECK-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0 ; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]]) -; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP15:%.*]] = mul i64 [[TMP14]], 4 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -401,6 +399,8 @@ define void @uniform_load(ptr noalias %dst, ptr noalias readonly %src, i64 %n) # ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 [[TMP10]] ; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0 ; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[BROADCAST_SPLAT]], ptr [[TMP13]], i32 4, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP15:%.*]] = mul i64 [[TMP14]], 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP15]] ; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]]) ; CHECK-NEXT: [[TMP16:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) @@ -465,8 +465,6 @@ define void @cond_uniform_load(ptr noalias %dst, ptr noalias readonly %src, ptr ; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]]) ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, ptr [[SRC:%.*]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; CHECK-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP21:%.*]] = mul i64 [[TMP20]], 4 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ] @@ -485,6 +483,8 @@ define void @cond_uniform_load(ptr noalias %dst, ptr noalias readonly %src, ptr ; CHECK-NEXT: [[TMP18:%.*]] = or [[TMP15]], [[TMP16]] ; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 0 ; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[PREDPHI]], ptr [[TMP19]], i32 4, [[TMP18]]) +; CHECK-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP21:%.*]] = mul i64 [[TMP20]], 4 ; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP21]] ; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]]) ; CHECK-NEXT: [[TMP22:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) @@ -565,8 +565,6 @@ define void @uniform_store(ptr noalias %dst, ptr noalias readonly %src, i64 %n) ; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]]) ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, ptr [[DST:%.*]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 4 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -576,6 +574,8 @@ define void @uniform_store(ptr noalias %dst, ptr noalias readonly %src, i64 %n) ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i32 0 ; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP12]], i32 4, [[ACTIVE_LANE_MASK]], poison) ; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0( [[WIDE_MASKED_LOAD]], [[BROADCAST_SPLAT]], i32 4, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP14]] ; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]]) ; CHECK-NEXT: [[TMP15:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) @@ -635,8 +635,6 @@ define void @simple_fdiv(ptr noalias %dst, ptr noalias %src, i64 %n) #0 { ; CHECK-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[UMAX]], [[TMP6]] ; CHECK-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0 ; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]]) -; CHECK-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 4 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VECTOR_BODY]] ] @@ -650,6 +648,8 @@ define void @simple_fdiv(ptr noalias %dst, ptr noalias %src, i64 %n) #0 { ; CHECK-NEXT: [[WIDE_MASKED_LOAD2:%.*]] = call @llvm.masked.load.nxv4f32.p0(ptr [[TMP14]], i32 4, [[ACTIVE_LANE_MASK]], poison) ; CHECK-NEXT: [[TMP15:%.*]] = fdiv [[WIDE_MASKED_LOAD]], [[WIDE_MASKED_LOAD2]] ; CHECK-NEXT: call void @llvm.masked.store.nxv4f32.p0( [[TMP15]], ptr [[TMP14]], i32 4, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 4 ; CHECK-NEXT: [[INDEX_NEXT3]] = add i64 [[INDEX1]], [[TMP17]] ; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]]) ; CHECK-NEXT: [[TMP18:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) @@ -715,8 +715,6 @@ define void @simple_idiv(ptr noalias %dst, ptr noalias %src, i64 %n) #0 { ; CHECK-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[UMAX]], [[TMP6]] ; CHECK-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0 ; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]]) -; CHECK-NEXT: [[TMP17:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP18:%.*]] = mul i64 [[TMP17]], 4 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VECTOR_BODY]] ] @@ -731,6 +729,8 @@ define void @simple_idiv(ptr noalias %dst, ptr noalias %src, i64 %n) #0 { ; CHECK-NEXT: [[TMP15:%.*]] = select [[ACTIVE_LANE_MASK]], [[WIDE_MASKED_LOAD2]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) ; CHECK-NEXT: [[TMP16:%.*]] = udiv [[WIDE_MASKED_LOAD]], [[TMP15]] ; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[TMP16]], ptr [[TMP14]], i32 4, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: [[TMP17:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP18:%.*]] = mul i64 [[TMP17]], 4 ; CHECK-NEXT: [[INDEX_NEXT3]] = add i64 [[INDEX1]], [[TMP18]] ; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]]) ; CHECK-NEXT: [[TMP19:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) @@ -788,8 +788,6 @@ define void @simple_memset_trip1024(i32 %val, ptr %ptr, i64 %n) #0 { ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[VAL:%.*]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ] @@ -797,6 +795,8 @@ define void @simple_memset_trip1024(i32 %val, ptr %ptr, i64 %n) #0 { ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[TMP4]] ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i32, ptr [[TMP5]], i32 0 ; CHECK-NEXT: store [[BROADCAST_SPLAT]], ptr [[TMP6]], align 4 +; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4 ; CHECK-NEXT: [[INDEX_NEXT2]] = add nuw i64 [[INDEX1]], [[TMP8]] ; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT2]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse.ll index 2cd52bfa87b99..b36a10b46734c 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse.ll @@ -23,8 +23,6 @@ define void @vector_reverse_f64(i64 %N, ptr noalias %a, ptr noalias %b) #0{ ; CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 4 ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub nsw i64 [[N]], [[N_MOD_VF]] -; CHECK-NEXT: [[TMP30:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP31:%.*]] = shl i64 [[TMP30]], 4 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -58,6 +56,8 @@ define void @vector_reverse_f64(i64 %N, ptr noalias %a, ptr noalias %b) #0{ ; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds double, ptr [[TMP19]], i64 [[TMP26]] ; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds double, ptr [[TMP28]], i64 [[TMP27]] ; CHECK-NEXT: store [[TMP18]], ptr [[TMP29]], align 8 +; CHECK-NEXT: [[TMP30:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP31:%.*]] = shl i64 [[TMP30]], 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP31]] ; CHECK-NEXT: [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] @@ -125,8 +125,6 @@ define void @vector_reverse_i64(i64 %N, ptr %a, ptr %b) #0 { ; CHECK-NEXT: [[TMP6:%.*]] = shl i64 [[TMP5]], 4 ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP6]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub nsw i64 [[N]], [[N_MOD_VF]] -; CHECK-NEXT: [[TMP33:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP34:%.*]] = shl i64 [[TMP33]], 4 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -160,6 +158,8 @@ define void @vector_reverse_i64(i64 %N, ptr %a, ptr %b) #0 { ; CHECK-NEXT: [[TMP31:%.*]] = getelementptr inbounds i64, ptr [[TMP22]], i64 [[TMP29]] ; CHECK-NEXT: [[TMP32:%.*]] = getelementptr inbounds i64, ptr [[TMP31]], i64 [[TMP30]] ; CHECK-NEXT: store [[TMP21]], ptr [[TMP32]], align 8 +; CHECK-NEXT: [[TMP33:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP34:%.*]] = shl i64 [[TMP33]], 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP34]] ; CHECK-NEXT: [[TMP35:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP35]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-vfabi.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-vfabi.ll index e12e82b5a7f5b..31cf2c4e2db4c 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-vfabi.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-vfabi.ll @@ -8,8 +8,6 @@ define void @test_big_little_params(ptr readonly %a, ptr readonly %b, ptr noalia ; CHECK-SAME: (ptr readonly [[A:%.*]], ptr readonly [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 1025) -; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 2 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -21,6 +19,8 @@ define void @test_big_little_params(ptr readonly %a, ptr readonly %b, ptr noalia ; CHECK-NEXT: [[TMP2:%.*]] = call @foo_vector( [[WIDE_MASKED_LOAD]], [[WIDE_MASKED_LOAD1]], [[ACTIVE_LANE_MASK]]) ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[INDEX]] ; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[TMP2]], ptr [[TMP3]], i32 4, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 2 ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP5]] ; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT]], i64 1025) ; CHECK-NEXT: [[TMP6:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i64 0 @@ -53,8 +53,6 @@ define void @test_little_big_params(ptr readonly %a, ptr readonly %b, ptr noalia ; CHECK-SAME: (ptr readonly [[A:%.*]], ptr readonly [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 1025) -; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 1 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -66,6 +64,8 @@ define void @test_little_big_params(ptr readonly %a, ptr readonly %b, ptr noalia ; CHECK-NEXT: [[TMP2:%.*]] = call @bar_vector( [[WIDE_MASKED_LOAD]], [[WIDE_MASKED_LOAD1]], [[ACTIVE_LANE_MASK]]) ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds double, ptr [[C]], i64 [[INDEX]] ; CHECK-NEXT: call void @llvm.masked.store.nxv2f64.p0( [[TMP2]], ptr [[TMP3]], i32 8, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 1 ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP5]] ; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT]], i64 1025) ; CHECK-NEXT: [[TMP6:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i64 0 diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-gep.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-gep.ll index 1bffc0d37e59f..62c778aca51d0 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-gep.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-gep.ll @@ -49,8 +49,6 @@ define void @pointer_induction_used_as_vector(ptr noalias %start.1, ptr noalias ; CHECK-NEXT: [[TMP4:%.*]] = mul i64 [[N_VEC]], 8 ; CHECK-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[START_1:%.*]], i64 [[TMP4]] ; CHECK-NEXT: [[IND_END2:%.*]] = getelementptr i8, ptr [[START_2:%.*]], i64 [[N_VEC]] -; CHECK-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP21:%.*]] = mul i64 [[TMP20]], 2 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[POINTER_PHI:%.*]] = phi ptr [ [[START_2]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ] @@ -77,6 +75,8 @@ define void @pointer_induction_used_as_vector(ptr noalias %start.1, ptr noalias ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP18]], align 1 ; CHECK-NEXT: [[TMP19:%.*]] = add [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i8 1, i64 0), poison, zeroinitializer) ; CHECK-NEXT: store [[TMP19]], ptr [[TMP18]], align 1 +; CHECK-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP21:%.*]] = mul i64 [[TMP20]], 2 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP21]] ; CHECK-NEXT: [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 [[TMP10]] ; CHECK-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] @@ -142,8 +142,6 @@ define void @pointer_induction(ptr noalias %start, i64 %N) { ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP4]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]] ; CHECK-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[START:%.*]], i64 [[N_VEC]] -; CHECK-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 2 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[POINTER_PHI:%.*]] = phi ptr [ [[START]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ] @@ -164,6 +162,8 @@ define void @pointer_induction(ptr noalias %start, i64 %N) { ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP14]], align 1 ; CHECK-NEXT: [[TMP15:%.*]] = add [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i8 1, i64 0), poison, zeroinitializer) ; CHECK-NEXT: store [[TMP15]], ptr [[TMP14]], align 1 +; CHECK-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 2 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX2]], [[TMP17]] ; CHECK-NEXT: [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 [[TMP8]] ; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll index c4386c3ce5d11..9cd2781ae2359 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll @@ -27,8 +27,6 @@ define void @widen_ptr_phi_unrolled(ptr noalias nocapture %a, ptr noalias nocapt ; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[DOTNEG]], [[N]] ; CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[N_VEC]], 3 ; CHECK-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[C:%.*]], i64 [[TMP3]] -; CHECK-NEXT: [[TMP25:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP26:%.*]] = shl nuw nsw i64 [[TMP25]], 3 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -63,6 +61,8 @@ define void @widen_ptr_phi_unrolled(ptr noalias nocapture %a, ptr noalias nocapt ; CHECK-NEXT: [[TMP23:%.*]] = shl nuw nsw i64 [[TMP22]], 2 ; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i64 [[TMP23]] ; CHECK-NEXT: store [[TMP20]], ptr [[TMP24]], align 4 +; CHECK-NEXT: [[TMP25:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP26:%.*]] = shl nuw nsw i64 [[TMP25]], 3 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP26]] ; CHECK-NEXT: [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] @@ -143,8 +143,6 @@ define void @widen_2ptrs_phi_unrolled(ptr noalias nocapture %dst, ptr noalias no ; CHECK-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[SRC:%.*]], i64 [[TMP3]] ; CHECK-NEXT: [[TMP4:%.*]] = shl i64 [[N_VEC]], 2 ; CHECK-NEXT: [[IND_END2:%.*]] = getelementptr i8, ptr [[DST:%.*]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP16:%.*]] = shl nuw nsw i64 [[TMP15]], 3 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -164,6 +162,8 @@ define void @widen_2ptrs_phi_unrolled(ptr noalias nocapture %dst, ptr noalias no ; CHECK-NEXT: [[TMP13:%.*]] = shl nuw nsw i64 [[TMP12]], 2 ; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i32, ptr [[NEXT_GEP5]], i64 [[TMP13]] ; CHECK-NEXT: store [[TMP11]], ptr [[TMP14]], align 4 +; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP16:%.*]] = shl nuw nsw i64 [[TMP15]], 3 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP16]] ; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] @@ -234,8 +234,6 @@ define i32 @pointer_iv_mixed(ptr noalias %a, ptr noalias %b, i64 %n) #0 { ; CHECK-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 [[TMP3]] ; CHECK-NEXT: [[TMP4:%.*]] = shl i64 [[N_VEC]], 3 ; CHECK-NEXT: [[IND_END2:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP13:%.*]] = shl nuw nsw i64 [[TMP12]], 1 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[POINTER_PHI:%.*]] = phi ptr [ [[A]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ] @@ -252,6 +250,8 @@ define i32 @pointer_iv_mixed(ptr noalias %a, ptr noalias %b, i64 %n) #0 { ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP10]], align 8 ; CHECK-NEXT: [[TMP11]] = add [[WIDE_LOAD]], [[VEC_PHI]] ; CHECK-NEXT: store [[TMP8]], ptr [[NEXT_GEP]], align 8 +; CHECK-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP13:%.*]] = shl nuw nsw i64 [[TMP12]], 1 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP13]] ; CHECK-NEXT: [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 [[TMP6]] ; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] @@ -310,8 +310,6 @@ define void @phi_used_in_vector_compare_and_scalar_indvar_update_and_store(ptr % ; CHECK-NEXT: entry: ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP7:%.*]] = shl nuw nsw i64 [[TMP6]], 1 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[POINTER_PHI:%.*]] = phi ptr [ [[PTR:%.*]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ] @@ -324,6 +322,8 @@ define void @phi_used_in_vector_compare_and_scalar_indvar_update_and_store(ptr % ; CHECK-NEXT: [[TMP4:%.*]] = icmp ne [[TMP3]], zeroinitializer ; CHECK-NEXT: [[TMP5:%.*]] = extractelement [[TMP3]], i64 0 ; CHECK-NEXT: call void @llvm.masked.store.nxv2i16.p0( zeroinitializer, ptr [[TMP5]], i32 2, [[TMP4]]) +; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP7:%.*]] = shl nuw nsw i64 [[TMP6]], 1 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP7]] ; CHECK-NEXT: [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 [[TMP1]] ; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/tail-folding-styles.ll b/llvm/test/Transforms/LoopVectorize/AArch64/tail-folding-styles.ll index 5264621896cb9..814d3ba53050e 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/tail-folding-styles.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/tail-folding-styles.ll @@ -24,8 +24,6 @@ define void @simple_memset_tailfold(i32 %val, ptr %ptr, i64 %n) "target-features ; NONE-NEXT: [[N_VEC:%.*]] = sub i64 [[UMAX]], [[N_MOD_VF]] ; NONE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[VAL:%.*]], i64 0 ; NONE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; NONE-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() -; NONE-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4 ; NONE-NEXT: br label [[VECTOR_BODY:%.*]] ; NONE: vector.body: ; NONE-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ] @@ -33,6 +31,8 @@ define void @simple_memset_tailfold(i32 %val, ptr %ptr, i64 %n) "target-features ; NONE-NEXT: [[TMP5:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[TMP4]] ; NONE-NEXT: [[TMP6:%.*]] = getelementptr i32, ptr [[TMP5]], i32 0 ; NONE-NEXT: store [[BROADCAST_SPLAT]], ptr [[TMP6]], align 4 +; NONE-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; NONE-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4 ; NONE-NEXT: [[INDEX_NEXT2]] = add nuw i64 [[INDEX1]], [[TMP8]] ; NONE-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT2]], [[N_VEC]] ; NONE-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] @@ -71,8 +71,6 @@ define void @simple_memset_tailfold(i32 %val, ptr %ptr, i64 %n) "target-features ; DATA-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; DATA-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[VAL:%.*]], i64 0 ; DATA-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; DATA-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() -; DATA-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 4 ; DATA-NEXT: br label [[VECTOR_BODY:%.*]] ; DATA: vector.body: ; DATA-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ] @@ -81,6 +79,8 @@ define void @simple_memset_tailfold(i32 %val, ptr %ptr, i64 %n) "target-features ; DATA-NEXT: [[TMP10:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[TMP9]] ; DATA-NEXT: [[TMP11:%.*]] = getelementptr i32, ptr [[TMP10]], i32 0 ; DATA-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[BROADCAST_SPLAT]], ptr [[TMP11]], i32 4, [[ACTIVE_LANE_MASK]]) +; DATA-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() +; DATA-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 4 ; DATA-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP13]] ; DATA-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT2]], [[N_VEC]] ; DATA-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] @@ -121,8 +121,6 @@ define void @simple_memset_tailfold(i32 %val, ptr %ptr, i64 %n) "target-features ; DATA_NO_LANEMASK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; DATA_NO_LANEMASK-NEXT: [[BROADCAST_SPLATINSERT4:%.*]] = insertelement poison, i32 [[VAL:%.*]], i64 0 ; DATA_NO_LANEMASK-NEXT: [[BROADCAST_SPLAT5:%.*]] = shufflevector [[BROADCAST_SPLATINSERT4]], poison, zeroinitializer -; DATA_NO_LANEMASK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() -; DATA_NO_LANEMASK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 4 ; DATA_NO_LANEMASK-NEXT: br label [[VECTOR_BODY:%.*]] ; DATA_NO_LANEMASK: vector.body: ; DATA_NO_LANEMASK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT6:%.*]], [[VECTOR_BODY]] ] @@ -136,6 +134,8 @@ define void @simple_memset_tailfold(i32 %val, ptr %ptr, i64 %n) "target-features ; DATA_NO_LANEMASK-NEXT: [[TMP13:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[TMP9]] ; DATA_NO_LANEMASK-NEXT: [[TMP14:%.*]] = getelementptr i32, ptr [[TMP13]], i32 0 ; DATA_NO_LANEMASK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[BROADCAST_SPLAT5]], ptr [[TMP14]], i32 4, [[TMP12]]) +; DATA_NO_LANEMASK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() +; DATA_NO_LANEMASK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 4 ; DATA_NO_LANEMASK-NEXT: [[INDEX_NEXT6]] = add i64 [[INDEX1]], [[TMP16]] ; DATA_NO_LANEMASK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT6]], [[N_VEC]] ; DATA_NO_LANEMASK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] @@ -174,8 +174,6 @@ define void @simple_memset_tailfold(i32 %val, ptr %ptr, i64 %n) "target-features ; DATA_AND_CONTROL-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]]) ; DATA_AND_CONTROL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[VAL:%.*]], i64 0 ; DATA_AND_CONTROL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; DATA_AND_CONTROL-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() -; DATA_AND_CONTROL-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 4 ; DATA_AND_CONTROL-NEXT: br label [[VECTOR_BODY:%.*]] ; DATA_AND_CONTROL: vector.body: ; DATA_AND_CONTROL-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ] @@ -184,6 +182,8 @@ define void @simple_memset_tailfold(i32 %val, ptr %ptr, i64 %n) "target-features ; DATA_AND_CONTROL-NEXT: [[TMP10:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[TMP9]] ; DATA_AND_CONTROL-NEXT: [[TMP11:%.*]] = getelementptr i32, ptr [[TMP10]], i32 0 ; DATA_AND_CONTROL-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[BROADCAST_SPLAT]], ptr [[TMP11]], i32 4, [[ACTIVE_LANE_MASK]]) +; DATA_AND_CONTROL-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() +; DATA_AND_CONTROL-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 4 ; DATA_AND_CONTROL-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP13]] ; DATA_AND_CONTROL-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT2]], i64 [[UMAX]]) ; DATA_AND_CONTROL-NEXT: [[TMP14:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) @@ -225,8 +225,6 @@ define void @simple_memset_tailfold(i32 %val, ptr %ptr, i64 %n) "target-features ; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]]) ; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[VAL:%.*]], i64 0 ; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() -; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 4 ; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; DATA_AND_CONTROL_NO_RT_CHECK: vector.body: ; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ] @@ -235,6 +233,8 @@ define void @simple_memset_tailfold(i32 %val, ptr %ptr, i64 %n) "target-features ; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[TMP11:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[TMP10]] ; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[TMP11]], i32 0 ; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[BROADCAST_SPLAT]], ptr [[TMP12]], i32 4, [[ACTIVE_LANE_MASK]]) +; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() +; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 4 ; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP14]] ; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]]) ; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[TMP15:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/type-shrinkage-zext-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/type-shrinkage-zext-costs.ll index dec3c286345ad..f49b3714062d7 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/type-shrinkage-zext-costs.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/type-shrinkage-zext-costs.ll @@ -32,18 +32,18 @@ define void @zext_i8_i16(ptr noalias nocapture readonly %p, ptr noalias nocaptur ; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 8 ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], [[TMP6]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]] -; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 8 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[INDEX]] -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP9]], align 1 -; CHECK-NEXT: [[TMP10:%.*]] = zext [[WIDE_LOAD]] to -; CHECK-NEXT: [[TMP11:%.*]] = add [[TMP10]], trunc ( shufflevector ( insertelement ( poison, i32 2, i64 0), poison, zeroinitializer) to ) -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i16, ptr [[Q]], i64 [[INDEX]] -; CHECK-NEXT: store [[TMP11]], ptr [[TMP12]], align 2 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP7]], align 1 +; CHECK-NEXT: [[TMP8:%.*]] = zext [[WIDE_LOAD]] to +; CHECK-NEXT: [[TMP9:%.*]] = add [[TMP8]], trunc ( shufflevector ( insertelement ( poison, i32 2, i64 0), poison, zeroinitializer) to ) +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i16, ptr [[Q]], i64 [[INDEX]] +; CHECK-NEXT: store [[TMP9]], ptr [[TMP10]], align 2 +; CHECK-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 8 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP12]] ; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: @@ -115,18 +115,18 @@ define void @sext_i8_i16(ptr noalias nocapture readonly %p, ptr noalias nocaptur ; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 8 ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], [[TMP6]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]] -; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 8 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[INDEX]] -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP9]], align 1 -; CHECK-NEXT: [[TMP10:%.*]] = sext [[WIDE_LOAD]] to -; CHECK-NEXT: [[TMP11:%.*]] = add [[TMP10]], trunc ( shufflevector ( insertelement ( poison, i32 2, i64 0), poison, zeroinitializer) to ) -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i16, ptr [[Q]], i64 [[INDEX]] -; CHECK-NEXT: store [[TMP11]], ptr [[TMP12]], align 2 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP7]], align 1 +; CHECK-NEXT: [[TMP8:%.*]] = sext [[WIDE_LOAD]] to +; CHECK-NEXT: [[TMP9:%.*]] = add [[TMP8]], trunc ( shufflevector ( insertelement ( poison, i32 2, i64 0), poison, zeroinitializer) to ) +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i16, ptr [[Q]], i64 [[INDEX]] +; CHECK-NEXT: store [[TMP9]], ptr [[TMP10]], align 2 +; CHECK-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 8 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP12]] ; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/uniform-args-call-variants.ll b/llvm/test/Transforms/LoopVectorize/AArch64/uniform-args-call-variants.ll index 19b007164eb34..51465dbf13e2f 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/uniform-args-call-variants.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/uniform-args-call-variants.ll @@ -13,8 +13,6 @@ define void @test_uniform(ptr noalias %dst, ptr readonly %src, i64 %uniform , i6 ; CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 1 ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[N]], i64 [[TMP1]]) ; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 [[N]]) -; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP7:%.*]] = shl i64 [[TMP6]], 1 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -24,6 +22,8 @@ define void @test_uniform(ptr noalias %dst, ptr readonly %src, i64 %uniform , i6 ; CHECK-NEXT: [[TMP4:%.*]] = call @foo_uniform( [[WIDE_MASKED_LOAD]], i64 [[UNIFORM]], [[ACTIVE_LANE_MASK]]) ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds double, ptr [[DST]], i64 [[INDEX]] ; CHECK-NEXT: call void @llvm.masked.store.nxv2f64.p0( [[TMP4]], ptr [[TMP5]], i32 8, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP7:%.*]] = shl i64 [[TMP6]], 1 ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP7]] ; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 [[TMP2]]) ; CHECK-NEXT: [[TMP8:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i64 0 @@ -57,8 +57,6 @@ define void @test_uniform_smaller_scalar(ptr noalias %dst, ptr readonly %src, i3 ; CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 1 ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[N]], i64 [[TMP1]]) ; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 [[N]]) -; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP7:%.*]] = shl i64 [[TMP6]], 1 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -68,6 +66,8 @@ define void @test_uniform_smaller_scalar(ptr noalias %dst, ptr readonly %src, i3 ; CHECK-NEXT: [[TMP4:%.*]] = call @bar_uniform( [[WIDE_MASKED_LOAD]], i32 [[UNIFORM]], [[ACTIVE_LANE_MASK]]) ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds double, ptr [[DST]], i64 [[INDEX]] ; CHECK-NEXT: call void @llvm.masked.store.nxv2f64.p0( [[TMP4]], ptr [[TMP5]], i32 8, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP7:%.*]] = shl i64 [[TMP6]], 1 ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP7]] ; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 [[TMP2]]) ; CHECK-NEXT: [[TMP8:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i64 0 diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/vector-call-linear-args.ll b/llvm/test/Transforms/LoopVectorize/AArch64/vector-call-linear-args.ll index 876d58131bd7a..cd133371f66ce 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/vector-call-linear-args.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/vector-call-linear-args.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter "call.*(foo|bar|baz|quux)" --version 2 +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter "call" --version 2 ; RUN: opt < %s -passes=loop-vectorize -force-vector-interleave=1 -S | FileCheck %s --check-prefixes=NEON ; RUN: opt < %s -mattr=+sve -passes=loop-vectorize -force-vector-interleave=1 -S | FileCheck %s --check-prefixes=SVE_OR_NEON ; RUN: opt < %s -mattr=+sve -passes=loop-vectorize -force-vector-interleave=1 -S -prefer-predicate-over-epilogue=predicate-dont-vectorize | FileCheck %s --check-prefixes=SVE_TF @@ -10,18 +10,15 @@ target triple = "aarch64-unknown-linux-gnu" define void @test_linear8(ptr noalias %a, ptr readnone %b, i64 %n) { ; NEON-LABEL: define void @test_linear8 ; NEON-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], i64 [[N:%.*]]) { -; NEON: [[TMP3:%.*]] = call <2 x i64> @vec_foo_linear8_nomask_neon(ptr [[TMP2:%.*]]) ; NEON: [[DATA:%.*]] = call i64 @foo(ptr [[GEPB:%.*]]) #[[ATTR0:[0-9]+]] ; ; SVE_OR_NEON-LABEL: define void @test_linear8 ; SVE_OR_NEON-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] { -; SVE_OR_NEON: [[TMP13:%.*]] = call @vec_foo_linear8_nomask_sve(ptr [[TMP12:%.*]]) -; SVE_OR_NEON: [[DATA:%.*]] = call i64 @foo(ptr [[GEPB:%.*]]) #[[ATTR2:[0-9]+]] +; SVE_OR_NEON: [[DATA:%.*]] = call i64 @foo(ptr [[GEPB:%.*]]) #[[ATTR1:[0-9]+]] ; ; SVE_TF-LABEL: define void @test_linear8 ; SVE_TF-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] { -; SVE_TF: [[TMP19:%.*]] = call @vec_foo_linear8_mask_sve(ptr [[TMP18:%.*]], [[ACTIVE_LANE_MASK:%.*]]) -; SVE_TF: [[DATA:%.*]] = call i64 @foo(ptr [[GEPB:%.*]]) #[[ATTR3:[0-9]+]] +; SVE_TF: [[DATA:%.*]] = call i64 @foo(ptr [[GEPB:%.*]]) #[[ATTR1:[0-9]+]] ; entry: br label %for.body @@ -43,17 +40,15 @@ for.cond.cleanup: define void @test_vector_linear4(ptr noalias %a, ptr readnone %b, ptr readonly %c, i64 %n) { ; NEON-LABEL: define void @test_vector_linear4 ; NEON-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], ptr readonly [[C:%.*]], i64 [[N:%.*]]) { -; NEON: [[TMP5:%.*]] = call <4 x i32> @vec_baz_vector_linear4_nomask_neon(<4 x i32> [[WIDE_LOAD:%.*]], ptr [[TMP4:%.*]]) ; NEON: [[DATA:%.*]] = call i32 @baz(i32 [[INPUT:%.*]], ptr [[GEPB:%.*]]) #[[ATTR1:[0-9]+]] ; ; SVE_OR_NEON-LABEL: define void @test_vector_linear4 ; SVE_OR_NEON-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], ptr readonly [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { -; SVE_OR_NEON: [[TMP15:%.*]] = call @vec_baz_vector_linear4_nomask_sve( [[WIDE_LOAD:%.*]], ptr [[TMP14:%.*]]) -; SVE_OR_NEON: [[DATA:%.*]] = call i32 @baz(i32 [[INPUT:%.*]], ptr [[GEPB:%.*]]) #[[ATTR3:[0-9]+]] +; SVE_OR_NEON: [[DATA:%.*]] = call i32 @baz(i32 [[INPUT:%.*]], ptr [[GEPB:%.*]]) #[[ATTR2:[0-9]+]] ; ; SVE_TF-LABEL: define void @test_vector_linear4 ; SVE_TF-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], ptr readonly [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { -; SVE_TF: [[DATA:%.*]] = call i32 @baz(i32 [[INPUT:%.*]], ptr [[GEPB:%.*]]) #[[ATTR4:[0-9]+]] +; SVE_TF: [[DATA:%.*]] = call i32 @baz(i32 [[INPUT:%.*]], ptr [[GEPB:%.*]]) #[[ATTR2:[0-9]+]] ; entry: br label %for.body @@ -81,11 +76,11 @@ define void @test_linear8_bad_stride(ptr noalias %a, ptr readnone %b, i64 %n) { ; ; SVE_OR_NEON-LABEL: define void @test_linear8_bad_stride ; SVE_OR_NEON-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { -; SVE_OR_NEON: [[DATA:%.*]] = call i64 @foo(ptr [[GEPB:%.*]]) #[[ATTR4:[0-9]+]] +; SVE_OR_NEON: [[DATA:%.*]] = call i64 @foo(ptr [[GEPB:%.*]]) #[[ATTR3:[0-9]+]] ; ; SVE_TF-LABEL: define void @test_linear8_bad_stride ; SVE_TF-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { -; SVE_TF: [[DATA:%.*]] = call i64 @foo(ptr [[GEPB:%.*]]) #[[ATTR5:[0-9]+]] +; SVE_TF: [[DATA:%.*]] = call i64 @foo(ptr [[GEPB:%.*]]) #[[ATTR3:[0-9]+]] ; entry: br label %for.body @@ -107,17 +102,15 @@ for.cond.cleanup: define void @test_linear16_wide_stride(ptr noalias %a, ptr readnone %b, i64 %n) { ; NEON-LABEL: define void @test_linear16_wide_stride ; NEON-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], i64 [[N:%.*]]) { -; NEON: [[TMP4:%.*]] = call <2 x i64> @vec_foo_linear16_nomask_neon(ptr [[TMP3:%.*]]) ; NEON: [[DATA:%.*]] = call i64 @foo(ptr [[GEPB:%.*]]) #[[ATTR2]] ; ; SVE_OR_NEON-LABEL: define void @test_linear16_wide_stride ; SVE_OR_NEON-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { -; SVE_OR_NEON: [[TMP14:%.*]] = call @vec_foo_linear16_nomask_sve(ptr [[TMP13:%.*]]) -; SVE_OR_NEON: [[DATA:%.*]] = call i64 @foo(ptr [[GEPB:%.*]]) #[[ATTR4]] +; SVE_OR_NEON: [[DATA:%.*]] = call i64 @foo(ptr [[GEPB:%.*]]) #[[ATTR3]] ; ; SVE_TF-LABEL: define void @test_linear16_wide_stride ; SVE_TF-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { -; SVE_TF: [[DATA:%.*]] = call i64 @foo(ptr [[GEPB:%.*]]) #[[ATTR5]] +; SVE_TF: [[DATA:%.*]] = call i64 @foo(ptr [[GEPB:%.*]]) #[[ATTR3]] ; entry: br label %for.body @@ -140,18 +133,15 @@ for.cond.cleanup: define void @test_linear4_linear8(ptr noalias %a, ptr readnone %b, ptr readonly %c, i64 %n) { ; NEON-LABEL: define void @test_linear4_linear8 ; NEON-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], ptr readonly [[C:%.*]], i64 [[N:%.*]]) { -; NEON: [[TMP5:%.*]] = call <4 x i32> @vec_quux_linear4_linear8_nomask_neon(ptr [[TMP3:%.*]], ptr [[TMP4:%.*]]) ; NEON: [[DATA:%.*]] = call i32 @quux(ptr [[GEPC:%.*]], ptr [[GEPB:%.*]]) #[[ATTR3:[0-9]+]] ; ; SVE_OR_NEON-LABEL: define void @test_linear4_linear8 ; SVE_OR_NEON-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], ptr readonly [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { -; SVE_OR_NEON: [[TMP15:%.*]] = call @vec_quux_linear4_linear8_mask_sve(ptr [[TMP13:%.*]], ptr [[TMP14:%.*]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) -; SVE_OR_NEON: [[DATA:%.*]] = call i32 @quux(ptr [[GEPC:%.*]], ptr [[GEPB:%.*]]) #[[ATTR5:[0-9]+]] +; SVE_OR_NEON: [[DATA:%.*]] = call i32 @quux(ptr [[GEPC:%.*]], ptr [[GEPB:%.*]]) #[[ATTR4:[0-9]+]] ; ; SVE_TF-LABEL: define void @test_linear4_linear8 ; SVE_TF-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], ptr readonly [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { -; SVE_TF: [[TMP21:%.*]] = call @vec_quux_linear4_linear8_mask_sve(ptr [[TMP19:%.*]], ptr [[TMP20:%.*]], [[ACTIVE_LANE_MASK:%.*]]) -; SVE_TF: [[DATA:%.*]] = call i32 @quux(ptr [[GEPC:%.*]], ptr [[GEPB:%.*]]) #[[ATTR6:[0-9]+]] +; SVE_TF: [[DATA:%.*]] = call i32 @quux(ptr [[GEPC:%.*]], ptr [[GEPB:%.*]]) #[[ATTR4:[0-9]+]] ; entry: br label %for.body @@ -174,17 +164,15 @@ for.cond.cleanup: define void @test_linear3_non_ptr(ptr noalias %a, i64 %n) { ; NEON-LABEL: define void @test_linear3_non_ptr ; NEON-SAME: (ptr noalias [[A:%.*]], i64 [[N:%.*]]) { -; NEON: [[TMP3:%.*]] = call <4 x i32> @vec_bar_linear3_nomask_neon(i32 [[TMP2:%.*]]) ; NEON: [[DATA:%.*]] = call i32 @bar(i32 [[TREBLED:%.*]]) #[[ATTR4:[0-9]+]] ; ; SVE_OR_NEON-LABEL: define void @test_linear3_non_ptr ; SVE_OR_NEON-SAME: (ptr noalias [[A:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { -; SVE_OR_NEON: [[TMP13:%.*]] = call @vec_bar_linear3_nomask_sve(i32 [[TMP12:%.*]]) -; SVE_OR_NEON: [[DATA:%.*]] = call i32 @bar(i32 [[TREBLED:%.*]]) #[[ATTR6:[0-9]+]] +; SVE_OR_NEON: [[DATA:%.*]] = call i32 @bar(i32 [[TREBLED:%.*]]) #[[ATTR5:[0-9]+]] ; ; SVE_TF-LABEL: define void @test_linear3_non_ptr ; SVE_TF-SAME: (ptr noalias [[A:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { -; SVE_TF: [[DATA:%.*]] = call i32 @bar(i32 [[TREBLED:%.*]]) #[[ATTR7:[0-9]+]] +; SVE_TF: [[DATA:%.*]] = call i32 @bar(i32 [[TREBLED:%.*]]) #[[ATTR5:[0-9]+]] ; entry: br label %for.body @@ -207,17 +195,15 @@ for.cond.cleanup: define void @test_linearn5_non_ptr_neg_stride(ptr noalias %a, i64 %n) { ; NEON-LABEL: define void @test_linearn5_non_ptr_neg_stride ; NEON-SAME: (ptr noalias [[A:%.*]], i64 [[N:%.*]]) { -; NEON: [[TMP3:%.*]] = call <4 x i32> @vec_bar_linearn5_nomask_neon(i32 [[TMP2:%.*]]) ; NEON: [[DATA:%.*]] = call i32 @bar(i32 [[NEGSTRIDE:%.*]]) #[[ATTR5:[0-9]+]] ; ; SVE_OR_NEON-LABEL: define void @test_linearn5_non_ptr_neg_stride ; SVE_OR_NEON-SAME: (ptr noalias [[A:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { -; SVE_OR_NEON: [[TMP13:%.*]] = call @vec_bar_linearn5_nomask_sve(i32 [[TMP12:%.*]]) -; SVE_OR_NEON: [[DATA:%.*]] = call i32 @bar(i32 [[NEGSTRIDE:%.*]]) #[[ATTR7:[0-9]+]] +; SVE_OR_NEON: [[DATA:%.*]] = call i32 @bar(i32 [[NEGSTRIDE:%.*]]) #[[ATTR6:[0-9]+]] ; ; SVE_TF-LABEL: define void @test_linearn5_non_ptr_neg_stride ; SVE_TF-SAME: (ptr noalias [[A:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { -; SVE_TF: [[DATA:%.*]] = call i32 @bar(i32 [[NEGSTRIDE:%.*]]) #[[ATTR8:[0-9]+]] +; SVE_TF: [[DATA:%.*]] = call i32 @bar(i32 [[NEGSTRIDE:%.*]]) #[[ATTR6:[0-9]+]] ; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/wider-VF-for-callinst.ll b/llvm/test/Transforms/LoopVectorize/AArch64/wider-VF-for-callinst.ll index 4a2f9d07ed91c..7d095206c6062 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/wider-VF-for-callinst.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/wider-VF-for-callinst.ll @@ -16,8 +16,6 @@ define void @test_widen(ptr noalias %a, ptr readnone %b) #1 { ; WIDE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 ; WIDE-NEXT: [[N_MOD_VF:%.*]] = urem i64 1025, [[TMP3]] ; WIDE-NEXT: [[N_VEC:%.*]] = sub i64 1025, [[N_MOD_VF]] -; WIDE-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() -; WIDE-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 4 ; WIDE-NEXT: br label [[VECTOR_BODY:%.*]] ; WIDE: vector.body: ; WIDE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -27,6 +25,8 @@ define void @test_widen(ptr noalias %a, ptr readnone %b) #1 { ; WIDE-NEXT: [[TMP6:%.*]] = call @foo_vector( [[TMP5]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) ; WIDE-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]] ; WIDE-NEXT: store [[TMP6]], ptr [[TMP7]], align 4 +; WIDE-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; WIDE-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 4 ; WIDE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]] ; WIDE-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; WIDE-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] diff --git a/llvm/test/Transforms/LoopVectorize/Hexagon/maximum-vf-crash.ll b/llvm/test/Transforms/LoopVectorize/Hexagon/maximum-vf-crash.ll index 4966ddd299492..4c7a7d31c5e72 100644 --- a/llvm/test/Transforms/LoopVectorize/Hexagon/maximum-vf-crash.ll +++ b/llvm/test/Transforms/LoopVectorize/Hexagon/maximum-vf-crash.ll @@ -1,4 +1,4 @@ -; RUN: opt -mtriple=hexagon -hexagon-autohvx -passes=loop-vectorize -S < %s 2>&1 | FileCheck %s +; RUN: opt -march=hexagon -hexagon-autohvx -passes=loop-vectorize -S < %s 2>&1 | FileCheck %s ; Check that we don't crash. @@ -6,6 +6,7 @@ ; CHECK: vector.body target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048" +target triple = "hexagon" ; Function Attrs: optsize define i32 @f() #0 { diff --git a/llvm/test/Transforms/LoopVectorize/Hexagon/minimum-vf.ll b/llvm/test/Transforms/LoopVectorize/Hexagon/minimum-vf.ll index 1ac556a2fda39..4ff7ad9ab4390 100644 --- a/llvm/test/Transforms/LoopVectorize/Hexagon/minimum-vf.ll +++ b/llvm/test/Transforms/LoopVectorize/Hexagon/minimum-vf.ll @@ -1,4 +1,4 @@ -; RUN: opt -mtriple=hexagon -passes=loop-vectorize -hexagon-autohvx -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt -march=hexagon -passes=loop-vectorize -hexagon-autohvx -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s ; REQUIRES: asserts ; Check that TTI::getMinimumVF works. The calculated MaxVF was based on the @@ -6,6 +6,7 @@ ; CHECK: LV: Overriding calculated MaxVF({{[0-9]+}}) with target's minimum: 64 target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048" +target triple = "hexagon" %s.0 = type { ptr, i32, i32, i32, i32 } diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/defaults.ll b/llvm/test/Transforms/LoopVectorize/RISCV/defaults.ll index 2242ab56f9539..4b93ea30cf252 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/defaults.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/defaults.ll @@ -24,18 +24,18 @@ define void @vector_add(ptr noalias nocapture %a, i64 %v) { ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 2 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[TMP7]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP8]], align 8 -; CHECK-NEXT: [[TMP9:%.*]] = add [[WIDE_LOAD]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: store [[TMP9]], ptr [[TMP8]], align 8 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 8 +; CHECK-NEXT: [[TMP7:%.*]] = add [[WIDE_LOAD]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: store [[TMP7]], ptr [[TMP6]], align 8 +; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]] ; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: @@ -85,22 +85,22 @@ define i64 @vector_add_reduce(ptr noalias nocapture %a) { ; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] -; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 2 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[TMP7]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP8]], align 8 -; CHECK-NEXT: [[TMP9]] = add [[VEC_PHI]], [[WIDE_LOAD]] -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 8 +; CHECK-NEXT: [[TMP7]] = add [[VEC_PHI]], [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]] ; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP11:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64( [[TMP9]]) +; CHECK-NEXT: [[TMP11:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64( [[TMP7]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll b/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll index a0a6d44f146a2..a6ab8d1a1f35d 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll @@ -21,18 +21,18 @@ define void @vector_udiv(ptr noalias nocapture %a, i64 %v, i64 %n) { ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 2 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[TMP7]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP8]], align 8 -; CHECK-NEXT: [[TMP9:%.*]] = udiv [[WIDE_LOAD]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: store [[TMP9]], ptr [[TMP8]], align 8 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 8 +; CHECK-NEXT: [[TMP7:%.*]] = udiv [[WIDE_LOAD]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: store [[TMP7]], ptr [[TMP6]], align 8 +; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]] ; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: @@ -125,18 +125,18 @@ define void @vector_sdiv(ptr noalias nocapture %a, i64 %v, i64 %n) { ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 2 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[TMP7]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP8]], align 8 -; CHECK-NEXT: [[TMP9:%.*]] = sdiv [[WIDE_LOAD]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: store [[TMP9]], ptr [[TMP8]], align 8 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 8 +; CHECK-NEXT: [[TMP7:%.*]] = sdiv [[WIDE_LOAD]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: store [[TMP7]], ptr [[TMP6]], align 8 +; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]] ; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: @@ -229,18 +229,18 @@ define void @vector_urem(ptr noalias nocapture %a, i64 %v, i64 %n) { ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 2 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[TMP7]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP8]], align 8 -; CHECK-NEXT: [[TMP9:%.*]] = urem [[WIDE_LOAD]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: store [[TMP9]], ptr [[TMP8]], align 8 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 8 +; CHECK-NEXT: [[TMP7:%.*]] = urem [[WIDE_LOAD]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: store [[TMP7]], ptr [[TMP6]], align 8 +; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]] ; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: middle.block: @@ -333,18 +333,18 @@ define void @vector_srem(ptr noalias nocapture %a, i64 %v, i64 %n) { ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 2 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[TMP7]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP8]], align 8 -; CHECK-NEXT: [[TMP9:%.*]] = srem [[WIDE_LOAD]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: store [[TMP9]], ptr [[TMP8]], align 8 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 8 +; CHECK-NEXT: [[TMP7:%.*]] = srem [[WIDE_LOAD]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: store [[TMP7]], ptr [[TMP6]], align 8 +; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]] ; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: middle.block: @@ -437,22 +437,22 @@ define void @predicated_udiv(ptr noalias nocapture %a, i64 %v, i64 %n) { ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 2 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[TMP7]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP8]], align 8 -; CHECK-NEXT: [[TMP9:%.*]] = icmp ne [[BROADCAST_SPLAT]], zeroinitializer -; CHECK-NEXT: [[TMP10:%.*]] = select [[TMP9]], [[BROADCAST_SPLAT]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP11:%.*]] = udiv [[WIDE_LOAD]], [[TMP10]] -; CHECK-NEXT: [[TMP12:%.*]] = xor [[TMP9]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[PREDPHI:%.*]] = select [[TMP9]], [[TMP11]], [[WIDE_LOAD]] -; CHECK-NEXT: store [[PREDPHI]], ptr [[TMP8]], align 8 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 8 +; CHECK-NEXT: [[TMP7:%.*]] = icmp ne [[BROADCAST_SPLAT]], zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = select [[TMP7]], [[BROADCAST_SPLAT]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP9:%.*]] = udiv [[WIDE_LOAD]], [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = xor [[TMP7]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[PREDPHI:%.*]] = select [[TMP7]], [[TMP9]], [[WIDE_LOAD]] +; CHECK-NEXT: store [[PREDPHI]], ptr [[TMP6]], align 8 +; CHECK-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP12]] ; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: middle.block: @@ -571,22 +571,22 @@ define void @predicated_sdiv(ptr noalias nocapture %a, i64 %v, i64 %n) { ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 2 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[TMP7]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP8]], align 8 -; CHECK-NEXT: [[TMP9:%.*]] = icmp ne [[BROADCAST_SPLAT]], zeroinitializer -; CHECK-NEXT: [[TMP10:%.*]] = select [[TMP9]], [[BROADCAST_SPLAT]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP11:%.*]] = sdiv [[WIDE_LOAD]], [[TMP10]] -; CHECK-NEXT: [[TMP12:%.*]] = xor [[TMP9]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[PREDPHI:%.*]] = select [[TMP9]], [[TMP11]], [[WIDE_LOAD]] -; CHECK-NEXT: store [[PREDPHI]], ptr [[TMP8]], align 8 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 8 +; CHECK-NEXT: [[TMP7:%.*]] = icmp ne [[BROADCAST_SPLAT]], zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = select [[TMP7]], [[BROADCAST_SPLAT]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP9:%.*]] = sdiv [[WIDE_LOAD]], [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = xor [[TMP7]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[PREDPHI:%.*]] = select [[TMP7]], [[TMP9]], [[WIDE_LOAD]] +; CHECK-NEXT: store [[PREDPHI]], ptr [[TMP6]], align 8 +; CHECK-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP12]] ; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK: middle.block: @@ -703,21 +703,21 @@ define void @predicated_udiv_by_constant(ptr noalias nocapture %a, i64 %n) { ; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] -; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 2 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[TMP7]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP8]], align 8 -; CHECK-NEXT: [[TMP9:%.*]] = icmp ne [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i64 42, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP10:%.*]] = udiv [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i64 27, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP11:%.*]] = xor [[TMP9]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[PREDPHI:%.*]] = select [[TMP9]], [[TMP10]], [[WIDE_LOAD]] -; CHECK-NEXT: store [[PREDPHI]], ptr [[TMP8]], align 8 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 8 +; CHECK-NEXT: [[TMP7:%.*]] = icmp ne [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i64 42, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP8:%.*]] = udiv [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i64 27, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP9:%.*]] = xor [[TMP7]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[PREDPHI:%.*]] = select [[TMP7]], [[TMP8]], [[WIDE_LOAD]] +; CHECK-NEXT: store [[PREDPHI]], ptr [[TMP6]], align 8 +; CHECK-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP11]] ; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; CHECK: middle.block: @@ -830,21 +830,21 @@ define void @predicated_sdiv_by_constant(ptr noalias nocapture %a, i64 %n) { ; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] -; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 2 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[TMP7]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP8]], align 8 -; CHECK-NEXT: [[TMP9:%.*]] = icmp ne [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i64 42, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP10:%.*]] = sdiv [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i64 27, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP11:%.*]] = xor [[TMP9]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[PREDPHI:%.*]] = select [[TMP9]], [[TMP10]], [[WIDE_LOAD]] -; CHECK-NEXT: store [[PREDPHI]], ptr [[TMP8]], align 8 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 8 +; CHECK-NEXT: [[TMP7:%.*]] = icmp ne [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i64 42, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP8:%.*]] = sdiv [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i64 27, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP9:%.*]] = xor [[TMP7]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[PREDPHI:%.*]] = select [[TMP7]], [[TMP8]], [[WIDE_LOAD]] +; CHECK-NEXT: store [[PREDPHI]], ptr [[TMP6]], align 8 +; CHECK-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP11]] ; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; CHECK: middle.block: @@ -957,22 +957,22 @@ define void @predicated_sdiv_by_minus_one(ptr noalias nocapture %a, i64 %n) { ; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 16 ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] -; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 16 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP8]], align 1 -; CHECK-NEXT: [[TMP9:%.*]] = icmp ne [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i8 -128, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP10:%.*]] = select [[TMP9]], shufflevector ( insertelement ( poison, i8 -1, i64 0), poison, zeroinitializer), shufflevector ( insertelement ( poison, i8 1, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP11:%.*]] = sdiv [[WIDE_LOAD]], [[TMP10]] -; CHECK-NEXT: [[TMP12:%.*]] = xor [[TMP9]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[PREDPHI:%.*]] = select [[TMP9]], [[TMP11]], [[WIDE_LOAD]] -; CHECK-NEXT: store [[PREDPHI]], ptr [[TMP8]], align 1 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 1 +; CHECK-NEXT: [[TMP7:%.*]] = icmp ne [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i8 -128, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP8:%.*]] = select [[TMP7]], shufflevector ( insertelement ( poison, i8 -1, i64 0), poison, zeroinitializer), shufflevector ( insertelement ( poison, i8 1, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP9:%.*]] = sdiv [[WIDE_LOAD]], [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = xor [[TMP7]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[PREDPHI:%.*]] = select [[TMP7]], [[TMP9]], [[WIDE_LOAD]] +; CHECK-NEXT: store [[PREDPHI]], ptr [[TMP6]], align 1 +; CHECK-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 16 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP12]] ; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] ; CHECK: middle.block: diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll index 57e1dc9051f4d..1310ed3618b2c 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll @@ -21,23 +21,23 @@ define i32 @add_i16_i32(ptr nocapture readonly %x, i32 %n) { ; OUTLOOP-NEXT: [[TMP3:%.*]] = mul i32 [[TMP2]], 4 ; OUTLOOP-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N]], [[TMP3]] ; OUTLOOP-NEXT: [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]] -; OUTLOOP-NEXT: [[TMP4:%.*]] = call i32 @llvm.vscale.i32() -; OUTLOOP-NEXT: [[TMP5:%.*]] = mul i32 [[TMP4]], 4 ; OUTLOOP-NEXT: br label [[VECTOR_BODY:%.*]] ; OUTLOOP: vector.body: ; OUTLOOP-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; OUTLOOP-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ] -; OUTLOOP-NEXT: [[TMP6:%.*]] = add i32 [[INDEX]], 0 -; OUTLOOP-NEXT: [[TMP7:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i32 [[TMP6]] -; OUTLOOP-NEXT: [[TMP8:%.*]] = getelementptr inbounds i16, ptr [[TMP7]], i32 0 -; OUTLOOP-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP8]], align 2 -; OUTLOOP-NEXT: [[TMP9:%.*]] = sext [[WIDE_LOAD]] to -; OUTLOOP-NEXT: [[TMP10]] = add [[VEC_PHI]], [[TMP9]] -; OUTLOOP-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP5]] +; OUTLOOP-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ] +; OUTLOOP-NEXT: [[TMP4:%.*]] = add i32 [[INDEX]], 0 +; OUTLOOP-NEXT: [[TMP5:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i32 [[TMP4]] +; OUTLOOP-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[TMP5]], i32 0 +; OUTLOOP-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 2 +; OUTLOOP-NEXT: [[TMP7:%.*]] = sext [[WIDE_LOAD]] to +; OUTLOOP-NEXT: [[TMP8]] = add [[VEC_PHI]], [[TMP7]] +; OUTLOOP-NEXT: [[TMP9:%.*]] = call i32 @llvm.vscale.i32() +; OUTLOOP-NEXT: [[TMP10:%.*]] = mul i32 [[TMP9]], 4 +; OUTLOOP-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP10]] ; OUTLOOP-NEXT: [[TMP11:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; OUTLOOP-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; OUTLOOP: middle.block: -; OUTLOOP-NEXT: [[TMP12:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP10]]) +; OUTLOOP-NEXT: [[TMP12:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP8]]) ; OUTLOOP-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] ; OUTLOOP-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; OUTLOOP: scalar.ph: @@ -75,20 +75,20 @@ define i32 @add_i16_i32(ptr nocapture readonly %x, i32 %n) { ; INLOOP-NEXT: [[TMP3:%.*]] = mul i32 [[TMP2]], 8 ; INLOOP-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N]], [[TMP3]] ; INLOOP-NEXT: [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]] -; INLOOP-NEXT: [[TMP4:%.*]] = call i32 @llvm.vscale.i32() -; INLOOP-NEXT: [[TMP5:%.*]] = mul i32 [[TMP4]], 8 ; INLOOP-NEXT: br label [[VECTOR_BODY:%.*]] ; INLOOP: vector.body: ; INLOOP-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; INLOOP-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ] -; INLOOP-NEXT: [[TMP6:%.*]] = add i32 [[INDEX]], 0 -; INLOOP-NEXT: [[TMP7:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i32 [[TMP6]] -; INLOOP-NEXT: [[TMP8:%.*]] = getelementptr inbounds i16, ptr [[TMP7]], i32 0 -; INLOOP-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP8]], align 2 -; INLOOP-NEXT: [[TMP9:%.*]] = sext [[WIDE_LOAD]] to -; INLOOP-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32( [[TMP9]]) -; INLOOP-NEXT: [[TMP11]] = add i32 [[TMP10]], [[VEC_PHI]] -; INLOOP-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP5]] +; INLOOP-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ] +; INLOOP-NEXT: [[TMP4:%.*]] = add i32 [[INDEX]], 0 +; INLOOP-NEXT: [[TMP5:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i32 [[TMP4]] +; INLOOP-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[TMP5]], i32 0 +; INLOOP-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 2 +; INLOOP-NEXT: [[TMP7:%.*]] = sext [[WIDE_LOAD]] to +; INLOOP-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32( [[TMP7]]) +; INLOOP-NEXT: [[TMP9]] = add i32 [[TMP8]], [[VEC_PHI]] +; INLOOP-NEXT: [[TMP10:%.*]] = call i32 @llvm.vscale.i32() +; INLOOP-NEXT: [[TMP11:%.*]] = mul i32 [[TMP10]], 8 +; INLOOP-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP11]] ; INLOOP-NEXT: [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; INLOOP-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; INLOOP: middle.block: @@ -96,7 +96,7 @@ define i32 @add_i16_i32(ptr nocapture readonly %x, i32 %n) { ; INLOOP-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; INLOOP: scalar.ph: ; INLOOP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] -; INLOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ] +; INLOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ] ; INLOOP-NEXT: br label [[FOR_BODY:%.*]] ; INLOOP: for.body: ; INLOOP-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] @@ -109,7 +109,7 @@ define i32 @add_i16_i32(ptr nocapture readonly %x, i32 %n) { ; INLOOP-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]] ; INLOOP-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; INLOOP: for.cond.cleanup.loopexit: -; INLOOP-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ] +; INLOOP-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ] ; INLOOP-NEXT: br label [[FOR_COND_CLEANUP]] ; INLOOP: for.cond.cleanup: ; INLOOP-NEXT: [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll index b6138d3d92610..29ef7364b8212 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll @@ -257,8 +257,6 @@ define void @load_store_factor3_i64(ptr %p) { ; CHECK-NEXT: [[TMP9:%.*]] = mul i64 1, [[TMP8]] ; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP9]], i64 0 ; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer -; CHECK-NEXT: [[TMP19:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP20:%.*]] = mul i64 [[TMP19]], 2 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -278,6 +276,8 @@ define void @load_store_factor3_i64(ptr %p) { ; CHECK-NEXT: [[WIDE_MASKED_GATHER2:%.*]] = call @llvm.masked.gather.nxv2i64.nxv2p0( [[TMP17]], i32 8, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison) ; CHECK-NEXT: [[TMP18:%.*]] = add [[WIDE_MASKED_GATHER2]], shufflevector ( insertelement ( poison, i64 3, i64 0), poison, zeroinitializer) ; CHECK-NEXT: call void @llvm.masked.scatter.nxv2i64.nxv2p0( [[TMP18]], [[TMP17]], i32 8, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; CHECK-NEXT: [[TMP19:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP20:%.*]] = mul i64 [[TMP19]], 2 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP20]] ; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] ; CHECK-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] @@ -362,8 +362,6 @@ define void @load_store_factor8(ptr %p) { ; CHECK-NEXT: [[TMP9:%.*]] = mul i64 1, [[TMP8]] ; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP9]], i64 0 ; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer -; CHECK-NEXT: [[TMP34:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP35:%.*]] = mul i64 [[TMP34]], 2 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -408,6 +406,8 @@ define void @load_store_factor8(ptr %p) { ; CHECK-NEXT: [[WIDE_MASKED_GATHER7:%.*]] = call @llvm.masked.gather.nxv2i64.nxv2p0( [[TMP32]], i32 8, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison) ; CHECK-NEXT: [[TMP33:%.*]] = add [[WIDE_MASKED_GATHER7]], shufflevector ( insertelement ( poison, i64 8, i64 0), poison, zeroinitializer) ; CHECK-NEXT: call void @llvm.masked.scatter.nxv2i64.nxv2p0( [[TMP33]], [[TMP32]], i32 8, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; CHECK-NEXT: [[TMP34:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP35:%.*]] = mul i64 [[TMP34]], 2 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP35]] ; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] ; CHECK-NEXT: [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/lmul.ll b/llvm/test/Transforms/LoopVectorize/RISCV/lmul.ll index b0aefae41a1fc..072cef66c00f4 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/lmul.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/lmul.ll @@ -15,7 +15,6 @@ define void @load_store(ptr %p) { ; LMUL1-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() ; LMUL1-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]] ; LMUL1-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] -; LMUL1-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() ; LMUL1-NEXT: br label [[VECTOR_BODY:%.*]] ; LMUL1: vector.body: ; LMUL1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -25,6 +24,7 @@ define void @load_store(ptr %p) { ; LMUL1-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP4]], align 8 ; LMUL1-NEXT: [[TMP5:%.*]] = add [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) ; LMUL1-NEXT: store [[TMP5]], ptr [[TMP4]], align 8 +; LMUL1-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() ; LMUL1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]] ; LMUL1-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; LMUL1-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] @@ -57,8 +57,6 @@ define void @load_store(ptr %p) { ; LMUL2-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 ; LMUL2-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] ; LMUL2-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] -; LMUL2-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() -; LMUL2-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 2 ; LMUL2-NEXT: br label [[VECTOR_BODY:%.*]] ; LMUL2: vector.body: ; LMUL2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -68,6 +66,8 @@ define void @load_store(ptr %p) { ; LMUL2-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 8 ; LMUL2-NEXT: [[TMP7:%.*]] = add [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) ; LMUL2-NEXT: store [[TMP7]], ptr [[TMP6]], align 8 +; LMUL2-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; LMUL2-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 2 ; LMUL2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]] ; LMUL2-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; LMUL2-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] @@ -100,8 +100,6 @@ define void @load_store(ptr %p) { ; LMUL4-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 ; LMUL4-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] ; LMUL4-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] -; LMUL4-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() -; LMUL4-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 4 ; LMUL4-NEXT: br label [[VECTOR_BODY:%.*]] ; LMUL4: vector.body: ; LMUL4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -111,6 +109,8 @@ define void @load_store(ptr %p) { ; LMUL4-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 8 ; LMUL4-NEXT: [[TMP7:%.*]] = add [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) ; LMUL4-NEXT: store [[TMP7]], ptr [[TMP6]], align 8 +; LMUL4-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; LMUL4-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 4 ; LMUL4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]] ; LMUL4-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; LMUL4-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] @@ -143,8 +143,6 @@ define void @load_store(ptr %p) { ; LMUL8-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8 ; LMUL8-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] ; LMUL8-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] -; LMUL8-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() -; LMUL8-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 8 ; LMUL8-NEXT: br label [[VECTOR_BODY:%.*]] ; LMUL8: vector.body: ; LMUL8-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -154,6 +152,8 @@ define void @load_store(ptr %p) { ; LMUL8-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 8 ; LMUL8-NEXT: [[TMP7:%.*]] = add [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) ; LMUL8-NEXT: store [[TMP7]], ptr [[TMP6]], align 8 +; LMUL8-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; LMUL8-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 8 ; LMUL8-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]] ; LMUL8-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; LMUL8-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll b/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll index 0c5394cb95a61..ace267d72dea0 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll @@ -16,23 +16,23 @@ define void @trip5_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture ; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 5, [[TMP4]] ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 16 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP7]], i64 5) -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP7]] -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0 -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr [[TMP9]], i32 1, [[ACTIVE_LANE_MASK]], poison) -; CHECK-NEXT: [[TMP10:%.*]] = shl [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, i8 1, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[TMP7]] -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0 -; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr [[TMP12]], i32 1, [[ACTIVE_LANE_MASK]], poison) -; CHECK-NEXT: [[TMP13:%.*]] = add [[TMP10]], [[WIDE_MASKED_LOAD1]] -; CHECK-NEXT: call void @llvm.masked.store.nxv16i8.p0( [[TMP13]], ptr [[TMP12]], i32 1, [[ACTIVE_LANE_MASK]]) -; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]] +; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP5]], i64 5) +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr [[TMP7]], i32 1, [[ACTIVE_LANE_MASK]], poison) +; CHECK-NEXT: [[TMP8:%.*]] = shl [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, i8 1, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP9]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr [[TMP10]], i32 1, [[ACTIVE_LANE_MASK]], poison) +; CHECK-NEXT: [[TMP11:%.*]] = add [[TMP8]], [[WIDE_MASKED_LOAD1]] +; CHECK-NEXT: call void @llvm.masked.store.nxv16i8.p0( [[TMP11]], ptr [[TMP10]], i32 1, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 16 +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP13]] ; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/mask-index-type.ll b/llvm/test/Transforms/LoopVectorize/RISCV/mask-index-type.ll index e7976b1463369..34b06972dab06 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/mask-index-type.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/mask-index-type.ll @@ -20,35 +20,35 @@ define void @test(ptr noalias nocapture %a, ptr noalias nocapture %b, i32 %v) { ; VLENUNK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 ; VLENUNK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] ; VLENUNK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] -; VLENUNK-NEXT: [[TMP6:%.*]] = call @llvm.experimental.stepvector.nxv4i64() -; VLENUNK-NEXT: [[TMP7:%.*]] = add [[TMP6]], zeroinitializer -; VLENUNK-NEXT: [[TMP8:%.*]] = mul [[TMP7]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) -; VLENUNK-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP8]] -; VLENUNK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() -; VLENUNK-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 4 -; VLENUNK-NEXT: [[TMP11:%.*]] = mul i64 1, [[TMP10]] -; VLENUNK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP11]], i64 0 +; VLENUNK-NEXT: [[TMP4:%.*]] = call @llvm.experimental.stepvector.nxv4i64() +; VLENUNK-NEXT: [[TMP5:%.*]] = add [[TMP4]], zeroinitializer +; VLENUNK-NEXT: [[TMP6:%.*]] = mul [[TMP5]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) +; VLENUNK-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP6]] +; VLENUNK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; VLENUNK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4 +; VLENUNK-NEXT: [[TMP9:%.*]] = mul i64 1, [[TMP8]] +; VLENUNK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP9]], i64 0 ; VLENUNK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer ; VLENUNK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[V:%.*]], i64 0 ; VLENUNK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; VLENUNK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() -; VLENUNK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 ; VLENUNK-NEXT: br label [[VECTOR_BODY:%.*]] ; VLENUNK: vector.body: ; VLENUNK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; VLENUNK-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; VLENUNK-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 0 -; VLENUNK-NEXT: [[TMP13:%.*]] = icmp ult [[VEC_IND]], shufflevector ( insertelement ( poison, i64 512, i64 0), poison, zeroinitializer) -; VLENUNK-NEXT: [[TMP14:%.*]] = getelementptr i32, ptr [[A:%.*]], i64 [[TMP12]] -; VLENUNK-NEXT: [[TMP15:%.*]] = getelementptr i32, ptr [[TMP14]], i32 0 -; VLENUNK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP15]], i32 4, [[TMP13]], poison) -; VLENUNK-NEXT: [[TMP16:%.*]] = xor [[TMP13]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; VLENUNK-NEXT: [[PREDPHI:%.*]] = select [[TMP16]], zeroinitializer, [[WIDE_MASKED_LOAD]] -; VLENUNK-NEXT: [[TMP17:%.*]] = add [[PREDPHI]], [[BROADCAST_SPLAT]] -; VLENUNK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP12]] -; VLENUNK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP18]], i32 0 -; VLENUNK-NEXT: store [[TMP17]], ptr [[TMP19]], align 4 -; VLENUNK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; VLENUNK-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 0 +; VLENUNK-NEXT: [[TMP11:%.*]] = icmp ult [[VEC_IND]], shufflevector ( insertelement ( poison, i64 512, i64 0), poison, zeroinitializer) +; VLENUNK-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[A:%.*]], i64 [[TMP10]] +; VLENUNK-NEXT: [[TMP13:%.*]] = getelementptr i32, ptr [[TMP12]], i32 0 +; VLENUNK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP13]], i32 4, [[TMP11]], poison) +; VLENUNK-NEXT: [[TMP14:%.*]] = xor [[TMP11]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; VLENUNK-NEXT: [[PREDPHI:%.*]] = select [[TMP14]], zeroinitializer, [[WIDE_MASKED_LOAD]] +; VLENUNK-NEXT: [[TMP15:%.*]] = add [[PREDPHI]], [[BROADCAST_SPLAT]] +; VLENUNK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP10]] +; VLENUNK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 0 +; VLENUNK-NEXT: store [[TMP15]], ptr [[TMP17]], align 4 +; VLENUNK-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64() +; VLENUNK-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 4 +; VLENUNK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP19]] ; VLENUNK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] ; VLENUNK-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; VLENUNK-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll b/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll index 358e35bfd57ef..72e3bca79a5a5 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll @@ -48,8 +48,6 @@ define void @foo4(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea ; RV32-NEXT: [[TMP10:%.*]] = mul i64 16, [[TMP9]] ; RV32-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP10]], i64 0 ; RV32-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer -; RV32-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64() -; RV32-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 2 ; RV32-NEXT: br label [[VECTOR_BODY:%.*]] ; RV32: vector.body: ; RV32-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -64,6 +62,8 @@ define void @foo4(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea ; RV32-NEXT: [[TMP16:%.*]] = fadd [[WIDE_MASKED_GATHER6]], [[TMP15]] ; RV32-NEXT: [[TMP17:%.*]] = getelementptr inbounds double, ptr [[A]], [[VEC_IND]] ; RV32-NEXT: call void @llvm.masked.scatter.nxv2f64.nxv2p0( [[TMP16]], [[TMP17]], i32 8, [[TMP12]]), !alias.scope !5, !noalias !7 +; RV32-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64() +; RV32-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 2 ; RV32-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP19]] ; RV32-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] ; RV32-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] @@ -130,8 +130,6 @@ define void @foo4(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea ; RV64-NEXT: [[TMP10:%.*]] = mul i64 16, [[TMP9]] ; RV64-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP10]], i64 0 ; RV64-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer -; RV64-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64() -; RV64-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 2 ; RV64-NEXT: br label [[VECTOR_BODY:%.*]] ; RV64: vector.body: ; RV64-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -146,6 +144,8 @@ define void @foo4(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea ; RV64-NEXT: [[TMP16:%.*]] = fadd [[WIDE_MASKED_GATHER6]], [[TMP15]] ; RV64-NEXT: [[TMP17:%.*]] = getelementptr inbounds double, ptr [[A]], [[VEC_IND]] ; RV64-NEXT: call void @llvm.masked.scatter.nxv2f64.nxv2p0( [[TMP16]], [[TMP17]], i32 8, [[TMP12]]), !alias.scope !5, !noalias !7 +; RV64-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64() +; RV64-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 2 ; RV64-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP19]] ; RV64-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] ; RV64-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/safe-dep-distance.ll b/llvm/test/Transforms/LoopVectorize/RISCV/safe-dep-distance.ll index 77b0ae2f846a3..90c64e33bcef6 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/safe-dep-distance.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/safe-dep-distance.ll @@ -19,20 +19,20 @@ define void @test(ptr %p) { ; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 200, [[TMP3]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 200, [[N_MOD_VF]] -; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 2 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i64, ptr [[TMP7]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP8]], align 32 -; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[TMP6]], 200 -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP9]] -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i64, ptr [[TMP10]], i32 0 -; CHECK-NEXT: store [[WIDE_LOAD]], ptr [[TMP11]], align 32 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i64, ptr [[TMP5]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 32 +; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[TMP4]], 200 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i64, ptr [[TMP8]], i32 0 +; CHECK-NEXT: store [[WIDE_LOAD]], ptr [[TMP9]], align 32 +; CHECK-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP11]] ; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: @@ -142,20 +142,20 @@ define void @trivial_due_max_vscale(ptr %p) { ; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 200, [[TMP3]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 200, [[N_MOD_VF]] -; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 2 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i64, ptr [[TMP7]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP8]], align 32 -; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[TMP6]], 8192 -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP9]] -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i64, ptr [[TMP10]], i32 0 -; CHECK-NEXT: store [[WIDE_LOAD]], ptr [[TMP11]], align 32 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i64, ptr [[TMP5]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 32 +; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[TMP4]], 8192 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i64, ptr [[TMP8]], i32 0 +; CHECK-NEXT: store [[WIDE_LOAD]], ptr [[TMP9]], align 32 +; CHECK-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP11]] ; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: middle.block: @@ -208,20 +208,20 @@ define void @no_high_lmul_or_interleave(ptr %p) { ; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 200, [[TMP3]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 200, [[N_MOD_VF]] -; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 2 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i64, ptr [[TMP7]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP8]], align 32 -; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[TMP6]], 1024 -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP9]] -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i64, ptr [[TMP10]], i32 0 -; CHECK-NEXT: store [[WIDE_LOAD]], ptr [[TMP11]], align 32 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i64, ptr [[TMP5]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 32 +; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[TMP4]], 1024 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i64, ptr [[TMP8]], i32 0 +; CHECK-NEXT: store [[WIDE_LOAD]], ptr [[TMP9]], align 32 +; CHECK-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP11]] ; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: middle.block: diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/scalable-basics.ll b/llvm/test/Transforms/LoopVectorize/RISCV/scalable-basics.ll index d2af6c6919a56..f7bc4bd35f377 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/scalable-basics.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/scalable-basics.ll @@ -26,8 +26,6 @@ define void @vector_add(ptr noalias nocapture %a, i64 %v, i64 %n) { ; VLENUNK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; VLENUNK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i64 0 ; VLENUNK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; VLENUNK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() -; VLENUNK-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 2 ; VLENUNK-NEXT: br label [[VECTOR_BODY:%.*]] ; VLENUNK: vector.body: ; VLENUNK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -37,6 +35,8 @@ define void @vector_add(ptr noalias nocapture %a, i64 %v, i64 %n) { ; VLENUNK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 8 ; VLENUNK-NEXT: [[TMP7:%.*]] = add [[WIDE_LOAD]], [[BROADCAST_SPLAT]] ; VLENUNK-NEXT: store [[TMP7]], ptr [[TMP6]], align 8 +; VLENUNK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; VLENUNK-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 2 ; VLENUNK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]] ; VLENUNK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; VLENUNK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] @@ -71,8 +71,6 @@ define void @vector_add(ptr noalias nocapture %a, i64 %v, i64 %n) { ; VLEN128-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; VLEN128-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i64 0 ; VLEN128-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; VLEN128-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() -; VLEN128-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 2 ; VLEN128-NEXT: br label [[VECTOR_BODY:%.*]] ; VLEN128: vector.body: ; VLEN128-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -82,6 +80,8 @@ define void @vector_add(ptr noalias nocapture %a, i64 %v, i64 %n) { ; VLEN128-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 8 ; VLEN128-NEXT: [[TMP7:%.*]] = add [[WIDE_LOAD]], [[BROADCAST_SPLAT]] ; VLEN128-NEXT: store [[TMP7]], ptr [[TMP6]], align 8 +; VLEN128-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; VLEN128-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 2 ; VLEN128-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]] ; VLEN128-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; VLEN128-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] @@ -137,8 +137,6 @@ define void @vector_add_i32(ptr noalias nocapture %a, i32 %v, i64 %n) { ; VLENUNK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; VLENUNK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[V:%.*]], i64 0 ; VLENUNK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; VLENUNK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() -; VLENUNK-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 4 ; VLENUNK-NEXT: br label [[VECTOR_BODY:%.*]] ; VLENUNK: vector.body: ; VLENUNK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -148,6 +146,8 @@ define void @vector_add_i32(ptr noalias nocapture %a, i32 %v, i64 %n) { ; VLENUNK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 4 ; VLENUNK-NEXT: [[TMP7:%.*]] = add [[WIDE_LOAD]], [[BROADCAST_SPLAT]] ; VLENUNK-NEXT: store [[TMP7]], ptr [[TMP6]], align 4 +; VLENUNK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; VLENUNK-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 4 ; VLENUNK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]] ; VLENUNK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; VLENUNK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] @@ -182,8 +182,6 @@ define void @vector_add_i32(ptr noalias nocapture %a, i32 %v, i64 %n) { ; VLEN128-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; VLEN128-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[V:%.*]], i64 0 ; VLEN128-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; VLEN128-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() -; VLEN128-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 4 ; VLEN128-NEXT: br label [[VECTOR_BODY:%.*]] ; VLEN128: vector.body: ; VLEN128-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -193,6 +191,8 @@ define void @vector_add_i32(ptr noalias nocapture %a, i32 %v, i64 %n) { ; VLEN128-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 4 ; VLEN128-NEXT: [[TMP7:%.*]] = add [[WIDE_LOAD]], [[BROADCAST_SPLAT]] ; VLEN128-NEXT: store [[TMP7]], ptr [[TMP6]], align 4 +; VLEN128-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; VLEN128-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 4 ; VLEN128-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]] ; VLEN128-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; VLEN128-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] @@ -303,8 +303,6 @@ define void @indexed_store(ptr noalias nocapture %a, ptr noalias nocapture %b, i ; VLENUNK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; VLENUNK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i64 0 ; VLENUNK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; VLENUNK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() -; VLENUNK-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 2 ; VLENUNK-NEXT: br label [[VECTOR_BODY:%.*]] ; VLENUNK: vector.body: ; VLENUNK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -314,6 +312,8 @@ define void @indexed_store(ptr noalias nocapture %a, ptr noalias nocapture %b, i ; VLENUNK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 8 ; VLENUNK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], [[WIDE_LOAD]] ; VLENUNK-NEXT: call void @llvm.masked.scatter.nxv2i64.nxv2p0( [[BROADCAST_SPLAT]], [[TMP7]], i32 8, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; VLENUNK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; VLENUNK-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 2 ; VLENUNK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]] ; VLENUNK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; VLENUNK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] @@ -348,8 +348,6 @@ define void @indexed_store(ptr noalias nocapture %a, ptr noalias nocapture %b, i ; VLEN128-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; VLEN128-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i64 0 ; VLEN128-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; VLEN128-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() -; VLEN128-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 2 ; VLEN128-NEXT: br label [[VECTOR_BODY:%.*]] ; VLEN128: vector.body: ; VLEN128-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -359,6 +357,8 @@ define void @indexed_store(ptr noalias nocapture %a, ptr noalias nocapture %b, i ; VLEN128-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 8 ; VLEN128-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], [[WIDE_LOAD]] ; VLEN128-NEXT: call void @llvm.masked.scatter.nxv2i64.nxv2p0( [[BROADCAST_SPLAT]], [[TMP7]], i32 8, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; VLEN128-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; VLEN128-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 2 ; VLEN128-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]] ; VLEN128-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; VLEN128-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] @@ -409,8 +409,6 @@ define i64 @indexed_load(ptr noalias nocapture %a, ptr noalias nocapture %b, i64 ; VLENUNK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 ; VLENUNK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] ; VLENUNK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] -; VLENUNK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() -; VLENUNK-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 2 ; VLENUNK-NEXT: br label [[VECTOR_BODY:%.*]] ; VLENUNK: vector.body: ; VLENUNK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -422,6 +420,8 @@ define i64 @indexed_load(ptr noalias nocapture %a, ptr noalias nocapture %b, i64 ; VLENUNK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], [[WIDE_LOAD]] ; VLENUNK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv2i64.nxv2p0( [[TMP7]], i32 8, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison) ; VLENUNK-NEXT: [[TMP8]] = add [[VEC_PHI]], [[WIDE_MASKED_GATHER]] +; VLENUNK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() +; VLENUNK-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 2 ; VLENUNK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]] ; VLENUNK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; VLENUNK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] @@ -459,8 +459,6 @@ define i64 @indexed_load(ptr noalias nocapture %a, ptr noalias nocapture %b, i64 ; VLEN128-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 ; VLEN128-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] ; VLEN128-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] -; VLEN128-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() -; VLEN128-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 2 ; VLEN128-NEXT: br label [[VECTOR_BODY:%.*]] ; VLEN128: vector.body: ; VLEN128-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -472,6 +470,8 @@ define i64 @indexed_load(ptr noalias nocapture %a, ptr noalias nocapture %b, i64 ; VLEN128-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], [[WIDE_LOAD]] ; VLEN128-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv2i64.nxv2p0( [[TMP7]], i32 8, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison) ; VLEN128-NEXT: [[TMP8]] = add [[VEC_PHI]], [[WIDE_MASKED_GATHER]] +; VLEN128-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() +; VLEN128-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 2 ; VLEN128-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]] ; VLEN128-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; VLEN128-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] @@ -531,8 +531,6 @@ define void @splat_int(ptr noalias nocapture %a, i64 %v, i64 %n) { ; VLENUNK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; VLENUNK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i64 0 ; VLENUNK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; VLENUNK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() -; VLENUNK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 2 ; VLENUNK-NEXT: br label [[VECTOR_BODY:%.*]] ; VLENUNK: vector.body: ; VLENUNK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -540,6 +538,8 @@ define void @splat_int(ptr noalias nocapture %a, i64 %v, i64 %n) { ; VLENUNK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP4]] ; VLENUNK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0 ; VLENUNK-NEXT: store [[BROADCAST_SPLAT]], ptr [[TMP6]], align 8 +; VLENUNK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; VLENUNK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 2 ; VLENUNK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]] ; VLENUNK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; VLENUNK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] @@ -572,8 +572,6 @@ define void @splat_int(ptr noalias nocapture %a, i64 %v, i64 %n) { ; VLEN128-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; VLEN128-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i64 0 ; VLEN128-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; VLEN128-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() -; VLEN128-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 2 ; VLEN128-NEXT: br label [[VECTOR_BODY:%.*]] ; VLEN128: vector.body: ; VLEN128-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -581,6 +579,8 @@ define void @splat_int(ptr noalias nocapture %a, i64 %v, i64 %n) { ; VLEN128-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP4]] ; VLEN128-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0 ; VLEN128-NEXT: store [[BROADCAST_SPLAT]], ptr [[TMP6]], align 8 +; VLEN128-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; VLEN128-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 2 ; VLEN128-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]] ; VLEN128-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; VLEN128-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] @@ -629,8 +629,6 @@ define void @splat_ptr(ptr noalias nocapture %a, ptr %v, i64 %n) { ; VLENUNK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; VLENUNK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, ptr [[V:%.*]], i64 0 ; VLENUNK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; VLENUNK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() -; VLENUNK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 2 ; VLENUNK-NEXT: br label [[VECTOR_BODY:%.*]] ; VLENUNK: vector.body: ; VLENUNK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -638,6 +636,8 @@ define void @splat_ptr(ptr noalias nocapture %a, ptr %v, i64 %n) { ; VLENUNK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP4]] ; VLENUNK-NEXT: [[TMP6:%.*]] = getelementptr inbounds ptr, ptr [[TMP5]], i32 0 ; VLENUNK-NEXT: store [[BROADCAST_SPLAT]], ptr [[TMP6]], align 8 +; VLENUNK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; VLENUNK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 2 ; VLENUNK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]] ; VLENUNK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; VLENUNK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] @@ -670,8 +670,6 @@ define void @splat_ptr(ptr noalias nocapture %a, ptr %v, i64 %n) { ; VLEN128-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; VLEN128-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, ptr [[V:%.*]], i64 0 ; VLEN128-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; VLEN128-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() -; VLEN128-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 2 ; VLEN128-NEXT: br label [[VECTOR_BODY:%.*]] ; VLEN128: vector.body: ; VLEN128-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -679,6 +677,8 @@ define void @splat_ptr(ptr noalias nocapture %a, ptr %v, i64 %n) { ; VLEN128-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP4]] ; VLEN128-NEXT: [[TMP6:%.*]] = getelementptr inbounds ptr, ptr [[TMP5]], i32 0 ; VLEN128-NEXT: store [[BROADCAST_SPLAT]], ptr [[TMP6]], align 8 +; VLEN128-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; VLEN128-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 2 ; VLEN128-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]] ; VLEN128-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; VLEN128-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/scalable-tailfold.ll b/llvm/test/Transforms/LoopVectorize/RISCV/scalable-tailfold.ll index e482e0a7bd84a..0a8c7cfda9efb 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/scalable-tailfold.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/scalable-tailfold.ll @@ -21,19 +21,19 @@ define void @vector_add(ptr noalias nocapture %a, i64 %v, i64 %n) { ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 2 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP7]], i64 1025) -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP7]] -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 0 -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv2i64.p0(ptr [[TMP9]], i32 8, [[ACTIVE_LANE_MASK]], poison) -; CHECK-NEXT: [[TMP10:%.*]] = add [[WIDE_MASKED_LOAD]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: call void @llvm.masked.store.nxv2i64.p0( [[TMP10]], ptr [[TMP9]], i32 8, [[ACTIVE_LANE_MASK]]) -; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]] +; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP5]], i64 1025) +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv2i64.p0(ptr [[TMP7]], i32 8, [[ACTIVE_LANE_MASK]], poison) +; CHECK-NEXT: [[TMP8:%.*]] = add [[WIDE_MASKED_LOAD]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: call void @llvm.masked.store.nxv2i64.p0( [[TMP8]], ptr [[TMP7]], i32 8, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP10]] ; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: @@ -87,19 +87,19 @@ define void @indexed_store(ptr noalias nocapture %a, ptr noalias nocapture %b, i ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 2 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP7]], i64 1025) -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[TMP7]] -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 0 -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv2i64.p0(ptr [[TMP9]], i32 8, [[ACTIVE_LANE_MASK]], poison) -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], [[WIDE_MASKED_LOAD]] -; CHECK-NEXT: call void @llvm.masked.scatter.nxv2i64.nxv2p0( [[BROADCAST_SPLAT]], [[TMP10]], i32 8, [[ACTIVE_LANE_MASK]]) -; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]] +; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP5]], i64 1025) +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv2i64.p0(ptr [[TMP7]], i32 8, [[ACTIVE_LANE_MASK]], poison) +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], [[WIDE_MASKED_LOAD]] +; CHECK-NEXT: call void @llvm.masked.scatter.nxv2i64.nxv2p0( [[BROADCAST_SPLAT]], [[TMP8]], i32 8, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP10]] ; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: @@ -149,26 +149,26 @@ define i64 @indexed_load(ptr noalias nocapture %a, ptr noalias nocapture %b, i64 ; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 1025, [[TMP4]] ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 2 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP7]], i64 1025) -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[TMP7]] -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 0 -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv2i64.p0(ptr [[TMP9]], i32 8, [[ACTIVE_LANE_MASK]], poison) -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], [[WIDE_MASKED_LOAD]] -; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv2i64.nxv2p0( [[TMP10]], i32 8, [[ACTIVE_LANE_MASK]], poison) -; CHECK-NEXT: [[TMP11]] = add [[VEC_PHI]], [[WIDE_MASKED_GATHER]] -; CHECK-NEXT: [[TMP12:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP11]], [[VEC_PHI]] -; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP5]], i64 1025) +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv2i64.p0(ptr [[TMP7]], i32 8, [[ACTIVE_LANE_MASK]], poison) +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], [[WIDE_MASKED_LOAD]] +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv2i64.nxv2p0( [[TMP8]], i32 8, [[ACTIVE_LANE_MASK]], poison) +; CHECK-NEXT: [[TMP9]] = add [[VEC_PHI]], [[WIDE_MASKED_GATHER]] +; CHECK-NEXT: [[TMP10:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP9]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP12]] ; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64( [[TMP12]]) +; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64( [[TMP10]]) ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] @@ -223,17 +223,17 @@ define void @splat_int(ptr noalias nocapture %a, i64 %v, i64 %n) { ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 2 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP7]], i64 1025) -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP7]] -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 0 -; CHECK-NEXT: call void @llvm.masked.store.nxv2i64.p0( [[BROADCAST_SPLAT]], ptr [[TMP9]], i32 8, [[ACTIVE_LANE_MASK]]) -; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]] +; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP5]], i64 1025) +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 0 +; CHECK-NEXT: call void @llvm.masked.store.nxv2i64.p0( [[BROADCAST_SPLAT]], ptr [[TMP7]], i32 8, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]] ; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: middle.block: @@ -281,18 +281,18 @@ define void @uniform_store(ptr noalias nocapture %a, ptr noalias nocapture %b, i ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 2 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP7]], i64 1025) +; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP5]], i64 1025) ; CHECK-NEXT: store i64 [[V]], ptr [[B:%.*]], align 8 -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP7]] -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 0 -; CHECK-NEXT: call void @llvm.masked.store.nxv2i64.p0( [[BROADCAST_SPLAT]], ptr [[TMP9]], i32 8, [[ACTIVE_LANE_MASK]]) -; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 0 +; CHECK-NEXT: call void @llvm.masked.store.nxv2i64.p0( [[BROADCAST_SPLAT]], ptr [[TMP7]], i32 8, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]] ; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: middle.block: @@ -375,19 +375,19 @@ define void @vector_add_trip1024(ptr noalias nocapture %a, i64 %v, i64 %n) { ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 2 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP7]], i64 1024) -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP7]] -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 0 -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv2i64.p0(ptr [[TMP9]], i32 8, [[ACTIVE_LANE_MASK]], poison) -; CHECK-NEXT: [[TMP10:%.*]] = add [[WIDE_MASKED_LOAD]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: call void @llvm.masked.store.nxv2i64.p0( [[TMP10]], ptr [[TMP9]], i32 8, [[ACTIVE_LANE_MASK]]) -; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]] +; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP5]], i64 1024) +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv2i64.p0(ptr [[TMP7]], i32 8, [[ACTIVE_LANE_MASK]], poison) +; CHECK-NEXT: [[TMP8:%.*]] = add [[WIDE_MASKED_LOAD]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: call void @llvm.masked.store.nxv2i64.p0( [[TMP8]], ptr [[TMP7]], i32 8, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP10]] ; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK: middle.block: diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/select-cmp-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/select-cmp-reduction.ll index 4d5cd145c2914..c7bb4ca4b6c6c 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/select-cmp-reduction.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/select-cmp-reduction.ll @@ -42,8 +42,6 @@ define i32 @select_icmp(i32 %x, i32 %y, ptr nocapture readonly %c, i64 %n) #0 { ; SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; SCALABLE-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i32 [[Y:%.*]], i64 0 ; SCALABLE-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer -; SCALABLE-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() -; SCALABLE-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 4 ; SCALABLE-NEXT: br label [[VECTOR_BODY:%.*]] ; SCALABLE: vector.body: ; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -54,6 +52,8 @@ define i32 @select_icmp(i32 %x, i32 %y, ptr nocapture readonly %c, i64 %n) #0 { ; SCALABLE-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 4 ; SCALABLE-NEXT: [[TMP8:%.*]] = icmp slt [[WIDE_LOAD]], [[BROADCAST_SPLAT]] ; SCALABLE-NEXT: [[TMP9]] = select [[TMP8]], [[VEC_PHI]], [[BROADCAST_SPLAT2]] +; SCALABLE-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() +; SCALABLE-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 4 ; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP11]] ; SCALABLE-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; SCALABLE-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] @@ -117,8 +117,6 @@ define i32 @select_fcmp(float %x, i32 %y, ptr nocapture readonly %c, i64 %n) #0 ; SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; SCALABLE-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i32 [[Y:%.*]], i64 0 ; SCALABLE-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer -; SCALABLE-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() -; SCALABLE-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 4 ; SCALABLE-NEXT: br label [[VECTOR_BODY:%.*]] ; SCALABLE: vector.body: ; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -129,6 +127,8 @@ define i32 @select_fcmp(float %x, i32 %y, ptr nocapture readonly %c, i64 %n) #0 ; SCALABLE-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 4 ; SCALABLE-NEXT: [[TMP8:%.*]] = fcmp fast olt [[WIDE_LOAD]], [[BROADCAST_SPLAT]] ; SCALABLE-NEXT: [[TMP9]] = select [[TMP8]], [[VEC_PHI]], [[BROADCAST_SPLAT2]] +; SCALABLE-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() +; SCALABLE-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 4 ; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP11]] ; SCALABLE-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; SCALABLE-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] @@ -184,8 +184,6 @@ define i32 @select_const_i32_from_icmp(ptr nocapture readonly %v, i64 %n) #0 { ; SCALABLE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 ; SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 %n, [[TMP3]] ; SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 %n, [[N_MOD_VF]] -; SCALABLE-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() -; SCALABLE-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 4 ; SCALABLE-NEXT: br label [[VECTOR_BODY:%.*]] ; SCALABLE: vector.body: ; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -196,6 +194,8 @@ define i32 @select_const_i32_from_icmp(ptr nocapture readonly %v, i64 %n) #0 { ; SCALABLE-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 4 ; SCALABLE-NEXT: [[TMP8:%.*]] = icmp eq [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i32 3, i64 0), poison, zeroinitializer) ; SCALABLE-NEXT: [[TMP9]] = select [[TMP8]], [[VEC_PHI]], shufflevector ( insertelement ( poison, i32 7, i64 0), poison, zeroinitializer) +; SCALABLE-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() +; SCALABLE-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 4 ; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP11]] ; SCALABLE-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; SCALABLE-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] @@ -261,8 +261,6 @@ define i32 @select_i32_from_icmp(ptr nocapture readonly %v, i32 %a, i32 %b, i64 ; SCALABLE-NEXT: [[MINMAX_IDENT_SPLAT:%.*]] = shufflevector [[MINMAX_IDENT_SPLATINSERT]], poison, zeroinitializer ; SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[B:%.*]], i64 0 ; SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; SCALABLE-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() -; SCALABLE-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 4 ; SCALABLE-NEXT: br label [[VECTOR_BODY:%.*]] ; SCALABLE: vector.body: ; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -273,6 +271,8 @@ define i32 @select_i32_from_icmp(ptr nocapture readonly %v, i32 %a, i32 %b, i64 ; SCALABLE-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 4 ; SCALABLE-NEXT: [[TMP8:%.*]] = icmp eq [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i32 3, i64 0), poison, zeroinitializer) ; SCALABLE-NEXT: [[TMP9]] = select [[TMP8]], [[VEC_PHI]], [[BROADCAST_SPLAT]] +; SCALABLE-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() +; SCALABLE-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 4 ; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP11]] ; SCALABLE-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; SCALABLE-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] @@ -330,8 +330,6 @@ define i32 @select_const_i32_from_fcmp(ptr nocapture readonly %v, i64 %n) #0 { ; SCALABLE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 ; SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 %n, [[TMP3]] ; SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 %n, [[N_MOD_VF]] -; SCALABLE-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() -; SCALABLE-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 4 ; SCALABLE-NEXT: br label [[VECTOR_BODY:%.*]] ; SCALABLE: vector.body: ; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -342,6 +340,8 @@ define i32 @select_const_i32_from_fcmp(ptr nocapture readonly %v, i64 %n) #0 { ; SCALABLE-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 4 ; SCALABLE-NEXT: [[TMP8:%.*]] = fcmp fast ueq [[WIDE_LOAD]], shufflevector ( insertelement ( poison, float 3.000000e+00, i64 0), poison, zeroinitializer) ; SCALABLE-NEXT: [[TMP9]] = select [[TMP8]], [[VEC_PHI]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) +; SCALABLE-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() +; SCALABLE-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 4 ; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP11]] ; SCALABLE-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; SCALABLE-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] @@ -428,8 +428,6 @@ define i32 @pred_select_const_i32_from_icmp(ptr noalias nocapture readonly %src1 ; SCALABLE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 ; SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 %n, [[TMP3]] ; SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 %n, [[N_MOD_VF]] -; SCALABLE-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() -; SCALABLE-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 4 ; SCALABLE-NEXT: br label [[VECTOR_BODY:%.*]] ; SCALABLE: vector.body: ; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -446,6 +444,8 @@ define i32 @pred_select_const_i32_from_icmp(ptr noalias nocapture readonly %src1 ; SCALABLE-NEXT: [[TMP13:%.*]] = select [[TMP12]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer), [[VEC_PHI]] ; SCALABLE-NEXT: [[TMP14:%.*]] = xor [[TMP8]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) ; SCALABLE-NEXT: [[PREDPHI]] = select [[TMP8]], [[TMP13]], [[VEC_PHI]] +; SCALABLE-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() +; SCALABLE-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 4 ; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP16]] ; SCALABLE-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; SCALABLE-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/short-trip-count.ll b/llvm/test/Transforms/LoopVectorize/RISCV/short-trip-count.ll index 331a8d2eb66e2..38ef0537e4318 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/short-trip-count.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/short-trip-count.ll @@ -14,8 +14,6 @@ define void @small_trip_count_min_vlen_128(ptr nocapture %a) nounwind vscale_ran ; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 4, [[TMP4]] ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], [[TMP1]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]] -; CHECK-NEXT: [[TMP9:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-NEXT: [[TMP10:%.*]] = mul i32 [[TMP9]], 4 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -26,6 +24,8 @@ define void @small_trip_count_min_vlen_128(ptr nocapture %a) nounwind vscale_ran ; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP7]], i32 4, [[ACTIVE_LANE_MASK]], poison) ; CHECK-NEXT: [[TMP8:%.*]] = add nsw [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) ; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[TMP8]], ptr [[TMP7]], i32 4, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: [[TMP9:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[TMP10:%.*]] = mul i32 [[TMP9]], 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP10]] ; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: @@ -75,8 +75,6 @@ define void @small_trip_count_min_vlen_32(ptr nocapture %a) nounwind vscale_rang ; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 4, [[TMP4]] ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], [[TMP1]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]] -; CHECK-NEXT: [[TMP9:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-NEXT: [[TMP10:%.*]] = mul i32 [[TMP9]], 4 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -87,6 +85,8 @@ define void @small_trip_count_min_vlen_32(ptr nocapture %a) nounwind vscale_rang ; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP7]], i32 4, [[ACTIVE_LANE_MASK]], poison) ; CHECK-NEXT: [[TMP8:%.*]] = add nsw [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) ; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[TMP8]], ptr [[TMP7]], i32 4, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: [[TMP9:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[TMP10:%.*]] = mul i32 [[TMP9]], 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP10]] ; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll b/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll index ad3eddec42b1f..65a2cb51b88ff 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll @@ -26,8 +26,6 @@ define void @single_constant_stride_int_scaled(ptr %p) { ; CHECK-NEXT: [[TMP11:%.*]] = mul i64 1, [[TMP10]] ; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP11]], i64 0 ; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer -; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 4 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -37,6 +35,8 @@ define void @single_constant_stride_int_scaled(ptr %p) { ; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0( [[TMP13]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison) ; CHECK-NEXT: [[TMP14:%.*]] = add [[WIDE_MASKED_GATHER]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) ; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0( [[TMP14]], [[TMP13]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP16]] ; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] ; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] @@ -99,8 +99,6 @@ define void @single_constant_stride_int_iv(ptr %p) { ; CHECK-NEXT: [[TMP9:%.*]] = mul i64 64, [[TMP8]] ; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP9]], i64 0 ; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer -; CHECK-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 4 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -109,6 +107,8 @@ define void @single_constant_stride_int_iv(ptr %p) { ; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0( [[TMP10]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison) ; CHECK-NEXT: [[TMP11:%.*]] = add [[WIDE_MASKED_GATHER]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) ; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0( [[TMP11]], [[TMP10]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; CHECK-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP13]] ; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] ; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] @@ -236,8 +236,6 @@ define void @single_stride_int_scaled(ptr %p, i64 %stride) { ; NOSTRIDED-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 4 ; NOSTRIDED-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP4]] ; NOSTRIDED-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] -; NOSTRIDED-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() -; NOSTRIDED-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 4 ; NOSTRIDED-NEXT: br label [[VECTOR_BODY:%.*]] ; NOSTRIDED: vector.body: ; NOSTRIDED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -247,6 +245,8 @@ define void @single_stride_int_scaled(ptr %p, i64 %stride) { ; NOSTRIDED-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP7]], align 4 ; NOSTRIDED-NEXT: [[TMP8:%.*]] = add [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) ; NOSTRIDED-NEXT: store [[TMP8]], ptr [[TMP7]], align 4 +; NOSTRIDED-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() +; NOSTRIDED-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 4 ; NOSTRIDED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]] ; NOSTRIDED-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; NOSTRIDED-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] @@ -320,8 +320,6 @@ define void @single_stride_int_iv(ptr %p, i64 %stride) { ; NOSTRIDED-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP4]] ; NOSTRIDED-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; NOSTRIDED-NEXT: [[IND_END:%.*]] = mul i64 [[N_VEC]], [[STRIDE]] -; NOSTRIDED-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() -; NOSTRIDED-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 4 ; NOSTRIDED-NEXT: br label [[VECTOR_BODY:%.*]] ; NOSTRIDED: vector.body: ; NOSTRIDED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -331,6 +329,8 @@ define void @single_stride_int_iv(ptr %p, i64 %stride) { ; NOSTRIDED-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP7]], align 4 ; NOSTRIDED-NEXT: [[TMP8:%.*]] = add [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) ; NOSTRIDED-NEXT: store [[TMP8]], ptr [[TMP7]], align 4 +; NOSTRIDED-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() +; NOSTRIDED-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 4 ; NOSTRIDED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]] ; NOSTRIDED-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; NOSTRIDED-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] @@ -452,8 +452,6 @@ define void @double_stride_int_scaled(ptr %p, ptr %p2, i64 %stride) { ; NOSTRIDED-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4 ; NOSTRIDED-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP8]] ; NOSTRIDED-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] -; NOSTRIDED-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() -; NOSTRIDED-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 4 ; NOSTRIDED-NEXT: br label [[VECTOR_BODY:%.*]] ; NOSTRIDED: vector.body: ; NOSTRIDED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -465,6 +463,8 @@ define void @double_stride_int_scaled(ptr %p, ptr %p2, i64 %stride) { ; NOSTRIDED-NEXT: [[TMP13:%.*]] = getelementptr i32, ptr [[P2]], i64 [[TMP9]] ; NOSTRIDED-NEXT: [[TMP14:%.*]] = getelementptr i32, ptr [[TMP13]], i32 0 ; NOSTRIDED-NEXT: store [[TMP12]], ptr [[TMP14]], align 4 +; NOSTRIDED-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() +; NOSTRIDED-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 4 ; NOSTRIDED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP16]] ; NOSTRIDED-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; NOSTRIDED-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] @@ -529,8 +529,6 @@ define void @double_stride_int_scaled(ptr %p, ptr %p2, i64 %stride) { ; STRIDED-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer ; STRIDED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[STRIDE]], i64 0 ; STRIDED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; STRIDED-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64() -; STRIDED-NEXT: [[TMP21:%.*]] = mul i64 [[TMP20]], 4 ; STRIDED-NEXT: br label [[VECTOR_BODY:%.*]] ; STRIDED: vector.body: ; STRIDED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -541,6 +539,8 @@ define void @double_stride_int_scaled(ptr %p, ptr %p2, i64 %stride) { ; STRIDED-NEXT: [[TMP18:%.*]] = add [[WIDE_MASKED_GATHER]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) ; STRIDED-NEXT: [[TMP19:%.*]] = getelementptr i32, ptr [[P2]], [[TMP16]] ; STRIDED-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0( [[TMP18]], [[TMP19]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)), !alias.scope !11, !noalias !8 +; STRIDED-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64() +; STRIDED-NEXT: [[TMP21:%.*]] = mul i64 [[TMP20]], 4 ; STRIDED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP21]] ; STRIDED-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] ; STRIDED-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] @@ -601,8 +601,6 @@ define void @double_stride_int_iv(ptr %p, ptr %p2, i64 %stride) { ; NOSTRIDED-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP4]] ; NOSTRIDED-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; NOSTRIDED-NEXT: [[IND_END:%.*]] = mul i64 [[N_VEC]], [[STRIDE]] -; NOSTRIDED-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() -; NOSTRIDED-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 4 ; NOSTRIDED-NEXT: br label [[VECTOR_BODY:%.*]] ; NOSTRIDED: vector.body: ; NOSTRIDED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -612,6 +610,8 @@ define void @double_stride_int_iv(ptr %p, ptr %p2, i64 %stride) { ; NOSTRIDED-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP7]], align 4 ; NOSTRIDED-NEXT: [[TMP8:%.*]] = add [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) ; NOSTRIDED-NEXT: store [[TMP8]], ptr [[TMP7]], align 4 +; NOSTRIDED-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() +; NOSTRIDED-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 4 ; NOSTRIDED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]] ; NOSTRIDED-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; NOSTRIDED-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] @@ -729,8 +729,6 @@ define void @double_stride_ptr_iv(ptr %p, ptr %p2, i64 %stride) { ; STRIDED-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP10]] ; STRIDED-NEXT: [[TMP11:%.*]] = mul i64 [[N_VEC]], [[STRIDE]] ; STRIDED-NEXT: [[IND_END7:%.*]] = getelementptr i8, ptr [[P2]], i64 [[TMP11]] -; STRIDED-NEXT: [[TMP29:%.*]] = call i64 @llvm.vscale.i64() -; STRIDED-NEXT: [[TMP30:%.*]] = mul i64 [[TMP29]], 4 ; STRIDED-NEXT: br label [[VECTOR_BODY:%.*]] ; STRIDED: vector.body: ; STRIDED-NEXT: [[POINTER_PHI:%.*]] = phi ptr [ [[P]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ] @@ -763,6 +761,8 @@ define void @double_stride_ptr_iv(ptr %p, ptr %p2, i64 %stride) { ; STRIDED-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0( [[TMP19]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison), !alias.scope !15 ; STRIDED-NEXT: [[TMP28:%.*]] = add [[WIDE_MASKED_GATHER]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) ; STRIDED-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0( [[TMP28]], [[TMP27]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)), !alias.scope !18, !noalias !15 +; STRIDED-NEXT: [[TMP29:%.*]] = call i64 @llvm.vscale.i64() +; STRIDED-NEXT: [[TMP30:%.*]] = mul i64 [[TMP29]], 4 ; STRIDED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP30]] ; STRIDED-NEXT: [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 [[TMP15]] ; STRIDED-NEXT: [[PTR_IND12]] = getelementptr i8, ptr [[POINTER_PHI11]], i64 [[TMP23]] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll index 88b659be7beb5..89c3219c4b6d0 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll @@ -21,8 +21,6 @@ define void @uniform_load(ptr noalias nocapture %a, ptr noalias nocapture %b, i6 ; SCALABLE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 ; SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 1025, [[TMP3]] ; SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 1025, [[N_MOD_VF]] -; SCALABLE-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() -; SCALABLE-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 2 ; SCALABLE-NEXT: br label [[VECTOR_BODY:%.*]] ; SCALABLE: vector.body: ; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -33,6 +31,8 @@ define void @uniform_load(ptr noalias nocapture %a, ptr noalias nocapture %b, i6 ; SCALABLE-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP4]] ; SCALABLE-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 0 ; SCALABLE-NEXT: store [[BROADCAST_SPLAT]], ptr [[TMP7]], align 8 +; SCALABLE-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; SCALABLE-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 2 ; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]] ; SCALABLE-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; SCALABLE-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] @@ -102,8 +102,6 @@ define void @uniform_load(ptr noalias nocapture %a, ptr noalias nocapture %b, i6 ; TF-SCALABLE-NEXT: [[N_RND_UP:%.*]] = add i64 1025, [[TMP4]] ; TF-SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] ; TF-SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; TF-SCALABLE-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() -; TF-SCALABLE-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 2 ; TF-SCALABLE-NEXT: br label [[VECTOR_BODY:%.*]] ; TF-SCALABLE: vector.body: ; TF-SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -115,6 +113,8 @@ define void @uniform_load(ptr noalias nocapture %a, ptr noalias nocapture %b, i6 ; TF-SCALABLE-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP5]] ; TF-SCALABLE-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[TMP7]], i32 0 ; TF-SCALABLE-NEXT: call void @llvm.masked.store.nxv2i64.p0( [[BROADCAST_SPLAT]], ptr [[TMP8]], i32 8, [[ACTIVE_LANE_MASK]]) +; TF-SCALABLE-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() +; TF-SCALABLE-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 2 ; TF-SCALABLE-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP10]] ; TF-SCALABLE-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; TF-SCALABLE-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] @@ -196,8 +196,6 @@ define i64 @uniform_load_outside_use(ptr noalias nocapture %a, ptr noalias nocap ; SCALABLE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 ; SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 1025, [[TMP3]] ; SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 1025, [[N_MOD_VF]] -; SCALABLE-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() -; SCALABLE-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 2 ; SCALABLE-NEXT: br label [[VECTOR_BODY:%.*]] ; SCALABLE: vector.body: ; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -208,6 +206,8 @@ define i64 @uniform_load_outside_use(ptr noalias nocapture %a, ptr noalias nocap ; SCALABLE-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP4]] ; SCALABLE-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 0 ; SCALABLE-NEXT: store [[BROADCAST_SPLAT]], ptr [[TMP7]], align 8 +; SCALABLE-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; SCALABLE-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 2 ; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]] ; SCALABLE-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; SCALABLE-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] @@ -337,8 +337,6 @@ define void @conditional_uniform_load(ptr noalias nocapture %a, ptr noalias noca ; SCALABLE-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer ; SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, ptr [[B:%.*]], i64 0 ; SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; SCALABLE-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() -; SCALABLE-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 2 ; SCALABLE-NEXT: br label [[VECTOR_BODY:%.*]] ; SCALABLE: vector.body: ; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -351,6 +349,8 @@ define void @conditional_uniform_load(ptr noalias nocapture %a, ptr noalias noca ; SCALABLE-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP10]] ; SCALABLE-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[TMP13]], i32 0 ; SCALABLE-NEXT: store [[PREDPHI]], ptr [[TMP14]], align 8 +; SCALABLE-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() +; SCALABLE-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 2 ; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP16]] ; SCALABLE-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] ; SCALABLE-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] @@ -454,8 +454,6 @@ define void @conditional_uniform_load(ptr noalias nocapture %a, ptr noalias noca ; TF-SCALABLE-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer ; TF-SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, ptr [[B:%.*]], i64 0 ; TF-SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; TF-SCALABLE-NEXT: [[TMP19:%.*]] = call i64 @llvm.vscale.i64() -; TF-SCALABLE-NEXT: [[TMP20:%.*]] = mul i64 [[TMP19]], 2 ; TF-SCALABLE-NEXT: br label [[VECTOR_BODY:%.*]] ; TF-SCALABLE: vector.body: ; TF-SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -472,6 +470,8 @@ define void @conditional_uniform_load(ptr noalias nocapture %a, ptr noalias noca ; TF-SCALABLE-NEXT: [[TMP17:%.*]] = or [[TMP13]], [[TMP15]] ; TF-SCALABLE-NEXT: [[TMP18:%.*]] = getelementptr inbounds i64, ptr [[TMP16]], i32 0 ; TF-SCALABLE-NEXT: call void @llvm.masked.store.nxv2i64.p0( [[PREDPHI]], ptr [[TMP18]], i32 8, [[TMP17]]) +; TF-SCALABLE-NEXT: [[TMP19:%.*]] = call i64 @llvm.vscale.i64() +; TF-SCALABLE-NEXT: [[TMP20:%.*]] = mul i64 [[TMP19]], 2 ; TF-SCALABLE-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP20]] ; TF-SCALABLE-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] ; TF-SCALABLE-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] @@ -581,8 +581,6 @@ define void @uniform_load_unaligned(ptr noalias nocapture %a, ptr noalias nocapt ; SCALABLE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 ; SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 1025, [[TMP3]] ; SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 1025, [[N_MOD_VF]] -; SCALABLE-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() -; SCALABLE-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 2 ; SCALABLE-NEXT: br label [[VECTOR_BODY:%.*]] ; SCALABLE: vector.body: ; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -593,6 +591,8 @@ define void @uniform_load_unaligned(ptr noalias nocapture %a, ptr noalias nocapt ; SCALABLE-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP4]] ; SCALABLE-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 0 ; SCALABLE-NEXT: store [[BROADCAST_SPLAT]], ptr [[TMP7]], align 8 +; SCALABLE-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; SCALABLE-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 2 ; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]] ; SCALABLE-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; SCALABLE-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] @@ -662,8 +662,6 @@ define void @uniform_load_unaligned(ptr noalias nocapture %a, ptr noalias nocapt ; TF-SCALABLE-NEXT: [[N_RND_UP:%.*]] = add i64 1025, [[TMP4]] ; TF-SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] ; TF-SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; TF-SCALABLE-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() -; TF-SCALABLE-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 2 ; TF-SCALABLE-NEXT: br label [[VECTOR_BODY:%.*]] ; TF-SCALABLE: vector.body: ; TF-SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -675,6 +673,8 @@ define void @uniform_load_unaligned(ptr noalias nocapture %a, ptr noalias nocapt ; TF-SCALABLE-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP5]] ; TF-SCALABLE-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[TMP7]], i32 0 ; TF-SCALABLE-NEXT: call void @llvm.masked.store.nxv2i64.p0( [[BROADCAST_SPLAT]], ptr [[TMP8]], i32 8, [[ACTIVE_LANE_MASK]]) +; TF-SCALABLE-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() +; TF-SCALABLE-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 2 ; TF-SCALABLE-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP10]] ; TF-SCALABLE-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; TF-SCALABLE-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] @@ -758,8 +758,6 @@ define void @uniform_store(ptr noalias nocapture %a, ptr noalias nocapture %b, i ; SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 1025, [[N_MOD_VF]] ; SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i64 0 ; SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; SCALABLE-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() -; SCALABLE-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 2 ; SCALABLE-NEXT: br label [[VECTOR_BODY:%.*]] ; SCALABLE: vector.body: ; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -768,6 +766,8 @@ define void @uniform_store(ptr noalias nocapture %a, ptr noalias nocapture %b, i ; SCALABLE-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP4]] ; SCALABLE-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0 ; SCALABLE-NEXT: store [[BROADCAST_SPLAT]], ptr [[TMP6]], align 8 +; SCALABLE-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; SCALABLE-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 2 ; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]] ; SCALABLE-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; SCALABLE-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] @@ -839,8 +839,6 @@ define void @uniform_store(ptr noalias nocapture %a, ptr noalias nocapture %b, i ; TF-SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; TF-SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i64 0 ; TF-SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; TF-SCALABLE-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() -; TF-SCALABLE-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 2 ; TF-SCALABLE-NEXT: br label [[VECTOR_BODY:%.*]] ; TF-SCALABLE: vector.body: ; TF-SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -850,6 +848,8 @@ define void @uniform_store(ptr noalias nocapture %a, ptr noalias nocapture %b, i ; TF-SCALABLE-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP5]] ; TF-SCALABLE-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 0 ; TF-SCALABLE-NEXT: call void @llvm.masked.store.nxv2i64.p0( [[BROADCAST_SPLAT]], ptr [[TMP7]], i32 8, [[ACTIVE_LANE_MASK]]) +; TF-SCALABLE-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; TF-SCALABLE-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 2 ; TF-SCALABLE-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]] ; TF-SCALABLE-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; TF-SCALABLE-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] @@ -933,8 +933,6 @@ define void @uniform_store_of_loop_varying(ptr noalias nocapture %a, ptr noalias ; SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 1025, [[N_MOD_VF]] ; SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i64 0 ; SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; SCALABLE-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() -; SCALABLE-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 2 ; SCALABLE-NEXT: br label [[VECTOR_BODY:%.*]] ; SCALABLE: vector.body: ; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -954,6 +952,8 @@ define void @uniform_store_of_loop_varying(ptr noalias nocapture %a, ptr noalias ; SCALABLE-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP8]] ; SCALABLE-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[TMP14]], i32 0 ; SCALABLE-NEXT: store [[BROADCAST_SPLAT]], ptr [[TMP15]], align 8 +; SCALABLE-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() +; SCALABLE-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 2 ; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP17]] ; SCALABLE-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; SCALABLE-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] @@ -1042,8 +1042,6 @@ define void @uniform_store_of_loop_varying(ptr noalias nocapture %a, ptr noalias ; TF-SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; TF-SCALABLE-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i64 [[V:%.*]], i64 0 ; TF-SCALABLE-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer -; TF-SCALABLE-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64() -; TF-SCALABLE-NEXT: [[TMP15:%.*]] = mul i64 [[TMP14]], 2 ; TF-SCALABLE-NEXT: br label [[VECTOR_BODY:%.*]] ; TF-SCALABLE: vector.body: ; TF-SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -1054,6 +1052,8 @@ define void @uniform_store_of_loop_varying(ptr noalias nocapture %a, ptr noalias ; TF-SCALABLE-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP11]] ; TF-SCALABLE-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[TMP12]], i32 0 ; TF-SCALABLE-NEXT: call void @llvm.masked.store.nxv2i64.p0( [[BROADCAST_SPLAT2]], ptr [[TMP13]], i32 8, [[ACTIVE_LANE_MASK]]) +; TF-SCALABLE-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64() +; TF-SCALABLE-NEXT: [[TMP15:%.*]] = mul i64 [[TMP14]], 2 ; TF-SCALABLE-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP15]] ; TF-SCALABLE-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] ; TF-SCALABLE-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] @@ -1175,8 +1175,6 @@ define void @conditional_uniform_store(ptr noalias nocapture %a, ptr noalias noc ; SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; SCALABLE-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, ptr [[B:%.*]], i64 0 ; SCALABLE-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer -; SCALABLE-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64() -; SCALABLE-NEXT: [[TMP15:%.*]] = mul i64 [[TMP14]], 2 ; SCALABLE-NEXT: br label [[VECTOR_BODY:%.*]] ; SCALABLE: vector.body: ; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -1187,6 +1185,8 @@ define void @conditional_uniform_store(ptr noalias nocapture %a, ptr noalias noc ; SCALABLE-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP10]] ; SCALABLE-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[TMP12]], i32 0 ; SCALABLE-NEXT: store [[BROADCAST_SPLAT]], ptr [[TMP13]], align 8 +; SCALABLE-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64() +; SCALABLE-NEXT: [[TMP15:%.*]] = mul i64 [[TMP14]], 2 ; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP15]] ; SCALABLE-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] ; SCALABLE-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] @@ -1288,8 +1288,6 @@ define void @conditional_uniform_store(ptr noalias nocapture %a, ptr noalias noc ; TF-SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; TF-SCALABLE-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, ptr [[B:%.*]], i64 0 ; TF-SCALABLE-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer -; TF-SCALABLE-NEXT: [[TMP19:%.*]] = call i64 @llvm.vscale.i64() -; TF-SCALABLE-NEXT: [[TMP20:%.*]] = mul i64 [[TMP19]], 2 ; TF-SCALABLE-NEXT: br label [[VECTOR_BODY:%.*]] ; TF-SCALABLE: vector.body: ; TF-SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -1305,6 +1303,8 @@ define void @conditional_uniform_store(ptr noalias nocapture %a, ptr noalias noc ; TF-SCALABLE-NEXT: [[TMP17:%.*]] = or [[TMP13]], [[TMP16]] ; TF-SCALABLE-NEXT: [[TMP18:%.*]] = getelementptr inbounds i64, ptr [[TMP14]], i32 0 ; TF-SCALABLE-NEXT: call void @llvm.masked.store.nxv2i64.p0( [[BROADCAST_SPLAT]], ptr [[TMP18]], i32 8, [[TMP17]]) +; TF-SCALABLE-NEXT: [[TMP19:%.*]] = call i64 @llvm.vscale.i64() +; TF-SCALABLE-NEXT: [[TMP20:%.*]] = mul i64 [[TMP19]], 2 ; TF-SCALABLE-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP20]] ; TF-SCALABLE-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] ; TF-SCALABLE-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] @@ -1414,8 +1414,6 @@ define void @uniform_store_unaligned(ptr noalias nocapture %a, ptr noalias nocap ; SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 1025, [[N_MOD_VF]] ; SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i64 0 ; SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; SCALABLE-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() -; SCALABLE-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 2 ; SCALABLE-NEXT: br label [[VECTOR_BODY:%.*]] ; SCALABLE: vector.body: ; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -1424,6 +1422,8 @@ define void @uniform_store_unaligned(ptr noalias nocapture %a, ptr noalias nocap ; SCALABLE-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP4]] ; SCALABLE-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0 ; SCALABLE-NEXT: store [[BROADCAST_SPLAT]], ptr [[TMP6]], align 8 +; SCALABLE-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; SCALABLE-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 2 ; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]] ; SCALABLE-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; SCALABLE-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] @@ -1495,8 +1495,6 @@ define void @uniform_store_unaligned(ptr noalias nocapture %a, ptr noalias nocap ; TF-SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; TF-SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[V:%.*]], i64 0 ; TF-SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; TF-SCALABLE-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() -; TF-SCALABLE-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 2 ; TF-SCALABLE-NEXT: br label [[VECTOR_BODY:%.*]] ; TF-SCALABLE: vector.body: ; TF-SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -1506,6 +1504,8 @@ define void @uniform_store_unaligned(ptr noalias nocapture %a, ptr noalias nocap ; TF-SCALABLE-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP5]] ; TF-SCALABLE-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 0 ; TF-SCALABLE-NEXT: call void @llvm.masked.store.nxv2i64.p0( [[BROADCAST_SPLAT]], ptr [[TMP7]], i32 8, [[ACTIVE_LANE_MASK]]) +; TF-SCALABLE-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; TF-SCALABLE-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 2 ; TF-SCALABLE-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]] ; TF-SCALABLE-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; TF-SCALABLE-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] diff --git a/llvm/test/Transforms/LoopVectorize/outer_loop_scalable.ll b/llvm/test/Transforms/LoopVectorize/outer_loop_scalable.ll index b42bf38b84f0b..c7d7ac3cb9ede 100644 --- a/llvm/test/Transforms/LoopVectorize/outer_loop_scalable.ll +++ b/llvm/test/Transforms/LoopVectorize/outer_loop_scalable.ll @@ -33,8 +33,6 @@ define void @foo() { ; CHECK-NEXT: [[TMP9:%.*]] = mul i64 1, [[TMP8]] ; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP9]], i64 0 ; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer -; CHECK-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 4 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[OUTER_LOOP_LATCH4:%.*]] ] @@ -57,6 +55,8 @@ define void @foo() { ; CHECK-NEXT: call void @llvm.masked.scatter.nxv4f32.nxv4p0( [[VEC_PHI5]], [[TMP10]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) ; CHECK-NEXT: [[TMP16:%.*]] = add nuw nsw [[VEC_IND]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) ; CHECK-NEXT: [[TMP17:%.*]] = icmp eq [[TMP16]], shufflevector ( insertelement ( poison, i64 1024, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP19]] ; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] ; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] diff --git a/llvm/test/Transforms/LoopVectorize/scalable-inductions.ll b/llvm/test/Transforms/LoopVectorize/scalable-inductions.ll index d8d0131b80f35..034400ca2b0fb 100644 --- a/llvm/test/Transforms/LoopVectorize/scalable-inductions.ll +++ b/llvm/test/Transforms/LoopVectorize/scalable-inductions.ll @@ -19,33 +19,33 @@ define void @add_ind64_unrolled(ptr noalias nocapture %a, ptr noalias nocapture ; CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 2 ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] -; CHECK-NEXT: [[TMP6:%.*]] = call @llvm.experimental.stepvector.nxv2i64() -; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP8:%.*]] = shl i64 [[TMP7]], 1 -; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP8]], i64 0 +; CHECK-NEXT: [[TMP4:%.*]] = call @llvm.experimental.stepvector.nxv2i64() +; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP6:%.*]] = shl i64 [[TMP5]], 1 +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP6]], i64 0 ; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP5:%.*]] = shl i64 [[TMP4]], 2 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP6]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP4]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[STEP_ADD:%.*]] = add [[VEC_IND]], [[DOTSPLAT]] -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[INDEX]] -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP9]], align 8 -; CHECK-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP11:%.*]] = shl i64 [[TMP10]], 1 -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[TMP9]], i64 [[TMP11]] -; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP12]], align 8 -; CHECK-NEXT: [[TMP13:%.*]] = add nsw [[WIDE_LOAD]], [[VEC_IND]] -; CHECK-NEXT: [[TMP14:%.*]] = add nsw [[WIDE_LOAD2]], [[STEP_ADD]] -; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] -; CHECK-NEXT: store [[TMP13]], ptr [[TMP15]], align 8 -; CHECK-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP17:%.*]] = shl i64 [[TMP16]], 1 -; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i64, ptr [[TMP15]], i64 [[TMP17]] -; CHECK-NEXT: store [[TMP14]], ptr [[TMP18]], align 8 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP7]], align 8 +; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP9:%.*]] = shl i64 [[TMP8]], 1 +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[TMP7]], i64 [[TMP9]] +; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP10]], align 8 +; CHECK-NEXT: [[TMP11:%.*]] = add nsw [[WIDE_LOAD]], [[VEC_IND]] +; CHECK-NEXT: [[TMP12:%.*]] = add nsw [[WIDE_LOAD2]], [[STEP_ADD]] +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] +; CHECK-NEXT: store [[TMP11]], ptr [[TMP13]], align 8 +; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP15:%.*]] = shl i64 [[TMP14]], 1 +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[TMP13]], i64 [[TMP15]] +; CHECK-NEXT: store [[TMP12]], ptr [[TMP16]], align 8 +; CHECK-NEXT: [[TMP17:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP18:%.*]] = shl i64 [[TMP17]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP18]] ; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[STEP_ADD]], [[DOTSPLAT]] ; CHECK-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] @@ -101,30 +101,30 @@ define void @add_ind64_unrolled_nxv1i64(ptr noalias nocapture %a, ptr noalias no ; CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 1 ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] -; CHECK-NEXT: [[TMP6:%.*]] = call @llvm.experimental.stepvector.nxv1i64() -; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP7]], i64 0 +; CHECK-NEXT: [[TMP4:%.*]] = call @llvm.experimental.stepvector.nxv1i64() +; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP5]], i64 0 ; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP5:%.*]] = shl i64 [[TMP4]], 1 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP6]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP4]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[STEP_ADD:%.*]] = add [[VEC_IND]], [[DOTSPLAT]] -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[INDEX]] -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP8]], align 8 -; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i64 [[TMP9]] -; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP10]], align 8 -; CHECK-NEXT: [[TMP11:%.*]] = add nsw [[WIDE_LOAD]], [[VEC_IND]] -; CHECK-NEXT: [[TMP12:%.*]] = add nsw [[WIDE_LOAD2]], [[STEP_ADD]] -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] -; CHECK-NEXT: store [[TMP11]], ptr [[TMP13]], align 8 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 8 +; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i64 [[TMP7]] +; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP8]], align 8 +; CHECK-NEXT: [[TMP9:%.*]] = add nsw [[WIDE_LOAD]], [[VEC_IND]] +; CHECK-NEXT: [[TMP10:%.*]] = add nsw [[WIDE_LOAD2]], [[STEP_ADD]] +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] +; CHECK-NEXT: store [[TMP9]], ptr [[TMP11]], align 8 +; CHECK-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[TMP11]], i64 [[TMP12]] +; CHECK-NEXT: store [[TMP10]], ptr [[TMP13]], align 8 ; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[TMP13]], i64 [[TMP14]] -; CHECK-NEXT: store [[TMP12]], ptr [[TMP15]], align 8 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-NEXT: [[TMP15:%.*]] = shl i64 [[TMP14]], 1 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP15]] ; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[STEP_ADD]], [[DOTSPLAT]] ; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] @@ -189,21 +189,21 @@ define void @add_unique_ind32(ptr noalias nocapture %a, i64 %n) { ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] ; CHECK-NEXT: [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32 ; CHECK-NEXT: [[IND_END:%.*]] = shl i32 [[DOTCAST]], 1 -; CHECK-NEXT: [[TMP6:%.*]] = call @llvm.experimental.stepvector.nxv4i32() -; CHECK-NEXT: [[TMP7:%.*]] = shl [[TMP6]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-NEXT: [[TMP9:%.*]] = shl i32 [[TMP8]], 3 -; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i32 [[TMP9]], i64 0 +; CHECK-NEXT: [[TMP4:%.*]] = call @llvm.experimental.stepvector.nxv4i32() +; CHECK-NEXT: [[TMP5:%.*]] = shl [[TMP4]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[TMP7:%.*]] = shl i32 [[TMP6]], 3 +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i32 [[TMP7]], i64 0 ; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP5:%.*]] = shl i64 [[TMP4]], 2 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP7]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]] -; CHECK-NEXT: store [[VEC_IND]], ptr [[TMP10]], align 4 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP5]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]] +; CHECK-NEXT: store [[VEC_IND]], ptr [[TMP8]], align 4 +; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP10:%.*]] = shl i64 [[TMP9]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]] ; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] ; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] @@ -267,25 +267,25 @@ define void @add_unique_indf32(ptr noalias nocapture %a, i64 %n) { ; CHECK-NEXT: [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float ; CHECK-NEXT: [[TMP4:%.*]] = fmul float [[DOTCAST]], 2.000000e+00 ; CHECK-NEXT: [[IND_END:%.*]] = fadd float [[TMP4]], 0.000000e+00 -; CHECK-NEXT: [[TMP7:%.*]] = call @llvm.experimental.stepvector.nxv4i32() -; CHECK-NEXT: [[TMP8:%.*]] = uitofp [[TMP7]] to -; CHECK-NEXT: [[TMP9:%.*]] = fmul [[TMP8]], shufflevector ( insertelement ( poison, float 2.000000e+00, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[INDUCTION:%.*]] = fadd [[TMP9]], zeroinitializer -; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-NEXT: [[TMP11:%.*]] = shl i32 [[TMP10]], 2 -; CHECK-NEXT: [[TMP12:%.*]] = uitofp i32 [[TMP11]] to float -; CHECK-NEXT: [[TMP13:%.*]] = fmul float [[TMP12]], 2.000000e+00 -; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, float [[TMP13]], i64 0 +; CHECK-NEXT: [[TMP5:%.*]] = call @llvm.experimental.stepvector.nxv4i32() +; CHECK-NEXT: [[TMP6:%.*]] = uitofp [[TMP5]] to +; CHECK-NEXT: [[TMP7:%.*]] = fmul [[TMP6]], shufflevector ( insertelement ( poison, float 2.000000e+00, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[INDUCTION:%.*]] = fadd [[TMP7]], zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[TMP9:%.*]] = shl i32 [[TMP8]], 2 +; CHECK-NEXT: [[TMP10:%.*]] = uitofp i32 [[TMP9]] to float +; CHECK-NEXT: [[TMP11:%.*]] = fmul float [[TMP10]], 2.000000e+00 +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, float [[TMP11]], i64 0 ; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer -; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP6:%.*]] = shl i64 [[TMP5]], 2 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]] -; CHECK-NEXT: store [[VEC_IND]], ptr [[TMP14]], align 4 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]] +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]] +; CHECK-NEXT: store [[VEC_IND]], ptr [[TMP12]], align 4 +; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP14:%.*]] = shl i64 [[TMP13]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP14]] ; CHECK-NEXT: [[VEC_IND_NEXT]] = fadd [[VEC_IND]], [[DOTSPLAT]] ; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] diff --git a/llvm/test/Transforms/LoopVectorize/scalable-lifetime.ll b/llvm/test/Transforms/LoopVectorize/scalable-lifetime.ll index b14d5a9f4bb57..375d2e0ac8af5 100644 --- a/llvm/test/Transforms/LoopVectorize/scalable-lifetime.ll +++ b/llvm/test/Transforms/LoopVectorize/scalable-lifetime.ll @@ -20,18 +20,18 @@ define void @test(ptr %d) { ; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 128, [[TMP3]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 128, [[N_MOD_VF]] -; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 2 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 ; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 4096, ptr [[ARR]]) -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[D]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0 -; CHECK-NEXT: store shufflevector ( insertelement ( poison, i32 100, i64 0), poison, zeroinitializer), ptr [[TMP8]], align 8 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[D]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0 +; CHECK-NEXT: store shufflevector ( insertelement ( poison, i32 100, i64 0), poison, zeroinitializer), ptr [[TMP6]], align 8 ; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 4096, ptr [[ARR]]) -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]] ; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: @@ -92,18 +92,18 @@ define void @testloopvariant(ptr %d) { ; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 128, [[TMP3]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 128, [[N_MOD_VF]] -; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 2 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 ; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 4096, ptr [[ARR]]) -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[D]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0 -; CHECK-NEXT: store shufflevector ( insertelement ( poison, i32 100, i64 0), poison, zeroinitializer), ptr [[TMP8]], align 8 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[D]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0 +; CHECK-NEXT: store shufflevector ( insertelement ( poison, i32 100, i64 0), poison, zeroinitializer), ptr [[TMP6]], align 8 ; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 4096, ptr [[ARR]]) -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]] ; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: diff --git a/llvm/test/Transforms/LoopVectorize/scalable-loop-unpredicated-body-scalar-tail.ll b/llvm/test/Transforms/LoopVectorize/scalable-loop-unpredicated-body-scalar-tail.ll index 33248e13320bf..6de52c09665c1 100644 --- a/llvm/test/Transforms/LoopVectorize/scalable-loop-unpredicated-body-scalar-tail.ll +++ b/llvm/test/Transforms/LoopVectorize/scalable-loop-unpredicated-body-scalar-tail.ll @@ -12,8 +12,6 @@ ; CHECKUF1-DAG: %[[VSCALEX4:.*]] = shl i64 %[[VSCALE]], 2 ; CHECKUF1-DAG: %n.mod.vf = urem i64 %wide.trip.count, %[[VSCALEX4]] ; CHECKUF1: %n.vec = sub nsw i64 %wide.trip.count, %n.mod.vf -; CHECKUF1: %[[VSCALE:.*]] = call i64 @llvm.vscale.i64() -; CHECKUF1: %[[VSCALEX4:.*]] = shl i64 %[[VSCALE]], 2 ; CHECKUF1: vector.body: ; CHECKUF1: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] @@ -22,6 +20,8 @@ ; CHECKUF1: %[[FADD:.*]] = fadd %wide.load, shufflevector ( insertelement ( poison, double 1.000000e+00, i64 0), poison, zeroinitializer) ; CHECKUF1: %[[IDXA:.*]] = getelementptr inbounds double, ptr %a, i64 %index ; CHECKUF1: store %[[FADD]], ptr %[[IDXA]], align 8 +; CHECKUF1: %[[VSCALE:.*]] = call i64 @llvm.vscale.i64() +; CHECKUF1: %[[VSCALEX4:.*]] = shl i64 %[[VSCALE]], 2 ; CHECKUF1: %index.next = add nuw i64 %index, %[[VSCALEX4]] ; CHECKUF1: %[[CMP:.*]] = icmp eq i64 %index.next, %n.vec ; CHECKUF1: br i1 %[[CMP]], label %middle.block, label %vector.body, !llvm.loop !0 @@ -41,8 +41,6 @@ ; CHECKUF2-DAG: %[[VSCALEX8:.*]] = shl i64 %[[VSCALE]], 3 ; CHECKUF2-DAG: %n.mod.vf = urem i64 %wide.trip.count, %[[VSCALEX8]] ; CHECKUF2: %n.vec = sub nsw i64 %wide.trip.count, %n.mod.vf -; CHECKUF2: %[[VSCALE:.*]] = call i64 @llvm.vscale.i64() -; CHECKUF2: %[[VSCALEX8:.*]] = shl i64 %[[VSCALE]], 3 ; CHECKUF2: vector.body: ; CHECKUF2: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] @@ -60,6 +58,8 @@ ; CHECKUF2: %[[VSCALE2:.*]] = shl i64 %[[VSCALE]], 2 ; CHECKUF2: %[[IDXA_NEXT:.*]] = getelementptr inbounds double, ptr %[[IDXA]], i64 %[[VSCALE2]] ; CHECKUF2: store %[[FADD_NEXT]], ptr %[[IDXA_NEXT]], align 8 +; CHECKUF2: %[[VSCALE:.*]] = call i64 @llvm.vscale.i64() +; CHECKUF2: %[[VSCALEX8:.*]] = shl i64 %[[VSCALE]], 3 ; CHECKUF2: %index.next = add nuw i64 %index, %[[VSCALEX8]] ; CHECKUF2: %[[CMP:.*]] = icmp eq i64 %index.next, %n.vec ; CHECKUF2: br i1 %[[CMP]], label %middle.block, label %vector.body, !llvm.loop !0 diff --git a/llvm/test/Transforms/LoopVectorize/scalable-reduction-inloop.ll b/llvm/test/Transforms/LoopVectorize/scalable-reduction-inloop.ll index e68e658f0e879..afe16c71f7f9c 100644 --- a/llvm/test/Transforms/LoopVectorize/scalable-reduction-inloop.ll +++ b/llvm/test/Transforms/LoopVectorize/scalable-reduction-inloop.ll @@ -5,10 +5,6 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3 define i8 @reduction_add_trunc(ptr noalias nocapture %A) { ; CHECK-LABEL: @reduction_add_trunc( -; CHECK: call i32 @llvm.vscale.i32() -; CHECK: call i32 @llvm.vscale.i32() -; CHECK: [[TMP30:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-NEXT: [[TMP31:%.*]] = mul i32 [[TMP30]], 16 ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ insertelement ( zeroinitializer, i32 255, i32 0), %vector.ph ], [ [[TMP34:%.*]], %vector.body ] @@ -25,6 +21,8 @@ define i8 @reduction_add_trunc(ptr noalias nocapture %A) { ; CHECK-NEXT: [[TMP35:%.*]] = trunc [[TMP29]] to ; CHECK-NEXT: [[TMP34]] = zext [[TMP33]] to ; CHECK-NEXT: [[TMP36]] = zext [[TMP35]] to +; CHECK-NEXT: [[TMP30:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[TMP31:%.*]] = mul i32 [[TMP30]], 16 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP31]] ; CHECK-NEXT: [[TMP32:%.*]] = icmp eq i32 [[INDEX_NEXT]], {{%.*}} ; CHECK: middle.block: diff --git a/llvm/test/Transforms/LoopVectorize/scalable-trunc-min-bitwidth.ll b/llvm/test/Transforms/LoopVectorize/scalable-trunc-min-bitwidth.ll index c7ad80f937570..8c92074abfd82 100644 --- a/llvm/test/Transforms/LoopVectorize/scalable-trunc-min-bitwidth.ll +++ b/llvm/test/Transforms/LoopVectorize/scalable-trunc-min-bitwidth.ll @@ -15,14 +15,14 @@ define void @trunc_minimal_bitwidth(ptr %bptr, ptr noalias %hptr, i32 %val, i64 ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[VAL:%.*]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 4 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP4:%.*]] = trunc [[BROADCAST_SPLAT]] to ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i16, ptr [[HPTR:%.*]], i64 [[INDEX]] ; CHECK-NEXT: store [[TMP4]], ptr [[TMP5]], align 2 +; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP7]] ; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] @@ -76,8 +76,6 @@ define void @trunc_minimal_bitwidths_shufflevector (ptr %p, i32 %arg1, i64 %len) ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[ARG1:%.*]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: [[TMP4:%.*]] = trunc [[BROADCAST_SPLAT]] to -; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 4 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -86,6 +84,8 @@ define void @trunc_minimal_bitwidths_shufflevector (ptr %p, i32 %arg1, i64 %len) ; CHECK-NEXT: [[TMP6:%.*]] = xor [[WIDE_LOAD]], [[TMP4]] ; CHECK-NEXT: [[TMP7:%.*]] = mul [[TMP6]], [[WIDE_LOAD]] ; CHECK-NEXT: store [[TMP7]], ptr [[TMP5]], align 1 +; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]] ; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll index 4e786cff79a47..f129cf3e2a1aa 100644 --- a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll +++ b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll @@ -809,22 +809,23 @@ exit: define void @zext_nneg(ptr noalias %p, ptr noalias %p1) { ; CHECK-LABEL: LV: Checking a loop in 'zext_nneg' ; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' { -; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count -; CHECK-NEXT: Live-in ir<1000> = original trip-count +; CHECK-NEXT: Live-in vp<%0> = vector-trip-count +; CHECK-NEXT: Live-in ir<0> = original trip-count ; CHECK-EMPTY: ; CHECK-NEXT: vector.ph: ; CHECK-NEXT: Successor(s): vector loop ; CHECK-EMPTY: ; CHECK-NEXT: vector loop: { ; CHECK-NEXT: vector.body: -; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[CAN_IV_NEXT:%.+]]> -; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1> -; CHECK-NEXT: CLONE ir<%idx> = getelementptr ir<%p>, vp<[[STEPS]]> -; CHECK-NEXT: WIDEN ir<%l> = load ir<%idx> -; CHECK-NEXT: WIDEN-CAST ir<%zext> = zext nneg ir<%l> -; CHECK-NEXT: REPLICATE store ir<%zext>, ir<%p1> -; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = VF * UF + nuw vp<[[CAN_IV]]> -; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VEC_TC]]> +; CHECK-NEXT: EMIT vp<%1> = CANONICAL-INDUCTION ir<0>, vp<%8> +; CHECK-NEXT: vp<%2> = DERIVED-IV ir<0> + vp<%1> * ir<1> (truncated to i32) +; CHECK-NEXT: vp<%3> = SCALAR-STEPS vp<%2>, ir<1> +; CHECK-NEXT: CLONE ir<%zext> = zext nneg vp<%3> +; CHECK-NEXT: CLONE ir<%idx2> = getelementptr ir<%p>, ir<%zext> +; CHECK-NEXT: WIDEN ir<%1> = load ir<%idx2> +; CHECK-NEXT: REPLICATE store ir<%1>, ir<%p1> +; CHECK-NEXT: EMIT vp<%8> = VF * UF + nuw vp<%1> +; CHECK-NEXT: EMIT branch-on-count vp<%8>, vp<%0> ; CHECK-NEXT: No successors ; CHECK-NEXT: } ; @@ -833,12 +834,13 @@ entry: body: %iv = phi i64 [ %next, %body ], [ 0, %entry ] - %idx = getelementptr i32, ptr %p, i64 %iv - %l = load i32, ptr %idx, align 8 - %zext = zext nneg i32 %l to i64 - store i64 %zext, ptr %p1, align 8 + %0 = trunc i64 %iv to i32 + %zext = zext nneg i32 %0 to i64 + %idx2 = getelementptr double, ptr %p, i64 %zext + %1 = load double, ptr %idx2, align 8 + store double %1, ptr %p1, align 8 %next = add i64 %iv, 1 - %cmp = icmp eq i64 %next, 1000 + %cmp = icmp eq i64 %next, 0 br i1 %cmp, label %exit, label %body exit: diff --git a/llvm/test/Transforms/PhaseOrdering/X86/vdiv.ll b/llvm/test/Transforms/PhaseOrdering/X86/vdiv.ll index 2510aeaebcf0c..e5582548447ae 100644 --- a/llvm/test/Transforms/PhaseOrdering/X86/vdiv.ll +++ b/llvm/test/Transforms/PhaseOrdering/X86/vdiv.ll @@ -64,31 +64,31 @@ define void @vdiv(ptr %x, ptr %y, double %a, i32 %N) #0 { ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END]], label [[FOR_BODY_PREHEADER9]] ; CHECK: for.body.preheader9: ; CHECK-NEXT: [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[TMP18:%.*]] = xor i64 [[INDVARS_IV_PH]], -1 +; CHECK-NEXT: [[TMP19:%.*]] = add nsw i64 [[TMP18]], [[WIDE_TRIP_COUNT]] ; CHECK-NEXT: [[XTRAITER:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 7 ; CHECK-NEXT: [[LCMP_MOD_NOT:%.*]] = icmp eq i64 [[XTRAITER]], 0 ; CHECK-NEXT: br i1 [[LCMP_MOD_NOT]], label [[FOR_BODY_PROL_LOOPEXIT:%.*]], label [[FOR_BODY_PROL_PREHEADER:%.*]] ; CHECK: for.body.prol.preheader: -; CHECK-NEXT: [[TMP18:%.*]] = fdiv fast double 1.000000e+00, [[A]] +; CHECK-NEXT: [[TMP20:%.*]] = fdiv fast double 1.000000e+00, [[A]] ; CHECK-NEXT: br label [[FOR_BODY_PROL:%.*]] ; CHECK: for.body.prol: ; CHECK-NEXT: [[INDVARS_IV_PROL:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_PROL:%.*]], [[FOR_BODY_PROL]] ], [ [[INDVARS_IV_PH]], [[FOR_BODY_PROL_PREHEADER]] ] ; CHECK-NEXT: [[PROL_ITER:%.*]] = phi i64 [ [[PROL_ITER_NEXT:%.*]], [[FOR_BODY_PROL]] ], [ 0, [[FOR_BODY_PROL_PREHEADER]] ] ; CHECK-NEXT: [[ARRAYIDX_PROL:%.*]] = getelementptr inbounds double, ptr [[Y]], i64 [[INDVARS_IV_PROL]] ; CHECK-NEXT: [[T0_PROL:%.*]] = load double, ptr [[ARRAYIDX_PROL]], align 8, !tbaa [[TBAA3]] -; CHECK-NEXT: [[TMP19:%.*]] = fmul fast double [[T0_PROL]], [[TMP18]] +; CHECK-NEXT: [[TMP21:%.*]] = fmul fast double [[T0_PROL]], [[TMP20]] ; CHECK-NEXT: [[ARRAYIDX2_PROL:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[INDVARS_IV_PROL]] -; CHECK-NEXT: store double [[TMP19]], ptr [[ARRAYIDX2_PROL]], align 8, !tbaa [[TBAA3]] +; CHECK-NEXT: store double [[TMP21]], ptr [[ARRAYIDX2_PROL]], align 8, !tbaa [[TBAA3]] ; CHECK-NEXT: [[INDVARS_IV_NEXT_PROL]] = add nuw nsw i64 [[INDVARS_IV_PROL]], 1 ; CHECK-NEXT: [[PROL_ITER_NEXT]] = add i64 [[PROL_ITER]], 1 ; CHECK-NEXT: [[PROL_ITER_CMP_NOT:%.*]] = icmp eq i64 [[PROL_ITER_NEXT]], [[XTRAITER]] ; CHECK-NEXT: br i1 [[PROL_ITER_CMP_NOT]], label [[FOR_BODY_PROL_LOOPEXIT]], label [[FOR_BODY_PROL]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: for.body.prol.loopexit: ; CHECK-NEXT: [[INDVARS_IV_UNR:%.*]] = phi i64 [ [[INDVARS_IV_PH]], [[FOR_BODY_PREHEADER9]] ], [ [[INDVARS_IV_NEXT_PROL]], [[FOR_BODY_PROL]] ] -; CHECK-NEXT: [[TMP20:%.*]] = sub nsw i64 [[INDVARS_IV_PH]], [[WIDE_TRIP_COUNT]] -; CHECK-NEXT: [[TMP21:%.*]] = icmp ugt i64 [[TMP20]], -8 -; CHECK-NEXT: br i1 [[TMP21]], label [[FOR_END]], label [[FOR_BODY_PREHEADER9_NEW:%.*]] +; CHECK-NEXT: [[TMP22:%.*]] = icmp ult i64 [[TMP19]], 7 +; CHECK-NEXT: br i1 [[TMP22]], label [[FOR_END]], label [[FOR_BODY_PREHEADER9_NEW:%.*]] ; CHECK: for.body.preheader9.new: -; CHECK-NEXT: [[TMP22:%.*]] = fdiv fast double 1.000000e+00, [[A]] ; CHECK-NEXT: [[TMP23:%.*]] = fdiv fast double 1.000000e+00, [[A]] ; CHECK-NEXT: [[TMP24:%.*]] = fdiv fast double 1.000000e+00, [[A]] ; CHECK-NEXT: [[TMP25:%.*]] = fdiv fast double 1.000000e+00, [[A]] @@ -96,56 +96,57 @@ define void @vdiv(ptr %x, ptr %y, double %a, i32 %N) #0 { ; CHECK-NEXT: [[TMP27:%.*]] = fdiv fast double 1.000000e+00, [[A]] ; CHECK-NEXT: [[TMP28:%.*]] = fdiv fast double 1.000000e+00, [[A]] ; CHECK-NEXT: [[TMP29:%.*]] = fdiv fast double 1.000000e+00, [[A]] +; CHECK-NEXT: [[TMP30:%.*]] = fdiv fast double 1.000000e+00, [[A]] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_UNR]], [[FOR_BODY_PREHEADER9_NEW]] ], [ [[INDVARS_IV_NEXT_7:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[Y]], i64 [[INDVARS_IV]] ; CHECK-NEXT: [[T0:%.*]] = load double, ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA3]] -; CHECK-NEXT: [[TMP30:%.*]] = fmul fast double [[T0]], [[TMP22]] +; CHECK-NEXT: [[TMP31:%.*]] = fmul fast double [[T0]], [[TMP23]] ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[INDVARS_IV]] -; CHECK-NEXT: store double [[TMP30]], ptr [[ARRAYIDX2]], align 8, !tbaa [[TBAA3]] +; CHECK-NEXT: store double [[TMP31]], ptr [[ARRAYIDX2]], align 8, !tbaa [[TBAA3]] ; CHECK-NEXT: [[INDVARS_IV_NEXT:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds double, ptr [[Y]], i64 [[INDVARS_IV_NEXT]] ; CHECK-NEXT: [[T0_1:%.*]] = load double, ptr [[ARRAYIDX_1]], align 8, !tbaa [[TBAA3]] -; CHECK-NEXT: [[TMP31:%.*]] = fmul fast double [[T0_1]], [[TMP23]] +; CHECK-NEXT: [[TMP32:%.*]] = fmul fast double [[T0_1]], [[TMP24]] ; CHECK-NEXT: [[ARRAYIDX2_1:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[INDVARS_IV_NEXT]] -; CHECK-NEXT: store double [[TMP31]], ptr [[ARRAYIDX2_1]], align 8, !tbaa [[TBAA3]] +; CHECK-NEXT: store double [[TMP32]], ptr [[ARRAYIDX2_1]], align 8, !tbaa [[TBAA3]] ; CHECK-NEXT: [[INDVARS_IV_NEXT_1:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 2 ; CHECK-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds double, ptr [[Y]], i64 [[INDVARS_IV_NEXT_1]] ; CHECK-NEXT: [[T0_2:%.*]] = load double, ptr [[ARRAYIDX_2]], align 8, !tbaa [[TBAA3]] -; CHECK-NEXT: [[TMP32:%.*]] = fmul fast double [[T0_2]], [[TMP24]] +; CHECK-NEXT: [[TMP33:%.*]] = fmul fast double [[T0_2]], [[TMP25]] ; CHECK-NEXT: [[ARRAYIDX2_2:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[INDVARS_IV_NEXT_1]] -; CHECK-NEXT: store double [[TMP32]], ptr [[ARRAYIDX2_2]], align 8, !tbaa [[TBAA3]] +; CHECK-NEXT: store double [[TMP33]], ptr [[ARRAYIDX2_2]], align 8, !tbaa [[TBAA3]] ; CHECK-NEXT: [[INDVARS_IV_NEXT_2:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 3 ; CHECK-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds double, ptr [[Y]], i64 [[INDVARS_IV_NEXT_2]] ; CHECK-NEXT: [[T0_3:%.*]] = load double, ptr [[ARRAYIDX_3]], align 8, !tbaa [[TBAA3]] -; CHECK-NEXT: [[TMP33:%.*]] = fmul fast double [[T0_3]], [[TMP25]] +; CHECK-NEXT: [[TMP34:%.*]] = fmul fast double [[T0_3]], [[TMP26]] ; CHECK-NEXT: [[ARRAYIDX2_3:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[INDVARS_IV_NEXT_2]] -; CHECK-NEXT: store double [[TMP33]], ptr [[ARRAYIDX2_3]], align 8, !tbaa [[TBAA3]] +; CHECK-NEXT: store double [[TMP34]], ptr [[ARRAYIDX2_3]], align 8, !tbaa [[TBAA3]] ; CHECK-NEXT: [[INDVARS_IV_NEXT_3:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 4 ; CHECK-NEXT: [[ARRAYIDX_4:%.*]] = getelementptr inbounds double, ptr [[Y]], i64 [[INDVARS_IV_NEXT_3]] ; CHECK-NEXT: [[T0_4:%.*]] = load double, ptr [[ARRAYIDX_4]], align 8, !tbaa [[TBAA3]] -; CHECK-NEXT: [[TMP34:%.*]] = fmul fast double [[T0_4]], [[TMP26]] +; CHECK-NEXT: [[TMP35:%.*]] = fmul fast double [[T0_4]], [[TMP27]] ; CHECK-NEXT: [[ARRAYIDX2_4:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[INDVARS_IV_NEXT_3]] -; CHECK-NEXT: store double [[TMP34]], ptr [[ARRAYIDX2_4]], align 8, !tbaa [[TBAA3]] +; CHECK-NEXT: store double [[TMP35]], ptr [[ARRAYIDX2_4]], align 8, !tbaa [[TBAA3]] ; CHECK-NEXT: [[INDVARS_IV_NEXT_4:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 5 ; CHECK-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds double, ptr [[Y]], i64 [[INDVARS_IV_NEXT_4]] ; CHECK-NEXT: [[T0_5:%.*]] = load double, ptr [[ARRAYIDX_5]], align 8, !tbaa [[TBAA3]] -; CHECK-NEXT: [[TMP35:%.*]] = fmul fast double [[T0_5]], [[TMP27]] +; CHECK-NEXT: [[TMP36:%.*]] = fmul fast double [[T0_5]], [[TMP28]] ; CHECK-NEXT: [[ARRAYIDX2_5:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[INDVARS_IV_NEXT_4]] -; CHECK-NEXT: store double [[TMP35]], ptr [[ARRAYIDX2_5]], align 8, !tbaa [[TBAA3]] +; CHECK-NEXT: store double [[TMP36]], ptr [[ARRAYIDX2_5]], align 8, !tbaa [[TBAA3]] ; CHECK-NEXT: [[INDVARS_IV_NEXT_5:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 6 ; CHECK-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds double, ptr [[Y]], i64 [[INDVARS_IV_NEXT_5]] ; CHECK-NEXT: [[T0_6:%.*]] = load double, ptr [[ARRAYIDX_6]], align 8, !tbaa [[TBAA3]] -; CHECK-NEXT: [[TMP36:%.*]] = fmul fast double [[T0_6]], [[TMP28]] +; CHECK-NEXT: [[TMP37:%.*]] = fmul fast double [[T0_6]], [[TMP29]] ; CHECK-NEXT: [[ARRAYIDX2_6:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[INDVARS_IV_NEXT_5]] -; CHECK-NEXT: store double [[TMP36]], ptr [[ARRAYIDX2_6]], align 8, !tbaa [[TBAA3]] +; CHECK-NEXT: store double [[TMP37]], ptr [[ARRAYIDX2_6]], align 8, !tbaa [[TBAA3]] ; CHECK-NEXT: [[INDVARS_IV_NEXT_6:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 7 ; CHECK-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds double, ptr [[Y]], i64 [[INDVARS_IV_NEXT_6]] ; CHECK-NEXT: [[T0_7:%.*]] = load double, ptr [[ARRAYIDX_7]], align 8, !tbaa [[TBAA3]] -; CHECK-NEXT: [[TMP37:%.*]] = fmul fast double [[T0_7]], [[TMP29]] +; CHECK-NEXT: [[TMP38:%.*]] = fmul fast double [[T0_7]], [[TMP30]] ; CHECK-NEXT: [[ARRAYIDX2_7:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[INDVARS_IV_NEXT_6]] -; CHECK-NEXT: store double [[TMP37]], ptr [[ARRAYIDX2_7]], align 8, !tbaa [[TBAA3]] +; CHECK-NEXT: store double [[TMP38]], ptr [[ARRAYIDX2_7]], align 8, !tbaa [[TBAA3]] ; CHECK-NEXT: [[INDVARS_IV_NEXT_7]] = add nuw nsw i64 [[INDVARS_IV]], 8 ; CHECK-NEXT: [[EXITCOND_NOT_7:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_7]], [[WIDE_TRIP_COUNT]] ; CHECK-NEXT: br i1 [[EXITCOND_NOT_7]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] diff --git a/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp b/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp index 6c34446e8d663..8e242cdd1d905 100644 --- a/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp +++ b/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp @@ -572,7 +572,7 @@ BenchmarkRunner::createFunctionExecutor( llvm_unreachable("ExecutionMode is outside expected range"); } -std::pair BenchmarkRunner::runConfiguration( +Expected BenchmarkRunner::runConfiguration( RunnableConfiguration &&RC, const std::optional &DumpFile) const { Benchmark &InstrBenchmark = RC.InstrBenchmark; @@ -583,7 +583,8 @@ std::pair BenchmarkRunner::runConfiguration( auto ObjectFilePath = writeObjectFile(ObjectFile.getBinary()->getData(), *DumpFile); if (Error E = ObjectFilePath.takeError()) { - return {std::move(E), std::move(InstrBenchmark)}; + InstrBenchmark.Error = toString(std::move(E)); + return std::move(InstrBenchmark); } outs() << "Check generated assembly with: /usr/bin/objdump -d " << *ObjectFilePath << "\n"; @@ -591,17 +592,20 @@ std::pair BenchmarkRunner::runConfiguration( if (BenchmarkPhaseSelector < BenchmarkPhaseSelectorE::Measure) { InstrBenchmark.Error = "actual measurements skipped."; - return {Error::success(), std::move(InstrBenchmark)}; + return std::move(InstrBenchmark); } Expected> Executor = createFunctionExecutor(std::move(ObjectFile), RC.InstrBenchmark.Key); if (!Executor) - return {Executor.takeError(), std::move(InstrBenchmark)}; + return Executor.takeError(); auto NewMeasurements = runMeasurements(**Executor); if (Error E = NewMeasurements.takeError()) { - return {std::move(E), std::move(InstrBenchmark)}; + if (!E.isA()) + return std::move(E); + InstrBenchmark.Error = toString(std::move(E)); + return std::move(InstrBenchmark); } assert(InstrBenchmark.NumRepetitions > 0 && "invalid NumRepetitions"); for (BenchmarkMeasure &BM : *NewMeasurements) { @@ -614,7 +618,7 @@ std::pair BenchmarkRunner::runConfiguration( } InstrBenchmark.Measurements = std::move(*NewMeasurements); - return {Error::success(), std::move(InstrBenchmark)}; + return std::move(InstrBenchmark); } Expected diff --git a/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.h b/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.h index 2c48d07e37ca9..24f2086289408 100644 --- a/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.h +++ b/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.h @@ -65,7 +65,7 @@ class BenchmarkRunner { unsigned NumRepetitions, unsigned LoopUnrollFactor, const SnippetRepetitor &Repetitor) const; - std::pair + Expected runConfiguration(RunnableConfiguration &&RC, const std::optional &DumpFile) const; diff --git a/llvm/tools/llvm-exegesis/llvm-exegesis.cpp b/llvm/tools/llvm-exegesis/llvm-exegesis.cpp index 148891a18246f..4e466303c9dbf 100644 --- a/llvm/tools/llvm-exegesis/llvm-exegesis.cpp +++ b/llvm/tools/llvm-exegesis/llvm-exegesis.cpp @@ -410,18 +410,8 @@ static void runBenchmarkConfigurations( std::optional DumpFile; if (DumpObjectToDisk.getNumOccurrences()) DumpFile = DumpObjectToDisk; - auto [Err, InstrBenchmark] = - Runner.runConfiguration(std::move(RC), DumpFile); - if (Err) { - // Errors from executing the snippets are fine. - // All other errors are a framework issue and should fail. - if (!Err.isA()) { - llvm::errs() << "llvm-exegesis error: " << toString(std::move(Err)); - exit(1); - } - InstrBenchmark.Error = toString(std::move(Err)); - } - AllResults.push_back(std::move(InstrBenchmark)); + AllResults.emplace_back( + ExitOnErr(Runner.runConfiguration(std::move(RC), DumpFile))); } Benchmark &Result = AllResults.front(); diff --git a/llvm/unittests/Analysis/ValueLatticeTest.cpp b/llvm/unittests/Analysis/ValueLatticeTest.cpp index b456f286e0d96..ae221811f3fb5 100644 --- a/llvm/unittests/Analysis/ValueLatticeTest.cpp +++ b/llvm/unittests/Analysis/ValueLatticeTest.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "llvm/Analysis/ValueLattice.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/IR/ConstantRange.h" #include "llvm/IR/Constants.h" #include "llvm/IR/IRBuilder.h" diff --git a/llvm/unittests/Analysis/VectorUtilsTest.cpp b/llvm/unittests/Analysis/VectorUtilsTest.cpp index 14958aa646a04..1b3a8b0259f01 100644 --- a/llvm/unittests/Analysis/VectorUtilsTest.cpp +++ b/llvm/unittests/Analysis/VectorUtilsTest.cpp @@ -17,6 +17,7 @@ #include "llvm/IR/NoFolder.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/SourceMgr.h" +#include "llvm/Support/KnownBits.h" #include "gtest/gtest.h" using namespace llvm; diff --git a/llvm/unittests/TableGen/AutomataTest.cpp b/llvm/unittests/TableGen/AutomataTest.cpp index 833cf51b19a10..53f66127292f8 100644 --- a/llvm/unittests/TableGen/AutomataTest.cpp +++ b/llvm/unittests/TableGen/AutomataTest.cpp @@ -6,6 +6,7 @@ // //===----------------------------------------------------------------------===// +#include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/STLExtras.h" #include "llvm/Support/Debug.h" #include "llvm/Support/Automaton.h" diff --git a/llvm/unittests/TargetParser/TargetParserTest.cpp b/llvm/unittests/TargetParser/TargetParserTest.cpp index acd0b9b9d65cb..abd87b455aceb 100644 --- a/llvm/unittests/TargetParser/TargetParserTest.cpp +++ b/llvm/unittests/TargetParser/TargetParserTest.cpp @@ -469,19 +469,13 @@ INSTANTIATE_TEST_SUITE_P( ARM::AEK_FP | ARM::AEK_RAS | ARM::AEK_LOB | ARM::AEK_FP16 | ARM::AEK_PACBTI, "8.1-M.Mainline"), - ARMCPUTestParams("cortex-m52", "armv8.1-m.main", - "fp-armv8-fullfp16-d16", - ARM::AEK_HWDIVTHUMB | ARM::AEK_DSP | ARM::AEK_SIMD | - ARM::AEK_FP | ARM::AEK_RAS | ARM::AEK_LOB | - ARM::AEK_FP16 | ARM::AEK_PACBTI, - "8.1-M.Mainline"), ARMCPUTestParams("iwmmxt", "iwmmxt", "none", ARM::AEK_NONE, "iwmmxt"), ARMCPUTestParams("xscale", "xscale", "none", ARM::AEK_NONE, "xscale"), ARMCPUTestParams("swift", "armv7s", "neon-vfpv4", ARM::AEK_HWDIVARM | ARM::AEK_HWDIVTHUMB | ARM::AEK_DSP, "7-S"))); -static constexpr unsigned NumARMCPUArchs = 90; +static constexpr unsigned NumARMCPUArchs = 89; TEST(TargetParserTest, testARMCPUArchList) { SmallVector List; diff --git a/llvm/utils/TableGen/GlobalISelMatchTableExecutorEmitter.h b/llvm/utils/TableGen/GlobalISelMatchTableExecutorEmitter.h index 7e952d6df3093..c30198f11195c 100644 --- a/llvm/utils/TableGen/GlobalISelMatchTableExecutorEmitter.h +++ b/llvm/utils/TableGen/GlobalISelMatchTableExecutorEmitter.h @@ -20,6 +20,7 @@ #include "llvm/ADT/StringRef.h" #include "llvm/ADT/Twine.h" #include +#include namespace llvm { class CodeGenTarget; diff --git a/llvm/utils/TableGen/IntrinsicEmitter.cpp b/llvm/utils/TableGen/IntrinsicEmitter.cpp index 28604c5600bf4..3402809c4828c 100644 --- a/llvm/utils/TableGen/IntrinsicEmitter.cpp +++ b/llvm/utils/TableGen/IntrinsicEmitter.cpp @@ -12,6 +12,7 @@ #include "CodeGenIntrinsics.h" #include "SequenceToOffsetTable.h" +#include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringExtras.h" diff --git a/mlir/cmake/modules/MLIRConfig.cmake.in b/mlir/cmake/modules/MLIRConfig.cmake.in index d4da3cd98cce9..cd8cec5dedf3c 100644 --- a/mlir/cmake/modules/MLIRConfig.cmake.in +++ b/mlir/cmake/modules/MLIRConfig.cmake.in @@ -13,7 +13,6 @@ set(MLIR_TABLEGEN_EXE "@MLIR_CONFIG_TABLEGEN_EXE@") set(MLIR_PDLL_TABLEGEN_EXE "@MLIR_CONFIG_PDLL_TABLEGEN_EXE@") set(MLIR_INSTALL_AGGREGATE_OBJECTS "@MLIR_INSTALL_AGGREGATE_OBJECTS@") set(MLIR_ENABLE_BINDINGS_PYTHON "@MLIR_ENABLE_BINDINGS_PYTHON@") -set(MLIR_ENABLE_EXECUTION_ENGINE "@MLIR_ENABLE_EXECUTION_ENGINE@") # For mlir_tablegen() set(MLIR_INCLUDE_DIR "@MLIR_INCLUDE_DIR@") diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td index d7690b84807f6..88d9cd2c71c0c 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td +++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td @@ -284,10 +284,10 @@ def LLVM_GEPOp : LLVM_Op<"getelementptr", [Pure, }]; let builders = [ - OpBuilder<(ins "Type":$resultType, "Type":$elementType, "Value":$basePtr, + OpBuilder<(ins "Type":$resultType, "Type":$basePtrType, "Value":$basePtr, "ValueRange":$indices, CArg<"bool", "false">:$inbounds, CArg<"ArrayRef", "{}">:$attributes)>, - OpBuilder<(ins "Type":$resultType, "Type":$elementType, "Value":$basePtr, + OpBuilder<(ins "Type":$resultType, "Type":$basePtrType, "Value":$basePtr, "ArrayRef":$indices, CArg<"bool", "false">:$inbounds, CArg<"ArrayRef", "{}">:$attributes)>, ]; diff --git a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h index a848d12fbbb50..3f4dfe42b71fd 100644 --- a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h +++ b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h @@ -493,8 +493,6 @@ LogicalResult dropUnitDims(RewriterBase &rewriter, GenericOp genericOp, struct ElementwiseOpFusionResult { Operation *fusedOp; llvm::DenseMap replacements; - static llvm::SmallDenseSet - getPreservedProducerResults(GenericOp producer, GenericOp consumer); }; FailureOr fuseElementwiseOps(RewriterBase &rewriter, OpOperand *fusedOperand); diff --git a/mlir/include/mlir/Dialect/Math/IR/MathOps.td b/mlir/include/mlir/Dialect/Math/IR/MathOps.td index 9742d3d936dff..f8e9fd601304b 100644 --- a/mlir/include/mlir/Dialect/Math/IR/MathOps.td +++ b/mlir/include/mlir/Dialect/Math/IR/MathOps.td @@ -300,35 +300,6 @@ def Math_CosOp : Math_FloatUnaryOp<"cos"> { let hasFolder = 1; } -//===----------------------------------------------------------------------===// -// AcosOp -//===----------------------------------------------------------------------===// - -def Math_AcosOp : Math_FloatUnaryOp<"acos"> { - let summary = "arcus cosine of the specified value"; - let description = [{ - Syntax: - - ``` - operation ::= ssa-id `=` `math.acos` ssa-use `:` type - ``` - - The `acos` operation computes the arcus cosine of a given value. It takes one - operand of floating point type (i.e., scalar, tensor or vector) and returns one - result of the same type. It has no standard attributes. - - Example: - - ```mlir - // Scalar arcus cosine value. - %a = math.acos %b : f64 - ``` - }]; - let hasFolder = 1; -} - - - //===----------------------------------------------------------------------===// // SinOp //===----------------------------------------------------------------------===// diff --git a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp index b68baffb5a856..75dee09d2f64f 100644 --- a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp +++ b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp @@ -1340,18 +1340,18 @@ LogicalResult ConvertSetDefaultDeviceOpToGpuRuntimeCallPattern::matchAndRewrite( } template -static Value genConstInt32From(OpBuilder &builder, Location loc, T tValue) { +static Value genConstInt32From(OpBuilder &builder, Location loc, T TValue) { Type llvmInt32Type = builder.getIntegerType(32); return builder.create(loc, llvmInt32Type, - static_cast(tValue)); + static_cast(TValue)); } template -static Value genConstFloat32From(OpBuilder &builder, Location loc, T tValue) { +static Value genConstFloat32From(OpBuilder &builder, Location loc, T TValue) { Type llvmFloat32Type = builder.getF32Type(); return builder.create( loc, llvmFloat32Type, - builder.getF32FloatAttr(static_cast(tValue))); + builder.getF32FloatAttr(static_cast(TValue))); } LogicalResult ConvertCreateDnTensorOpToGpuRuntimeCallPattern::matchAndRewrite( @@ -1629,7 +1629,7 @@ LogicalResult ConvertSpMMBufferSizeOpToGpuRuntimeCallPattern::matchAndRewrite( auto stream = adaptor.getAsyncDependencies().front(); Value bufferSize; if (is2To4Sparsity(op.getSpmatA())) { - auto pruneFlag = + auto prune_flag = genConstInt32From(rewriter, loc, get2To4PruneFlag(op.getSpmatA())); auto computeType = genConstInt32From( rewriter, loc, getCuSparseLtDataTypeFrom(adaptor.getComputeType())); @@ -1641,7 +1641,7 @@ LogicalResult ConvertSpMMBufferSizeOpToGpuRuntimeCallPattern::matchAndRewrite( .create(loc, rewriter, {bufferSize, modeA, modeB, adaptor.getSpmatA(), adaptor.getDnmatB(), adaptor.getDnmatC(), computeType, - pruneFlag, stream}) + prune_flag, stream}) .getResult(); auto bufferSizePtr1 = rewriter.create( diff --git a/mlir/lib/Conversion/MathToLibm/MathToLibm.cpp b/mlir/lib/Conversion/MathToLibm/MathToLibm.cpp index 27c2cb9352071..103c1fb8c3822 100644 --- a/mlir/lib/Conversion/MathToLibm/MathToLibm.cpp +++ b/mlir/lib/Conversion/MathToLibm/MathToLibm.cpp @@ -162,7 +162,6 @@ ScalarOpToLibmCall::matchAndRewrite(Op op, void mlir::populateMathToLibmConversionPatterns(RewritePatternSet &patterns) { MLIRContext *ctx = patterns.getContext(); - populatePatternsForOp(patterns, ctx, "acosf", "acos"); populatePatternsForOp(patterns, ctx, "atan2f", "atan2"); populatePatternsForOp(patterns, ctx, "atanf", "atan"); populatePatternsForOp(patterns, ctx, "cbrtf", "cbrt"); diff --git a/mlir/lib/Conversion/MemRefToSPIRV/MapMemRefStorageClassPass.cpp b/mlir/lib/Conversion/MemRefToSPIRV/MapMemRefStorageClassPass.cpp index c6ef5be2494ad..35a3af07940a8 100644 --- a/mlir/lib/Conversion/MemRefToSPIRV/MapMemRefStorageClassPass.cpp +++ b/mlir/lib/Conversion/MemRefToSPIRV/MapMemRefStorageClassPass.cpp @@ -205,7 +205,7 @@ spirv::MemorySpaceToStorageClassConverter::MemorySpaceToStorageClassConverter( static bool isLegalType(Type type) { if (auto memRefType = dyn_cast(type)) { Attribute spaceAttr = memRefType.getMemorySpace(); - return isa_and_nonnull(spaceAttr); + return spaceAttr && isa(spaceAttr); } return true; } diff --git a/mlir/lib/Conversion/PDLToPDLInterp/PredicateTree.cpp b/mlir/lib/Conversion/PDLToPDLInterp/PredicateTree.cpp index a9c3b0a71ef0d..baab40020ac2e 100644 --- a/mlir/lib/Conversion/PDLToPDLInterp/PredicateTree.cpp +++ b/mlir/lib/Conversion/PDLToPDLInterp/PredicateTree.cpp @@ -856,7 +856,7 @@ static void foldSwitchToBool(std::unique_ptr &node) { // If the node only contains one child, collapse it into a boolean predicate // node. if (children.size() == 1) { - auto *childIt = children.begin(); + auto childIt = children.begin(); node = std::make_unique( node->getPosition(), node->getQuestion(), childIt->first, std::move(childIt->second), std::move(node->getFailureNode())); diff --git a/mlir/lib/Conversion/VectorToGPU/VectorToGPU.cpp b/mlir/lib/Conversion/VectorToGPU/VectorToGPU.cpp index 56fcf25c9ddcc..f151011ee48af 100644 --- a/mlir/lib/Conversion/VectorToGPU/VectorToGPU.cpp +++ b/mlir/lib/Conversion/VectorToGPU/VectorToGPU.cpp @@ -303,9 +303,8 @@ static bool supportsMMaMatrixType(Operation *op, bool useNvGpu) { /// `getSlice`. In scf.for we only want to include as part of the slice elements /// that are part of the use/def chain. static SetVector -getSliceContract(Operation *op, - const BackwardSliceOptions &backwardSliceOptions, - const ForwardSliceOptions &forwardSliceOptions) { +getSliceContract(Operation *op, BackwardSliceOptions backwardSliceOptions, + ForwardSliceOptions forwardSliceOptions) { SetVector slice; slice.insert(op); unsigned currentIndex = 0; @@ -558,7 +557,7 @@ convertTransferReadOp(RewriterBase &rewriter, vector::TransferReadOp op, auto elType = op.getVectorType().getElementType(); const char *fragType = inferFragType(op); if (op->hasOneUse()) { - auto *user = *op->user_begin(); + auto user = *op->user_begin(); // Infer the signedness of the mma type from the integer extend. bool isSignedExtend = isa(user); if (isSignedExtend || isa(user)) { diff --git a/mlir/lib/Dialect/Linalg/Transforms/ElementwiseOpFusion.cpp b/mlir/lib/Dialect/Linalg/Transforms/ElementwiseOpFusion.cpp index 3eb91190751ef..dc5ea28b67cdc 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/ElementwiseOpFusion.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/ElementwiseOpFusion.cpp @@ -71,25 +71,6 @@ static AffineMap getIndexingMapOfProducerOperandsInCoordinatesOfFusedOp( return t1.compose(fusedConsumerArgIndexMap); } -/// Returns a set of indices of the producer's results which would -/// be preserved after the fusion. -llvm::SmallDenseSet -ElementwiseOpFusionResult::getPreservedProducerResults(GenericOp producer, - GenericOp consumer) { - llvm::SmallDenseSet preservedProducerResults; - for (const auto &producerResult : llvm::enumerate(producer->getResults())) { - auto *outputOperand = producer.getDpsInitOperand(producerResult.index()); - if (producer.payloadUsesValueFromOperand(outputOperand) || - !producer.canOpOperandsBeDropped(outputOperand) || - llvm::any_of(producerResult.value().getUsers(), [&](Operation *user) { - return user != consumer.getOperation(); - })) { - preservedProducerResults.insert(producerResult.index()); - } - } - return preservedProducerResults; -} - /// Conditions for elementwise fusion of generic operations. bool mlir::linalg::areElementwiseOpsFusable(OpOperand *fusedOperand) { if (!fusedOperand) @@ -304,9 +285,17 @@ mlir::linalg::fuseElementwiseOps(RewriterBase &rewriter, assert(consumer.isDpsInput(fusedOperand) && "expected producer of input operand"); /// Find the results of the producer that have uses outside of the consumer. - llvm::SmallDenseSet preservedProducerResults = - ElementwiseOpFusionResult::getPreservedProducerResults(producer, - consumer); + llvm::SmallDenseSet preservedProducerResults; + for (const auto &producerResult : llvm::enumerate(producer->getResults())) { + auto *outputOperand = producer.getDpsInitOperand(producerResult.index()); + if (producer.payloadUsesValueFromOperand(outputOperand) || + !producer.canOpOperandsBeDropped(outputOperand) || + llvm::any_of(producerResult.value().getUsers(), [&](Operation *user) { + return user != consumer.getOperation(); + })) { + preservedProducerResults.insert(producerResult.index()); + } + } // Compute the fused operands list and indexing maps. SmallVector fusedInputOperands, fusedOutputOperands; diff --git a/mlir/lib/Dialect/Math/IR/MathOps.cpp b/mlir/lib/Dialect/Math/IR/MathOps.cpp index 066a21c76f7d1..28d1c062f235e 100644 --- a/mlir/lib/Dialect/Math/IR/MathOps.cpp +++ b/mlir/lib/Dialect/Math/IR/MathOps.cpp @@ -41,24 +41,6 @@ OpFoldResult math::AbsIOp::fold(FoldAdaptor adaptor) { [](const APInt &a) { return a.abs(); }); } -//===----------------------------------------------------------------------===// -// AcosOp folder -//===----------------------------------------------------------------------===// - -OpFoldResult math::AcosOp::fold(FoldAdaptor adaptor) { - return constFoldUnaryOpConditional( - adaptor.getOperands(), [](const APFloat &a) -> std::optional { - switch (a.getSizeInBits(a.getSemantics())) { - case 64: - return APFloat(acos(a.convertToDouble())); - case 32: - return APFloat(acosf(a.convertToFloat())); - default: - return {}; - } - }); -} - //===----------------------------------------------------------------------===// // AtanOp folder //===----------------------------------------------------------------------===// diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/LoopEmitter.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/LoopEmitter.cpp index 08d37b6a9656f..75121b5e3ce2e 100644 --- a/mlir/lib/Dialect/SparseTensor/Transforms/LoopEmitter.cpp +++ b/mlir/lib/Dialect/SparseTensor/Transforms/LoopEmitter.cpp @@ -147,30 +147,40 @@ static Value genSparseReducedAffineCond(OpBuilder &builder, Location loc, // Helper functions that load/store into the position buffer for slice-driven // loops. -// The sliced pointer buffer is organized as: -// [[pLo0, pLo1, pLo2, ...], -// [pHi0, pHi1, pHi2, ...], -// [pNx0, pNx1, pNx2, ...]] +// The sliced pointer buffer is orgnized as: +// [size, curPtr] (two metadata) + [[pLo0, pLo1, pLo2, ...], +// [pHi0, pHi1, pHi2, ...], +// [pNx0, pNx1, pNx2, ...]] static Value allocSlicePosBuf(OpBuilder &builder, Location loc, Value tupleCnt) { Value bufSz = MULI(tupleCnt, C_IDX(kSliceIterWidth)); // Additional two metadata {memSize, idx} at head. + bufSz = ADDI(bufSz, C_IDX(2)); return genAlloca(builder, loc, bufSz, builder.getIndexType()); } +// TODO: We should use SSA value for it. +// Gets and sets metadata. +static Value loadSlicePosPtr(OpBuilder &builder, Location loc, Value sPosBuf) { + return genIndexLoad(builder, loc, sPosBuf, C_IDX(1)); +} +static void updateSlicePosPtr(OpBuilder &builder, Location loc, Value sPosBuf, + Value pPtr) { + builder.create(loc, pPtr, sPosBuf, C_IDX(1)); +} // Gets and sets position values for slice-driven loops. enum class SlicePosKind { kLo, kHi, kNext }; static Value getSlicePosIdx(OpBuilder &builder, Location loc, Value posBuf, Value tupleIdx, SlicePosKind posKind) { Value dim = builder.create(loc, posBuf, C_IDX(0)); - Value tupleCnt = DIVUI(dim, C_IDX(kSliceIterWidth)); + Value tupleCnt = DIVUI(SUBI(dim, C_IDX(2)), C_IDX(kSliceIterWidth)); switch (posKind) { case SlicePosKind::kLo: - return tupleIdx; + return ADDI(tupleIdx, C_IDX(2)); case SlicePosKind::kHi: - return ADDI(tupleIdx, tupleCnt); + return ADDI(tupleIdx, ADDI(tupleCnt, C_IDX(2))); case SlicePosKind::kNext: - return ADDI(tupleIdx, MULI(tupleCnt, C_IDX(2))); + return ADDI(tupleIdx, ADDI(tupleCnt, ADDI(tupleCnt, C_IDX(2)))); } llvm_unreachable("unexpected kind"); } @@ -334,9 +344,6 @@ void LoopEmitter::initialize(ValueRange ts, StringAttr loopTag, bool hasOutput, this->dependentLvlMap.assign( numTensors, std::vector>>()); this->slicePosBuffer.assign(numTensors, std::vector>()); - this->sliceTupleNxStartIdx.assign(numTensors, std::vector()); - this->sliceTupleFwdCnt.assign(numTensors, std::vector()); - this->trivialSlice.assign(numTensors, std::vector()); this->sliceMeta.assign( numTensors, std::vector>>()); this->sliceStack.assign(numTensors, std::vector()); @@ -387,9 +394,6 @@ void LoopEmitter::initialize(ValueRange ts, StringAttr loopTag, bool hasOutput, dependentLvlMap[tid].assign( lvlRank, std::vector>()); slicePosBuffer[tid].assign(lvlRank, std::vector()); - sliceTupleNxStartIdx[tid].assign(lvlRank, Value()); - sliceTupleFwdCnt[tid].assign(lvlRank, Value()); - trivialSlice[tid].assign(lvlRank, false); sliceMeta[tid].assign(lvlRank, std::vector>()); sliceStack[tid].emplace_back(/*minCrd=*/Value(), /*offset=*/Value(), /*isNonEmpty*/ Value(), @@ -802,7 +806,6 @@ std::optional LoopEmitter::genWhileLoopBody(OpBuilder &builder, assert(ivs.size() == 1); // Coord is the relative offset related to its parents. assert(sliceStack[tid].back().depth == 1 && "TODO: not yet implement"); - sliceTupleFwdCnt[tid][lvl] = SUBI(ivs[0], posits[tid][lvl]); // Update c = absOffset[lvl][depth] - absOffset[lvl][depth - 1] Value posit = ivs[0]; Value crdBuf = coordinatesBuffers[tid][lvl]; @@ -1321,12 +1324,6 @@ void LoopEmitter::enterTensorsAtDenseLvls( } else { posits[tid][lvl] = genAddress(builder, loc, tid, lvl, ADDI(info.offset, iv)); - Value fwdCnt = lvl == 0 || trivialSlice[tid][lvl] - ? C_IDX(0) - : sliceTupleFwdCnt[tid][lvl - 1]; - Value sz = sliceMeta[tid][lvl].back().first; - Value mul = MULI(fwdCnt, sz); - sliceTupleFwdCnt[tid][lvl] = ADDI(mul, iv); } levelReducedDep[tid][lvl]++; } else { @@ -1360,7 +1357,13 @@ void LoopEmitter::exitForLoop(RewriterBase &rewriter, Location loc, assert(isDenseLT(lvlTypes[tid][lvl])); assert(*info.slicedOnLvl == lvl); (void)reduced; + // Resets slices pointers as the resolved slices are invalidated after we + // moves forward to the next slice. + invalidateSliceIterIdx(rewriter, loc, tid, lvl); info.minCrd = info.offset = info.isNonEmpty = Value(); + } else { + forwardsReducedSliceLevelTreeIt(rewriter, loc, tid, lvl, + constantIndex(rewriter, loc, 1)); } levelReducedDep[tid][lvl]--; } @@ -1440,6 +1443,54 @@ void LoopEmitter::exitForLoop(RewriterBase &rewriter, Location loc, } } +void LoopEmitter::forwardsReducedSliceLevelTreeIt(OpBuilder &builder, + Location loc, TensorId tid, + Level rootLvl, Value fcnt) { + + auto stt = getSparseTensorType(tensors[tid]); + + // Finds a [Lvl, leafLvl) range, and all level in between are fully reduced + // sparse levels (but not resolved). Since we forward an iterator at higher + // level of the tree, the subtree need to be pruned. + Level leafLvl = rootLvl + 1; + while (leafLvl < stt.getLvlRank() && depFullyReduced(tid, leafLvl) && + !stt.isDenseLvl(leafLvl)) { + leafLvl++; + } + + Level curLvl = rootLvl + 1; + Value nxPosPtr = nullptr; + if (curLvl < leafLvl) { + assert(!isDenseLT(lvlTypes[tid][curLvl])); + // The first compressed level, setting up the position pointer for it. + Value sPosBuf = slicePosBuffer[tid][curLvl].back(); + // One step forwards in the parent level result in forwarding one `segment` + // in the child sparse level. + Value pPosPtr = loadSlicePosPtr(builder, loc, sPosBuf); // previous ptr + Value cPosPtr = ADDI(fcnt, pPosPtr); // current ptr + updateSlicePosPtr(builder, loc, sPosBuf, cPosPtr); + // Loads the position pointer start for next level. + nxPosPtr = + loadSlicePos(builder, loc, sPosBuf, cPosPtr, SlicePosKind::kNext); + curLvl++; + } + + // TODO: This is not always needed, but we did it unconditionally for now for + // simplicity. + // It is only needed when `curLvl` is forwarded without traversing its child + // level (e.g., the level is in a conjunctive lattices and got pruned), such + // that the position pointer is not forwarded inside the loop. + for (; curLvl < leafLvl; curLvl++) { + assert(nxPosPtr); + if (!isDenseLT(lvlTypes[tid][curLvl])) { + Value sPosBuf = slicePosBuffer[tid][curLvl].back(); + updateSlicePosPtr(builder, loc, sPosBuf, nxPosPtr); + nxPosPtr = + loadSlicePos(builder, loc, sPosBuf, nxPosPtr, SlicePosKind::kNext); + } + } +} + void LoopEmitter::exitWhileLoop(OpBuilder &builder, Location loc, MutableArrayRef reduc) { const LoopInfo &loopInfo = loopStack.back(); @@ -1489,6 +1540,13 @@ void LoopEmitter::exitWhileLoop(OpBuilder &builder, Location loc, forwarded = CMPI(eq, coords[tid][lvl], iv); operands.push_back(SELECT(forwarded, nxPos, pos)); } + { + OpBuilder::InsertionGuard guard(builder); + auto ifOp = builder.create(loc, TypeRange{}, forwarded, + /*else=*/false); + builder.setInsertionPointToStart(&ifOp.getThenRegion().front()); + forwardsReducedSliceLevelTreeIt(builder, loc, tid, lvl, one); + } // The coordinate is invalid now. coords[tid][lvl] = nullptr; @@ -1858,7 +1916,8 @@ void LoopEmitter::genResolvedSliceBegin(OpBuilder &builder, Location loc, pHi = genIndexLoad(builder, loc, positionsBuffers[tid][lvl], ADDI(posits[tid][lvl - 1], c1)); } - // Fills out pIdxBuffer[tid][lvl][0] with [pLo, pHi] + // Fills out pIdxBuffer[tid][lvl][0] with [0, pLo, pHi] + updateSlicePosPtr(builder, loc, sPtrBuf, c0); updateSlicePos(builder, loc, sPtrBuf, pLo, c0, SlicePosKind::kLo); updateSlicePos(builder, loc, sPtrBuf, pHi, c0, SlicePosKind::kHi); // Slice over a resolved parent, we only need one pair of pos hi and lo to @@ -1997,6 +2056,8 @@ void LoopEmitter::genUnResolvedSliceBegin(OpBuilder &builder, Location loc, Value isNonEmpty = result[0]; Value minCrd = result[1]; // Two metadata [memSize, idx]. + // TODO: Can use an SSA value for these two metadata + updateSlicePosPtr(builder, loc, sPtrBuf, c0); // FIXME: we need the relative offset related to the base slice. Value absOffset = offsetFromMinCoord(builder, loc, minCrd, remSz, isNonEmpty); sliceStack[tid].emplace_back(minCrd, absOffset, isNonEmpty, result[2], lvl, @@ -2005,30 +2066,16 @@ void LoopEmitter::genUnResolvedSliceBegin(OpBuilder &builder, Location loc, bool LoopEmitter::genSliceBegin(OpBuilder &builder, Location loc, TensorId tid, Level lvl) { - Value curLvlIdx = C_IDX(0); if (depFullyReduced(tid, lvl)) { - if (lvl == 0 || trivialSlice[tid][lvl]) { - sliceTupleNxStartIdx[tid][lvl] = C_IDX(0); - } else { - if (isDenseLT(lvlTypes[tid][lvl])) { - sliceTupleNxStartIdx[tid][lvl] = sliceTupleNxStartIdx[tid][lvl - 1]; - } else { - assert(isCompressedLT(lvlTypes[tid][lvl])); - curLvlIdx = ADDI(sliceTupleNxStartIdx[tid][lvl - 1], - sliceTupleFwdCnt[0][lvl - 1]); - sliceTupleNxStartIdx[tid][lvl] = - loadSlicePos(builder, loc, slicePosBuffer[tid][lvl].back(), - curLvlIdx, SlicePosKind::kNext); - } - } + // Do not need to prepare for slice driven loop on dense level after it is + // fully reduced. if (isDenseLT(lvlTypes[tid][lvl])) return true; - - Value sPosBuf = slicePosBuffer[tid][lvl].back(); // If constraints on the tensor is fully resolved. We do not need to // generates slice begin any more, instead we fall back to TACO-based // algorithm to (co)iterates over the slice. - Value tupleIdx = curLvlIdx; + Value sPosBuf = slicePosBuffer[tid][lvl].back(); + Value tupleIdx = loadSlicePosPtr(builder, loc, sPosBuf); posits[tid][lvl] = loadSlicePos(builder, loc, sPosBuf, tupleIdx, SlicePosKind::kLo); highs[tid][lvl] = @@ -2087,16 +2134,23 @@ bool LoopEmitter::genSliceBegin(OpBuilder &builder, Location loc, TensorId tid, if (sliceInfo.isInitialTensor() || (lvl >= 1 && lvlFullyResolved(tid, lvl - 1))) { // First level or previous level has been full resolved. - trivialSlice[tid][lvl] = true; genResolvedSliceBegin(builder, loc, tid, lvl); } else { // The previous level has not been full resolved. - trivialSlice[tid][lvl] = false; genUnResolvedSliceBegin(builder, loc, tid, lvl); } return false; } +void LoopEmitter::invalidateSliceIterIdx(OpBuilder &builder, Location loc, + TensorId tid, Level lvl) { + for (unsigned i = 0; i <= lvl; i++) { + if (!isDenseLT(lvlTypes[tid][i]) && !dependentLvlMap[tid][i].empty()) { + updateSlicePosPtr(builder, loc, slicePosBuffer[tid][i].back(), C_IDX(0)); + } + } +} + std::tuple LoopEmitter::genSliceNextInduction(OpBuilder &builder, Location loc, TensorId tid, Level lvl) { @@ -2121,6 +2175,10 @@ LoopEmitter::genSliceNextInduction(OpBuilder &builder, Location loc, // isNonEmpty = false; // Value absOffset = info.offset; + // Resets slices pointers as the resolved slices are invalidated after we + // moves forward to the next slice. + invalidateSliceIterIdx(builder, loc, tid, lvl); + SmallVector reduc = {info.minCrd, info.isNonEmpty, absOffset}; Value sPtrBuf = slicePosBuffer[tid][lvl][info.depth - 1]; Value fastPathP = CMPI(ugt, info.minCrd, absOffset); diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/LoopEmitter.h b/mlir/lib/Dialect/SparseTensor/Transforms/LoopEmitter.h index fa8b0076f733b..5e51cb2110fa1 100644 --- a/mlir/lib/Dialect/SparseTensor/Transforms/LoopEmitter.h +++ b/mlir/lib/Dialect/SparseTensor/Transforms/LoopEmitter.h @@ -453,6 +453,11 @@ class LoopEmitter { return tid < lvlTypes.size() && lvl < lvlTypes[tid].size(); } + /// Forwards the (conceptual) "tree iterator" when iterating over a fully + /// reduced slice created by index-reduction. + void forwardsReducedSliceLevelTreeIt(OpBuilder &builder, Location loc, + TensorId tid, Level lvl, Value fcnt); + /// Prepares loop for iterating over `tensor[lvl]`, under the assumption /// that `tensor[0...lvl-1]` loops have already been set up. void prepareLoopOverTensorAtLvl(OpBuilder &builder, Location loc, @@ -605,6 +610,11 @@ class LoopEmitter { void genUnResolvedSliceBegin(OpBuilder &builder, Location loc, TensorId tid, Level lvl); + /// Invalidates the index kept in slice postion buffers (by setting it to + /// zero). + /// TODO: We should instead use an SSA value for the index. + void invalidateSliceIterIdx(OpBuilder &builder, Location loc, TensorId tid, + Level lvl); /// Generates code to get the first non-empty slice of tid on lvl. /// return true if has already been resolved. bool genSliceBegin(OpBuilder &builder, Location loc, TensorId tid, Level lvl); @@ -673,9 +683,6 @@ class LoopEmitter { // But they always starts with the first pidx pointing to coord > slice.offset // to avoid iteration from the beginning. std::vector>> slicePosBuffer; - std::vector> sliceTupleNxStartIdx; - std::vector> sliceTupleFwdCnt; - std::vector> trivialSlice; // The (size, stride) for each conceptual slice used for index reduction // loops. diff --git a/mlir/lib/Dialect/Utils/IndexingUtils.cpp b/mlir/lib/Dialect/Utils/IndexingUtils.cpp index bb8a0d5912d7c..f4e29539214b4 100644 --- a/mlir/lib/Dialect/Utils/IndexingUtils.cpp +++ b/mlir/lib/Dialect/Utils/IndexingUtils.cpp @@ -70,7 +70,7 @@ SmallVector delinearizeImpl(ExprType linearIndex, //===----------------------------------------------------------------------===// SmallVector mlir::computeSuffixProduct(ArrayRef sizes) { - assert(llvm::all_of(sizes, [](int64_t s) { return s >= 0; }) && + assert(llvm::all_of(sizes, [](int64_t s) { return s > 0; }) && "sizes must be nonnegative"); int64_t unit = 1; return ::computeSuffixProductImpl(sizes, unit); diff --git a/mlir/lib/Transforms/Utils/GreedyPatternRewriteDriver.cpp b/mlir/lib/Transforms/Utils/GreedyPatternRewriteDriver.cpp index 7decbce018a87..8e2bfe557c555 100644 --- a/mlir/lib/Transforms/Utils/GreedyPatternRewriteDriver.cpp +++ b/mlir/lib/Transforms/Utils/GreedyPatternRewriteDriver.cpp @@ -581,6 +581,9 @@ void GreedyPatternRewriteDriver::notifyOperationReplaced( }); if (config.listener) config.listener->notifyOperationReplaced(op, replacement); + for (auto result : op->getResults()) + for (auto *user : result.getUsers()) + addToWorklist(user); } LogicalResult GreedyPatternRewriteDriver::notifyMatchFailure( diff --git a/mlir/test/Conversion/MathToLibm/convert-to-libm.mlir b/mlir/test/Conversion/MathToLibm/convert-to-libm.mlir index f0c4512cbfdcc..4837a0cce634a 100644 --- a/mlir/test/Conversion/MathToLibm/convert-to-libm.mlir +++ b/mlir/test/Conversion/MathToLibm/convert-to-libm.mlir @@ -1,7 +1,5 @@ // RUN: mlir-opt %s -convert-math-to-libm -canonicalize | FileCheck %s -// CHECK-DAG: @acos(f64) -> f64 attributes {llvm.readnone} -// CHECK-DAG: @acosf(f32) -> f32 attributes {llvm.readnone} // CHECK-DAG: @atan(f64) -> f64 attributes {llvm.readnone} // CHECK-DAG: @atanf(f32) -> f32 attributes {llvm.readnone} // CHECK-DAG: @erf(f64) -> f64 attributes {llvm.readnone} @@ -31,43 +29,6 @@ // CHECK-DAG: @ceil(f64) -> f64 attributes {llvm.readnone} // CHECK-DAG: @ceilf(f32) -> f32 attributes {llvm.readnone} -// CHECK-LABEL: func @acos_caller -// CHECK-SAME: %[[FLOAT:.*]]: f32 -// CHECK-SAME: %[[DOUBLE:.*]]: f64 -func.func @acos_caller(%float: f32, %double: f64) -> (f32, f64) { - // CHECK-DAG: %[[FLOAT_RESULT:.*]] = call @acosf(%[[FLOAT]]) : (f32) -> f32 - %float_result = math.acos %float : f32 - // CHECK-DAG: %[[DOUBLE_RESULT:.*]] = call @acos(%[[DOUBLE]]) : (f64) -> f64 - %double_result = math.acos %double : f64 - // CHECK: return %[[FLOAT_RESULT]], %[[DOUBLE_RESULT]] - return %float_result, %double_result : f32, f64 -} - -// CHECK-LABEL: func @acos_vec_caller( -// CHECK-SAME: %[[VAL_0:.*]]: vector<2xf32>, -// CHECK-SAME: %[[VAL_1:.*]]: vector<2xf64>) -> (vector<2xf32>, vector<2xf64>) { -// CHECK-DAG: %[[CVF:.*]] = arith.constant dense<0.000000e+00> : vector<2xf32> -// CHECK-DAG: %[[CVD:.*]] = arith.constant dense<0.000000e+00> : vector<2xf64> -// CHECK: %[[IN0_F32:.*]] = vector.extract %[[VAL_0]][0] : f32 from vector<2xf32> -// CHECK: %[[OUT0_F32:.*]] = call @acosf(%[[IN0_F32]]) : (f32) -> f32 -// CHECK: %[[VAL_8:.*]] = vector.insert %[[OUT0_F32]], %[[CVF]] [0] : f32 into vector<2xf32> -// CHECK: %[[IN1_F32:.*]] = vector.extract %[[VAL_0]][1] : f32 from vector<2xf32> -// CHECK: %[[OUT1_F32:.*]] = call @acosf(%[[IN1_F32]]) : (f32) -> f32 -// CHECK: %[[VAL_11:.*]] = vector.insert %[[OUT1_F32]], %[[VAL_8]] [1] : f32 into vector<2xf32> -// CHECK: %[[IN0_F64:.*]] = vector.extract %[[VAL_1]][0] : f64 from vector<2xf64> -// CHECK: %[[OUT0_F64:.*]] = call @acos(%[[IN0_F64]]) : (f64) -> f64 -// CHECK: %[[VAL_14:.*]] = vector.insert %[[OUT0_F64]], %[[CVD]] [0] : f64 into vector<2xf64> -// CHECK: %[[IN1_F64:.*]] = vector.extract %[[VAL_1]][1] : f64 from vector<2xf64> -// CHECK: %[[OUT1_F64:.*]] = call @acos(%[[IN1_F64]]) : (f64) -> f64 -// CHECK: %[[VAL_17:.*]] = vector.insert %[[OUT1_F64]], %[[VAL_14]] [1] : f64 into vector<2xf64> -// CHECK: return %[[VAL_11]], %[[VAL_17]] : vector<2xf32>, vector<2xf64> -// CHECK: } -func.func @acos_vec_caller(%float: vector<2xf32>, %double: vector<2xf64>) -> (vector<2xf32>, vector<2xf64>) { - %float_result = math.acos %float : vector<2xf32> - %double_result = math.acos %double : vector<2xf64> - return %float_result, %double_result : vector<2xf32>, vector<2xf64> -} - // CHECK-LABEL: func @atan_caller // CHECK-SAME: %[[FLOAT:.*]]: f32 // CHECK-SAME: %[[DOUBLE:.*]]: f64 diff --git a/mlir/test/Dialect/MemRef/expand-strided-metadata.mlir b/mlir/test/Dialect/MemRef/expand-strided-metadata.mlir index 28b7004300594..ab0c78a8ba766 100644 --- a/mlir/test/Dialect/MemRef/expand-strided-metadata.mlir +++ b/mlir/test/Dialect/MemRef/expand-strided-metadata.mlir @@ -1494,23 +1494,3 @@ func.func @extract_strided_metadata_of_cast_unranked( index, index, index, index } - - -// ----- -memref.global "private" @dynamicShmem : memref<0xf16,3> - -// CHECK-LABEL: func @zero_sized_memred -func.func @zero_sized_memred(%arg0: f32) -> (memref, index,index,index) { - %c0 = arith.constant 0 : index - %dynamicMem = memref.get_global @dynamicShmem : memref<0xf16, 3> - - // CHECK: %[[BASE:.*]] = memref.get_global @dynamicShmem : memref<0xf16, 3> - // CHECK: %[[CAST:.*]] = memref.reinterpret_cast %[[BASE]] to offset: [0], sizes: [], strides: [] : memref<0xf16, 3> to memref - // CHECK: return %[[CAST]] - - %base_buffer, %offset, %sizes, %strides = memref.extract_strided_metadata %dynamicMem : memref<0xf16, 3> -> memref, index, index, index - return %base_buffer, %offset, - %sizes, %strides : - memref, index, - index, index -} \ No newline at end of file diff --git a/mlir/test/Dialect/SparseTensor/sparse_conv_2d_slice_based.mlir b/mlir/test/Dialect/SparseTensor/sparse_conv_2d_slice_based.mlir index a3c1e76a3d09a..02cc5d1e2ef34 100644 --- a/mlir/test/Dialect/SparseTensor/sparse_conv_2d_slice_based.mlir +++ b/mlir/test/Dialect/SparseTensor/sparse_conv_2d_slice_based.mlir @@ -12,226 +12,241 @@ // CHECK-SAME: %[[VAL_1:.*]]: tensor<3x3xi32>) -> tensor<6x6xi32, #sparse> { // CHECK-DAG: %[[VAL_2:.*]] = arith.constant true // CHECK-DAG: %[[VAL_3:.*]] = arith.constant -2 : index -// CHECK-DAG: %[[VAL_4:.*]] = arith.constant 2 : index +// CHECK-DAG: %[[VAL_4:.*]] = arith.constant 4 : index // CHECK-DAG: %[[VAL_5:.*]] = arith.constant 8 : index // CHECK-DAG: %[[VAL_6:.*]] = arith.constant 3 : index // CHECK-DAG: %[[VAL_7:.*]] = arith.constant 1 : index -// CHECK-DAG: %[[VAL_8:.*]] = arith.constant 0 : index -// CHECK-DAG: %[[VAL_9:.*]] = arith.constant 0 : i32 -// CHECK-DAG: %[[VAL_10:.*]] = arith.constant false -// CHECK-DAG: %[[VAL_11:.*]] = tensor.empty() : tensor<6x6xi32, #sparse> -// CHECK-DAG: %[[VAL_12:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 0 : index} : tensor<8x8xi32, #sparse> to memref -// CHECK-DAG: %[[VAL_13:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 0 : index} : tensor<8x8xi32, #sparse> to memref -// CHECK-DAG: %[[VAL_14:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 1 : index} : tensor<8x8xi32, #sparse> to memref -// CHECK-DAG: %[[VAL_15:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 1 : index} : tensor<8x8xi32, #sparse> to memref -// CHECK-DAG: %[[VAL_16:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<8x8xi32, #sparse> to memref -// CHECK-DAG: %[[VAL_17:.*]] = memref.alloca() : memref<9xindex> -// CHECK-DAG: %[[VAL_18:.*]] = memref.alloca() : memref<3xindex> -// CHECK-DAG: %[[VAL_19:.*]] = memref.load %[[VAL_12]]{{\[}}%[[VAL_7]]] : memref -// CHECK: memref.store %[[VAL_8]], %[[VAL_18]]{{\[}}%[[VAL_8]]] : memref<3xindex> -// CHECK: memref.store %[[VAL_19]], %[[VAL_18]]{{\[}}%[[VAL_7]]] : memref<3xindex> -// CHECK: %[[VAL_20:.*]] = arith.cmpi ugt, %[[VAL_19]], %[[VAL_8]] : index -// CHECK: %[[VAL_21:.*]] = memref.load %[[VAL_13]]{{\[}}%[[VAL_8]]] : memref -// CHECK: %[[VAL_22:.*]] = arith.cmpi uge, %[[VAL_21]], %[[VAL_6]] : index -// CHECK: %[[VAL_23:.*]] = arith.andi %[[VAL_20]], %[[VAL_22]] : i1 -// CHECK: %[[VAL_24:.*]] = arith.addi %[[VAL_21]], %[[VAL_3]] : index -// CHECK: %[[VAL_25:.*]] = arith.select %[[VAL_23]], %[[VAL_24]], %[[VAL_8]] : index -// CHECK: %[[VAL_26:.*]]:3 = scf.while (%[[VAL_27:.*]] = %[[VAL_20]], %[[VAL_28:.*]] = %[[VAL_21]], %[[VAL_29:.*]] = %[[VAL_25]], %[[VAL_30:.*]] = %[[VAL_11]]) : (i1, index, index, tensor<6x6xi32, #sparse>) -> (index, index, tensor<6x6xi32, #sparse>) { -// CHECK: scf.condition(%[[VAL_27]]) %[[VAL_28]], %[[VAL_29]], %[[VAL_30]] : index, index, tensor<6x6xi32, #sparse> +// CHECK-DAG: %[[VAL_8:.*]] = arith.constant 5 : index +// CHECK-DAG: %[[VAL_9:.*]] = arith.constant 2 : index +// CHECK-DAG: %[[VAL_10:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[VAL_11:.*]] = arith.constant 0 : i32 +// CHECK-DAG: %[[VAL_12:.*]] = arith.constant false +// CHECK-DAG: %[[VAL_13:.*]] = tensor.empty() : tensor<6x6xi32, #sparse> +// CHECK-DAG: %[[VAL_14:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 0 : index} : tensor<8x8xi32, #sparse> to memref +// CHECK-DAG: %[[VAL_15:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 0 : index} : tensor<8x8xi32, #sparse> to memref +// CHECK-DAG: %[[VAL_16:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 1 : index} : tensor<8x8xi32, #sparse> to memref +// CHECK-DAG: %[[VAL_17:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 1 : index} : tensor<8x8xi32, #sparse> to memref +// CHECK-DAG: %[[VAL_18:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<8x8xi32, #sparse> to memref +// CHECK: %[[VAL_19:.*]] = memref.alloca() : memref<11xindex> +// CHECK: %[[VAL_20:.*]] = memref.alloca() : memref<5xindex> +// CHECK: %[[VAL_21:.*]] = memref.load %[[VAL_14]]{{\[}}%[[VAL_7]]] : memref +// CHECK: memref.store %[[VAL_10]], %[[VAL_20]]{{\[}}%[[VAL_7]]] : memref<5xindex> +// CHECK: memref.store %[[VAL_10]], %[[VAL_20]]{{\[}}%[[VAL_9]]] : memref<5xindex> +// CHECK: memref.store %[[VAL_21]], %[[VAL_20]]{{\[}}%[[VAL_6]]] : memref<5xindex> +// CHECK: %[[VAL_22:.*]] = arith.cmpi ugt, %[[VAL_21]], %[[VAL_10]] : index +// CHECK: %[[VAL_23:.*]] = memref.load %[[VAL_15]]{{\[}}%[[VAL_10]]] : memref +// CHECK: %[[VAL_24:.*]] = arith.cmpi uge, %[[VAL_23]], %[[VAL_6]] : index +// CHECK: %[[VAL_25:.*]] = arith.andi %[[VAL_22]], %[[VAL_24]] : i1 +// CHECK: %[[VAL_26:.*]] = arith.addi %[[VAL_23]], %[[VAL_3]] : index +// CHECK: %[[VAL_27:.*]] = arith.select %[[VAL_25]], %[[VAL_26]], %[[VAL_10]] : index +// CHECK: %[[VAL_28:.*]]:3 = scf.while (%[[VAL_29:.*]] = %[[VAL_22]], %[[VAL_30:.*]] = %[[VAL_23]], %[[VAL_31:.*]] = %[[VAL_27]], %[[VAL_32:.*]] = %[[VAL_13]]) : (i1, index, index, tensor<6x6xi32, #sparse>) -> (index, index, tensor<6x6xi32, #sparse>) { +// CHECK: scf.condition(%[[VAL_29]]) %[[VAL_30]], %[[VAL_31]], %[[VAL_32]] : index, index, tensor<6x6xi32, #sparse> // CHECK: } do { -// CHECK: ^bb0(%[[VAL_31:.*]]: index, %[[VAL_32:.*]]: index, %[[VAL_33:.*]]: tensor<6x6xi32, #sparse>): -// CHECK: %[[VAL_34:.*]] = memref.load %[[VAL_18]]{{\[}}%[[VAL_8]]] : memref<3xindex> -// CHECK: %[[VAL_35:.*]] = memref.load %[[VAL_18]]{{\[}}%[[VAL_7]]] : memref<3xindex> -// CHECK: memref.store %[[VAL_8]], %[[VAL_18]]{{\[}}%[[VAL_4]]] : memref<3xindex> -// CHECK: %[[VAL_36:.*]] = arith.addi %[[VAL_32]], %[[VAL_6]] : index -// CHECK: %[[VAL_37:.*]]:5 = scf.while (%[[VAL_38:.*]] = %[[VAL_34]], %[[VAL_39:.*]] = %[[VAL_10]], %[[VAL_40:.*]] = %[[VAL_5]], %[[VAL_41:.*]] = %[[VAL_8]], %[[VAL_42:.*]] = %[[VAL_8]]) : (index, i1, index, index, index) -> (index, i1, index, index, index) { -// CHECK: %[[VAL_43:.*]] = arith.cmpi ult, %[[VAL_38]], %[[VAL_35]] : index -// CHECK: %[[VAL_44:.*]] = scf.if %[[VAL_43]] -> (i1) { -// CHECK: %[[VAL_45:.*]] = memref.load %[[VAL_13]]{{\[}}%[[VAL_38]]] : memref -// CHECK: %[[VAL_46:.*]] = arith.cmpi ult, %[[VAL_45]], %[[VAL_36]] : index -// CHECK: scf.yield %[[VAL_46]] : i1 +// CHECK: ^bb0(%[[VAL_33:.*]]: index, %[[VAL_34:.*]]: index, %[[VAL_35:.*]]: tensor<6x6xi32, #sparse>): +// CHECK: %[[VAL_36:.*]] = memref.load %[[VAL_20]]{{\[}}%[[VAL_9]]] : memref<5xindex> +// CHECK: %[[VAL_37:.*]] = memref.load %[[VAL_20]]{{\[}}%[[VAL_6]]] : memref<5xindex> +// CHECK: memref.store %[[VAL_10]], %[[VAL_20]]{{\[}}%[[VAL_4]]] : memref<5xindex> +// CHECK: %[[VAL_38:.*]] = arith.addi %[[VAL_34]], %[[VAL_6]] : index +// CHECK: %[[VAL_39:.*]]:5 = scf.while (%[[VAL_40:.*]] = %[[VAL_36]], %[[VAL_41:.*]] = %[[VAL_12]], %[[VAL_42:.*]] = %[[VAL_5]], %[[VAL_43:.*]] = %[[VAL_10]], %[[VAL_44:.*]] = %[[VAL_10]]) : (index, i1, index, index, index) -> (index, i1, index, index, index) { +// CHECK: %[[VAL_45:.*]] = arith.cmpi ult, %[[VAL_40]], %[[VAL_37]] : index +// CHECK: %[[VAL_46:.*]] = scf.if %[[VAL_45]] -> (i1) { +// CHECK: %[[VAL_47:.*]] = memref.load %[[VAL_15]]{{\[}}%[[VAL_40]]] : memref +// CHECK: %[[VAL_48:.*]] = arith.cmpi ult, %[[VAL_47]], %[[VAL_38]] : index +// CHECK: scf.yield %[[VAL_48]] : i1 // CHECK: } else { -// CHECK: scf.yield %[[VAL_10]] : i1 +// CHECK: scf.yield %[[VAL_12]] : i1 // CHECK: } -// CHECK: scf.condition(%[[VAL_44]]) %[[VAL_38]], %[[VAL_39]], %[[VAL_40]], %[[VAL_41]], %[[VAL_42]] : index, i1, index, index, index +// CHECK: scf.condition(%[[VAL_46]]) %[[VAL_40]], %[[VAL_41]], %[[VAL_42]], %[[VAL_43]], %[[VAL_44]] : index, i1, index, index, index // CHECK: } do { -// CHECK: ^bb0(%[[VAL_47:.*]]: index, %[[VAL_48:.*]]: i1, %[[VAL_49:.*]]: index, %[[VAL_50:.*]]: index, %[[VAL_51:.*]]: index): -// CHECK: %[[VAL_52:.*]] = arith.addi %[[VAL_47]], %[[VAL_7]] : index -// CHECK: %[[VAL_53:.*]] = memref.load %[[VAL_14]]{{\[}}%[[VAL_47]]] : memref -// CHECK: %[[VAL_54:.*]] = memref.load %[[VAL_14]]{{\[}}%[[VAL_52]]] : memref -// CHECK: %[[VAL_55:.*]] = arith.cmpi ult, %[[VAL_53]], %[[VAL_54]] : index -// CHECK: %[[VAL_56:.*]] = arith.ori %[[VAL_55]], %[[VAL_48]] : i1 -// CHECK: %[[VAL_57:.*]] = scf.if %[[VAL_55]] -> (index) { -// CHECK: %[[VAL_58:.*]] = memref.load %[[VAL_15]]{{\[}}%[[VAL_53]]] : memref -// CHECK: %[[VAL_59:.*]] = arith.cmpi ult, %[[VAL_58]], %[[VAL_49]] : index -// CHECK: %[[VAL_60:.*]] = arith.select %[[VAL_59]], %[[VAL_58]], %[[VAL_49]] : index -// CHECK: scf.yield %[[VAL_60]] : index +// CHECK: ^bb0(%[[VAL_49:.*]]: index, %[[VAL_50:.*]]: i1, %[[VAL_51:.*]]: index, %[[VAL_52:.*]]: index, %[[VAL_53:.*]]: index): +// CHECK: %[[VAL_54:.*]] = arith.addi %[[VAL_49]], %[[VAL_7]] : index +// CHECK: %[[VAL_55:.*]] = memref.load %[[VAL_16]]{{\[}}%[[VAL_49]]] : memref +// CHECK: %[[VAL_56:.*]] = memref.load %[[VAL_16]]{{\[}}%[[VAL_54]]] : memref +// CHECK: %[[VAL_57:.*]] = arith.cmpi ult, %[[VAL_55]], %[[VAL_56]] : index +// CHECK: %[[VAL_58:.*]] = arith.ori %[[VAL_57]], %[[VAL_50]] : i1 +// CHECK: %[[VAL_59:.*]] = scf.if %[[VAL_57]] -> (index) { +// CHECK: %[[VAL_60:.*]] = memref.load %[[VAL_17]]{{\[}}%[[VAL_55]]] : memref +// CHECK: %[[VAL_61:.*]] = arith.cmpi ult, %[[VAL_60]], %[[VAL_51]] : index +// CHECK: %[[VAL_62:.*]] = arith.select %[[VAL_61]], %[[VAL_60]], %[[VAL_51]] : index +// CHECK: scf.yield %[[VAL_62]] : index // CHECK: } else { -// CHECK: scf.yield %[[VAL_49]] : index +// CHECK: scf.yield %[[VAL_51]] : index // CHECK: } -// CHECK: memref.store %[[VAL_53]], %[[VAL_17]]{{\[}}%[[VAL_50]]] : memref<9xindex> -// CHECK: %[[VAL_61:.*]] = arith.addi %[[VAL_50]], %[[VAL_6]] : index -// CHECK: memref.store %[[VAL_54]], %[[VAL_17]]{{\[}}%[[VAL_61]]] : memref<9xindex> -// CHECK: %[[VAL_62:.*]] = arith.addi %[[VAL_50]], %[[VAL_7]] : index -// CHECK: %[[VAL_63:.*]] = arith.addi %[[VAL_51]], %[[VAL_7]] : index -// CHECK: scf.yield %[[VAL_52]], %[[VAL_56]], %[[VAL_57]], %[[VAL_62]], %[[VAL_63]] : index, i1, index, index, index +// CHECK: %[[VAL_63:.*]] = arith.addi %[[VAL_52]], %[[VAL_9]] : index +// CHECK: memref.store %[[VAL_55]], %[[VAL_19]]{{\[}}%[[VAL_63]]] : memref<11xindex> +// CHECK: %[[VAL_64:.*]] = arith.addi %[[VAL_52]], %[[VAL_8]] : index +// CHECK: memref.store %[[VAL_56]], %[[VAL_19]]{{\[}}%[[VAL_64]]] : memref<11xindex> +// CHECK: %[[VAL_65:.*]] = arith.addi %[[VAL_52]], %[[VAL_7]] : index +// CHECK: %[[VAL_66:.*]] = arith.addi %[[VAL_53]], %[[VAL_7]] : index +// CHECK: scf.yield %[[VAL_54]], %[[VAL_58]], %[[VAL_59]], %[[VAL_65]], %[[VAL_66]] : index, i1, index, index, index // CHECK: } -// CHECK: %[[VAL_64:.*]] = arith.cmpi uge, %[[VAL_65:.*]]#2, %[[VAL_6]] : index -// CHECK: %[[VAL_66:.*]] = arith.andi %[[VAL_65]]#1, %[[VAL_64]] : i1 -// CHECK: %[[VAL_67:.*]] = arith.addi %[[VAL_65]]#2, %[[VAL_3]] : index -// CHECK: %[[VAL_68:.*]] = arith.select %[[VAL_66]], %[[VAL_67]], %[[VAL_8]] : index -// CHECK: %[[VAL_69:.*]]:3 = scf.while (%[[VAL_70:.*]] = %[[VAL_65]]#1, %[[VAL_71:.*]] = %[[VAL_65]]#2, %[[VAL_72:.*]] = %[[VAL_68]], %[[VAL_73:.*]] = %[[VAL_33]]) : (i1, index, index, tensor<6x6xi32, #sparse>) -> (index, index, tensor<6x6xi32, #sparse>) { -// CHECK: scf.condition(%[[VAL_70]]) %[[VAL_71]], %[[VAL_72]], %[[VAL_73]] : index, index, tensor<6x6xi32, #sparse> +// CHECK: memref.store %[[VAL_10]], %[[VAL_19]]{{\[}}%[[VAL_7]]] : memref<11xindex> +// CHECK: %[[VAL_67:.*]] = arith.cmpi uge, %[[VAL_68:.*]]#2, %[[VAL_6]] : index +// CHECK: %[[VAL_69:.*]] = arith.andi %[[VAL_68]]#1, %[[VAL_67]] : i1 +// CHECK: %[[VAL_70:.*]] = arith.addi %[[VAL_68]]#2, %[[VAL_3]] : index +// CHECK: %[[VAL_71:.*]] = arith.select %[[VAL_69]], %[[VAL_70]], %[[VAL_10]] : index +// CHECK: %[[VAL_72:.*]]:3 = scf.while (%[[VAL_73:.*]] = %[[VAL_68]]#1, %[[VAL_74:.*]] = %[[VAL_68]]#2, %[[VAL_75:.*]] = %[[VAL_71]], %[[VAL_76:.*]] = %[[VAL_35]]) : (i1, index, index, tensor<6x6xi32, #sparse>) -> (index, index, tensor<6x6xi32, #sparse>) { +// CHECK: scf.condition(%[[VAL_73]]) %[[VAL_74]], %[[VAL_75]], %[[VAL_76]] : index, index, tensor<6x6xi32, #sparse> // CHECK: } do { -// CHECK: ^bb0(%[[VAL_74:.*]]: index, %[[VAL_75:.*]]: index, %[[VAL_76:.*]]: tensor<6x6xi32, #sparse>): -// CHECK: %[[VAL_77:.*]] = memref.load %[[VAL_18]]{{\[}}%[[VAL_8]]] : memref<3xindex> -// CHECK: %[[VAL_78:.*]] = memref.load %[[VAL_18]]{{\[}}%[[VAL_7]]] : memref<3xindex> -// CHECK: %[[VAL_79:.*]]:3 = scf.while (%[[VAL_80:.*]] = %[[VAL_77]], %[[VAL_81:.*]] = %[[VAL_9]], %[[VAL_82:.*]] = %[[VAL_10]]) : (index, i32, i1) -> (index, i32, i1) { -// CHECK: %[[VAL_83:.*]] = arith.cmpi ult, %[[VAL_80]], %[[VAL_78]] : index -// CHECK: %[[VAL_84:.*]] = scf.if %[[VAL_83]] -> (i1) { -// CHECK: %[[VAL_85:.*]] = memref.load %[[VAL_13]]{{\[}}%[[VAL_80]]] : memref -// CHECK: %[[VAL_86:.*]] = arith.cmpi ult, %[[VAL_85]], %[[VAL_36]] : index -// CHECK: scf.yield %[[VAL_86]] : i1 +// CHECK: ^bb0(%[[VAL_77:.*]]: index, %[[VAL_78:.*]]: index, %[[VAL_79:.*]]: tensor<6x6xi32, #sparse>): +// CHECK: %[[VAL_80:.*]] = memref.load %[[VAL_20]]{{\[}}%[[VAL_7]]] : memref<5xindex> +// CHECK: %[[VAL_81:.*]] = arith.addi %[[VAL_80]], %[[VAL_9]] : index +// CHECK: %[[VAL_82:.*]] = memref.load %[[VAL_20]]{{\[}}%[[VAL_81]]] : memref<5xindex> +// CHECK: %[[VAL_83:.*]] = arith.addi %[[VAL_80]], %[[VAL_6]] : index +// CHECK: %[[VAL_84:.*]] = memref.load %[[VAL_20]]{{\[}}%[[VAL_83]]] : memref<5xindex> +// CHECK: %[[VAL_85:.*]]:3 = scf.while (%[[VAL_86:.*]] = %[[VAL_82]], %[[VAL_87:.*]] = %[[VAL_11]], %[[VAL_88:.*]] = %[[VAL_12]]) : (index, i32, i1) -> (index, i32, i1) { +// CHECK: %[[VAL_89:.*]] = arith.cmpi ult, %[[VAL_86]], %[[VAL_84]] : index +// CHECK: %[[VAL_90:.*]] = scf.if %[[VAL_89]] -> (i1) { +// CHECK: %[[VAL_91:.*]] = memref.load %[[VAL_15]]{{\[}}%[[VAL_86]]] : memref +// CHECK: %[[VAL_92:.*]] = arith.cmpi ult, %[[VAL_91]], %[[VAL_38]] : index +// CHECK: scf.yield %[[VAL_92]] : i1 // CHECK: } else { -// CHECK: scf.yield %[[VAL_10]] : i1 +// CHECK: scf.yield %[[VAL_12]] : i1 // CHECK: } -// CHECK: scf.condition(%[[VAL_84]]) %[[VAL_80]], %[[VAL_81]], %[[VAL_82]] : index, i32, i1 +// CHECK: scf.condition(%[[VAL_90]]) %[[VAL_86]], %[[VAL_87]], %[[VAL_88]] : index, i32, i1 // CHECK: } do { -// CHECK: ^bb0(%[[VAL_87:.*]]: index, %[[VAL_88:.*]]: i32, %[[VAL_89:.*]]: i1): -// CHECK: %[[VAL_90:.*]] = arith.subi %[[VAL_87]], %[[VAL_77]] : index -// CHECK: %[[VAL_91:.*]] = memref.load %[[VAL_13]]{{\[}}%[[VAL_87]]] : memref -// CHECK: %[[VAL_92:.*]] = arith.subi %[[VAL_91]], %[[VAL_32]] : index -// CHECK: %[[VAL_93:.*]] = memref.load %[[VAL_17]]{{\[}}%[[VAL_90]]] : memref<9xindex> -// CHECK: %[[VAL_94:.*]] = arith.addi %[[VAL_90]], %[[VAL_6]] : index -// CHECK: %[[VAL_95:.*]] = memref.load %[[VAL_17]]{{\[}}%[[VAL_94]]] : memref<9xindex> -// CHECK: %[[VAL_96:.*]] = arith.addi %[[VAL_75]], %[[VAL_6]] : index -// CHECK: %[[VAL_97:.*]]:2 = scf.while (%[[VAL_98:.*]] = %[[VAL_93]], %[[VAL_99:.*]] = %[[VAL_88]]) : (index, i32) -> (index, i32) { -// CHECK: %[[VAL_100:.*]] = arith.cmpi ult, %[[VAL_98]], %[[VAL_95]] : index -// CHECK: %[[VAL_101:.*]] = scf.if %[[VAL_100]] -> (i1) { -// CHECK: %[[VAL_102:.*]] = memref.load %[[VAL_15]]{{\[}}%[[VAL_98]]] : memref -// CHECK: %[[VAL_103:.*]] = arith.cmpi ult, %[[VAL_102]], %[[VAL_96]] : index -// CHECK: scf.yield %[[VAL_103]] : i1 +// CHECK: ^bb0(%[[VAL_93:.*]]: index, %[[VAL_94:.*]]: i32, %[[VAL_95:.*]]: i1): +// CHECK: %[[VAL_96:.*]] = memref.load %[[VAL_15]]{{\[}}%[[VAL_93]]] : memref +// CHECK: %[[VAL_97:.*]] = arith.subi %[[VAL_96]], %[[VAL_34]] : index +// CHECK: %[[VAL_98:.*]] = memref.load %[[VAL_19]]{{\[}}%[[VAL_7]]] : memref<11xindex> +// CHECK: %[[VAL_99:.*]] = arith.addi %[[VAL_98]], %[[VAL_9]] : index +// CHECK: %[[VAL_100:.*]] = memref.load %[[VAL_19]]{{\[}}%[[VAL_99]]] : memref<11xindex> +// CHECK: %[[VAL_101:.*]] = arith.addi %[[VAL_98]], %[[VAL_8]] : index +// CHECK: %[[VAL_102:.*]] = memref.load %[[VAL_19]]{{\[}}%[[VAL_101]]] : memref<11xindex> +// CHECK: %[[VAL_103:.*]] = arith.addi %[[VAL_78]], %[[VAL_6]] : index +// CHECK: %[[VAL_104:.*]]:2 = scf.while (%[[VAL_105:.*]] = %[[VAL_100]], %[[VAL_106:.*]] = %[[VAL_94]]) : (index, i32) -> (index, i32) { +// CHECK: %[[VAL_107:.*]] = arith.cmpi ult, %[[VAL_105]], %[[VAL_102]] : index +// CHECK: %[[VAL_108:.*]] = scf.if %[[VAL_107]] -> (i1) { +// CHECK: %[[VAL_109:.*]] = memref.load %[[VAL_17]]{{\[}}%[[VAL_105]]] : memref +// CHECK: %[[VAL_110:.*]] = arith.cmpi ult, %[[VAL_109]], %[[VAL_103]] : index +// CHECK: scf.yield %[[VAL_110]] : i1 // CHECK: } else { -// CHECK: scf.yield %[[VAL_10]] : i1 +// CHECK: scf.yield %[[VAL_12]] : i1 // CHECK: } -// CHECK: scf.condition(%[[VAL_101]]) %[[VAL_98]], %[[VAL_99]] : index, i32 +// CHECK: scf.condition(%[[VAL_108]]) %[[VAL_105]], %[[VAL_106]] : index, i32 // CHECK: } do { -// CHECK: ^bb0(%[[VAL_104:.*]]: index, %[[VAL_105:.*]]: i32): -// CHECK: %[[VAL_106:.*]] = memref.load %[[VAL_15]]{{\[}}%[[VAL_104]]] : memref -// CHECK: %[[VAL_107:.*]] = arith.subi %[[VAL_106]], %[[VAL_75]] : index -// CHECK: %[[VAL_108:.*]] = memref.load %[[VAL_16]]{{\[}}%[[VAL_104]]] : memref -// CHECK: %[[VAL_109:.*]] = tensor.extract %[[VAL_1]]{{\[}}%[[VAL_92]], %[[VAL_107]]] : tensor<3x3xi32> -// CHECK: %[[VAL_110:.*]] = arith.muli %[[VAL_108]], %[[VAL_109]] : i32 -// CHECK: %[[VAL_111:.*]] = arith.addi %[[VAL_105]], %[[VAL_110]] : i32 -// CHECK: %[[VAL_112:.*]] = arith.addi %[[VAL_104]], %[[VAL_7]] : index -// CHECK: scf.yield %[[VAL_112]], %[[VAL_111]] : index, i32 +// CHECK: ^bb0(%[[VAL_111:.*]]: index, %[[VAL_112:.*]]: i32): +// CHECK: %[[VAL_113:.*]] = memref.load %[[VAL_17]]{{\[}}%[[VAL_111]]] : memref +// CHECK: %[[VAL_114:.*]] = arith.subi %[[VAL_113]], %[[VAL_78]] : index +// CHECK: %[[VAL_115:.*]] = memref.load %[[VAL_18]]{{\[}}%[[VAL_111]]] : memref +// CHECK: %[[VAL_116:.*]] = tensor.extract %[[VAL_1]]{{\[}}%[[VAL_97]], %[[VAL_114]]] : tensor<3x3xi32> +// CHECK: %[[VAL_117:.*]] = arith.muli %[[VAL_115]], %[[VAL_116]] : i32 +// CHECK: %[[VAL_118:.*]] = arith.addi %[[VAL_112]], %[[VAL_117]] : i32 +// CHECK: %[[VAL_119:.*]] = arith.addi %[[VAL_111]], %[[VAL_7]] : index +// CHECK: scf.yield %[[VAL_119]], %[[VAL_118]] : index, i32 // CHECK: } -// CHECK: %[[VAL_113:.*]] = arith.addi %[[VAL_87]], %[[VAL_7]] : index -// CHECK: scf.yield %[[VAL_113]], %[[VAL_114:.*]]#1, %[[VAL_2]] : index, i32, i1 +// CHECK: %[[VAL_120:.*]] = arith.addi %[[VAL_93]], %[[VAL_7]] : index +// CHECK: %[[VAL_121:.*]] = arith.addi %[[VAL_98]], %[[VAL_7]] : index +// CHECK: memref.store %[[VAL_121]], %[[VAL_19]]{{\[}}%[[VAL_7]]] : memref<11xindex> +// CHECK: scf.yield %[[VAL_120]], %[[VAL_122:.*]]#1, %[[VAL_2]] : index, i32, i1 // CHECK: } -// CHECK: %[[VAL_115:.*]] = scf.if %[[VAL_116:.*]]#2 -> (tensor<6x6xi32, #sparse>) { -// CHECK: %[[VAL_117:.*]] = sparse_tensor.insert %[[VAL_116]]#1 into %[[VAL_76]]{{\[}}%[[VAL_32]], %[[VAL_75]]] : tensor<6x6xi32, #sparse> -// CHECK: scf.yield %[[VAL_117]] : tensor<6x6xi32, #sparse> +// CHECK: %[[VAL_123:.*]] = scf.if %[[VAL_124:.*]]#2 -> (tensor<6x6xi32, #sparse>) { +// CHECK: %[[VAL_125:.*]] = sparse_tensor.insert %[[VAL_124]]#1 into %[[VAL_79]]{{\[}}%[[VAL_34]], %[[VAL_78]]] : tensor<6x6xi32, #sparse> +// CHECK: scf.yield %[[VAL_125]] : tensor<6x6xi32, #sparse> // CHECK: } else { -// CHECK: scf.yield %[[VAL_76]] : tensor<6x6xi32, #sparse> +// CHECK: scf.yield %[[VAL_79]] : tensor<6x6xi32, #sparse> // CHECK: } -// CHECK: %[[VAL_118:.*]] = arith.cmpi ugt, %[[VAL_74]], %[[VAL_75]] : index -// CHECK: %[[VAL_119:.*]]:3 = scf.if %[[VAL_118]] -> (index, i1, index) { -// CHECK: %[[VAL_120:.*]] = arith.addi %[[VAL_75]], %[[VAL_7]] : index -// CHECK: scf.yield %[[VAL_74]], %[[VAL_2]], %[[VAL_120]] : index, i1, index +// CHECK: memref.store %[[VAL_10]], %[[VAL_20]]{{\[}}%[[VAL_7]]] : memref<5xindex> +// CHECK: memref.store %[[VAL_10]], %[[VAL_19]]{{\[}}%[[VAL_7]]] : memref<11xindex> +// CHECK: %[[VAL_126:.*]] = arith.cmpi ugt, %[[VAL_77]], %[[VAL_78]] : index +// CHECK: %[[VAL_127:.*]]:3 = scf.if %[[VAL_126]] -> (index, i1, index) { +// CHECK: %[[VAL_128:.*]] = arith.addi %[[VAL_78]], %[[VAL_7]] : index +// CHECK: scf.yield %[[VAL_77]], %[[VAL_2]], %[[VAL_128]] : index, i1, index // CHECK: } else { -// CHECK: %[[VAL_121:.*]]:2 = scf.for %[[VAL_122:.*]] = %[[VAL_8]] to %[[VAL_65]]#3 step %[[VAL_7]] iter_args(%[[VAL_123:.*]] = %[[VAL_5]], %[[VAL_124:.*]] = %[[VAL_10]]) -> (index, i1) { -// CHECK: %[[VAL_125:.*]] = memref.load %[[VAL_17]]{{\[}}%[[VAL_122]]] : memref<9xindex> -// CHECK: %[[VAL_126:.*]] = arith.addi %[[VAL_122]], %[[VAL_6]] : index -// CHECK: %[[VAL_127:.*]] = memref.load %[[VAL_17]]{{\[}}%[[VAL_126]]] : memref<9xindex> -// CHECK: %[[VAL_128:.*]] = arith.cmpi ult, %[[VAL_125]], %[[VAL_127]] : index -// CHECK: %[[VAL_129:.*]] = scf.if %[[VAL_128]] -> (index) { -// CHECK: %[[VAL_130:.*]] = memref.load %[[VAL_15]]{{\[}}%[[VAL_125]]] : memref -// CHECK: %[[VAL_131:.*]] = arith.cmpi eq, %[[VAL_130]], %[[VAL_74]] : index -// CHECK: %[[VAL_132:.*]] = scf.if %[[VAL_131]] -> (index) { -// CHECK: %[[VAL_133:.*]] = arith.addi %[[VAL_125]], %[[VAL_7]] : index -// CHECK: memref.store %[[VAL_133]], %[[VAL_17]]{{\[}}%[[VAL_122]]] : memref<9xindex> -// CHECK: scf.yield %[[VAL_133]] : index +// CHECK: %[[VAL_129:.*]]:2 = scf.for %[[VAL_130:.*]] = %[[VAL_10]] to %[[VAL_68]]#3 step %[[VAL_7]] iter_args(%[[VAL_131:.*]] = %[[VAL_5]], %[[VAL_132:.*]] = %[[VAL_12]]) -> (index, i1) { +// CHECK: %[[VAL_133:.*]] = arith.addi %[[VAL_130]], %[[VAL_9]] : index +// CHECK: %[[VAL_134:.*]] = memref.load %[[VAL_19]]{{\[}}%[[VAL_133]]] : memref<11xindex> +// CHECK: %[[VAL_135:.*]] = arith.addi %[[VAL_130]], %[[VAL_8]] : index +// CHECK: %[[VAL_136:.*]] = memref.load %[[VAL_19]]{{\[}}%[[VAL_135]]] : memref<11xindex> +// CHECK: %[[VAL_137:.*]] = arith.cmpi ult, %[[VAL_134]], %[[VAL_136]] : index +// CHECK: %[[VAL_138:.*]] = scf.if %[[VAL_137]] -> (index) { +// CHECK: %[[VAL_139:.*]] = memref.load %[[VAL_17]]{{\[}}%[[VAL_134]]] : memref +// CHECK: %[[VAL_140:.*]] = arith.cmpi eq, %[[VAL_139]], %[[VAL_77]] : index +// CHECK: %[[VAL_141:.*]] = scf.if %[[VAL_140]] -> (index) { +// CHECK: %[[VAL_142:.*]] = arith.addi %[[VAL_134]], %[[VAL_7]] : index +// CHECK: memref.store %[[VAL_142]], %[[VAL_19]]{{\[}}%[[VAL_133]]] : memref<11xindex> +// CHECK: scf.yield %[[VAL_142]] : index // CHECK: } else { -// CHECK: scf.yield %[[VAL_125]] : index +// CHECK: scf.yield %[[VAL_134]] : index // CHECK: } -// CHECK: scf.yield %[[VAL_132]] : index +// CHECK: scf.yield %[[VAL_141]] : index // CHECK: } else { -// CHECK: scf.yield %[[VAL_125]] : index +// CHECK: scf.yield %[[VAL_134]] : index // CHECK: } -// CHECK: %[[VAL_134:.*]] = arith.cmpi ult, %[[VAL_129]], %[[VAL_127]] : index -// CHECK: %[[VAL_135:.*]] = scf.if %[[VAL_134]] -> (index) { -// CHECK: %[[VAL_136:.*]] = memref.load %[[VAL_15]]{{\[}}%[[VAL_129]]] : memref -// CHECK: scf.yield %[[VAL_136]] : index +// CHECK: %[[VAL_143:.*]] = arith.cmpi ult, %[[VAL_138]], %[[VAL_136]] : index +// CHECK: %[[VAL_144:.*]] = scf.if %[[VAL_143]] -> (index) { +// CHECK: %[[VAL_145:.*]] = memref.load %[[VAL_17]]{{\[}}%[[VAL_138]]] : memref +// CHECK: scf.yield %[[VAL_145]] : index // CHECK: } else { -// CHECK: scf.yield %[[VAL_123]] : index +// CHECK: scf.yield %[[VAL_131]] : index // CHECK: } -// CHECK: %[[VAL_137:.*]] = arith.ori %[[VAL_134]], %[[VAL_124]] : i1 -// CHECK: %[[VAL_138:.*]] = arith.cmpi ult, %[[VAL_135]], %[[VAL_123]] : index -// CHECK: %[[VAL_139:.*]] = arith.select %[[VAL_138]], %[[VAL_135]], %[[VAL_123]] : index -// CHECK: scf.yield %[[VAL_139]], %[[VAL_137]] : index, i1 +// CHECK: %[[VAL_146:.*]] = arith.ori %[[VAL_143]], %[[VAL_132]] : i1 +// CHECK: %[[VAL_147:.*]] = arith.cmpi ult, %[[VAL_144]], %[[VAL_131]] : index +// CHECK: %[[VAL_148:.*]] = arith.select %[[VAL_147]], %[[VAL_144]], %[[VAL_131]] : index +// CHECK: scf.yield %[[VAL_148]], %[[VAL_146]] : index, i1 // CHECK: } -// CHECK: %[[VAL_140:.*]] = arith.addi %[[VAL_141:.*]]#0, %[[VAL_7]] : index -// CHECK: %[[VAL_142:.*]] = arith.addi %[[VAL_141]]#0, %[[VAL_3]] : index -// CHECK: %[[VAL_143:.*]] = arith.cmpi uge, %[[VAL_140]], %[[VAL_6]] : index -// CHECK: %[[VAL_144:.*]] = arith.select %[[VAL_143]], %[[VAL_142]], %[[VAL_8]] : index -// CHECK: scf.yield %[[VAL_141]]#0, %[[VAL_141]]#1, %[[VAL_144]] : index, i1, index +// CHECK: %[[VAL_149:.*]] = arith.addi %[[VAL_150:.*]]#0, %[[VAL_7]] : index +// CHECK: %[[VAL_151:.*]] = arith.addi %[[VAL_150]]#0, %[[VAL_3]] : index +// CHECK: %[[VAL_152:.*]] = arith.cmpi uge, %[[VAL_149]], %[[VAL_6]] : index +// CHECK: %[[VAL_153:.*]] = arith.select %[[VAL_152]], %[[VAL_151]], %[[VAL_10]] : index +// CHECK: scf.yield %[[VAL_150]]#0, %[[VAL_150]]#1, %[[VAL_153]] : index, i1, index // CHECK: } -// CHECK: %[[VAL_145:.*]] = arith.addi %[[VAL_75]], %[[VAL_7]] : index -// CHECK: %[[VAL_146:.*]] = arith.cmpi ugt, %[[VAL_147:.*]]#2, %[[VAL_145]] : index -// CHECK: %[[VAL_148:.*]] = arith.select %[[VAL_146]], %[[VAL_147]]#2, %[[VAL_145]] : index -// CHECK: %[[VAL_149:.*]] = arith.addi %[[VAL_148]], %[[VAL_6]] : index -// CHECK: %[[VAL_150:.*]] = arith.cmpi ule, %[[VAL_149]], %[[VAL_5]] : index -// CHECK: %[[VAL_151:.*]] = arith.andi %[[VAL_147]]#1, %[[VAL_150]] : i1 -// CHECK: scf.yield %[[VAL_151]], %[[VAL_147]]#0, %[[VAL_148]], %[[VAL_115]] : i1, index, index, tensor<6x6xi32, #sparse> +// CHECK: %[[VAL_154:.*]] = arith.addi %[[VAL_78]], %[[VAL_7]] : index +// CHECK: %[[VAL_155:.*]] = arith.cmpi ugt, %[[VAL_156:.*]]#2, %[[VAL_154]] : index +// CHECK: %[[VAL_157:.*]] = arith.select %[[VAL_155]], %[[VAL_156]]#2, %[[VAL_154]] : index +// CHECK: %[[VAL_158:.*]] = arith.addi %[[VAL_157]], %[[VAL_6]] : index +// CHECK: %[[VAL_159:.*]] = arith.cmpi ule, %[[VAL_158]], %[[VAL_5]] : index +// CHECK: %[[VAL_160:.*]] = arith.andi %[[VAL_156]]#1, %[[VAL_159]] : i1 +// CHECK: scf.yield %[[VAL_160]], %[[VAL_156]]#0, %[[VAL_157]], %[[VAL_123]] : i1, index, index, tensor<6x6xi32, #sparse> // CHECK: } -// CHECK: %[[VAL_152:.*]] = arith.cmpi ugt, %[[VAL_31]], %[[VAL_32]] : index -// CHECK: %[[VAL_153:.*]]:3 = scf.if %[[VAL_152]] -> (index, i1, index) { -// CHECK: %[[VAL_154:.*]] = arith.addi %[[VAL_32]], %[[VAL_7]] : index -// CHECK: scf.yield %[[VAL_31]], %[[VAL_2]], %[[VAL_154]] : index, i1, index +// CHECK: memref.store %[[VAL_10]], %[[VAL_20]]{{\[}}%[[VAL_7]]] : memref<5xindex> +// CHECK: %[[VAL_161:.*]] = arith.cmpi ugt, %[[VAL_33]], %[[VAL_34]] : index +// CHECK: %[[VAL_162:.*]]:3 = scf.if %[[VAL_161]] -> (index, i1, index) { +// CHECK: %[[VAL_163:.*]] = arith.addi %[[VAL_34]], %[[VAL_7]] : index +// CHECK: scf.yield %[[VAL_33]], %[[VAL_2]], %[[VAL_163]] : index, i1, index // CHECK: } else { -// CHECK: %[[VAL_155:.*]] = memref.load %[[VAL_18]]{{\[}}%[[VAL_8]]] : memref<3xindex> -// CHECK: %[[VAL_156:.*]] = memref.load %[[VAL_18]]{{\[}}%[[VAL_7]]] : memref<3xindex> -// CHECK: %[[VAL_157:.*]] = arith.cmpi ult, %[[VAL_155]], %[[VAL_156]] : index -// CHECK: %[[VAL_158:.*]] = scf.if %[[VAL_157]] -> (index) { -// CHECK: %[[VAL_159:.*]] = memref.load %[[VAL_13]]{{\[}}%[[VAL_155]]] : memref -// CHECK: %[[VAL_160:.*]] = arith.cmpi eq, %[[VAL_159]], %[[VAL_31]] : index -// CHECK: %[[VAL_161:.*]] = scf.if %[[VAL_160]] -> (index) { -// CHECK: %[[VAL_162:.*]] = arith.addi %[[VAL_155]], %[[VAL_7]] : index -// CHECK: memref.store %[[VAL_162]], %[[VAL_18]]{{\[}}%[[VAL_8]]] : memref<3xindex> -// CHECK: scf.yield %[[VAL_162]] : index +// CHECK: %[[VAL_164:.*]] = memref.load %[[VAL_20]]{{\[}}%[[VAL_9]]] : memref<5xindex> +// CHECK: %[[VAL_165:.*]] = memref.load %[[VAL_20]]{{\[}}%[[VAL_6]]] : memref<5xindex> +// CHECK: %[[VAL_166:.*]] = arith.cmpi ult, %[[VAL_164]], %[[VAL_165]] : index +// CHECK: %[[VAL_167:.*]] = scf.if %[[VAL_166]] -> (index) { +// CHECK: %[[VAL_168:.*]] = memref.load %[[VAL_15]]{{\[}}%[[VAL_164]]] : memref +// CHECK: %[[VAL_169:.*]] = arith.cmpi eq, %[[VAL_168]], %[[VAL_33]] : index +// CHECK: %[[VAL_170:.*]] = scf.if %[[VAL_169]] -> (index) { +// CHECK: %[[VAL_171:.*]] = arith.addi %[[VAL_164]], %[[VAL_7]] : index +// CHECK: memref.store %[[VAL_171]], %[[VAL_20]]{{\[}}%[[VAL_9]]] : memref<5xindex> +// CHECK: scf.yield %[[VAL_171]] : index // CHECK: } else { -// CHECK: scf.yield %[[VAL_155]] : index +// CHECK: scf.yield %[[VAL_164]] : index // CHECK: } -// CHECK: scf.yield %[[VAL_161]] : index +// CHECK: scf.yield %[[VAL_170]] : index // CHECK: } else { -// CHECK: scf.yield %[[VAL_155]] : index +// CHECK: scf.yield %[[VAL_164]] : index // CHECK: } -// CHECK: %[[VAL_163:.*]] = arith.cmpi ult, %[[VAL_158]], %[[VAL_156]] : index -// CHECK: %[[VAL_164:.*]] = scf.if %[[VAL_163]] -> (index) { -// CHECK: %[[VAL_165:.*]] = memref.load %[[VAL_13]]{{\[}}%[[VAL_158]]] : memref -// CHECK: scf.yield %[[VAL_165]] : index +// CHECK: %[[VAL_172:.*]] = arith.cmpi ult, %[[VAL_167]], %[[VAL_165]] : index +// CHECK: %[[VAL_173:.*]] = scf.if %[[VAL_172]] -> (index) { +// CHECK: %[[VAL_174:.*]] = memref.load %[[VAL_15]]{{\[}}%[[VAL_167]]] : memref +// CHECK: scf.yield %[[VAL_174]] : index // CHECK: } else { // CHECK: scf.yield %[[VAL_5]] : index // CHECK: } -// CHECK: %[[VAL_166:.*]] = arith.cmpi ult, %[[VAL_164]], %[[VAL_5]] : index -// CHECK: %[[VAL_167:.*]] = arith.select %[[VAL_166]], %[[VAL_164]], %[[VAL_5]] : index -// CHECK: %[[VAL_168:.*]] = arith.addi %[[VAL_167]], %[[VAL_7]] : index -// CHECK: %[[VAL_169:.*]] = arith.addi %[[VAL_167]], %[[VAL_3]] : index -// CHECK: %[[VAL_170:.*]] = arith.cmpi uge, %[[VAL_168]], %[[VAL_6]] : index -// CHECK: %[[VAL_171:.*]] = arith.select %[[VAL_170]], %[[VAL_169]], %[[VAL_8]] : index -// CHECK: scf.yield %[[VAL_167]], %[[VAL_163]], %[[VAL_171]] : index, i1, index +// CHECK: %[[VAL_175:.*]] = arith.cmpi ult, %[[VAL_173]], %[[VAL_5]] : index +// CHECK: %[[VAL_176:.*]] = arith.select %[[VAL_175]], %[[VAL_173]], %[[VAL_5]] : index +// CHECK: %[[VAL_177:.*]] = arith.addi %[[VAL_176]], %[[VAL_7]] : index +// CHECK: %[[VAL_178:.*]] = arith.addi %[[VAL_176]], %[[VAL_3]] : index +// CHECK: %[[VAL_179:.*]] = arith.cmpi uge, %[[VAL_177]], %[[VAL_6]] : index +// CHECK: %[[VAL_180:.*]] = arith.select %[[VAL_179]], %[[VAL_178]], %[[VAL_10]] : index +// CHECK: scf.yield %[[VAL_176]], %[[VAL_172]], %[[VAL_180]] : index, i1, index // CHECK: } -// CHECK: %[[VAL_172:.*]] = arith.addi %[[VAL_32]], %[[VAL_7]] : index -// CHECK: %[[VAL_173:.*]] = arith.cmpi ugt, %[[VAL_174:.*]]#2, %[[VAL_172]] : index -// CHECK: %[[VAL_175:.*]] = arith.select %[[VAL_173]], %[[VAL_174]]#2, %[[VAL_172]] : index -// CHECK: %[[VAL_176:.*]] = arith.addi %[[VAL_175]], %[[VAL_6]] : index -// CHECK: %[[VAL_177:.*]] = arith.cmpi ule, %[[VAL_176]], %[[VAL_5]] : index -// CHECK: %[[VAL_178:.*]] = arith.andi %[[VAL_174]]#1, %[[VAL_177]] : i1 -// CHECK: scf.yield %[[VAL_178]], %[[VAL_174]]#0, %[[VAL_175]], %[[VAL_179:.*]]#2 : i1, index, index, tensor<6x6xi32, #sparse> +// CHECK: %[[VAL_181:.*]] = arith.addi %[[VAL_34]], %[[VAL_7]] : index +// CHECK: %[[VAL_182:.*]] = arith.cmpi ugt, %[[VAL_183:.*]]#2, %[[VAL_181]] : index +// CHECK: %[[VAL_184:.*]] = arith.select %[[VAL_182]], %[[VAL_183]]#2, %[[VAL_181]] : index +// CHECK: %[[VAL_185:.*]] = arith.addi %[[VAL_184]], %[[VAL_6]] : index +// CHECK: %[[VAL_186:.*]] = arith.cmpi ule, %[[VAL_185]], %[[VAL_5]] : index +// CHECK: %[[VAL_187:.*]] = arith.andi %[[VAL_183]]#1, %[[VAL_186]] : i1 +// CHECK: scf.yield %[[VAL_187]], %[[VAL_183]]#0, %[[VAL_184]], %[[VAL_188:.*]]#2 : i1, index, index, tensor<6x6xi32, #sparse> // CHECK: } -// CHECK: %[[VAL_180:.*]] = sparse_tensor.load %[[VAL_181:.*]]#2 hasInserts : tensor<6x6xi32, #sparse> -// CHECK: return %[[VAL_180]] : tensor<6x6xi32, #sparse> +// CHECK: %[[VAL_189:.*]] = sparse_tensor.load %[[VAL_190:.*]]#2 hasInserts : tensor<6x6xi32, #sparse> +// CHECK: return %[[VAL_189]] : tensor<6x6xi32, #sparse> // CHECK: } func.func @conv2d_all_sparse_CSR(%arg0: tensor<8x8xi32, #DCSR>, %arg1: tensor<3x3xi32>) -> tensor<6x6xi32, #DCSR> { diff --git a/utils/bazel/llvm-project-overlay/libc/test/src/__support/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/test/src/__support/BUILD.bazel index f775183296636..a973e6541da01 100644 --- a/utils/bazel/llvm-project-overlay/libc/test/src/__support/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/libc/test/src/__support/BUILD.bazel @@ -74,10 +74,7 @@ libc_test( name = "uint_test", srcs = ["uint_test.cpp"], deps = [ - "//libc:__support_cpp_bit", "//libc:__support_cpp_optional", - "//libc:__support_cpp_type_traits", - "//libc:__support_macros_properties_float", "//libc:__support_uint", ], )